1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///** 21 //****************************************************************************** 22 //* @file 23 //* ih264_default_weighted_pred_av8.s 24 //* 25 //* @brief 26 //* Contains function definitions for default weighted prediction. 27 //* 28 //* @author 29 //* Kaushik Senthoor R 30 //* 31 //* @par List of Functions: 32 //* 33 //* - ih264_default_weighted_pred_luma_av8() 34 //* - ih264_default_weighted_pred_chroma_av8() 35 //* 36 //* @remarks 37 //* None 38 //* 39 //******************************************************************************* 40 //*/ 41 //******************************************************************************* 42 //* @function 43 //* ih264_default_weighted_pred_luma_av8() 44 //* 45 //* @brief 46 //* This routine performs the default weighted prediction as described in sec 47 //* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma. 48 //* 49 //* @par Description: 50 //* This function gets two ht x wd blocks, calculates their rounded-average and 51 //* stores it in the destination block. 52 //* 53 //* @param[in] puc_src1: 54 //* UWORD8 Pointer to the buffer containing the first input block. 55 //* 56 //* @param[in] puc_src2: 57 //* UWORD8 Pointer to the buffer containing the second input block. 58 //* 59 //* @param[out] puc_dst 60 //* UWORD8 pointer to the destination where the output block is stored. 61 //* 62 //* @param[in] src_strd1 63 //* Stride of the first input buffer 64 //* 65 //* @param[in] src_strd2 66 //* Stride of the second input buffer 67 //* 68 //* @param[in] dst_strd 69 //* Stride of the destination buffer 70 //* 71 //* @param[in] ht 72 //* integer height of the array 73 //* 74 //* @param[in] wd 75 //* integer width of the array 76 //* 77 //* @returns 78 //* None 79 //* 80 //* @remarks 81 //* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). 82 //* 83 //******************************************************************************* 84 //*/ 85 //void ih264_default_weighted_pred_luma_av8(UWORD8 *puc_src1, 86 // UWORD8 *puc_src2, 87 // UWORD8 *puc_dst, 88 // WORD32 src_strd1, 89 // WORD32 src_strd2, 90 // WORD32 dst_strd, 91 // WORD32 ht, 92 // WORD32 wd) 93 // 94 //**************Variables Vs Registers***************************************** 95 // x0 => puc_src1 96 // x1 => puc_src2 97 // x2 => puc_dst 98 // w3 => src_strd1 99 // w4 => src_strd2 100 // w5 => dst_strd 101 // w6 => ht 102 // w7 => wd 103 // 104 .text 105 .p2align 2 106 .include "ih264_neon_macros.s" 107 108 109 110 .global ih264_default_weighted_pred_luma_av8 111 112 ih264_default_weighted_pred_luma_av8: 113 114 push_v_regs 115 stp x19, x20, [sp, #-16]! 116 sxtw x3, w3 117 sxtw x4, w4 118 sxtw x5, w5 119 cmp w7, #16 120 beq loop_16 //branch if wd is 16 121 cmp w7, #8 122 beq loop_8 //branch if wd is 8 123 124 loop_4: //each iteration processes four rows 125 126 ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1 127 ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1 128 ld1 {v2.s}[0], [x1], x4 //load row 1 in source 2 129 ld1 {v2.s}[1], [x1], x4 //load row 2 in source 2 130 ld1 {v1.s}[0], [x0], x3 //load row 3 in source 1 131 ld1 {v1.s}[1], [x0], x3 //load row 4 in source 1 132 urhadd v0.8b, v0.8b , v2.8b 133 ld1 {v3.s}[0], [x1], x4 //load row 3 in source 2 134 ld1 {v3.s}[1], [x1], x4 //load row 4 in source 2 135 subs w6, w6, #4 //decrement ht by 4 136 st1 {v0.s}[0], [x2], x5 //load row 1 in destination 137 st1 {v0.s}[1], [x2], x5 //load row 2 in destination 138 urhadd v1.8b, v1.8b , v3.8b 139 st1 {v1.s}[0], [x2], x5 //load row 3 in destination 140 st1 {v1.s}[1], [x2], x5 //load row 4 in destination 141 bgt loop_4 //if greater than 0 repeat the loop again 142 b end_loops 143 144 loop_8: //each iteration processes four rows 145 146 ld1 {v0.8b}, [x0], x3 //load row 1 in source 1 147 ld1 {v4.8b}, [x1], x4 //load row 1 in source 2 148 ld1 {v1.8b}, [x0], x3 //load row 2 in source 1 149 ld1 {v5.8b}, [x1], x4 //load row 2 in source 2 150 ld1 {v2.8b}, [x0], x3 //load row 3 in source 1 151 urhadd v0.16b, v0.16b , v4.16b 152 urhadd v1.16b, v1.16b , v5.16b 153 ld1 {v6.8b}, [x1], x4 //load row 3 in source 2 154 ld1 {v3.8b}, [x0], x3 //load row 4 in source 1 155 urhadd v2.8b, v2.8b , v6.8b 156 ld1 {v7.8b}, [x1], x4 //load row 4 in source 2 157 subs w6, w6, #4 //decrement ht by 4 158 st1 {v0.8b}, [x2], x5 //load row 1 in destination 159 urhadd v3.8b, v3.8b , v7.8b 160 st1 {v1.8b}, [x2], x5 //load row 2 in destination 161 st1 {v2.8b}, [x2], x5 //load row 3 in destination 162 st1 {v3.8b}, [x2], x5 //load row 4 in destination 163 bgt loop_8 //if greater than 0 repeat the loop again 164 b end_loops 165 166 loop_16: //each iteration processes eight rows 167 168 ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1 169 ld1 {v16.8b, v17.8b}, [x1], x4 //load row 1 in source 2 170 ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1 171 ld1 {v18.8b, v19.8b}, [x1], x4 //load row 2 in source 2 172 urhadd v0.16b, v0.16b , v16.16b 173 urhadd v1.16b, v1.16b , v17.16b 174 ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1 175 ld1 {v20.8b, v21.8b}, [x1], x4 //load row 3 in source 2 176 urhadd v2.16b, v2.16b , v18.16b 177 urhadd v3.16b, v3.16b , v19.16b 178 ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1 179 ld1 {v22.8b, v23.8b}, [x1], x4 //load row 4 in source 2 180 urhadd v4.16b, v4.16b , v20.16b 181 urhadd v5.16b, v5.16b , v21.16b 182 ld1 {v8.8b, v9.8b}, [x0], x3 //load row 5 in source 1 183 ld1 {v24.8b, v25.8b}, [x1], x4 //load row 5 in source 2 184 urhadd v6.16b, v6.16b , v22.16b 185 urhadd v7.16b, v7.16b , v23.16b 186 ld1 {v10.8b, v11.8b}, [x0], x3 //load row 6 in source 1 187 ld1 {v26.8b, v27.8b}, [x1], x4 //load row 6 in source 2 188 urhadd v8.16b, v8.16b , v24.16b 189 urhadd v9.16b, v9.16b , v25.16b 190 ld1 {v12.8b, v13.8b}, [x0], x3 //load row 7 in source 1 191 ld1 {v28.8b, v29.8b}, [x1], x4 //load row 7 in source 2 192 urhadd v10.16b, v10.16b , v26.16b 193 urhadd v11.16b, v11.16b , v27.16b 194 ld1 {v14.8b, v15.8b}, [x0], x3 //load row 8 in source 1 195 ld1 {v30.8b, v31.8b}, [x1], x4 //load row 8 in source 2 196 urhadd v12.16b, v12.16b , v28.16b 197 urhadd v13.16b, v13.16b , v29.16b 198 st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination 199 st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination 200 urhadd v14.16b, v14.16b , v30.16b 201 urhadd v15.16b, v15.16b , v31.16b 202 st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination 203 st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination 204 subs w6, w6, #8 //decrement ht by 8 205 st1 {v8.8b, v9.8b}, [x2], x5 //load row 5 in destination 206 st1 {v10.8b, v11.8b}, [x2], x5 //load row 6 in destination 207 st1 {v12.8b, v13.8b}, [x2], x5 //load row 7 in destination 208 st1 {v14.8b, v15.8b}, [x2], x5 //load row 8 in destination 209 bgt loop_16 //if greater than 0 repeat the loop again 210 211 end_loops: 212 213 // LDMFD sp!,{x4-x7,x15} //Reload the registers from sp 214 ldp x19, x20, [sp], #16 215 pop_v_regs 216 ret 217 218 219 //******************************************************************************* 220 //* @function 221 //* ih264_default_weighted_pred_chroma_av8() 222 //* 223 //* @brief 224 //* This routine performs the default weighted prediction as described in sec 225 //* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma. 226 //* 227 //* @par Description: 228 //* This function gets two ht x wd blocks, calculates their rounded-average and 229 //* stores it in the destination block for U and V. 230 //* 231 //* @param[in] puc_src1: 232 //* UWORD8 Pointer to the buffer containing the first input block. 233 //* 234 //* @param[in] puc_src2: 235 //* UWORD8 Pointer to the buffer containing the second input block. 236 //* 237 //* @param[out] puc_dst 238 //* UWORD8 pointer to the destination where the output block is stored. 239 //* 240 //* @param[in] src_strd1 241 //* Stride of the first input buffer 242 //* 243 //* @param[in] src_strd2 244 //* Stride of the second input buffer 245 //* 246 //* @param[in] dst_strd 247 //* Stride of the destination buffer 248 //* 249 //* @param[in] ht 250 //* integer height of the array 251 //* 252 //* @param[in] wd 253 //* integer width of the array 254 //* 255 //* @returns 256 //* None 257 //* 258 //* @remarks 259 //* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). 260 //* 261 //******************************************************************************* 262 //*/ 263 //void ih264_default_weighted_pred_chroma_av8(UWORD8 *puc_src1, 264 // UWORD8 *puc_src2, 265 // UWORD8 *puc_dst, 266 // WORD32 src_strd1, 267 // WORD32 src_strd2, 268 // WORD32 dst_strd, 269 // WORD32 ht, 270 // WORD32 wd) 271 // 272 //**************Variables Vs Registers***************************************** 273 // x0 => puc_src1 274 // x1 => puc_src2 275 // x2 => puc_dst 276 // w3 => src_strd1 277 // w4 => src_strd2 278 // w5 => dst_strd 279 // w6 => ht 280 // w7 => wd 281 // 282 283 284 285 286 .global ih264_default_weighted_pred_chroma_av8 287 288 ih264_default_weighted_pred_chroma_av8: 289 290 push_v_regs 291 stp x19, x20, [sp, #-16]! 292 sxtw x3, w3 293 sxtw x4, w4 294 sxtw x5, w5 295 cmp w7, #8 296 beq loop_8_uv //branch if wd is 8 297 cmp w7, #4 298 beq loop_4_uv //branch if wd is 4 299 300 loop_2_uv: //each iteration processes two rows 301 302 ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1 303 ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1 304 ld1 {v1.s}[0], [x1], x4 //load row 1 in source 2 305 ld1 {v1.s}[1], [x1], x4 //load row 2 in source 2 306 urhadd v0.8b, v0.8b , v1.8b 307 subs w6, w6, #2 //decrement ht by 2 308 st1 {v0.s}[0], [x2], x5 //load row 1 in destination 309 st1 {v0.s}[1], [x2], x5 //load row 2 in destination 310 bgt loop_2_uv //if greater than 0 repeat the loop again 311 b end_loops_uv 312 313 loop_4_uv: //each iteration processes two rows 314 315 ld1 {v0.8b}, [x0], x3 //load row 1 in source 1 316 ld1 {v2.8b}, [x1], x4 //load row 1 in source 2 317 ld1 {v1.8b}, [x0], x3 //load row 2 in source 1 318 urhadd v0.8b, v0.8b , v2.8b 319 ld1 {v3.8b}, [x1], x4 //load row 2 in source 2 320 urhadd v1.8b, v1.8b , v3.8b 321 st1 {v0.8b}, [x2], x5 //load row 1 in destination 322 subs w6, w6, #2 //decrement ht by 2 323 st1 {v1.8b}, [x2], x5 //load row 2 in destination 324 bgt loop_4_uv //if greater than 0 repeat the loop again 325 b end_loops_uv 326 327 loop_8_uv: //each iteration processes four rows 328 329 ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1 330 ld1 {v8.8b, v9.8b}, [x1], x4 //load row 1 in source 2 331 ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1 332 urhadd v0.16b, v0.16b , v8.16b 333 urhadd v1.16b, v1.16b , v9.16b 334 ld1 {v10.8b, v11.8b}, [x1], x4 //load row 2 in source 2 335 ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1 336 urhadd v2.16b, v2.16b , v10.16b 337 urhadd v3.16b, v3.16b , v11.16b 338 ld1 {v12.8b, v13.8b}, [x1], x4 //load row 3 in source 2 339 ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1 340 urhadd v4.16b, v4.16b , v12.16b 341 urhadd v5.16b, v5.16b , v13.16b 342 ld1 {v14.8b, v15.8b}, [x1], x4 //load row 4 in source 2 343 st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination 344 urhadd v6.16b, v6.16b , v14.16b 345 urhadd v7.16b, v7.16b , v15.16b 346 st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination 347 subs w6, w6, #4 //decrement ht by 4 348 st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination 349 st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination 350 bgt loop_8_uv //if greater than 0 repeat the loop again 351 352 end_loops_uv: 353 ldp x19, x20, [sp], #16 354 pop_v_regs 355 ret 356 357 358 359