1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 21 ///** 22 //****************************************************************************** 23 //* 24 //* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC ) 25 //* and do the prediction. 26 //* 27 //* @par Description 28 //* This function evaluates first three intra chroma modes and compute corresponding sad 29 //* and return the buffer predicted with best mode. 30 //* 31 //* @param[in] pu1_src 32 //* UWORD8 pointer to the source 33 //* 34 //** @param[in] pu1_ngbr_pels 35 //* UWORD8 pointer to neighbouring pels 36 //* 37 //* @param[out] pu1_dst 38 //* UWORD8 pointer to the destination 39 //* 40 //* @param[in] src_strd 41 //* integer source stride 42 //* 43 //* @param[in] dst_strd 44 //* integer destination stride 45 //* 46 //* @param[in] u4_n_avblty 47 //* availability of neighbouring pixels 48 //* 49 //* @param[in] u4_intra_mode 50 //* Pointer to the variable in which best mode is returned 51 //* 52 //* @param[in] pu4_sadmin 53 //* Pointer to the variable in which minimum sad is returned 54 //* 55 //* @param[in] u4_valid_intra_modes 56 //* Says what all modes are valid 57 //* 58 //* 59 //* @return none 60 //* 61 //****************************************************************************** 62 //*/ 63 // 64 //void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src, 65 // UWORD8 *pu1_ngbr_pels_i16, 66 // UWORD8 *pu1_dst, 67 // UWORD32 src_strd, 68 // UWORD32 dst_strd, 69 // WORD32 u4_n_avblty, 70 // UWORD32 *u4_intra_mode, 71 // WORD32 *pu4_sadmin, 72 // UWORD32 u4_valid_intra_modes) 73 // 74 .text 75 .p2align 2 76 .include "ih264_neon_macros.s" 77 78 .global ih264e_evaluate_intra_chroma_modes_av8 79 80 ih264e_evaluate_intra_chroma_modes_av8: 81 82 //x0 = pu1_src, 83 //x1 = pu1_ngbr_pels_i16, 84 //x2 = pu1_dst, 85 //x3 = src_strd, 86 //x4 = dst_strd, 87 //x5 = u4_n_avblty, 88 //x6 = u4_intra_mode, 89 //x7 = pu4_sadmin 90 91 92 93 // STMFD sp!, {x4-x12, x14} //store register values to stack 94 push_v_regs 95 stp x19, x20, [sp, #-16]! 96 //----------------------- 97 ldr x16, [sp, #80] 98 mov x17, x4 99 mov x18, x5 100 mov x14, x6 101 mov x15, x7 102 103 mov x19, #5 104 ands x6, x5, x19 105 beq none_available 106 cmp x6, #1 107 beq left_only_available 108 cmp x6, #4 109 beq top_only_available 110 111 all_available: 112 ld1 {v0.8b, v1.8b}, [x1] 113 add x6, x1, #18 114 ld1 {v2.8b, v3.8b}, [x6] 115 uxtl v0.8h, v0.8b 116 uxtl v1.8h, v1.8b 117 addp v0.4s, v0.4s , v0.4s 118 addp v1.4s, v1.4s , v1.4s 119 addp v0.4s, v0.4s , v0.4s 120 addp v1.4s, v1.4s , v1.4s 121 uxtl v2.8h, v2.8b 122 uxtl v3.8h, v3.8b 123 addp v2.4s, v2.4s , v2.4s 124 addp v3.4s, v3.4s , v3.4s 125 addp v2.4s, v2.4s , v2.4s 126 addp v3.4s, v3.4s , v3.4s 127 rshrn v5.8b, v0.8h, #2 128 dup v21.8h, v5.h[0] 129 rshrn v6.8b, v3.8h, #2 130 dup v20.8h, v6.h[0] 131 add v1.8h, v1.8h, v2.8h 132 rshrn v1.8b, v1.8h, #3 133 dup v23.8h, v1.h[0] 134 mov v20.d[0], v23.d[0] 135 add v0.8h, v0.8h, v3.8h 136 rshrn v0.8b, v0.8h, #3 137 dup v23.8h, v0.h[0] 138 mov v31.d[0], v23.d[0] 139 mov v28.d[0], v20.d[0] 140 mov v29.d[0], v20.d[1] 141 mov v30.d[0], v21.d[0] 142 b sad_comp 143 144 left_only_available: 145 ld1 {v0.8b, v1.8b}, [x1] 146 uxtl v0.8h, v0.8b 147 uxtl v1.8h, v1.8b 148 addp v0.4s, v0.4s , v0.4s 149 addp v1.4s, v1.4s , v1.4s 150 addp v0.4s, v0.4s , v0.4s 151 addp v1.4s, v1.4s , v1.4s 152 rshrn v0.8b, v0.8h, #2 153 rshrn v1.8b, v1.8h, #2 154 155 dup v28.8h , v1.h[0] 156 dup v29.8h , v1.h[0] 157 dup v30.8h, v0.h[0] 158 dup v31.8h, v0.h[0] 159 b sad_comp 160 161 top_only_available: 162 add x6, x1, #18 163 ld1 {v0.8b, v1.8b}, [x6] 164 uxtl v0.8h, v0.8b 165 uxtl v1.8h, v1.8b 166 addp v0.4s, v0.4s , v0.4s 167 addp v1.4s, v1.4s , v1.4s 168 addp v0.4s, v0.4s , v0.4s 169 addp v1.4s, v1.4s , v1.4s 170 rshrn v0.8b, v0.8h, #2 171 rshrn v1.8b, v1.8h, #2 172 dup v28.8h , v0.h[0] 173 dup v30.8h, v1.h[0] 174 mov v29.d[0], v30.d[1] 175 mov v30.d[0], v28.d[0] 176 mov v31.d[0], v30.d[1] 177 b sad_comp 178 none_available: 179 mov w20, #128 180 dup v28.16b, w20 181 dup v29.16b, w20 182 dup v30.16b, w20 183 dup v31.16b, w20 184 185 186 187 sad_comp: 188 add x6, x1, #18 189 ld1 {v10.8b, v11.8b}, [x6] // vertical values 190 191 ld1 {v27.8h}, [x1] 192 193 dup v20.8h, v27.h[7] ///HORIZONTAL VALUE ROW=0// 194 dup v21.8h, v27.h[7] 195 196 ld1 { v0.8b, v1.8b}, [x0], x3 197 198 199 ///vertical row 0@ 200 uabdl v16.8h, v0.8b, v10.8b 201 uabdl v18.8h, v1.8b, v11.8b 202 203 ///HORZ row 0@ 204 uabdl v26.8h, v0.8b, v20.8b 205 uabdl v14.8h, v1.8b, v21.8b 206 207 ld1 {v2.8b, v3.8b}, [x0], x3 208 209 210 211 ///dc row 0@ 212 uabdl v22.8h, v0.8b, v28.8b 213 uabdl v24.8h, v1.8b, v29.8b 214 215 216 dup v20.8h, v27.h[6] 217 dup v21.8h, v27.h[6] ///HORIZONTAL VALUE ROW=1// 218 219 ///vertical row 1@ 220 uabal v16.8h, v2.8b, v10.8b 221 uabal v18.8h, v3.8b, v11.8b 222 223 ld1 { v4.8b, v5.8b}, [x0], x3 224 225 ///HORZ row 1@ 226 uabal v26.8h, v2.8b, v20.8b 227 uabal v14.8h, v3.8b, v21.8b 228 229 ///dc row 1@ 230 uabal v22.8h, v2.8b, v28.8b 231 uabal v24.8h, v3.8b, v29.8b 232 233 dup v20.8h, v27.h[5] 234 dup v21.8h, v27.h[5] ///HORIZONTAL VALUE ROW=2// 235 236 ///vertical row 2@ 237 uabal v16.8h, v4.8b, v10.8b 238 uabal v18.8h, v5.8b, v11.8b 239 240 ld1 { v6.8b, v7.8b}, [x0], x3 241 ///HORZ row 2@ 242 uabal v26.8h, v4.8b, v20.8b 243 uabal v14.8h, v5.8b, v21.8b 244 245 ///dc row 2@ 246 uabal v22.8h, v4.8b, v28.8b 247 uabal v24.8h, v5.8b, v29.8b 248 249 dup v20.8h, v27.h[4] 250 dup v21.8h, v27.h[4] ///HORIZONTAL VALUE ROW=3// 251 252 ///vertical row 3@ 253 uabal v16.8h, v6.8b, v10.8b 254 uabal v18.8h, v7.8b, v11.8b 255 256 ///HORZ row 3@ 257 uabal v26.8h, v6.8b, v20.8b 258 uabal v14.8h, v7.8b, v21.8b 259 260 ///dc row 3@ 261 uabal v22.8h, v6.8b, v28.8b 262 uabal v24.8h, v7.8b, v29.8b 263 264 //---------------------------------------------------------------------------------------------- 265 ld1 { v0.8b, v1.8b}, [x0], x3 266 267 268 dup v20.8h, v27.h[3] 269 dup v21.8h, v27.h[3] ///HORIZONTAL VALUE ROW=0// 270 271 ///vertical row 0@ 272 uabal v16.8h, v0.8b, v10.8b 273 uabal v18.8h, v1.8b, v11.8b 274 275 ///HORZ row 0@ 276 uabal v26.8h, v0.8b, v20.8b 277 uabal v14.8h, v1.8b, v21.8b 278 279 ld1 { v2.8b, v3.8b}, [x0], x3 280 281 ///dc row 0@ 282 uabal v22.8h, v0.8b, v30.8b 283 uabal v24.8h, v1.8b, v31.8b 284 285 dup v20.8h, v27.h[2] 286 dup v21.8h, v27.h[2] ///HORIZONTAL VALUE ROW=1// 287 288 ///vertical row 1@ 289 uabal v16.8h, v2.8b, v10.8b 290 uabal v18.8h, v3.8b, v11.8b 291 292 ///HORZ row 1@ 293 uabal v26.8h, v2.8b, v20.8b 294 uabal v14.8h, v3.8b, v21.8b 295 296 ld1 { v4.8b, v5.8b}, [x0], x3 297 298 ///dc row 1@ 299 uabal v22.8h, v2.8b, v30.8b 300 uabal v24.8h, v3.8b, v31.8b 301 302 dup v20.8h, v27.h[1] 303 dup v21.8h, v27.h[1] ///HORIZONTAL VALUE ROW=2// 304 305 ///vertical row 2@ 306 uabal v16.8h, v4.8b, v10.8b 307 uabal v18.8h, v5.8b, v11.8b 308 309 ///HORZ row 2@ 310 uabal v26.8h, v4.8b, v20.8b 311 uabal v14.8h, v5.8b, v21.8b 312 313 ld1 {v6.8b, v7.8b}, [x0], x3 314 315 ///dc row 2@ 316 uabal v22.8h, v4.8b, v30.8b 317 uabal v24.8h, v5.8b, v31.8b 318 319 dup v20.8h, v27.h[0] 320 dup v21.8h, v27.h[0] ///HORIZONTAL VALUE ROW=3// 321 322 ///vertical row 3@ 323 uabal v16.8h, v6.8b, v10.8b 324 uabal v18.8h, v7.8b, v11.8b 325 326 ///HORZ row 3@ 327 uabal v26.8h, v6.8b, v20.8b 328 uabal v14.8h, v7.8b, v21.8b 329 330 ///dc row 3@ 331 uabal v22.8h, v6.8b, v30.8b 332 uabal v24.8h, v7.8b, v31.8b 333 334 335 //------------------------------------------- 336 337 338 //vert sum 339 340 add v16.8h, v16.8h , v18.8h 341 mov v18.d[0], v16.d[1] 342 add v16.4h, v16.4h , v18.4h 343 uaddlp v16.2s, v16.4h 344 addp v16.2s, v16.2s, v16.2s 345 smov x8, v16.s[0] 346 347 348 //horz sum 349 350 add v26.8h, v26.8h , v14.8h 351 mov v14.d[0], v26.d[1] 352 add v26.4h, v26.4h , v14.4h 353 uaddlp v26.2s, v26.4h 354 addp v26.2s, v26.2s, v26.2s 355 smov x9, v26.s[0] 356 357 //dc sum 358 359 add v24.8h, v22.8h , v24.8h ///DC 360 mov v25.d[0], v24.d[1] 361 add v24.4h, v24.4h , v25.4h ///DC 362 uaddlp v24.2s, v24.4h ///DC 363 addp v24.2s, v24.2s, v24.2s ///DC 364 smov x10, v24.s[0] //dc 365 366 367 368 369 mov x11, #1 370 //----------------------- 371 mov x0, x16 // u4_valid_intra_modes 372 373 //-------------------------------------------- 374 375 376 lsl x11, x11, #30 377 378 ands x7, x0, #04 // vert mode valid???????????? 379 csel x8, x11, x8, eq 380 381 ands x6, x0, #02 // horz mode valid???????????? 382 csel x9, x11, x9, eq 383 384 ands x6, x0, #01 // dc mode valid???????????? 385 csel x10, x11, x10, eq 386 387 388 //--------------------------- 389 390 mov x4, x17 391 mov x6, x14 392 mov x7, x15 393 394 //-------------------------- 395 396 cmp x10, x9 397 bgt not_dc 398 cmp x10, x8 399 bgt do_vert 400 401 ///---------------------- 402 //DO DC PREDICTION 403 str w10 , [x7] //MIN SAD 404 405 mov w10, #0 406 str w10 , [x6] // MODE 407 408 b do_dc_vert 409 //----------------------------- 410 411 not_dc: 412 cmp x9, x8 413 bgt do_vert 414 ///---------------------- 415 //DO HORIZONTAL 416 str w9 , [x7] //MIN SAD 417 418 mov w10, #1 419 str w10 , [x6] // MODE 420 ld1 {v0.8h}, [x1] 421 422 dup v10.8h, v0.h[7] 423 dup v11.8h, v0.h[6] 424 dup v12.8h, v0.h[5] 425 dup v13.8h, v0.h[4] 426 st1 {v10.8h}, [x2], x4 427 dup v14.8h, v0.h[3] 428 st1 {v11.8h}, [x2], x4 429 dup v15.8h, v0.h[2] 430 st1 {v12.8h}, [x2], x4 431 dup v16.8h, v0.h[1] 432 st1 {v13.8h}, [x2], x4 433 dup v17.8h, v0.h[0] 434 st1 {v14.8h}, [x2], x4 435 st1 {v15.8h}, [x2], x4 436 st1 {v16.8h}, [x2], x4 437 st1 {v17.8h}, [x2], x4 438 439 b end_func 440 441 do_vert: 442 //DO VERTICAL PREDICTION 443 str w8 , [x7] //MIN SAD 444 mov w8, #2 445 str w8 , [x6] // MODE 446 add x6, x1, #18 447 ld1 {v28.8b, v29.8b}, [x6] // vertical values 448 ld1 {v30.8b, v31.8b}, [x6] // vertical values 449 450 do_dc_vert: 451 st1 {v28.2s, v29.2s} , [x2], x4 //0 452 st1 {v28.2s, v29.2s} , [x2], x4 //1 453 st1 {v28.2s, v29.2s} , [x2], x4 //2 454 st1 {v28.2s, v29.2s} , [x2], x4 //3 455 st1 {v30.2s, v31.2s} , [x2], x4 //4 456 st1 {v30.2s, v31.2s} , [x2], x4 //5 457 st1 {v30.2s, v31.2s} , [x2], x4 //6 458 st1 {v30.2s, v31.2s} , [x2], x4 //7 459 460 end_func: 461 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 462 ldp x19, x20, [sp], #16 463 pop_v_regs 464 ret 465 466 467