1 //****************************************************************************** 2 //* 3 //* Copyright (C) 2015 The Android Open Source Project 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //***************************************************************************** 18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 //*/ 20 ///*****************************************************************************/ 21 ///** 22 //******************************************************************************* 23 //* @file 24 //* ih264_resi_trans_quant_av8.c 25 //* 26 //* @brief 27 //* contains function definitions for residual and forward trans 28 //* 29 //* @author 30 //* ittiam 31 //* 32 //* @par list of functions: 33 //* ih264_resi_trans_quant_4x4_av8 34 //* ih264_resi_trans_quant_8x8_av8 35 //* ih264_resi_trans_quant_chroma_4x4_av8 36 //* @remarks 37 //* none 38 //* 39 //******************************************************************************* 40 .include "ih264_neon_macros.s" 41 .text 42 .p2align 2 43 //***************************************************************************** 44 //* 45 //* function name : ih264_resi_trans_quant_4x4 46 //* description : this function does cf4 of h264 47 //* 48 //* arguments : x0 :pointer to src buffer 49 // x1 :pointer to pred buffer 50 // x2 :pointer to dst buffer 51 // x3 :source stride 52 // x4 :pred stride, 53 // x5 :dst stride, 54 // x6 :pointer to scaling matrix, 55 // x7 :pointer to threshold matrix, 56 // stack qbits, 57 // rounding factor, 58 // pointer to store nnz 59 // pointer to store non quantized dc value 60 // values returned : none 61 // 62 // register usage : 63 // stack usage : 64 bytes 64 // cycles : 65 // interruptiaility : interruptable 66 // 67 // known limitations 68 // \assumptions : 69 // 70 // revision history : 71 // dd mm yyyy author(s) changes 72 // 1 12 2013 100633 first version 73 // 20 1 2014 100633 changes the api, optimization 74 // 75 //***************************************************************************** 76 77 .global ih264_resi_trans_quant_4x4_av8 78 ih264_resi_trans_quant_4x4_av8: 79 80 //x0 :pointer to src buffer 81 //x1 :pointer to pred buffer 82 //x2 :pointer to dst buffer 83 //x3 :source stride 84 //x4 :pred stride 85 //x5 :dst stride, 86 //x6 :scale matirx, 87 //x7 :threshold matrix 88 // :qbits 89 // :round factor 90 // :nnz 91 // :pointer to store non quantized dc value 92 push_v_regs 93 //x0 :pointer to src buffer 94 //x1 :pointer to pred buffer 95 //x2 :pointer to dst buffer 96 //x3 :source stride 97 //x4 :pred stride 98 //x5 :scale matirx, 99 //x6 :threshold matrix 100 //x7 :qbits 101 //x8 :round factor 102 //x9 :nnz 103 //x10 :pointer to store non quantized dc value 104 105 ldr w8, [sp, #64] //load round factor 106 ldr x10, [sp, #80] //load addres for non quant val 107 neg x7, x7 //negate the qbit value for usiing lsl 108 ldr x9, [sp, #72] 109 110 //------------fucntion loading done----------------; 111 112 ld1 {v30.8b}, [x0], x3 //load first 8 pix src row 1 113 ld1 {v31.8b}, [x1], x4 //load first 8 pix pred row 1 114 ld1 {v28.8b}, [x0], x3 //load first 8 pix src row 2 115 ld1 {v29.8b}, [x1], x4 //load first 8 pix pred row 2 116 ld1 {v26.8b}, [x0], x3 //load first 8 pix src row 3 117 ld1 {v27.8b}, [x1], x4 //load first 8 pix pred row 3 118 ld1 {v24.8b}, [x0] //load first 8 pix src row 4 119 ld1 {v25.8b}, [x1] //load first 8 pix pred row 4 120 121 usubl v0.8h, v30.8b, v31.8b //find residue row 1 122 usubl v2.8h, v28.8b, v29.8b //find residue row 2 123 usubl v4.8h, v26.8b, v27.8b //find residue row 3 124 usubl v6.8h, v24.8b, v25.8b //find residue row 4 125 126 trn1 v1.4h, v0.4h, v2.4h 127 trn2 v3.4h, v0.4h, v2.4h //t12 128 trn1 v5.4h, v4.4h, v6.4h 129 trn2 v7.4h, v4.4h, v6.4h //t23 130 131 trn1 v0.2s, v1.2s, v5.2s 132 trn2 v4.2s, v1.2s, v5.2s //t13 133 trn1 v2.2s, v3.2s, v7.2s 134 trn2 v6.2s, v3.2s, v7.2s //t14 135 136 add v8.4h, v0.4h, v6.4h //x0 = x4+x7 137 add v9.4h, v2.4h, v4.4h //x1 = x5+x6 138 sub v10.4h, v2.4h, v4.4h //x2 = x5-x6 139 sub v11.4h, v0.4h, v6.4h //x3 = x4-x7 140 141 shl v12.4h, v10.4h, #1 //u_shift(x2,1,shft) 142 shl v13.4h, v11.4h, #1 //u_shift(x3,1,shft) 143 144 add v14.4h, v8.4h, v9.4h //x4 = x0 + x1; 145 sub v16.4h, v8.4h, v9.4h //x6 = x0 - x1; 146 add v15.4h, v13.4h, v10.4h //x5 = u_shift(x3,1,shft) + x2; 147 sub v17.4h, v11.4h, v12.4h //x7 = x3 - u_shift(x2,1,shft); 148 149 //taking transpose again so as to make do vert transform 150 trn1 v0.4h, v14.4h, v15.4h 151 trn2 v1.4h, v14.4h, v15.4h //t12 152 trn1 v2.4h, v16.4h, v17.4h 153 trn2 v3.4h, v16.4h, v17.4h //t23 154 155 trn1 v14.2s, v0.2s, v2.2s 156 trn2 v16.2s, v0.2s, v2.2s //t13 157 trn1 v15.2s, v1.2s, v3.2s 158 trn2 v17.2s, v1.2s, v3.2s //t24 159 160 //let us do vertical transform 161 //same code as horiz 162 add v18.4h, v14.4h , v17.4h //x0 = x4+x7 163 add v19.4h, v15.4h , v16.4h //x1 = x5+x6 164 sub v20.4h, v15.4h , v16.4h //x2 = x5-x6 165 sub v21.4h, v14.4h , v17.4h //x3 = x4-x7 166 167 shl v22.4h, v20.4h, #1 //u_shift(x2,1,shft) 168 shl v23.4h, v21.4h, #1 //u_shift(x3,1,shft) 169 170 dup v8.4s, w8 //load rounding value row 1 171 172 add v24.4h, v18.4h , v19.4h //x5 = x0 + x1; 173 sub v26.4h, v18.4h , v19.4h //x7 = x0 - x1; 174 add v25.4h, v23.4h , v20.4h //x6 = u_shift(x3,1,shft) + x2; 175 sub v27.4h, v21.4h , v22.4h //x8 = x3 - u_shift(x2,1,shft); 176 177 dup v23.4s, w8 //load round factor values 178 179 st1 {v24.h}[0], [x10] //store the dc value to alternate dc sddress 180 //core tranform is done for 4x8 block 1 181 ld1 {v28.4h-v31.4h}, [x5] //load the scaling values 182 183 abs v0.4h, v24.4h //abs val of row 1 184 abs v1.4h, v25.4h //abs val of row 2 185 abs v2.4h, v26.4h //abs val of row 3 186 abs v3.4h, v27.4h //abs val of row 4 187 188 cmgt v4.4h, v24.4h, #0 189 cmgt v5.4h, v25.4h, #0 190 cmgt v6.4h, v26.4h, #0 191 cmgt v7.4h, v27.4h, #0 192 193 smull v0.4s, v0.4h, v28.4h //multiply and add row 1 194 smull v1.4s, v1.4h, v29.4h //multiply and add row 2 195 smull v2.4s, v2.4h, v30.4h //multiply and add row 3 196 smull v3.4s, v3.4h, v31.4h //multiply and add row 4 197 198 add v20.4s, v0.4s, v23.4s 199 add v21.4s, v1.4s, v23.4s 200 add v22.4s, v2.4s, v23.4s 201 add v23.4s, v3.4s, v23.4s 202 203 dup v24.4s, w7 204 205 sshl v20.4s, v20.4s, v24.4s //shift row 1 206 sshl v21.4s, v21.4s, v24.4s //shift row 2 207 sshl v22.4s, v22.4s, v24.4s //shift row 3 208 sshl v23.4s, v23.4s, v24.4s //shift row 4 209 210 xtn v20.4h, v20.4s //narrow row 1 211 xtn v21.4h, v21.4s //narrow row 2 212 xtn v22.4h, v22.4s //narrow row 3 213 xtn v23.4h, v23.4s //narrow row 4 214 215 neg v24.8h, v20.8h //get negative 216 neg v25.8h, v21.8h //get negative 217 neg v26.8h, v22.8h //get negative 218 neg v27.8h, v23.8h //get negative 219 220 //compare with zero for computng nnz 221 cmeq v0.4h, v20.4h, #0 222 cmeq v1.4h, v21.4h, #0 223 cmeq v2.4h, v22.4h, #0 224 cmeq v3.4h, v23.4h, #0 225 226 bsl v4.8b, v20.8b, v24.8b //restore sign of row 1 and 2 227 bsl v5.8b, v21.8b, v25.8b //restore sign of row 3 and 4 228 bsl v6.8b, v22.8b, v26.8b //restore sign of row 1 and 2 229 bsl v7.8b, v23.8b, v27.8b //restore sign of row 3 and 4 230 231 //narrow the comaprison result 232 mov v0.d[1], v2.d[0] 233 mov v1.d[1], v3.d[0] 234 235 xtn v0.8b, v0.8h 236 xtn v1.8b, v1.8h 237 238 ushr v0.8b, v0.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 239 ushr v1.8b, v1.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 240 241 add v0.8b, v0.8b, v1.8b //i pair add nnz 1 242 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 243 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 244 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 245 246 st1 {v4.4h-v7.4h}, [x2] //store blk 247 248 movi v25.8b, #16 //get max nnz 249 sub v26.8b, v25.8b , v0.8b //invert current nnz 250 st1 {v26.b}[0], [x9] //write nnz 251 252 pop_v_regs 253 ret 254 255 256 //***************************************************************************** 257 //* 258 //* function name : ih264_resi_trans_quant_chroma_4x4 259 //* description : this function does residue calculation, forward transform 260 //* and quantization for 4x4 chroma block. 261 //* 262 //* arguments : x0 :pointer to src buffer 263 // x1 :pointer to pred buffer 264 // x2 :pointer to dst buffer 265 // x3 :source stride 266 // x4 :pred stride, 267 // x5 :dst stride, 268 // x6 :pointer to scaling matrix, 269 // x7 :pointer to threshold matrix, 270 // stack qbits, 271 // rounding factor, 272 // pointer to store nnz 273 // pointer to store unquantized dc values 274 // values returned : none 275 // 276 // register usage : 277 // stack usage : 64 bytes 278 // cycles : 279 // interruptiaility : interruptable 280 // 281 // known limitations 282 // \assumptions : 283 // 284 // revision history : 285 // dd mm yyyy author(s) changes 286 // 11 2 2015 100664 first version 287 // 25 2 2015 100633 first av8 version 288 //***************************************************************************** 289 290 .global ih264_resi_trans_quant_chroma_4x4_av8 291 ih264_resi_trans_quant_chroma_4x4_av8: 292 293 //x0 :pointer to src buffer 294 //x1 :pointer to pred buffer 295 //x2 :pointer to dst buffer 296 //x3 :source stride 297 //stack :pred stride 298 // :scale matirx, 299 // :threshold matrix 300 // :qbits 301 // :round factor 302 // :nnz 303 // :pu1_dc_alt_addr 304 push_v_regs 305 //x0 :pointer to src buffer 306 //x1 :pointer to pred buffer 307 //x2 :pointer to dst buffer 308 //x3 :source stride 309 //x4 :pred stride 310 //x5 :scale matirx, 311 //x6 :threshold matrix 312 //x7 :qbits 313 //x8 :round factor 314 //x9 :nnz 315 //x10 :pointer to store non quantized dc value 316 317 ldr w8, [sp, #64] //load round factor 318 ldr x10, [sp, #80] //load addres for non quant val 319 neg x7, x7 //negate the qbit value for usiing lsl 320 ldr x9, [sp, #72] 321 //------------fucntion loading done----------------; 322 323 ld1 {v30.8b}, [x0], x3 //load first 8 pix src row 1 324 ld1 {v31.8b}, [x1], x4 //load first 8 pix pred row 1 325 ld1 {v28.8b}, [x0], x3 //load first 8 pix src row 2 326 ld1 {v29.8b}, [x1], x4 //load first 8 pix pred row 2 327 ld1 {v26.8b}, [x0], x3 //load first 8 pix src row 3 328 ld1 {v27.8b}, [x1], x4 //load first 8 pix pred row 3 329 ld1 {v24.8b}, [x0] //load first 8 pix src row 4 330 ld1 {v25.8b}, [x1] //load first 8 pix pred row 4 331 332 333 //deinterleave the loaded values 334 uzp1 v30.8b, v30.8b, v30.8b 335 uzp1 v31.8b, v31.8b, v31.8b 336 uzp1 v28.8b, v28.8b, v28.8b 337 uzp1 v29.8b, v29.8b, v29.8b 338 uzp1 v26.8b, v26.8b, v26.8b 339 uzp1 v27.8b, v27.8b, v27.8b 340 uzp1 v24.8b, v24.8b, v24.8b 341 uzp1 v25.8b, v25.8b, v25.8b 342 //this deinterleaving is the only differnece betweenchrom and luma fucntions 343 344 usubl v0.8h, v30.8b, v31.8b //find residue row 1 345 usubl v2.8h, v28.8b, v29.8b //find residue row 2 346 usubl v4.8h, v26.8b, v27.8b //find residue row 3 347 usubl v6.8h, v24.8b, v25.8b //find residue row 4 348 349 trn1 v1.4h, v0.4h, v2.4h 350 trn2 v3.4h, v0.4h, v2.4h //t12 351 trn1 v5.4h, v4.4h, v6.4h 352 trn2 v7.4h, v4.4h, v6.4h //t23 353 354 trn1 v0.2s, v1.2s, v5.2s 355 trn2 v4.2s, v1.2s, v5.2s //t13 356 trn1 v2.2s, v3.2s, v7.2s 357 trn2 v6.2s, v3.2s, v7.2s //t14 358 359 add v8.4h, v0.4h, v6.4h //x0 = x4+x7 360 add v9.4h, v2.4h, v4.4h //x1 = x5+x6 361 sub v10.4h, v2.4h, v4.4h //x2 = x5-x6 362 sub v11.4h, v0.4h, v6.4h //x3 = x4-x7 363 364 shl v12.4h, v10.4h, #1 //u_shift(x2,1,shft) 365 shl v13.4h, v11.4h, #1 //u_shift(x3,1,shft) 366 367 add v14.4h, v8.4h, v9.4h //x4 = x0 + x1; 368 sub v16.4h, v8.4h, v9.4h //x6 = x0 - x1; 369 add v15.4h, v13.4h, v10.4h //x5 = u_shift(x3,1,shft) + x2; 370 sub v17.4h, v11.4h, v12.4h //x7 = x3 - u_shift(x2,1,shft); 371 372 //taking transpose again so as to make do vert transform 373 trn1 v0.4h, v14.4h, v15.4h 374 trn2 v1.4h, v14.4h, v15.4h //t12 375 trn1 v2.4h, v16.4h, v17.4h 376 trn2 v3.4h, v16.4h, v17.4h //t23 377 378 trn1 v14.2s, v0.2s, v2.2s 379 trn2 v16.2s, v0.2s, v2.2s //t13 380 trn1 v15.2s, v1.2s, v3.2s 381 trn2 v17.2s, v1.2s, v3.2s //t24 382 383 //let us do vertical transform 384 //same code as horiz 385 add v18.4h, v14.4h , v17.4h //x0 = x4+x7 386 add v19.4h, v15.4h , v16.4h //x1 = x5+x6 387 sub v20.4h, v15.4h , v16.4h //x2 = x5-x6 388 sub v21.4h, v14.4h , v17.4h //x3 = x4-x7 389 390 shl v22.4h, v20.4h, #1 //u_shift(x2,1,shft) 391 shl v23.4h, v21.4h, #1 //u_shift(x3,1,shft) 392 393 dup v8.4s, w8 //load rounding value row 1 394 395 add v24.4h, v18.4h , v19.4h //x5 = x0 + x1; 396 sub v26.4h, v18.4h , v19.4h //x7 = x0 - x1; 397 add v25.4h, v23.4h , v20.4h //x6 = u_shift(x3,1,shft) + x2; 398 sub v27.4h, v21.4h , v22.4h //x8 = x3 - u_shift(x2,1,shft); 399 400 dup v23.4s, w8 //load round factor values 401 402 st1 {v24.h}[0], [x10] //store the dc value to alternate dc sddress 403 //core tranform is done for 4x8 block 1 404 ld1 {v28.4h-v31.4h}, [x5] //load the scaling values 405 406 abs v0.4h, v24.4h //abs val of row 1 407 abs v1.4h, v25.4h //abs val of row 2 408 abs v2.4h, v26.4h //abs val of row 3 409 abs v3.4h, v27.4h //abs val of row 4 410 411 cmgt v4.4h, v24.4h, #0 412 cmgt v5.4h, v25.4h, #0 413 cmgt v6.4h, v26.4h, #0 414 cmgt v7.4h, v27.4h, #0 415 416 smull v0.4s, v0.4h, v28.4h //multiply and add row 1 417 smull v1.4s, v1.4h, v29.4h //multiply and add row 2 418 smull v2.4s, v2.4h, v30.4h //multiply and add row 3 419 smull v3.4s, v3.4h, v31.4h //multiply and add row 4 420 421 add v20.4s, v0.4s, v23.4s 422 add v21.4s, v1.4s, v23.4s 423 add v22.4s, v2.4s, v23.4s 424 add v23.4s, v3.4s, v23.4s 425 426 dup v24.4s, w7 427 428 sshl v20.4s, v20.4s, v24.4s //shift row 1 429 sshl v21.4s, v21.4s, v24.4s //shift row 2 430 sshl v22.4s, v22.4s, v24.4s //shift row 3 431 sshl v23.4s, v23.4s, v24.4s //shift row 4 432 433 xtn v20.4h, v20.4s //narrow row 1 434 xtn v21.4h, v21.4s //narrow row 2 435 xtn v22.4h, v22.4s //narrow row 3 436 xtn v23.4h, v23.4s //narrow row 4 437 438 neg v24.8h, v20.8h //get negative 439 neg v25.8h, v21.8h //get negative 440 neg v26.8h, v22.8h //get negative 441 neg v27.8h, v23.8h //get negative 442 443 //compare with zero for computng nnz 444 cmeq v0.4h, v20.4h, #0 445 cmeq v1.4h, v21.4h, #0 446 cmeq v2.4h, v22.4h, #0 447 cmeq v3.4h, v23.4h, #0 448 449 bsl v4.8b, v20.8b, v24.8b //restore sign of row 1 and 2 450 bsl v5.8b, v21.8b, v25.8b //restore sign of row 3 and 4 451 bsl v6.8b, v22.8b, v26.8b //restore sign of row 1 and 2 452 bsl v7.8b, v23.8b, v27.8b //restore sign of row 3 and 4 453 454 //narrow the comaprison result 455 mov v0.d[1], v2.d[0] 456 mov v1.d[1], v3.d[0] 457 458 xtn v0.8b, v0.8h 459 xtn v1.8b, v1.8h 460 461 ushr v0.8b, v0.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 462 ushr v1.8b, v1.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 463 464 add v0.8b, v0.8b, v1.8b //i pair add nnz 1 465 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 466 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 467 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 468 469 st1 {v4.4h-v7.4h}, [x2] //store blk 470 471 movi v25.8b, #16 //get max nnz 472 sub v26.8b, v25.8b , v0.8b //invert current nnz 473 st1 {v26.b}[0], [x9] //write nnz 474 475 pop_v_regs 476 ret 477 478 479 //***************************************************************************** 480 //* 481 //* function name : ih264_hadamard_quant_4x4_av8 482 //* description : this function does forward hadamard transform and 483 //* quantization for luma dc block 484 //* 485 //* arguments : x0 :pointer to src buffer 486 // x1 :pointer to dst buffer 487 // x2 :pu2_scale_matrix 488 // x2 :pu2_threshold_matrix 489 // x3 :u4_qbits 490 // x4 :u4_round_factor 491 // x5 :pu1_nnz 492 // values returned : none 493 // 494 // register usage : 495 // stack usage : 0 bytes 496 // cycles : around 497 // interruptiaility : interruptable 498 // 499 // known limitations 500 // \assumptions : 501 // 502 // revision history : 503 // dd mm yyyy author(s) changes 504 // 20 2 2015 100633 first version 505 // 506 //***************************************************************************** 507 //ih264_hadamard_quant_4x4_av8(word16 *pi2_src, word16 *pi2_dst, 508 // const uword16 *pu2_scale_matrix, 509 // const uword16 *pu2_threshold_matrix, uword32 u4_qbits, 510 // uword32 u4_round_factor,uword8 *pu1_nnz 511 // ) 512 .global ih264_hadamard_quant_4x4_av8 513 ih264_hadamard_quant_4x4_av8: 514 515 //x0 :pointer to src buffer 516 //x1 :pointer to dst buffer 517 //x2 :pu2_scale_matrix 518 //x3 :pu2_threshold_matrix 519 //x4 :u4_qbits 520 //x5 :u4_round_factor 521 //x6 :pu1_nnz 522 523 push_v_regs 524 525 ld4 {v0.4h-v3.4h}, [x0] //load 4x4 block 526 ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0] 527 528 saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7; 529 saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6; 530 ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6; 531 ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7; 532 533 dup v30.8h, v30.h[0] //pu2_scale_matrix[0] 534 535 add v14.4s, v4.4s, v5.4s //pi2_dst[0] = x0 + x1; 536 add v15.4s, v7.4s, v6.4s //pi2_dst[1] = x3 + x2; 537 sub v16.4s, v4.4s, v5.4s //pi2_dst[2] = x0 - x1; 538 sub v17.4s, v7.4s, v6.4s //pi2_dst[3] = x3 - x2; 539 540 //transpose 4x4 block 541 trn1 v18.4s, v14.4s, v15.4s 542 trn2 v19.4s, v14.4s, v15.4s 543 trn1 v20.4s, v16.4s, v17.4s 544 trn2 v21.4s, v16.4s, v17.4s 545 546 trn1 v14.2d, v18.2d, v20.2d 547 trn2 v16.2d, v18.2d, v20.2d 548 trn1 v15.2d, v19.2d, v21.2d 549 trn2 v17.2d, v19.2d, v21.2d 550 //end transpose 551 552 add v18.4s, v14.4s, v17.4s //x0 = x4 + x7; 553 add v19.4s, v15.4s, v16.4s //x1 = x5 + x6; 554 sub v20.4s, v15.4s, v16.4s //x2 = x5 - x6; 555 sub v21.4s, v14.4s, v17.4s //x3 = x4 - x7; 556 557 dup v14.4s, w5 //round factor 558 dup v15.4s, v14.s[0] 559 dup v16.4s, v14.s[0] 560 dup v17.4s, v14.s[0] 561 562 add v22.4s, v18.4s, v19.4s //(x0 + x1) 563 add v23.4s, v21.4s, v20.4s //(x3 + x2) 564 sub v24.4s, v18.4s, v19.4s //(x0 - x1) 565 sub v25.4s, v21.4s, v20.4s //(x3 - x2) 566 567 shrn v0.4h, v22.4s, #1 //i4_value = (x0 + x1) >> 1; 568 shrn2 v0.8h, v23.4s, #1 //i4_value = (x3 + x2) >> 1; 569 shrn v1.4h, v24.4s, #1 //i4_value = (x0 - x1) >> 1; 570 shrn2 v1.8h, v25.4s, #1 //i4_value = (x3 - x2) >> 1; 571 572 abs v2.8h, v0.8h 573 abs v3.8h, v1.8h 574 575 cmgt v4.8h, v0.8h, #0 //get the sign row 1,2 576 cmgt v5.8h, v1.8h, #0 577 578 neg w4, w4 //-u4_qbits 579 dup v22.4s, w4 //load -u4_qbits 580 581 umlal v14.4s, v2.4h, v30.4h 582 umlal2 v15.4s, v2.8h, v30.8h 583 umlal v16.4s, v3.4h, v30.4h 584 umlal2 v17.4s, v3.8h, v30.8h 585 586 ushl v14.4s, v14.4s, v22.4s 587 ushl v15.4s, v15.4s, v22.4s 588 ushl v16.4s, v16.4s, v22.4s 589 ushl v17.4s, v17.4s, v22.4s 590 591 uqxtn v14.4h, v14.4s 592 uqxtn2 v14.8h, v15.4s 593 uqxtn v16.4h, v16.4s 594 uqxtn2 v16.8h, v17.4s 595 596 neg v15.8h, v14.8h 597 neg v17.8h, v16.8h 598 599 bsl v4.16b, v14.16b, v15.16b 600 bsl v5.16b, v16.16b, v17.16b 601 602 cmeq v0.8h, v14.8h, #0 603 cmeq v1.8h, v16.8h, #0 604 605 st1 {v4.8h-v5.8h}, [x1] 606 607 movi v20.8b, #16 608 609 xtn v2.8b, v0.8h 610 xtn v3.8b, v1.8h 611 612 ushr v2.8b, v2.8b, #7 613 ushr v3.8b, v3.8b, #7 614 615 add v2.8b, v2.8b, v3.8b 616 addp v2.8b, v2.8b, v2.8b 617 addp v2.8b, v2.8b, v2.8b 618 addp v2.8b, v2.8b, v2.8b 619 sub v20.8b, v20.8b, v2.8b 620 st1 {v20.b}[0], [x6] 621 622 pop_v_regs 623 ret 624 625 626 //***************************************************************************** 627 //* 628 //* function name : ih264_hadamard_quant_2x2_uv 629 //* description : this function does forward hadamard transform and 630 //* quantization for dc block of chroma for both planes 631 //* 632 //* arguments : x0 :pointer to src buffer 633 // x1 :pointer to dst buffer 634 // x2 :pu2_scale_matrix 635 // x2 :pu2_threshold_matrix 636 // x3 :u4_qbits 637 // x4 :u4_round_factor 638 // x5 :pu1_nnz 639 // values returned : none 640 // 641 // register usage : 642 // stack usage : 0 bytes 643 // cycles : around 644 // interruptiaility : interruptable 645 // 646 // known limitations 647 // \assumptions : 648 // 649 // revision history : 650 // dd mm yyyy author(s) changes 651 // 20 2 2015 100633 first version 652 // 653 //***************************************************************************** 654 // ih264_hadamard_quant_2x2_uv_av8(word16 *pi2_src, word16 *pi2_dst, 655 // const uword16 *pu2_scale_matrix, 656 // const uword16 *pu2_threshold_matrix, uword32 u4_qbits, 657 // uword32 u4_round_factor,uword8 *pu1_nnz 658 // ) 659 660 .global ih264_hadamard_quant_2x2_uv_av8 661 ih264_hadamard_quant_2x2_uv_av8: 662 663 push_v_regs 664 665 ld2 {v0.4h-v1.4h}, [x0] //load src 666 667 ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0] 668 dup v30.4h, v30.h[0] //pu2_scale_matrix 669 uxtl v30.4s, v30.4h //pu2_scale_matrix 670 671 neg w4, w4 672 dup v24.4s, w4 //u4_qbits 673 674 dup v25.4s, w5 //round fact 675 dup v26.4s, v25.s[0] 676 677 saddl v2.4s, v0.4h, v1.4h //x0 = x4 + x5;, x2 = x6 + x7; 678 ssubl v3.4s, v0.4h, v1.4h //x1 = x4 - x5; x3 = x6 - x7; 679 680 trn1 v4.4s, v2.4s, v3.4s 681 trn2 v5.4s, v2.4s, v3.4s //q1 -> x0 x1, q2 -> x2 x3 682 683 add v0.4s, v4.4s , v5.4s // (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3); 684 sub v1.4s, v4.4s , v5.4s // (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3); 685 686 abs v2.4s, v0.4s 687 abs v3.4s, v1.4s 688 689 cmgt v4.4s, v0.4s, #0 //get the sign row 1,2 690 cmgt v5.4s, v1.4s, #0 691 692 uqxtn v4.4h, v4.4s 693 sqxtn2 v4.8h, v5.4s 694 695 mla v25.4s, v2.4s, v30.4s 696 mla v26.4s, v3.4s, v30.4s 697 698 ushl v2.4s, v25.4s, v24.4s //>>qbit 699 ushl v3.4s, v26.4s, v24.4s //>>qbit 700 701 uqxtn v2.4h, v2.4s 702 uqxtn2 v2.8h, v3.4s 703 704 neg v5.8h, v2.8h 705 706 bsl v4.16b, v2.16b, v5.16b //*sign 707 708 //rearrange such that we get each plane coeffs as continous 709 mov v5.s[0], v4.s[1] 710 mov v4.s[1], v4.s[2] 711 mov v4.s[2], v5.s[0] 712 713 cmeq v5.8h, v4.8h, #0 //compute nnz 714 xtn v5.8b, v5.8h //reduce nnz comparison to 1 bit 715 ushr v5.8b, v5.8b, #7 //reduce nnz comparison to 1 bit 716 movi v20.8b, #4 //since we add zeros, we need to subtract from 4 to get nnz 717 addp v5.8b, v5.8b, v5.8b //sum up nnz 718 addp v5.8b, v5.8b, v5.8b //sum up nnz 719 720 st1 {v4.8h}, [x1] //store the block 721 722 st1 {v4.8h}, [x1] //store the block 723 sub v20.8b, v20.8b, v5.8b //4- numzeros 724 725 st1 {v20.h}[0], [x6] //store nnz 726 727 pop_v_regs 728 ret 729 730 731 732