1 ;// 2 ;// This confidential and proprietary software may be used only as 3 ;// authorised by a licensing agreement from ARM Limited 4 ;// (C) COPYRIGHT 2004 ARM Limited 5 ;// ALL RIGHTS RESERVED 6 ;// The entire notice above must be reproduced on all authorised 7 ;// copies and copies may only be made to the extent permitted 8 ;// by a licensing agreement from ARM Limited. 9 ;// 10 ;// IDCT_s.s 11 ;// 12 ;// Inverse DCT module 13 ;// 14 ;// 15 ;// ALGORITHM DESCRIPTION 16 ;// 17 ;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each 18 ;// column and then a 1D IDCT for each row. 19 ;// 20 ;// The 8-point 1D IDCT is defined by 21 ;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2 22 ;// 23 ;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0 24 ;// c(u,x) = cos( (2x+1)*u*pi/16 ) 25 ;// 26 ;// We compute the 8-point 1D IDCT using the reverse of 27 ;// the Arai-Agui-Nakajima flow graph which we split into 28 ;// 5 stages named in reverse order to identify with the 29 ;// forward DCT. Direct inversion of the forward formulae 30 ;// in file FDCT_s.s gives: 31 ;// 32 ;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ] 33 ;// [ A(0) = 2*sqrt(2) 34 ;// A(u) = 4*cos(u*pi/16) for (u!=0) ] 35 ;// 36 ;// IStage 4: i0 = j0 i1 = j4 37 ;// i3 = (j2+j6)/2 i2 = (j2-j6)/2 38 ;// i7 = (j5+j3)/2 i4 = (j5-j3)/2 39 ;// i5 = (j1+j7)/2 i6 = (j1-j7)/2 40 ;// 41 ;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2 42 ;// h2 = (i2*sqrt2)-i3 h3 = i3 43 ;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6 44 ;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6 45 ;// [ The above two lines rotate by -(pi/8) ] 46 ;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2 47 ;// 48 ;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2 49 ;// g1 = (h1+h2)/2 g2 = (h1-h2)/2 50 ;// g7 = h7 g6 = h6 - h7 51 ;// g5 = h5 - g6 g4 = h4 - g5 52 ;// 53 ;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2 54 ;// f1 = (g1+g6)/2 f6 = (g1-g6)/2 55 ;// f2 = (g2+g5)/2 f5 = (g2-g5)/2 56 ;// f3 = (g3+g4)/2 f4 = (g3-g4)/2 57 ;// 58 ;// Note that most coefficients are halved 3 times during the 59 ;// above calculation. We can rescale the algorithm dividing 60 ;// the input by 8 to remove the halvings. 61 ;// 62 ;// IStage 5: j(u) = T(u)*A(u)/8 63 ;// 64 ;// IStage 4: i0 = j0 i1 = j4 65 ;// i3 = j2 + j6 i2 = j2 - j6 66 ;// i7 = j5 + j3 i4 = j5 - j3 67 ;// i5 = j1 + j7 i6 = j1 - j7 68 ;// 69 ;// IStage 3: h0 = i0 + i1 h1 = i0 - i1 70 ;// h2 = (i2*sqrt2)-i3 h3 = i3 71 ;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6) 72 ;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6) 73 ;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7 74 ;// 75 ;// IStage 2: g0 = h0 + h3 g3 = h0 - h3 76 ;// g1 = h1 + h2 g2 = h1 - h2 77 ;// g7 = h7 g6 = h6 - h7 78 ;// g5 = h5 - g6 g4 = h4 - g5 79 ;// 80 ;// IStage 1: f0 = g0 + g7 f7 = g0 - g7 81 ;// f1 = g1 + g6 f6 = g1 - g6 82 ;// f2 = g2 + g5 f5 = g2 - g5 83 ;// f3 = g3 + g4 f4 = g3 - g4 84 ;// 85 ;// Note: 86 ;// 1. The scaling by A(u)/8 can often be combined with inverse 87 ;// quantization. The column and row scalings can be combined. 88 ;// 2. The flowgraph in the AAN paper has h4,g6 negated compared 89 ;// to the above code but is otherwise identical. 90 ;// 3. The rotation by -pi/8 can be peformed using three multiplies 91 ;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4 92 ;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6 93 ;// 4. If |T(u)|<=1 then from the IDCT definition, 94 ;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2 95 ;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2 96 ;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2 97 ;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2) 98 ;// = (approx)2.64 99 ;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits. 100 ;// The table below shows input patterns generating the maximum 101 ;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1 102 ;// InputPattern Max |f(x)| 103 ;// PPPPPPPP |f0| = 2.64 104 ;// PPPMMMMM |f1| = 2.64 105 ;// PPMMMPPP |f2| = 2.64 106 ;// PPMMPPMM |f3| = 2.64 107 ;// PMMPPMMP |f4| = 2.64 108 ;// PMMPMMPM |f5| = 2.64 109 ;// PMPPMPMP |f6| = 2.64 110 ;// PMPMPMPM |f7| = 2.64 111 ;// Note that this input pattern is the transpose of the 112 ;// corresponding max input patter for the FDCT. 113 114 ;// Arguments 115 116 pSrc RN 0 ;// source data buffer 117 Stride RN 1 ;// destination stride in bytes 118 pDest RN 2 ;// destination data buffer 119 pScale RN 3 ;// pointer to scaling table 120 121 122 ;// DCT Inverse Macro 123 ;// The DCT code should be parametrized according 124 ;// to the following inputs: 125 ;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255) 126 ;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255) 127 ;// "s16" : 16-bit signed data not saturated (max size ~+/-14273) 128 ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment 129 ;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment 130 ;// 131 ;// Inputs: 132 ;// pSrc = r0 = Pointer to input data 133 ;// Range is -256 to +255 (9-bit) 134 ;// Stride = r1 = Stride between input lines 135 ;// pDest = r2 = Pointer to output data 136 ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale 137 138 139 140 MACRO 141 M_IDCT $outsize, $inscale, $stride 142 LCLA SHIFT 143 144 145 IF ARM1136JS 146 147 ;// REGISTER ALLOCATION 148 ;// This is hard since we have 8 values, 9 free registers and each 149 ;// butterfly requires a temporary register. We also want to 150 ;// maintain register order so we can use LDM/STM. The table below 151 ;// summarises the register allocation that meets all these criteria. 152 ;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above. 153 ;// 154 ;// r1 a01 g0 h0 155 ;// r4 b01 f0 g1 h1 i0 156 ;// r5 a23 f1 g2 i1 157 ;// r6 b23 f2 g3 h2 i2 158 ;// r7 a45 f3 h3 i3 159 ;// r8 b45 f4 g4 h4 i4 160 ;// r9 a67 f5 g5 h5 i5 161 ;// r10 b67 f6 g6 h6 i6 162 ;// r11 f7 g7 h7 i7 163 ;// 164 ra01 RN 1 165 rb01 RN 4 166 ra23 RN 5 167 rb23 RN 6 168 ra45 RN 7 169 rb45 RN 8 170 ra67 RN 9 171 rb67 RN 10 172 rtmp RN 11 173 csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ] 174 LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ] 175 ;// Transpose allocation 176 xft RN ra01 177 xf0 RN rb01 178 xf1 RN ra23 179 xf2 RN rb23 180 xf3 RN ra45 181 xf4 RN rb45 182 xf5 RN ra67 183 xf6 RN rb67 184 xf7 RN rtmp 185 ;// IStage 1 allocation 186 xg0 RN xft 187 xg1 RN xf0 188 xg2 RN xf1 189 xg3 RN xf2 190 xgt RN xf3 191 xg4 RN xf4 192 xg5 RN xf5 193 xg6 RN xf6 194 xg7 RN xf7 195 ;// IStage 2 allocation 196 xh0 RN xg0 197 xh1 RN xg1 198 xht RN xg2 199 xh2 RN xg3 200 xh3 RN xgt 201 xh4 RN xg4 202 xh5 RN xg5 203 xh6 RN xg6 204 xh7 RN xg7 205 ;// IStage 3,4 allocation 206 xit RN xh0 207 xi0 RN xh1 208 xi1 RN xht 209 xi2 RN xh2 210 xi3 RN xh3 211 xi4 RN xh4 212 xi5 RN xh5 213 xi6 RN xh6 214 xi7 RN xh7 215 216 M_STR pDest, ppDest 217 IF "$stride"="s" 218 M_STR Stride, pStride 219 ENDIF 220 M_ADR pDest, pBlk 221 LDR csPiBy8, =0x30fc7642 222 LDR LoopRR2, =0x00005a82 223 224 v6_idct_col$_F 225 ;// Load even values 226 LDR xi4, [pSrc], #4 ;// j0 227 LDR xi5, [pSrc, #4*16-4] ;// j4 228 LDR xi6, [pSrc, #2*16-4] ;// j2 229 LDR xi7, [pSrc, #6*16-4] ;// j6 230 231 ;// Scale Even Values 232 IF "$inscale"="s16" ;// 16x16 mul 233 SHIFT SETA 12 234 LDR xi0, [pScale], #4 235 LDR xi1, [pScale, #4*16-4] 236 LDR xi2, [pScale, #2*16-4] 237 MOV xit, #1<<(SHIFT-1) 238 SMLABB xi3, xi0, xi4, xit 239 SMLATT xi4, xi0, xi4, xit 240 SMLABB xi0, xi1, xi5, xit 241 SMLATT xi5, xi1, xi5, xit 242 MOV xi3, xi3, ASR #SHIFT 243 PKHBT xi4, xi3, xi4, LSL #(16-SHIFT) 244 LDR xi3, [pScale, #6*16-4] 245 SMLABB xi1, xi2, xi6, xit 246 SMLATT xi6, xi2, xi6, xit 247 MOV xi0, xi0, ASR #SHIFT 248 PKHBT xi5, xi0, xi5, LSL #(16-SHIFT) 249 SMLABB xi2, xi3, xi7, xit 250 SMLATT xi7, xi3, xi7, xit 251 MOV xi1, xi1, ASR #SHIFT 252 PKHBT xi6, xi1, xi6, LSL #(16-SHIFT) 253 MOV xi2, xi2, ASR #SHIFT 254 PKHBT xi7, xi2, xi7, LSL #(16-SHIFT) 255 ENDIF 256 IF "$inscale"="s32" ;// 32x16 mul 257 SHIFT SETA (12+8-16) 258 MOV xit, #1<<(SHIFT-1) 259 LDR xi0, [pScale], #8 260 LDR xi1, [pScale, #0*32+4-8] 261 LDR xi2, [pScale, #4*32-8] 262 LDR xi3, [pScale, #4*32+4-8] 263 SMLAWB xi0, xi0, xi4, xit 264 SMLAWT xi1, xi1, xi4, xit 265 SMLAWB xi2, xi2, xi5, xit 266 SMLAWT xi3, xi3, xi5, xit 267 MOV xi0, xi0, ASR #SHIFT 268 PKHBT xi4, xi0, xi1, LSL #(16-SHIFT) 269 MOV xi2, xi2, ASR #SHIFT 270 PKHBT xi5, xi2, xi3, LSL #(16-SHIFT) 271 LDR xi0, [pScale, #2*32-8] 272 LDR xi1, [pScale, #2*32+4-8] 273 LDR xi2, [pScale, #6*32-8] 274 LDR xi3, [pScale, #6*32+4-8] 275 SMLAWB xi0, xi0, xi6, xit 276 SMLAWT xi1, xi1, xi6, xit 277 SMLAWB xi2, xi2, xi7, xit 278 SMLAWT xi3, xi3, xi7, xit 279 MOV xi0, xi0, ASR #SHIFT 280 PKHBT xi6, xi0, xi1, LSL #(16-SHIFT) 281 MOV xi2, xi2, ASR #SHIFT 282 PKHBT xi7, xi2, xi3, LSL #(16-SHIFT) 283 ENDIF 284 285 ;// Load odd values 286 LDR xi0, [pSrc, #1*16-4] ;// j1 287 LDR xi1, [pSrc, #7*16-4] ;// j7 288 LDR xi2, [pSrc, #5*16-4] ;// j5 289 LDR xi3, [pSrc, #3*16-4] ;// j3 290 291 IF {TRUE} 292 ;// shortcut if odd values 0 293 TEQ xi0, #0 294 TEQEQ xi1, #0 295 TEQEQ xi2, #0 296 TEQEQ xi3, #0 297 BEQ v6OddZero$_F 298 ENDIF 299 300 ;// Store scaled even values 301 STMIA pDest, {xi4, xi5, xi6, xi7} 302 303 ;// Scale odd values 304 IF "$inscale"="s16" 305 ;// Perform AAN Scale 306 LDR xi4, [pScale, #1*16-4] 307 LDR xi5, [pScale, #7*16-4] 308 LDR xi6, [pScale, #5*16-4] 309 SMLABB xi7, xi0, xi4, xit 310 SMLATT xi0, xi0, xi4, xit 311 SMLABB xi4, xi1, xi5, xit 312 SMLATT xi1, xi1, xi5, xit 313 MOV xi7, xi7, ASR #SHIFT 314 PKHBT xi0, xi7, xi0, LSL #(16-SHIFT) 315 LDR xi7, [pScale, #3*16-4] 316 SMLABB xi5, xi2, xi6, xit 317 SMLATT xi2, xi2, xi6, xit 318 MOV xi4, xi4, ASR #SHIFT 319 PKHBT xi1, xi4, xi1, LSL #(16-SHIFT) 320 SMLABB xi6, xi3, xi7, xit 321 SMLATT xi3, xi3, xi7, xit 322 MOV xi5, xi5, ASR #SHIFT 323 PKHBT xi2, xi5, xi2, LSL #(16-SHIFT) 324 MOV xi6, xi6, ASR #SHIFT 325 PKHBT xi3, xi6, xi3, LSL #(16-SHIFT) 326 ENDIF 327 IF "$inscale"="s32" ;// 32x16 mul 328 LDR xi4, [pScale, #1*32-8] 329 LDR xi5, [pScale, #1*32+4-8] 330 LDR xi6, [pScale, #7*32-8] 331 LDR xi7, [pScale, #7*32+4-8] 332 SMLAWB xi4, xi4, xi0, xit 333 SMLAWT xi5, xi5, xi0, xit 334 SMLAWB xi6, xi6, xi1, xit 335 SMLAWT xi7, xi7, xi1, xit 336 MOV xi4, xi4, ASR #SHIFT 337 PKHBT xi0, xi4, xi5, LSL #(16-SHIFT) 338 MOV xi6, xi6, ASR #SHIFT 339 PKHBT xi1, xi6, xi7, LSL #(16-SHIFT) 340 LDR xi4, [pScale, #5*32-8] 341 LDR xi5, [pScale, #5*32+4-8] 342 LDR xi6, [pScale, #3*32-8] 343 LDR xi7, [pScale, #3*32+4-8] 344 SMLAWB xi4, xi4, xi2, xit 345 SMLAWT xi5, xi5, xi2, xit 346 SMLAWB xi6, xi6, xi3, xit 347 SMLAWT xi7, xi7, xi3, xit 348 MOV xi4, xi4, ASR #SHIFT 349 PKHBT xi2, xi4, xi5, LSL #(16-SHIFT) 350 MOV xi6, xi6, ASR #SHIFT 351 PKHBT xi3, xi6, xi7, LSL #(16-SHIFT) 352 ENDIF 353 354 SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2 355 SSUB16 xi6, xi0, xi1 ;// j1-j7 356 SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2 357 SSUB16 xi4, xi2, xi3 ;// j5-j3 358 359 SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 360 361 PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a 362 PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b 363 364 SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] 365 SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] 366 SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] 367 SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] 368 369 SMULBB xi1, xi3, LoopRR2 370 SMULTB xi3, xi3, LoopRR2 371 372 PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 373 PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 374 SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 375 376 ;// xi0,xi1,xi2,xi3 now free 377 ;// IStage 4,3, rows 2to3 x1/2 378 379 MOV xi3, xi3, LSL #1 380 PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 381 LDRD xi0, [pDest, #8] ;// j2,j6 scaled 382 383 ;// IStage 2, rows4to7 384 SSUB16 xg6, xh6, xh7 385 SSUB16 xg5, xh5, xg6 386 SSUB16 xg4, xh4, xg5 387 388 SSUB16 xi2, xi0, xi1 ;// (j2-j6) 389 SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 390 391 SMULBB xi0, xi2, LoopRR2 392 SMULTB xi2, xi2, LoopRR2 393 394 MOV xi2, xi2, LSL #1 395 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 396 397 ;// xi0, xi1 now free 398 ;// IStage 4,3 rows 0to1 x 1/2 399 LDRD xi0, [pDest] ;// j0, j4 scaled 400 SSUB16 xh2, xh2, xi3 401 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 402 403 SHADD16 xh0, xi0, xi1 404 SHSUB16 xh1, xi0, xi1 405 406 ;// IStage 2 rows 0to3 x 1/2 407 SHSUB16 xg2, xh1, xh2 408 SHADD16 xg1, xh1, xh2 409 SHSUB16 xg3, xh0, xh3 410 SHADD16 xg0, xh0, xh3 411 412 ;// IStage 1 all rows 413 SADD16 xf3, xg3, xg4 414 SSUB16 xf4, xg3, xg4 415 SADD16 xf2, xg2, xg5 416 SSUB16 xf5, xg2, xg5 417 SADD16 xf1, xg1, xg6 418 SSUB16 xf6, xg1, xg6 419 SADD16 xf0, xg0, xg7 420 SSUB16 xf7, xg0, xg7 421 422 ;// Transpose, store and loop 423 PKHBT ra01, xf0, xf1, LSL #16 424 PKHTB rb01, xf1, xf0, ASR #16 425 426 PKHBT ra23, xf2, xf3, LSL #16 427 PKHTB rb23, xf3, xf2, ASR #16 428 429 PKHBT ra45, xf4, xf5, LSL #16 430 PKHTB rb45, xf5, xf4, ASR #16 431 432 PKHBT ra67, xf6, xf7, LSL #16 433 STMIA pDest!, {ra01, ra23, ra45, ra67} 434 PKHTB rb67, xf7, xf6, ASR #16 435 STMIA pDest!, {rb01, rb23, rb45, rb67} 436 BCC v6_idct_col$_F 437 438 SUB pSrc, pDest, #(64*2) 439 M_LDR pDest, ppDest 440 IF "$stride"="s" 441 M_LDR pScale, pStride 442 ENDIF 443 B v6_idct_row$_F 444 445 v6OddZero$_F 446 SSUB16 xi2, xi6, xi7 ;// (j2-j6) 447 SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2 448 449 SMULBB xi0, xi2, LoopRR2 450 SMULTB xi2, xi2, LoopRR2 451 452 MOV xi2, xi2, LSL #1 453 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 454 SSUB16 xh2, xh2, xi3 455 456 ;// xi0, xi1 now free 457 ;// IStage 4,3 rows 0to1 x 1/2 458 459 SHADD16 xh0, xi4, xi5 460 SHSUB16 xh1, xi4, xi5 461 462 ;// IStage 2 rows 0to3 x 1/2 463 SHSUB16 xg2, xh1, xh2 464 SHADD16 xg1, xh1, xh2 465 SHSUB16 xg3, xh0, xh3 466 SHADD16 xg0, xh0, xh3 467 468 ;// IStage 1 all rows 469 MOV xf3, xg3 470 MOV xf4, xg3 471 MOV xf2, xg2 472 MOV xf5, xg2 473 MOV xf1, xg1 474 MOV xf6, xg1 475 MOV xf0, xg0 476 MOV xf7, xg0 477 478 ;// Transpose 479 PKHBT ra01, xf0, xf1, LSL #16 480 PKHTB rb01, xf1, xf0, ASR #16 481 482 PKHBT ra23, xf2, xf3, LSL #16 483 PKHTB rb23, xf3, xf2, ASR #16 484 485 PKHBT ra45, xf4, xf5, LSL #16 486 PKHTB rb45, xf5, xf4, ASR #16 487 488 PKHBT ra67, xf6, xf7, LSL #16 489 PKHTB rb67, xf7, xf6, ASR #16 490 491 STMIA pDest!, {ra01, ra23, ra45, ra67} 492 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 493 STMIA pDest!, {rb01, rb23, rb45, rb67} 494 495 BCC v6_idct_col$_F 496 SUB pSrc, pDest, #(64*2) 497 M_LDR pDest, ppDest 498 IF "$stride"="s" 499 M_LDR pScale, pStride 500 ENDIF 501 502 503 v6_idct_row$_F 504 ;// IStage 4,3, rows4to7 x1/4 505 LDR xit, =0x00010001 ;// rounding constant 506 LDR xi0, [pSrc, #1*16] ;// j1 507 LDR xi1, [pSrc, #7*16] ;// 4*j7 508 LDR xi2, [pSrc, #5*16] ;// j5 509 LDR xi3, [pSrc, #3*16] ;// j3 510 511 SHADD16 xi1, xi1, xit ;// 2*j7 512 SHADD16 xi1, xi1, xit ;// j7 513 514 SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2 515 SSUB16 xi6, xi0, xi1 ;// j1-j7 516 SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2 517 SSUB16 xi4, xi2, xi3 ;// j5-j3 518 519 SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 520 521 PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a 522 PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b 523 524 SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] 525 SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] 526 SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] 527 SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] 528 529 SMULBB xi1, xi3, LoopRR2 530 SMULTB xi3, xi3, LoopRR2 531 532 PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 533 PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 534 SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 535 536 MOV xi3, xi3, LSL #1 537 PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 538 539 ;// xi0,xi1,xi2,xi3 now free 540 ;// IStage 4,3, rows 2to3 x1/2 541 542 LDR xi0, [pSrc, #2*16] ;// j2 543 LDR xi1, [pSrc, #6*16] ;// 2*j6 544 545 ;// IStage 2, rows4to7 546 SSUB16 xg6, xh6, xh7 547 SSUB16 xg5, xh5, xg6 548 SSUB16 xg4, xh4, xg5 549 550 SHADD16 xi1, xi1, xit ;// j6 551 SSUB16 xi2, xi0, xi1 ;// (j2-j6) 552 SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 553 554 SMULBB xi0, xi2, LoopRR2 555 SMULTB xi2, xi2, LoopRR2 556 557 MOV xi2, xi2, LSL #1 558 559 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 560 561 ;// xi0, xi1 now free 562 ;// IStage 4,3 rows 0to1 x 1/2 563 LDR xi1, [pSrc, #4*16] ;// j4 564 LDR xi0, [pSrc], #4 ;// j0 565 566 SSUB16 xh2, xh2, xi3 567 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 568 569 ADD xi0, xi0, xit, LSL #2 ;// ensure correct round 570 SHADD16 xh0, xi0, xi1 ;// of DC result 571 SHSUB16 xh1, xi0, xi1 572 573 ;// IStage 2 rows 0to3 x 1/2 574 SHSUB16 xg2, xh1, xh2 575 SHADD16 xg1, xh1, xh2 576 SHSUB16 xg3, xh0, xh3 577 SHADD16 xg0, xh0, xh3 578 579 ;// IStage 1 all rows 580 SHADD16 xf3, xg3, xg4 581 SHSUB16 xf4, xg3, xg4 582 SHADD16 xf2, xg2, xg5 583 SHSUB16 xf5, xg2, xg5 584 SHADD16 xf1, xg1, xg6 585 SHSUB16 xf6, xg1, xg6 586 SHADD16 xf0, xg0, xg7 587 SHSUB16 xf7, xg0, xg7 588 589 ;// Saturate 590 IF ("$outsize"="u8") 591 USAT16 xf0, #8, xf0 592 USAT16 xf1, #8, xf1 593 USAT16 xf2, #8, xf2 594 USAT16 xf3, #8, xf3 595 USAT16 xf4, #8, xf4 596 USAT16 xf5, #8, xf5 597 USAT16 xf6, #8, xf6 598 USAT16 xf7, #8, xf7 599 ENDIF 600 IF ("$outsize"="s9") 601 SSAT16 xf0, #9, xf0 602 SSAT16 xf1, #9, xf1 603 SSAT16 xf2, #9, xf2 604 SSAT16 xf3, #9, xf3 605 SSAT16 xf4, #9, xf4 606 SSAT16 xf5, #9, xf5 607 SSAT16 xf6, #9, xf6 608 SSAT16 xf7, #9, xf7 609 ENDIF 610 611 ;// Transpose to Row, Pack and store 612 IF ("$outsize"="u8") 613 ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ] 614 ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ] 615 ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ] 616 ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ] 617 PKHBT ra01, xf0, xf2, LSL #16 618 PKHTB rb01, xf2, xf0, ASR #16 619 PKHBT ra23, xf4, xf6, LSL #16 620 PKHTB rb23, xf6, xf4, ASR #16 621 STMIA pDest, {ra01, ra23} 622 IF "$stride"="s" 623 ADD pDest, pDest, pScale 624 STMIA pDest, {rb01, rb23} 625 ADD pDest, pDest, pScale 626 ELSE 627 ADD pDest, pDest, #($stride) 628 STMIA pDest, {rb01, rb23} 629 ADD pDest, pDest, #($stride) 630 ENDIF 631 ENDIF 632 IF ("$outsize"="s9"):LOR:("$outsize"="s16") 633 PKHBT ra01, xf0, xf1, LSL #16 634 PKHTB rb01, xf1, xf0, ASR #16 635 636 PKHBT ra23, xf2, xf3, LSL #16 637 PKHTB rb23, xf3, xf2, ASR #16 638 639 PKHBT ra45, xf4, xf5, LSL #16 640 PKHTB rb45, xf5, xf4, ASR #16 641 642 PKHBT ra67, xf6, xf7, LSL #16 643 PKHTB rb67, xf7, xf6, ASR #16 644 645 STMIA pDest, {ra01, ra23, ra45, ra67} 646 IF "$stride"="s" 647 ADD pDest, pDest, pScale 648 STMIA pDest, {rb01, rb23, rb45, rb67} 649 ADD pDest, pDest, pScale 650 ELSE 651 ADD pDest, pDest, #($stride) 652 STMIA pDest, {rb01, rb23, rb45, rb67} 653 ADD pDest, pDest, #($stride) 654 ENDIF 655 ENDIF 656 657 BCC v6_idct_row$_F 658 ENDIF ;// ARM1136JS 659 660 661 IF CortexA8 662 663 Src0 EQU 7 664 Src1 EQU 8 665 Src2 EQU 9 666 Src3 EQU 10 667 Src4 EQU 11 668 Src5 EQU 12 669 Src6 EQU 13 670 Src7 EQU 14 671 Tmp EQU 15 672 673 qXj0 QN Src0.S16 674 qXj1 QN Src1.S16 675 qXj2 QN Src2.S16 676 qXj3 QN Src3.S16 677 qXj4 QN Src4.S16 678 qXj5 QN Src5.S16 679 qXj6 QN Src6.S16 680 qXj7 QN Src7.S16 681 qXjt QN Tmp.S16 682 683 dXj0lo DN (Src0*2).S16 684 dXj0hi DN (Src0*2+1).S16 685 dXj1lo DN (Src1*2).S16 686 dXj1hi DN (Src1*2+1).S16 687 dXj2lo DN (Src2*2).S16 688 dXj2hi DN (Src2*2+1).S16 689 dXj3lo DN (Src3*2).S16 690 dXj3hi DN (Src3*2+1).S16 691 dXj4lo DN (Src4*2).S16 692 dXj4hi DN (Src4*2+1).S16 693 dXj5lo DN (Src5*2).S16 694 dXj5hi DN (Src5*2+1).S16 695 dXj6lo DN (Src6*2).S16 696 dXj6hi DN (Src6*2+1).S16 697 dXj7lo DN (Src7*2).S16 698 dXj7hi DN (Src7*2+1).S16 699 dXjtlo DN (Tmp*2).S16 700 dXjthi DN (Tmp*2+1).S16 701 702 qXi0 QN qXj0 703 qXi1 QN qXj4 704 qXi2 QN qXj2 705 qXi3 QN qXj7 706 qXi4 QN qXj5 707 qXi5 QN qXjt 708 qXi6 QN qXj1 709 qXi7 QN qXj6 710 qXit QN qXj3 711 712 dXi0lo DN dXj0lo 713 dXi0hi DN dXj0hi 714 dXi1lo DN dXj4lo 715 dXi1hi DN dXj4hi 716 dXi2lo DN dXj2lo 717 dXi2hi DN dXj2hi 718 dXi3lo DN dXj7lo 719 dXi3hi DN dXj7hi 720 dXi4lo DN dXj5lo 721 dXi4hi DN dXj5hi 722 dXi5lo DN dXjtlo 723 dXi5hi DN dXjthi 724 dXi6lo DN dXj1lo 725 dXi6hi DN dXj1hi 726 dXi7lo DN dXj6lo 727 dXi7hi DN dXj6hi 728 dXitlo DN dXj3lo 729 dXithi DN dXj3hi 730 731 qXh0 QN qXit 732 qXh1 QN qXi0 733 qXh2 QN qXi2 734 qXh3 QN qXi3 735 qXh4 QN qXi7 736 qXh5 QN qXi5 737 qXh6 QN qXi4 738 qXh7 QN qXi1 739 qXht QN qXi6 740 741 dXh0lo DN dXitlo 742 dXh0hi DN dXithi 743 dXh1lo DN dXi0lo 744 dXh1hi DN dXi0hi 745 dXh2lo DN dXi2lo 746 dXh2hi DN dXi2hi 747 dXh3lo DN dXi3lo 748 dXh3hi DN dXi3hi 749 dXh4lo DN dXi7lo 750 dXh4hi DN dXi7hi 751 dXh5lo DN dXi5lo 752 dXh5hi DN dXi5hi 753 dXh6lo DN dXi4lo 754 dXh6hi DN dXi4hi 755 dXh7lo DN dXi1lo 756 dXh7hi DN dXi1hi 757 dXhtlo DN dXi6lo 758 dXhthi DN dXi6hi 759 760 qXg0 QN qXh2 761 qXg1 QN qXht 762 qXg2 QN qXh1 763 qXg3 QN qXh0 764 qXg4 QN qXh4 765 qXg5 QN qXh5 766 qXg6 QN qXh6 767 qXg7 QN qXh7 768 qXgt QN qXh3 769 770 qXf0 QN qXg6 771 qXf1 QN qXg5 772 qXf2 QN qXg4 773 qXf3 QN qXgt 774 qXf4 QN qXg3 775 qXf5 QN qXg2 776 qXf6 QN qXg1 777 qXf7 QN qXg0 778 qXft QN qXg7 779 780 781 qXt0 QN 1.S32 782 qXt1 QN 2.S32 783 qT0lo QN 1.S32 784 qT0hi QN 2.S32 785 qT1lo QN 3.S32 786 qT1hi QN 4.S32 787 qScalelo QN 5.S32 ;// used to read post scale values 788 qScalehi QN 6.S32 789 qTemp0 QN 5.S32 790 qTemp1 QN 6.S32 791 792 793 Scale1 EQU 6 794 Scale2 EQU 15 795 qScale1 QN Scale1.S16 796 qScale2 QN Scale2.S16 797 dScale1lo DN (Scale1*2).S16 798 dScale1hi DN (Scale1*2+1).S16 799 dScale2lo DN (Scale2*2).S16 800 dScale2hi DN (Scale2*2+1).S16 801 802 dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]} 803 InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15 804 S DN dCoefs[1] ;// Sin(PI/8) in Q15 805 C DN dCoefs[2] ;// Cos(PI/8) in Q15 806 807 pTemp RN 12 808 809 810 IMPORT armCOMM_IDCTCoef 811 812 VLD1 {qXj0,qXj1}, [pSrc @64]! 813 VLD1 {qXj2,qXj3}, [pSrc @64]! 814 VLD1 {qXj4,qXj5}, [pSrc @64]! 815 VLD1 {qXj6,qXj7}, [pSrc @64]! 816 817 ;// Load PreScale and multiply with Src 818 ;// IStage 4 819 820 IF "$inscale"="s16" ;// 16X16 Mul 821 M_IDCT_PRESCALE16 822 ENDIF 823 824 IF "$inscale"="s32" ;// 32X32 ,ul 825 M_IDCT_PRESCALE32 826 ENDIF 827 828 ;// IStage 3 829 VQRDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2) 830 VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2 831 VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2 832 VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4 833 VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2 834 VQRDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2) 835 VSUB qXh2, qXi2, qXi3 ;// h2, h3 836 837 VMULL qXt0, dXi4lo, C ;// c*i4 838 VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6 839 VMULL qXt1, dXi4hi, C 840 VMLAL qXt1, dXi6hi, S 841 VSHRN dXh4lo, qXt0, #16 ;// h4 842 VSHRN dXh4hi, qXt1, #16 843 844 VMULL qXt0, dXi6lo, C ;// c*i6 845 VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6 846 VMULL qXt1, dXi6hi, C 847 VMLSL qXt1, dXi4hi, S 848 VSHRN dXh6lo, qXt0, #16 ;// h6 849 VSHRN dXh6hi, qXt1, #16 850 851 ;// IStage 2 852 VSUB qXg6, qXh6, qXh7 853 VSUB qXg5, qXh5, qXg6 854 VSUB qXg4, qXh4, qXg5 855 VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2 856 VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2 857 VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2 858 VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2 859 860 ;// IStage 1 all rows 861 VADD qXf3, qXg3, qXg4 862 VSUB qXf4, qXg3, qXg4 863 VADD qXf2, qXg2, qXg5 864 VSUB qXf5, qXg2, qXg5 865 VADD qXf1, qXg1, qXg6 866 VSUB qXf6, qXg1, qXg6 867 VADD qXf0, qXg0, qXg7 868 VSUB qXf7, qXg0, qXg7 869 870 ;// Transpose, store and loop 871 XTR0 EQU Src5 872 XTR1 EQU Tmp 873 XTR2 EQU Src6 874 XTR3 EQU Src7 875 XTR4 EQU Src3 876 XTR5 EQU Src0 877 XTR6 EQU Src1 878 XTR7 EQU Src2 879 XTRt EQU Src4 880 881 qA0 QN XTR0.S32 ;// for XTRpose 882 qA1 QN XTR1.S32 883 qA2 QN XTR2.S32 884 qA3 QN XTR3.S32 885 qA4 QN XTR4.S32 886 qA5 QN XTR5.S32 887 qA6 QN XTR6.S32 888 qA7 QN XTR7.S32 889 890 dB0 DN XTR0*2+1 ;// for using VSWP 891 dB1 DN XTR1*2+1 892 dB2 DN XTR2*2+1 893 dB3 DN XTR3*2+1 894 dB4 DN XTR4*2 895 dB5 DN XTR5*2 896 dB6 DN XTR6*2 897 dB7 DN XTR7*2 898 899 900 VTRN qXf0, qXf1 901 VTRN qXf2, qXf3 902 VTRN qXf4, qXf5 903 VTRN qXf6, qXf7 904 VTRN qA0, qA2 905 VTRN qA1, qA3 906 VTRN qA4, qA6 907 VTRN qA5, qA7 908 VSWP dB0, dB4 909 VSWP dB1, dB5 910 VSWP dB2, dB6 911 VSWP dB3, dB7 912 913 914 qYj0 QN qXf0 915 qYj1 QN qXf1 916 qYj2 QN qXf2 917 qYj3 QN qXf3 918 qYj4 QN qXf4 919 qYj5 QN qXf5 920 qYj6 QN qXf6 921 qYj7 QN qXf7 922 qYjt QN qXft 923 924 dYj0lo DN (XTR0*2).S16 925 dYj0hi DN (XTR0*2+1).S16 926 dYj1lo DN (XTR1*2).S16 927 dYj1hi DN (XTR1*2+1).S16 928 dYj2lo DN (XTR2*2).S16 929 dYj2hi DN (XTR2*2+1).S16 930 dYj3lo DN (XTR3*2).S16 931 dYj3hi DN (XTR3*2+1).S16 932 dYj4lo DN (XTR4*2).S16 933 dYj4hi DN (XTR4*2+1).S16 934 dYj5lo DN (XTR5*2).S16 935 dYj5hi DN (XTR5*2+1).S16 936 dYj6lo DN (XTR6*2).S16 937 dYj6hi DN (XTR6*2+1).S16 938 dYj7lo DN (XTR7*2).S16 939 dYj7hi DN (XTR7*2+1).S16 940 dYjtlo DN (XTRt*2).S16 941 dYjthi DN (XTRt*2+1).S16 942 943 qYi0 QN qYj0 944 qYi1 QN qYj4 945 qYi2 QN qYj2 946 qYi3 QN qYj7 947 qYi4 QN qYj5 948 qYi5 QN qYjt 949 qYi6 QN qYj1 950 qYi7 QN qYj6 951 qYit QN qYj3 952 953 dYi0lo DN dYj0lo 954 dYi0hi DN dYj0hi 955 dYi1lo DN dYj4lo 956 dYi1hi DN dYj4hi 957 dYi2lo DN dYj2lo 958 dYi2hi DN dYj2hi 959 dYi3lo DN dYj7lo 960 dYi3hi DN dYj7hi 961 dYi4lo DN dYj5lo 962 dYi4hi DN dYj5hi 963 dYi5lo DN dYjtlo 964 dYi5hi DN dYjthi 965 dYi6lo DN dYj1lo 966 dYi6hi DN dYj1hi 967 dYi7lo DN dYj6lo 968 dYi7hi DN dYj6hi 969 dYitlo DN dYj3lo 970 dYithi DN dYj3hi 971 972 qYh0 QN qYit 973 qYh1 QN qYi0 974 qYh2 QN qYi2 975 qYh3 QN qYi3 976 qYh4 QN qYi7 977 qYh5 QN qYi5 978 qYh6 QN qYi4 979 qYh7 QN qYi1 980 qYht QN qYi6 981 982 dYh0lo DN dYitlo 983 dYh0hi DN dYithi 984 dYh1lo DN dYi0lo 985 dYh1hi DN dYi0hi 986 dYh2lo DN dYi2lo 987 dYh2hi DN dYi2hi 988 dYh3lo DN dYi3lo 989 dYh3hi DN dYi3hi 990 dYh4lo DN dYi7lo 991 dYh4hi DN dYi7hi 992 dYh5lo DN dYi5lo 993 dYh5hi DN dYi5hi 994 dYh6lo DN dYi4lo 995 dYh6hi DN dYi4hi 996 dYh7lo DN dYi1lo 997 dYh7hi DN dYi1hi 998 dYhtlo DN dYi6lo 999 dYhthi DN dYi6hi 1000 1001 qYg0 QN qYh2 1002 qYg1 QN qYht 1003 qYg2 QN qYh1 1004 qYg3 QN qYh0 1005 qYg4 QN qYh4 1006 qYg5 QN qYh5 1007 qYg6 QN qYh6 1008 qYg7 QN qYh7 1009 qYgt QN qYh3 1010 1011 qYf0 QN qYg6 1012 qYf1 QN qYg5 1013 qYf2 QN qYg4 1014 qYf3 QN qYgt 1015 qYf4 QN qYg3 1016 qYf5 QN qYg2 1017 qYf6 QN qYg1 1018 qYf7 QN qYg0 1019 qYft QN qYg7 1020 1021 VRSHR qYj7, qYj7, #2 1022 VRSHR qYj6, qYj6, #1 1023 1024 VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2 1025 VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7 1026 VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2 1027 VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6 1028 VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2 1029 VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3 1030 1031 VQRDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2) 1032 ;// IStage 4,3 rows 0to1 x 1/2 1033 1034 MOV pTemp, #0x4 ;// ensure correct round 1035 VDUP qScale1, pTemp ;// of DC result 1036 VADD qYi0, qYi0, qScale1 1037 1038 VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2 1039 VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2 1040 1041 VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4 1042 VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2 1043 VSUB qYh2, qYi2, qYi3 ;// h2, h3 1044 VQRDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2) 1045 1046 VMULL qXt0, dYi4lo, C ;// c*i4 1047 VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6 1048 VMULL qXt1, dYi4hi, C 1049 VMLAL qXt1, dYi6hi, S 1050 VSHRN dYh4lo, qXt0, #16 ;// h4 1051 VSHRN dYh4hi, qXt1, #16 1052 1053 VMULL qXt0, dYi6lo, C ;// c*i6 1054 VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6 1055 VMULL qXt1, dYi6hi, C 1056 VMLSL qXt1, dYi4hi, S 1057 VSHRN dYh6lo, qXt0, #16 ;// h6 1058 VSHRN dYh6hi, qXt1, #16 1059 1060 VSUB qYg6, qYh6, qYh7 1061 VSUB qYg5, qYh5, qYg6 1062 VSUB qYg4, qYh4, qYg5 1063 1064 ;// IStage 2 rows 0to3 x 1/2 1065 VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2 1066 VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2 1067 VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2 1068 VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2 1069 1070 1071 ;// IStage 1 all rows 1072 VHADD qYf3, qYg3, qYg4 1073 VHSUB qYf4, qYg3, qYg4 1074 VHADD qYf2, qYg2, qYg5 1075 VHSUB qYf5, qYg2, qYg5 1076 VHADD qYf1, qYg1, qYg6 1077 VHSUB qYf6, qYg1, qYg6 1078 VHADD qYf0, qYg0, qYg7 1079 VHSUB qYf7, qYg0, qYg7 1080 1081 YTR0 EQU Src0 1082 YTR1 EQU Src4 1083 YTR2 EQU Src1 1084 YTR3 EQU Src2 1085 YTR4 EQU Src7 1086 YTR5 EQU Src5 1087 YTR6 EQU Tmp 1088 YTR7 EQU Src6 1089 YTRt EQU Src3 1090 1091 qC0 QN YTR0.S32 ;// for YTRpose 1092 qC1 QN YTR1.S32 1093 qC2 QN YTR2.S32 1094 qC3 QN YTR3.S32 1095 qC4 QN YTR4.S32 1096 qC5 QN YTR5.S32 1097 qC6 QN YTR6.S32 1098 qC7 QN YTR7.S32 1099 1100 dD0 DN YTR0*2+1 ;// for using VSWP 1101 dD1 DN YTR1*2+1 1102 dD2 DN YTR2*2+1 1103 dD3 DN YTR3*2+1 1104 dD4 DN YTR4*2 1105 dD5 DN YTR5*2 1106 dD6 DN YTR6*2 1107 dD7 DN YTR7*2 1108 1109 VTRN qYf0, qYf1 1110 VTRN qYf2, qYf3 1111 VTRN qYf4, qYf5 1112 VTRN qYf6, qYf7 1113 VTRN qC0, qC2 1114 VTRN qC1, qC3 1115 VTRN qC4, qC6 1116 VTRN qC5, qC7 1117 VSWP dD0, dD4 1118 VSWP dD1, dD5 1119 VSWP dD2, dD6 1120 VSWP dD3, dD7 1121 1122 1123 dYf0U8 DN YTR0*2.U8 1124 dYf1U8 DN YTR1*2.U8 1125 dYf2U8 DN YTR2*2.U8 1126 dYf3U8 DN YTR3*2.U8 1127 dYf4U8 DN YTR4*2.U8 1128 dYf5U8 DN YTR5*2.U8 1129 dYf6U8 DN YTR6*2.U8 1130 dYf7U8 DN YTR7*2.U8 1131 1132 ;// 1133 ;// Do saturation if outsize is other than S16 1134 ;// 1135 1136 IF ("$outsize"="u8") 1137 ;// Output range [0-255] 1138 VQMOVN dYf0U8, qYf0 1139 VQMOVN dYf1U8, qYf1 1140 VQMOVN dYf2U8, qYf2 1141 VQMOVN dYf3U8, qYf3 1142 VQMOVN dYf4U8, qYf4 1143 VQMOVN dYf5U8, qYf5 1144 VQMOVN dYf6U8, qYf6 1145 VQMOVN dYf7U8, qYf7 1146 ENDIF 1147 1148 IF ("$outsize"="s9") 1149 ;// Output range [-256 to +255] 1150 VQSHL qYf0, qYf0, #16-9 1151 VQSHL qYf1, qYf1, #16-9 1152 VQSHL qYf2, qYf2, #16-9 1153 VQSHL qYf3, qYf3, #16-9 1154 VQSHL qYf4, qYf4, #16-9 1155 VQSHL qYf5, qYf5, #16-9 1156 VQSHL qYf6, qYf6, #16-9 1157 VQSHL qYf7, qYf7, #16-9 1158 1159 VSHR qYf0, qYf0, #16-9 1160 VSHR qYf1, qYf1, #16-9 1161 VSHR qYf2, qYf2, #16-9 1162 VSHR qYf3, qYf3, #16-9 1163 VSHR qYf4, qYf4, #16-9 1164 VSHR qYf5, qYf5, #16-9 1165 VSHR qYf6, qYf6, #16-9 1166 VSHR qYf7, qYf7, #16-9 1167 ENDIF 1168 1169 ;// Store output depending on the Stride size 1170 IF "$stride"="s" 1171 VST1 qYf0, [pDest @64], Stride 1172 VST1 qYf1, [pDest @64], Stride 1173 VST1 qYf2, [pDest @64], Stride 1174 VST1 qYf3, [pDest @64], Stride 1175 VST1 qYf4, [pDest @64], Stride 1176 VST1 qYf5, [pDest @64], Stride 1177 VST1 qYf6, [pDest @64], Stride 1178 VST1 qYf7, [pDest @64] 1179 ELSE 1180 IF ("$outsize"="u8") 1181 VST1 dYf0U8, [pDest @64], #8 1182 VST1 dYf1U8, [pDest @64], #8 1183 VST1 dYf2U8, [pDest @64], #8 1184 VST1 dYf3U8, [pDest @64], #8 1185 VST1 dYf4U8, [pDest @64], #8 1186 VST1 dYf5U8, [pDest @64], #8 1187 VST1 dYf6U8, [pDest @64], #8 1188 VST1 dYf7U8, [pDest @64] 1189 ELSE 1190 ;// ("$outsize"="s9") or ("$outsize"="s16") 1191 VST1 qYf0, [pDest @64], #16 1192 VST1 qYf1, [pDest @64], #16 1193 VST1 qYf2, [pDest @64], #16 1194 VST1 qYf3, [pDest @64], #16 1195 VST1 qYf4, [pDest @64], #16 1196 VST1 qYf5, [pDest @64], #16 1197 VST1 qYf6, [pDest @64], #16 1198 VST1 qYf7, [pDest @64] 1199 ENDIF 1200 1201 ENDIF 1202 1203 1204 1205 ENDIF ;// CortexA8 1206 1207 1208 1209 MEND 1210 1211 ;// Scale TWO input rows with TWO rows of 16 bit scale values 1212 ;// 1213 ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row 1214 ;// input (Eight input values) with one row of scale values. Also 1215 ;// Loads next scale values from pScale, if $LastRow flag is not set. 1216 ;// 1217 ;// Input Registers: 1218 ;// 1219 ;// $dAlo - Input D register with first four S16 values of row n 1220 ;// $dAhi - Input D register with next four S16 values of row n 1221 ;// $dBlo - Input D register with first four S16 values of row n+1 1222 ;// $dBhi - Input D register with next four S16 values of row n+1 1223 ;// pScale - Pointer to next row of scale values 1224 ;// qT0lo - Temporary scratch register 1225 ;// qT0hi - Temporary scratch register 1226 ;// qT1lo - Temporary scratch register 1227 ;// qT1hi - Temporary scratch register 1228 ;// dScale1lo - Scale value of row n 1229 ;// dScale1hi - Scale value of row n 1230 ;// dScale2lo - Scale value of row n+1 1231 ;// dScale2hi - Scale value of row n+1 1232 ;// 1233 ;// Input Flag 1234 ;// 1235 ;// $LastRow - Flag to indicate whether current row is last row 1236 ;// 1237 ;// Output Registers: 1238 ;// 1239 ;// $dAlo - Scaled output values (first four S16 of row n) 1240 ;// $dAhi - Scaled output values (next four S16 of row n) 1241 ;// $dBlo - Scaled output values (first four S16 of row n+1) 1242 ;// $dBhi - Scaled output values (next four S16 of row n+1) 1243 ;// qScale1 - Scale values for next row 1244 ;// qScale2 - Scale values for next row+1 1245 ;// pScale - Pointer to next row of scale values 1246 ;// 1247 MACRO 1248 M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow 1249 VMULL qT0lo, $dAlo, dScale1lo 1250 VMULL qT0hi, $dAhi, dScale1hi 1251 VMULL qT1lo, $dBlo, dScale2lo 1252 VMULL qT1hi, $dBhi, dScale2hi 1253 IF "$LastRow"="0" 1254 VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1 1255 VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2 1256 ENDIF 1257 VQRSHRN $dAlo, qT0lo, #12 1258 VQRSHRN $dAhi, qT0hi, #12 1259 VQRSHRN $dBlo, qT1lo, #12 1260 VQRSHRN $dBhi, qT1hi, #12 1261 MEND 1262 1263 ;// Scale 8x8 block input values with 16 bit scale values 1264 ;// 1265 ;// This macro is used to pre-scale block of 8x8 input. 1266 ;// This also do the Ist stage transformations of IDCT. 1267 ;// 1268 ;// Input Registers: 1269 ;// 1270 ;// dXjnlo - n th input D register with first four S16 values 1271 ;// dXjnhi - n th input D register with next four S16 values 1272 ;// qXjn - n th input Q register with eight S16 values 1273 ;// pScale - Pointer to scale values 1274 ;// 1275 ;// Output Registers: 1276 ;// 1277 ;// qXin - n th output Q register with eight S16 output values of 1st stage 1278 ;// 1279 MACRO 1280 M_IDCT_PRESCALE16 1281 VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0 1282 VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0 1283 M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1 1284 M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0 1285 M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0 1286 M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1 1287 VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2 1288 VSUB qXi6, qXj1, qXj7 ;// j1-j7 1289 LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants 1290 VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2 1291 VSUB qXi2, qXj2, qXj6 ;// j2-j6 1292 VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants 1293 VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2 1294 VSUB qXi4, qXj5, qXj3 ;// j5-j3 1295 MEND 1296 1297 1298 ;// Scale 8x8 block input values with 32 bit scale values 1299 ;// 1300 ;// This macro is used to pre-scale block of 8x8 input. 1301 ;// This also do the Ist stage transformations of IDCT. 1302 ;// 1303 ;// Input Registers: 1304 ;// 1305 ;// dXjnlo - n th input D register with first four S16 values 1306 ;// dXjnhi - n th input D register with next four S16 values 1307 ;// qXjn - n th input Q register with eight S16 values 1308 ;// pScale - Pointer to 32bit scale values in Q23 format 1309 ;// 1310 ;// Output Registers: 1311 ;// 1312 ;// dXinlo - n th output D register with first four S16 output values of 1st stage 1313 ;// dXinhi - n th output D register with next four S16 output values of 1st stage 1314 ;// 1315 MACRO 1316 M_IDCT_PRESCALE32 1317 qScale0lo QN 0.S32 1318 qScale0hi QN 1.S32 1319 qScale1lo QN 2.S32 1320 qScale1hi QN 3.S32 1321 qScale2lo QN qScale1lo 1322 qScale2hi QN qScale1hi 1323 qScale3lo QN qScale1lo 1324 qScale3hi QN qScale1hi 1325 qScale4lo QN qScale1lo 1326 qScale4hi QN qScale1hi 1327 qScale5lo QN qScale0lo 1328 qScale5hi QN qScale0hi 1329 qScale6lo QN qScale0lo 1330 qScale6hi QN qScale0hi 1331 qScale7lo QN qScale0lo 1332 qScale7hi QN qScale0hi 1333 1334 qSrc0lo QN 4.S32 1335 qSrc0hi QN 5.S32 1336 qSrc1lo QN 6.S32 1337 qSrc1hi QN Src4.S32 1338 qSrc2lo QN qSrc0lo 1339 qSrc2hi QN qSrc0hi 1340 qSrc3lo QN qSrc0lo 1341 qSrc3hi QN qSrc0hi 1342 qSrc4lo QN qSrc0lo 1343 qSrc4hi QN qSrc0hi 1344 qSrc5lo QN qSrc1lo 1345 qSrc5hi QN qSrc1hi 1346 qSrc6lo QN qSrc1lo 1347 qSrc6hi QN qSrc1hi 1348 qSrc7lo QN qSrc0lo 1349 qSrc7hi QN qSrc0hi 1350 1351 qRes17lo QN qScale0lo 1352 qRes17hi QN qScale0hi 1353 qRes26lo QN qScale0lo 1354 qRes26hi QN qScale0hi 1355 qRes53lo QN qScale0lo 1356 qRes53hi QN qScale0hi 1357 1358 ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7] 1359 1360 ;// Row 0 1361 VLD1 {qScale0lo, qScale0hi}, [pScale]! 1362 VSHLL qSrc0lo, dXj0lo, #(12-1) 1363 VSHLL qSrc0hi, dXj0hi, #(12-1) 1364 VLD1 {qScale1lo, qScale1hi}, [pScale]! 1365 VQRDMULH qSrc0lo, qScale0lo, qSrc0lo 1366 VQRDMULH qSrc0hi, qScale0hi, qSrc0hi 1367 VLD1 {qScale7lo, qScale7hi}, [pTemp]! 1368 VSHLL qSrc1lo, dXj1lo, #(12-1) 1369 VSHLL qSrc1hi, dXj1hi, #(12-1) 1370 VMOVN dXi0lo, qSrc0lo ;// Output i0 1371 VMOVN dXi0hi, qSrc0hi 1372 VSHLL qSrc7lo, dXj7lo, #(12-1) 1373 VSHLL qSrc7hi, dXj7hi, #(12-1) 1374 SUB pTemp, pTemp, #((16*2)+(4*8*1)) 1375 VQRDMULH qSrc1lo, qScale1lo, qSrc1lo 1376 VQRDMULH qSrc1hi, qScale1hi, qSrc1hi 1377 VQRDMULH qSrc7lo, qScale7lo, qSrc7lo 1378 VQRDMULH qSrc7hi, qScale7hi, qSrc7hi 1379 VLD1 {qScale2lo, qScale2hi}, [pScale]! 1380 1381 ;// Row 1 & 7 1382 VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2 1383 VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2 1384 VMOVN dXi5lo, qRes17lo ;// Output i5 1385 VMOVN dXi5hi, qRes17hi 1386 VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7 1387 VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7 1388 VMOVN dXi6lo, qRes17lo ;// Output i6 1389 VMOVN dXi6hi, qRes17hi 1390 VSHLL qSrc2lo, dXj2lo, #(12-1) 1391 VSHLL qSrc2hi, dXj2hi, #(12-1) 1392 VLD1 {qScale6lo, qScale6hi}, [pTemp]! 1393 VSHLL qSrc6lo, dXj6lo, #(12-1) 1394 VSHLL qSrc6hi, dXj6hi, #(12-1) 1395 SUB pTemp, pTemp, #((16*2)+(4*8*1)) 1396 VQRDMULH qSrc2lo, qScale2lo, qSrc2lo 1397 VQRDMULH qSrc2hi, qScale2hi, qSrc2hi 1398 VQRDMULH qSrc6lo, qScale6lo, qSrc6lo 1399 VQRDMULH qSrc6hi, qScale6hi, qSrc6hi 1400 VLD1 {qScale3lo, qScale3hi}, [pScale]! 1401 1402 ;// Row 2 & 6 1403 VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2 1404 VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2 1405 VMOVN dXi3lo, qRes26lo ;// Output i3 1406 VMOVN dXi3hi, qRes26hi 1407 VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6 1408 VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6 1409 VMOVN dXi2lo, qRes26lo ;// Output i2 1410 VMOVN dXi2hi, qRes26hi 1411 VSHLL qSrc3lo, dXj3lo, #(12-1) 1412 VSHLL qSrc3hi, dXj3hi, #(12-1) 1413 VLD1 {qScale5lo, qScale5hi}, [pTemp]! 1414 VSHLL qSrc5lo, dXj5lo, #(12-1) 1415 VSHLL qSrc5hi, dXj5hi, #(12-1) 1416 VQRDMULH qSrc3lo, qScale3lo, qSrc3lo 1417 VQRDMULH qSrc3hi, qScale3hi, qSrc3hi 1418 VQRDMULH qSrc5lo, qScale5lo, qSrc5lo 1419 VQRDMULH qSrc5hi, qScale5hi, qSrc5hi 1420 1421 ;// Row 3 & 5 1422 VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2 1423 VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2 1424 SUB pSrc, pSrc, #16*2*2 1425 VMOVN dXi7lo, qRes53lo ;// Output i7 1426 VMOVN dXi7hi, qRes53hi 1427 VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3 1428 VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3 1429 VLD1 qXj4, [pSrc @64] 1430 VMOVN dXi4lo, qRes53lo ;// Output i4 1431 VMOVN dXi4hi, qRes53hi 1432 VSHLL qSrc4lo, dXj4lo, #(12-1) 1433 VSHLL qSrc4hi, dXj4hi, #(12-1) 1434 VLD1 {qScale4lo, qScale4hi}, [pScale] 1435 LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants 1436 VQRDMULH qSrc4lo, qScale4lo, qSrc4lo 1437 VQRDMULH qSrc4hi, qScale4hi, qSrc4hi 1438 VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants 1439 ;// Row 4 1440 VMOVN dXi1lo, qSrc4lo ;// Output i1 1441 VMOVN dXi1hi, qSrc4hi 1442 1443 MEND 1444 1445 END 1446