1 ;// 2 ;// Copyright (C) 2004 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// IDCT_s.s 19 ;// 20 ;// Inverse DCT module 21 ;// 22 ;// 23 ;// ALGORITHM DESCRIPTION 24 ;// 25 ;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each 26 ;// column and then a 1D IDCT for each row. 27 ;// 28 ;// The 8-point 1D IDCT is defined by 29 ;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2 30 ;// 31 ;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0 32 ;// c(u,x) = cos( (2x+1)*u*pi/16 ) 33 ;// 34 ;// We compute the 8-point 1D IDCT using the reverse of 35 ;// the Arai-Agui-Nakajima flow graph which we split into 36 ;// 5 stages named in reverse order to identify with the 37 ;// forward DCT. Direct inversion of the forward formulae 38 ;// in file FDCT_s.s gives: 39 ;// 40 ;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ] 41 ;// [ A(0) = 2*sqrt(2) 42 ;// A(u) = 4*cos(u*pi/16) for (u!=0) ] 43 ;// 44 ;// IStage 4: i0 = j0 i1 = j4 45 ;// i3 = (j2+j6)/2 i2 = (j2-j6)/2 46 ;// i7 = (j5+j3)/2 i4 = (j5-j3)/2 47 ;// i5 = (j1+j7)/2 i6 = (j1-j7)/2 48 ;// 49 ;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2 50 ;// h2 = (i2*sqrt2)-i3 h3 = i3 51 ;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6 52 ;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6 53 ;// [ The above two lines rotate by -(pi/8) ] 54 ;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2 55 ;// 56 ;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2 57 ;// g1 = (h1+h2)/2 g2 = (h1-h2)/2 58 ;// g7 = h7 g6 = h6 - h7 59 ;// g5 = h5 - g6 g4 = h4 - g5 60 ;// 61 ;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2 62 ;// f1 = (g1+g6)/2 f6 = (g1-g6)/2 63 ;// f2 = (g2+g5)/2 f5 = (g2-g5)/2 64 ;// f3 = (g3+g4)/2 f4 = (g3-g4)/2 65 ;// 66 ;// Note that most coefficients are halved 3 times during the 67 ;// above calculation. We can rescale the algorithm dividing 68 ;// the input by 8 to remove the halvings. 69 ;// 70 ;// IStage 5: j(u) = T(u)*A(u)/8 71 ;// 72 ;// IStage 4: i0 = j0 i1 = j4 73 ;// i3 = j2 + j6 i2 = j2 - j6 74 ;// i7 = j5 + j3 i4 = j5 - j3 75 ;// i5 = j1 + j7 i6 = j1 - j7 76 ;// 77 ;// IStage 3: h0 = i0 + i1 h1 = i0 - i1 78 ;// h2 = (i2*sqrt2)-i3 h3 = i3 79 ;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6) 80 ;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6) 81 ;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7 82 ;// 83 ;// IStage 2: g0 = h0 + h3 g3 = h0 - h3 84 ;// g1 = h1 + h2 g2 = h1 - h2 85 ;// g7 = h7 g6 = h6 - h7 86 ;// g5 = h5 - g6 g4 = h4 - g5 87 ;// 88 ;// IStage 1: f0 = g0 + g7 f7 = g0 - g7 89 ;// f1 = g1 + g6 f6 = g1 - g6 90 ;// f2 = g2 + g5 f5 = g2 - g5 91 ;// f3 = g3 + g4 f4 = g3 - g4 92 ;// 93 ;// Note: 94 ;// 1. The scaling by A(u)/8 can often be combined with inverse 95 ;// quantization. The column and row scalings can be combined. 96 ;// 2. The flowgraph in the AAN paper has h4,g6 negated compared 97 ;// to the above code but is otherwise identical. 98 ;// 3. The rotation by -pi/8 can be peformed using three multiplies 99 ;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4 100 ;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6 101 ;// 4. If |T(u)|<=1 then from the IDCT definition, 102 ;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2 103 ;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2 104 ;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2 105 ;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2) 106 ;// = (approx)2.64 107 ;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits. 108 ;// The table below shows input patterns generating the maximum 109 ;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1 110 ;// InputPattern Max |f(x)| 111 ;// PPPPPPPP |f0| = 2.64 112 ;// PPPMMMMM |f1| = 2.64 113 ;// PPMMMPPP |f2| = 2.64 114 ;// PPMMPPMM |f3| = 2.64 115 ;// PMMPPMMP |f4| = 2.64 116 ;// PMMPMMPM |f5| = 2.64 117 ;// PMPPMPMP |f6| = 2.64 118 ;// PMPMPMPM |f7| = 2.64 119 ;// Note that this input pattern is the transpose of the 120 ;// corresponding max input patter for the FDCT. 121 122 ;// Arguments 123 124 pSrc RN 0 ;// source data buffer 125 Stride RN 1 ;// destination stride in bytes 126 pDest RN 2 ;// destination data buffer 127 pScale RN 3 ;// pointer to scaling table 128 129 130 ;// DCT Inverse Macro 131 ;// The DCT code should be parametrized according 132 ;// to the following inputs: 133 ;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255) 134 ;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255) 135 ;// "s16" : 16-bit signed data not saturated (max size ~+/-14273) 136 ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment 137 ;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment 138 ;// 139 ;// Inputs: 140 ;// pSrc = r0 = Pointer to input data 141 ;// Range is -256 to +255 (9-bit) 142 ;// Stride = r1 = Stride between input lines 143 ;// pDest = r2 = Pointer to output data 144 ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale 145 146 147 148 MACRO 149 M_IDCT $outsize, $inscale, $stride 150 LCLA SHIFT 151 152 153 IF ARM1136JS 154 155 ;// REGISTER ALLOCATION 156 ;// This is hard since we have 8 values, 9 free registers and each 157 ;// butterfly requires a temporary register. We also want to 158 ;// maintain register order so we can use LDM/STM. The table below 159 ;// summarises the register allocation that meets all these criteria. 160 ;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above. 161 ;// 162 ;// r1 a01 g0 h0 163 ;// r4 b01 f0 g1 h1 i0 164 ;// r5 a23 f1 g2 i1 165 ;// r6 b23 f2 g3 h2 i2 166 ;// r7 a45 f3 h3 i3 167 ;// r8 b45 f4 g4 h4 i4 168 ;// r9 a67 f5 g5 h5 i5 169 ;// r10 b67 f6 g6 h6 i6 170 ;// r11 f7 g7 h7 i7 171 ;// 172 ra01 RN 1 173 rb01 RN 4 174 ra23 RN 5 175 rb23 RN 6 176 ra45 RN 7 177 rb45 RN 8 178 ra67 RN 9 179 rb67 RN 10 180 rtmp RN 11 181 csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ] 182 LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ] 183 ;// Transpose allocation 184 xft RN ra01 185 xf0 RN rb01 186 xf1 RN ra23 187 xf2 RN rb23 188 xf3 RN ra45 189 xf4 RN rb45 190 xf5 RN ra67 191 xf6 RN rb67 192 xf7 RN rtmp 193 ;// IStage 1 allocation 194 xg0 RN xft 195 xg1 RN xf0 196 xg2 RN xf1 197 xg3 RN xf2 198 xgt RN xf3 199 xg4 RN xf4 200 xg5 RN xf5 201 xg6 RN xf6 202 xg7 RN xf7 203 ;// IStage 2 allocation 204 xh0 RN xg0 205 xh1 RN xg1 206 xht RN xg2 207 xh2 RN xg3 208 xh3 RN xgt 209 xh4 RN xg4 210 xh5 RN xg5 211 xh6 RN xg6 212 xh7 RN xg7 213 ;// IStage 3,4 allocation 214 xit RN xh0 215 xi0 RN xh1 216 xi1 RN xht 217 xi2 RN xh2 218 xi3 RN xh3 219 xi4 RN xh4 220 xi5 RN xh5 221 xi6 RN xh6 222 xi7 RN xh7 223 224 M_STR pDest, ppDest 225 IF "$stride"="s" 226 M_STR Stride, pStride 227 ENDIF 228 M_ADR pDest, pBlk 229 LDR csPiBy8, =0x30fc7642 230 LDR LoopRR2, =0x00005a82 231 232 v6_idct_col$_F 233 ;// Load even values 234 LDR xi4, [pSrc], #4 ;// j0 235 LDR xi5, [pSrc, #4*16-4] ;// j4 236 LDR xi6, [pSrc, #2*16-4] ;// j2 237 LDR xi7, [pSrc, #6*16-4] ;// j6 238 239 ;// Scale Even Values 240 IF "$inscale"="s16" ;// 16x16 mul 241 SHIFT SETA 12 242 LDR xi0, [pScale], #4 243 LDR xi1, [pScale, #4*16-4] 244 LDR xi2, [pScale, #2*16-4] 245 MOV xit, #1<<(SHIFT-1) 246 SMLABB xi3, xi0, xi4, xit 247 SMLATT xi4, xi0, xi4, xit 248 SMLABB xi0, xi1, xi5, xit 249 SMLATT xi5, xi1, xi5, xit 250 MOV xi3, xi3, ASR #SHIFT 251 PKHBT xi4, xi3, xi4, LSL #(16-SHIFT) 252 LDR xi3, [pScale, #6*16-4] 253 SMLABB xi1, xi2, xi6, xit 254 SMLATT xi6, xi2, xi6, xit 255 MOV xi0, xi0, ASR #SHIFT 256 PKHBT xi5, xi0, xi5, LSL #(16-SHIFT) 257 SMLABB xi2, xi3, xi7, xit 258 SMLATT xi7, xi3, xi7, xit 259 MOV xi1, xi1, ASR #SHIFT 260 PKHBT xi6, xi1, xi6, LSL #(16-SHIFT) 261 MOV xi2, xi2, ASR #SHIFT 262 PKHBT xi7, xi2, xi7, LSL #(16-SHIFT) 263 ENDIF 264 IF "$inscale"="s32" ;// 32x16 mul 265 SHIFT SETA (12+8-16) 266 MOV xit, #1<<(SHIFT-1) 267 LDR xi0, [pScale], #8 268 LDR xi1, [pScale, #0*32+4-8] 269 LDR xi2, [pScale, #4*32-8] 270 LDR xi3, [pScale, #4*32+4-8] 271 SMLAWB xi0, xi0, xi4, xit 272 SMLAWT xi1, xi1, xi4, xit 273 SMLAWB xi2, xi2, xi5, xit 274 SMLAWT xi3, xi3, xi5, xit 275 MOV xi0, xi0, ASR #SHIFT 276 PKHBT xi4, xi0, xi1, LSL #(16-SHIFT) 277 MOV xi2, xi2, ASR #SHIFT 278 PKHBT xi5, xi2, xi3, LSL #(16-SHIFT) 279 LDR xi0, [pScale, #2*32-8] 280 LDR xi1, [pScale, #2*32+4-8] 281 LDR xi2, [pScale, #6*32-8] 282 LDR xi3, [pScale, #6*32+4-8] 283 SMLAWB xi0, xi0, xi6, xit 284 SMLAWT xi1, xi1, xi6, xit 285 SMLAWB xi2, xi2, xi7, xit 286 SMLAWT xi3, xi3, xi7, xit 287 MOV xi0, xi0, ASR #SHIFT 288 PKHBT xi6, xi0, xi1, LSL #(16-SHIFT) 289 MOV xi2, xi2, ASR #SHIFT 290 PKHBT xi7, xi2, xi3, LSL #(16-SHIFT) 291 ENDIF 292 293 ;// Load odd values 294 LDR xi0, [pSrc, #1*16-4] ;// j1 295 LDR xi1, [pSrc, #7*16-4] ;// j7 296 LDR xi2, [pSrc, #5*16-4] ;// j5 297 LDR xi3, [pSrc, #3*16-4] ;// j3 298 299 IF {TRUE} 300 ;// shortcut if odd values 0 301 TEQ xi0, #0 302 TEQEQ xi1, #0 303 TEQEQ xi2, #0 304 TEQEQ xi3, #0 305 BEQ v6OddZero$_F 306 ENDIF 307 308 ;// Store scaled even values 309 STMIA pDest, {xi4, xi5, xi6, xi7} 310 311 ;// Scale odd values 312 IF "$inscale"="s16" 313 ;// Perform AAN Scale 314 LDR xi4, [pScale, #1*16-4] 315 LDR xi5, [pScale, #7*16-4] 316 LDR xi6, [pScale, #5*16-4] 317 SMLABB xi7, xi0, xi4, xit 318 SMLATT xi0, xi0, xi4, xit 319 SMLABB xi4, xi1, xi5, xit 320 SMLATT xi1, xi1, xi5, xit 321 MOV xi7, xi7, ASR #SHIFT 322 PKHBT xi0, xi7, xi0, LSL #(16-SHIFT) 323 LDR xi7, [pScale, #3*16-4] 324 SMLABB xi5, xi2, xi6, xit 325 SMLATT xi2, xi2, xi6, xit 326 MOV xi4, xi4, ASR #SHIFT 327 PKHBT xi1, xi4, xi1, LSL #(16-SHIFT) 328 SMLABB xi6, xi3, xi7, xit 329 SMLATT xi3, xi3, xi7, xit 330 MOV xi5, xi5, ASR #SHIFT 331 PKHBT xi2, xi5, xi2, LSL #(16-SHIFT) 332 MOV xi6, xi6, ASR #SHIFT 333 PKHBT xi3, xi6, xi3, LSL #(16-SHIFT) 334 ENDIF 335 IF "$inscale"="s32" ;// 32x16 mul 336 LDR xi4, [pScale, #1*32-8] 337 LDR xi5, [pScale, #1*32+4-8] 338 LDR xi6, [pScale, #7*32-8] 339 LDR xi7, [pScale, #7*32+4-8] 340 SMLAWB xi4, xi4, xi0, xit 341 SMLAWT xi5, xi5, xi0, xit 342 SMLAWB xi6, xi6, xi1, xit 343 SMLAWT xi7, xi7, xi1, xit 344 MOV xi4, xi4, ASR #SHIFT 345 PKHBT xi0, xi4, xi5, LSL #(16-SHIFT) 346 MOV xi6, xi6, ASR #SHIFT 347 PKHBT xi1, xi6, xi7, LSL #(16-SHIFT) 348 LDR xi4, [pScale, #5*32-8] 349 LDR xi5, [pScale, #5*32+4-8] 350 LDR xi6, [pScale, #3*32-8] 351 LDR xi7, [pScale, #3*32+4-8] 352 SMLAWB xi4, xi4, xi2, xit 353 SMLAWT xi5, xi5, xi2, xit 354 SMLAWB xi6, xi6, xi3, xit 355 SMLAWT xi7, xi7, xi3, xit 356 MOV xi4, xi4, ASR #SHIFT 357 PKHBT xi2, xi4, xi5, LSL #(16-SHIFT) 358 MOV xi6, xi6, ASR #SHIFT 359 PKHBT xi3, xi6, xi7, LSL #(16-SHIFT) 360 ENDIF 361 362 LDR xit, =0x00010001 ;// rounding constant 363 SADD16 xi5, xi0, xi1 ;// (j1+j7)/2 364 SHADD16 xi5, xi5, xit 365 366 SSUB16 xi6, xi0, xi1 ;// j1-j7 367 SADD16 xi7, xi2, xi3 ;// (j5+j3)/2 368 SHADD16 xi7, xi7, xit 369 370 SSUB16 xi4, xi2, xi3 ;// j5-j3 371 372 SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 373 374 PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a 375 PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b 376 377 SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] 378 SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] 379 SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] 380 SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] 381 382 SMULBB xi1, xi3, LoopRR2 383 SMULTB xi3, xi3, LoopRR2 384 385 PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 386 PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 387 SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 388 389 ;// xi0,xi1,xi2,xi3 now free 390 ;// IStage 4,3, rows 2to3 x1/2 391 392 MOV xi3, xi3, LSL #1 393 PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 394 LDRD xi0, [pDest, #8] ;// j2,j6 scaled 395 396 ;// IStage 2, rows4to7 397 SSUB16 xg6, xh6, xh7 398 SSUB16 xg5, xh5, xg6 399 SSUB16 xg4, xh4, xg5 400 401 SSUB16 xi2, xi0, xi1 ;// (j2-j6) 402 403 SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 404 405 SMULBB xi0, xi2, LoopRR2 406 SMULTB xi2, xi2, LoopRR2 407 408 MOV xi2, xi2, LSL #1 409 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 410 411 ;// xi0, xi1 now free 412 ;// IStage 4,3 rows 0to1 x 1/2 413 LDRD xi0, [pDest] ;// j0, j4 scaled 414 SSUB16 xh2, xh2, xi3 415 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 416 417 SHADD16 xh0, xi0, xi1 418 SHSUB16 xh1, xi0, xi1 419 420 ;// IStage 2 rows 0to3 x 1/2 421 SHSUB16 xg2, xh1, xh2 422 SHADD16 xg1, xh1, xh2 423 SHSUB16 xg3, xh0, xh3 424 SHADD16 xg0, xh0, xh3 425 426 ;// IStage 1 all rows 427 SADD16 xf3, xg3, xg4 428 SSUB16 xf4, xg3, xg4 429 SADD16 xf2, xg2, xg5 430 SSUB16 xf5, xg2, xg5 431 SADD16 xf1, xg1, xg6 432 SSUB16 xf6, xg1, xg6 433 SADD16 xf0, xg0, xg7 434 SSUB16 xf7, xg0, xg7 435 436 ;// Transpose, store and loop 437 PKHBT ra01, xf0, xf1, LSL #16 438 PKHTB rb01, xf1, xf0, ASR #16 439 440 PKHBT ra23, xf2, xf3, LSL #16 441 PKHTB rb23, xf3, xf2, ASR #16 442 443 PKHBT ra45, xf4, xf5, LSL #16 444 PKHTB rb45, xf5, xf4, ASR #16 445 446 PKHBT ra67, xf6, xf7, LSL #16 447 STMIA pDest!, {ra01, ra23, ra45, ra67} 448 PKHTB rb67, xf7, xf6, ASR #16 449 STMIA pDest!, {rb01, rb23, rb45, rb67} 450 BCC v6_idct_col$_F 451 452 SUB pSrc, pDest, #(64*2) 453 M_LDR pDest, ppDest 454 IF "$stride"="s" 455 M_LDR pScale, pStride 456 ENDIF 457 B v6_idct_row$_F 458 459 v6OddZero$_F 460 SSUB16 xi2, xi6, xi7 ;// (j2-j6) 461 SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2 462 463 SMULBB xi0, xi2, LoopRR2 464 SMULTB xi2, xi2, LoopRR2 465 466 MOV xi2, xi2, LSL #1 467 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 468 SSUB16 xh2, xh2, xi3 469 470 ;// xi0, xi1 now free 471 ;// IStage 4,3 rows 0to1 x 1/2 472 473 SHADD16 xh0, xi4, xi5 474 SHSUB16 xh1, xi4, xi5 475 476 ;// IStage 2 rows 0to3 x 1/2 477 SHSUB16 xg2, xh1, xh2 478 SHADD16 xg1, xh1, xh2 479 SHSUB16 xg3, xh0, xh3 480 SHADD16 xg0, xh0, xh3 481 482 ;// IStage 1 all rows 483 MOV xf3, xg3 484 MOV xf4, xg3 485 MOV xf2, xg2 486 MOV xf5, xg2 487 MOV xf1, xg1 488 MOV xf6, xg1 489 MOV xf0, xg0 490 MOV xf7, xg0 491 492 ;// Transpose 493 PKHBT ra01, xf0, xf1, LSL #16 494 PKHTB rb01, xf1, xf0, ASR #16 495 496 PKHBT ra23, xf2, xf3, LSL #16 497 PKHTB rb23, xf3, xf2, ASR #16 498 499 PKHBT ra45, xf4, xf5, LSL #16 500 PKHTB rb45, xf5, xf4, ASR #16 501 502 PKHBT ra67, xf6, xf7, LSL #16 503 PKHTB rb67, xf7, xf6, ASR #16 504 505 STMIA pDest!, {ra01, ra23, ra45, ra67} 506 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 507 STMIA pDest!, {rb01, rb23, rb45, rb67} 508 509 BCC v6_idct_col$_F 510 SUB pSrc, pDest, #(64*2) 511 M_LDR pDest, ppDest 512 IF "$stride"="s" 513 M_LDR pScale, pStride 514 ENDIF 515 516 517 v6_idct_row$_F 518 ;// IStage 4,3, rows4to7 x1/4 519 LDR xit, =0x00010001 ;// rounding constant 520 LDR xi0, [pSrc, #1*16] ;// j1 521 LDR xi1, [pSrc, #7*16] ;// 4*j7 522 LDR xi2, [pSrc, #5*16] ;// j5 523 LDR xi3, [pSrc, #3*16] ;// j3 524 525 SHADD16 xi1, xi1, xit ;// 2*j7 526 SHADD16 xi1, xi1, xit ;// j7 527 528 SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2 529 SSUB16 xi6, xi0, xi1 ;// j1-j7 530 SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2 531 SSUB16 xi4, xi2, xi3 ;// j5-j3 532 533 SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 534 535 PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a 536 PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b 537 538 SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] 539 SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] 540 SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] 541 SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] 542 543 SMULBB xi1, xi3, LoopRR2 544 SMULTB xi3, xi3, LoopRR2 545 546 PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 547 PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 548 SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 549 550 MOV xi3, xi3, LSL #1 551 PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 552 553 ;// xi0,xi1,xi2,xi3 now free 554 ;// IStage 4,3, rows 2to3 x1/2 555 556 LDR xi0, [pSrc, #2*16] ;// j2 557 LDR xi1, [pSrc, #6*16] ;// 2*j6 558 559 ;// IStage 2, rows4to7 560 SSUB16 xg6, xh6, xh7 561 SSUB16 xg5, xh5, xg6 562 SSUB16 xg4, xh4, xg5 563 564 SHADD16 xi1, xi1, xit ;// j6 565 SSUB16 xi2, xi0, xi1 ;// (j2-j6) 566 SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 567 568 SMULBB xi0, xi2, LoopRR2 569 SMULTB xi2, xi2, LoopRR2 570 571 MOV xi2, xi2, LSL #1 572 573 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 574 575 ;// xi0, xi1 now free 576 ;// IStage 4,3 rows 0to1 x 1/2 577 LDR xi1, [pSrc, #4*16] ;// j4 578 LDR xi0, [pSrc], #4 ;// j0 579 580 SSUB16 xh2, xh2, xi3 581 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 582 583 ADD xi0, xi0, xit, LSL #2 ;// ensure correct round 584 SHADD16 xh0, xi0, xi1 ;// of DC result 585 SHSUB16 xh1, xi0, xi1 586 587 ;// IStage 2 rows 0to3 x 1/2 588 SHSUB16 xg2, xh1, xh2 589 SHADD16 xg1, xh1, xh2 590 SHSUB16 xg3, xh0, xh3 591 SHADD16 xg0, xh0, xh3 592 593 ;// IStage 1 all rows 594 SHADD16 xf3, xg3, xg4 595 SHSUB16 xf4, xg3, xg4 596 SHADD16 xf2, xg2, xg5 597 SHSUB16 xf5, xg2, xg5 598 SHADD16 xf1, xg1, xg6 599 SHSUB16 xf6, xg1, xg6 600 SHADD16 xf0, xg0, xg7 601 SHSUB16 xf7, xg0, xg7 602 603 ;// Saturate 604 IF ("$outsize"="u8") 605 USAT16 xf0, #8, xf0 606 USAT16 xf1, #8, xf1 607 USAT16 xf2, #8, xf2 608 USAT16 xf3, #8, xf3 609 USAT16 xf4, #8, xf4 610 USAT16 xf5, #8, xf5 611 USAT16 xf6, #8, xf6 612 USAT16 xf7, #8, xf7 613 ENDIF 614 IF ("$outsize"="s9") 615 SSAT16 xf0, #9, xf0 616 SSAT16 xf1, #9, xf1 617 SSAT16 xf2, #9, xf2 618 SSAT16 xf3, #9, xf3 619 SSAT16 xf4, #9, xf4 620 SSAT16 xf5, #9, xf5 621 SSAT16 xf6, #9, xf6 622 SSAT16 xf7, #9, xf7 623 ENDIF 624 625 ;// Transpose to Row, Pack and store 626 IF ("$outsize"="u8") 627 ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ] 628 ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ] 629 ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ] 630 ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ] 631 PKHBT ra01, xf0, xf2, LSL #16 632 PKHTB rb01, xf2, xf0, ASR #16 633 PKHBT ra23, xf4, xf6, LSL #16 634 PKHTB rb23, xf6, xf4, ASR #16 635 STMIA pDest, {ra01, ra23} 636 IF "$stride"="s" 637 ADD pDest, pDest, pScale 638 STMIA pDest, {rb01, rb23} 639 ADD pDest, pDest, pScale 640 ELSE 641 ADD pDest, pDest, #($stride) 642 STMIA pDest, {rb01, rb23} 643 ADD pDest, pDest, #($stride) 644 ENDIF 645 ENDIF 646 IF ("$outsize"="s9"):LOR:("$outsize"="s16") 647 PKHBT ra01, xf0, xf1, LSL #16 648 PKHTB rb01, xf1, xf0, ASR #16 649 650 PKHBT ra23, xf2, xf3, LSL #16 651 PKHTB rb23, xf3, xf2, ASR #16 652 653 PKHBT ra45, xf4, xf5, LSL #16 654 PKHTB rb45, xf5, xf4, ASR #16 655 656 PKHBT ra67, xf6, xf7, LSL #16 657 PKHTB rb67, xf7, xf6, ASR #16 658 659 STMIA pDest, {ra01, ra23, ra45, ra67} 660 IF "$stride"="s" 661 ADD pDest, pDest, pScale 662 STMIA pDest, {rb01, rb23, rb45, rb67} 663 ADD pDest, pDest, pScale 664 ELSE 665 ADD pDest, pDest, #($stride) 666 STMIA pDest, {rb01, rb23, rb45, rb67} 667 ADD pDest, pDest, #($stride) 668 ENDIF 669 ENDIF 670 671 BCC v6_idct_row$_F 672 ENDIF ;// ARM1136JS 673 674 675 IF CortexA8 676 677 Src0 EQU 7 678 Src1 EQU 8 679 Src2 EQU 9 680 Src3 EQU 10 681 Src4 EQU 11 682 Src5 EQU 12 683 Src6 EQU 13 684 Src7 EQU 14 685 Tmp EQU 15 686 687 qXj0 QN Src0.S16 688 qXj1 QN Src1.S16 689 qXj2 QN Src2.S16 690 qXj3 QN Src3.S16 691 qXj4 QN Src4.S16 692 qXj5 QN Src5.S16 693 qXj6 QN Src6.S16 694 qXj7 QN Src7.S16 695 qXjt QN Tmp.S16 696 697 dXj0lo DN (Src0*2).S16 698 dXj0hi DN (Src0*2+1).S16 699 dXj1lo DN (Src1*2).S16 700 dXj1hi DN (Src1*2+1).S16 701 dXj2lo DN (Src2*2).S16 702 dXj2hi DN (Src2*2+1).S16 703 dXj3lo DN (Src3*2).S16 704 dXj3hi DN (Src3*2+1).S16 705 dXj4lo DN (Src4*2).S16 706 dXj4hi DN (Src4*2+1).S16 707 dXj5lo DN (Src5*2).S16 708 dXj5hi DN (Src5*2+1).S16 709 dXj6lo DN (Src6*2).S16 710 dXj6hi DN (Src6*2+1).S16 711 dXj7lo DN (Src7*2).S16 712 dXj7hi DN (Src7*2+1).S16 713 dXjtlo DN (Tmp*2).S16 714 dXjthi DN (Tmp*2+1).S16 715 716 qXi0 QN qXj0 717 qXi1 QN qXj4 718 qXi2 QN qXj2 719 qXi3 QN qXj7 720 qXi4 QN qXj5 721 qXi5 QN qXjt 722 qXi6 QN qXj1 723 qXi7 QN qXj6 724 qXit QN qXj3 725 726 dXi0lo DN dXj0lo 727 dXi0hi DN dXj0hi 728 dXi1lo DN dXj4lo 729 dXi1hi DN dXj4hi 730 dXi2lo DN dXj2lo 731 dXi2hi DN dXj2hi 732 dXi3lo DN dXj7lo 733 dXi3hi DN dXj7hi 734 dXi4lo DN dXj5lo 735 dXi4hi DN dXj5hi 736 dXi5lo DN dXjtlo 737 dXi5hi DN dXjthi 738 dXi6lo DN dXj1lo 739 dXi6hi DN dXj1hi 740 dXi7lo DN dXj6lo 741 dXi7hi DN dXj6hi 742 dXitlo DN dXj3lo 743 dXithi DN dXj3hi 744 745 qXh0 QN qXit 746 qXh1 QN qXi0 747 qXh2 QN qXi2 748 qXh3 QN qXi3 749 qXh4 QN qXi7 750 qXh5 QN qXi5 751 qXh6 QN qXi4 752 qXh7 QN qXi1 753 qXht QN qXi6 754 755 dXh0lo DN dXitlo 756 dXh0hi DN dXithi 757 dXh1lo DN dXi0lo 758 dXh1hi DN dXi0hi 759 dXh2lo DN dXi2lo 760 dXh2hi DN dXi2hi 761 dXh3lo DN dXi3lo 762 dXh3hi DN dXi3hi 763 dXh4lo DN dXi7lo 764 dXh4hi DN dXi7hi 765 dXh5lo DN dXi5lo 766 dXh5hi DN dXi5hi 767 dXh6lo DN dXi4lo 768 dXh6hi DN dXi4hi 769 dXh7lo DN dXi1lo 770 dXh7hi DN dXi1hi 771 dXhtlo DN dXi6lo 772 dXhthi DN dXi6hi 773 774 qXg0 QN qXh2 775 qXg1 QN qXht 776 qXg2 QN qXh1 777 qXg3 QN qXh0 778 qXg4 QN qXh4 779 qXg5 QN qXh5 780 qXg6 QN qXh6 781 qXg7 QN qXh7 782 qXgt QN qXh3 783 784 qXf0 QN qXg6 785 qXf1 QN qXg5 786 qXf2 QN qXg4 787 qXf3 QN qXgt 788 qXf4 QN qXg3 789 qXf5 QN qXg2 790 qXf6 QN qXg1 791 qXf7 QN qXg0 792 qXft QN qXg7 793 794 795 qXt0 QN 1.S32 796 qXt1 QN 2.S32 797 qT0lo QN 1.S32 798 qT0hi QN 2.S32 799 qT1lo QN 3.S32 800 qT1hi QN 4.S32 801 qScalelo QN 5.S32 ;// used to read post scale values 802 qScalehi QN 6.S32 803 qTemp0 QN 5.S32 804 qTemp1 QN 6.S32 805 806 807 Scale1 EQU 6 808 Scale2 EQU 15 809 qScale1 QN Scale1.S16 810 qScale2 QN Scale2.S16 811 dScale1lo DN (Scale1*2).S16 812 dScale1hi DN (Scale1*2+1).S16 813 dScale2lo DN (Scale2*2).S16 814 dScale2hi DN (Scale2*2+1).S16 815 816 dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]} 817 InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15 818 S DN dCoefs[1] ;// Sin(PI/8) in Q15 819 C DN dCoefs[2] ;// Cos(PI/8) in Q15 820 821 pTemp RN 12 822 823 824 IMPORT armCOMM_IDCTCoef 825 826 VLD1 {qXj0,qXj1}, [pSrc @64]! 827 VLD1 {qXj2,qXj3}, [pSrc @64]! 828 VLD1 {qXj4,qXj5}, [pSrc @64]! 829 VLD1 {qXj6,qXj7}, [pSrc @64]! 830 831 ;// Load PreScale and multiply with Src 832 ;// IStage 4 833 834 IF "$inscale"="s16" ;// 16X16 Mul 835 M_IDCT_PRESCALE16 836 ENDIF 837 838 IF "$inscale"="s32" ;// 32X32 ,ul 839 M_IDCT_PRESCALE32 840 ENDIF 841 842 ;// IStage 3 843 VQDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2) 844 VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2 845 VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2 846 VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4 847 VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2 848 VQDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2) 849 VSUB qXh2, qXi2, qXi3 ;// h2, h3 850 851 VMULL qXt0, dXi4lo, C ;// c*i4 852 VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6 853 VMULL qXt1, dXi4hi, C 854 VMLAL qXt1, dXi6hi, S 855 VSHRN dXh4lo, qXt0, #16 ;// h4 856 VSHRN dXh4hi, qXt1, #16 857 858 VMULL qXt0, dXi6lo, C ;// c*i6 859 VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6 860 VMULL qXt1, dXi6hi, C 861 VMLSL qXt1, dXi4hi, S 862 VSHRN dXh6lo, qXt0, #16 ;// h6 863 VSHRN dXh6hi, qXt1, #16 864 865 ;// IStage 2 866 VSUB qXg6, qXh6, qXh7 867 VSUB qXg5, qXh5, qXg6 868 VSUB qXg4, qXh4, qXg5 869 VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2 870 VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2 871 VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2 872 VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2 873 874 ;// IStage 1 all rows 875 VADD qXf3, qXg3, qXg4 876 VSUB qXf4, qXg3, qXg4 877 VADD qXf2, qXg2, qXg5 878 VSUB qXf5, qXg2, qXg5 879 VADD qXf1, qXg1, qXg6 880 VSUB qXf6, qXg1, qXg6 881 VADD qXf0, qXg0, qXg7 882 VSUB qXf7, qXg0, qXg7 883 884 ;// Transpose, store and loop 885 XTR0 EQU Src5 886 XTR1 EQU Tmp 887 XTR2 EQU Src6 888 XTR3 EQU Src7 889 XTR4 EQU Src3 890 XTR5 EQU Src0 891 XTR6 EQU Src1 892 XTR7 EQU Src2 893 XTRt EQU Src4 894 895 qA0 QN XTR0.S32 ;// for XTRpose 896 qA1 QN XTR1.S32 897 qA2 QN XTR2.S32 898 qA3 QN XTR3.S32 899 qA4 QN XTR4.S32 900 qA5 QN XTR5.S32 901 qA6 QN XTR6.S32 902 qA7 QN XTR7.S32 903 904 dB0 DN XTR0*2+1 ;// for using VSWP 905 dB1 DN XTR1*2+1 906 dB2 DN XTR2*2+1 907 dB3 DN XTR3*2+1 908 dB4 DN XTR4*2 909 dB5 DN XTR5*2 910 dB6 DN XTR6*2 911 dB7 DN XTR7*2 912 913 914 VTRN qXf0, qXf1 915 VTRN qXf2, qXf3 916 VTRN qXf4, qXf5 917 VTRN qXf6, qXf7 918 VTRN qA0, qA2 919 VTRN qA1, qA3 920 VTRN qA4, qA6 921 VTRN qA5, qA7 922 VSWP dB0, dB4 923 VSWP dB1, dB5 924 VSWP dB2, dB6 925 VSWP dB3, dB7 926 927 928 qYj0 QN qXf0 929 qYj1 QN qXf1 930 qYj2 QN qXf2 931 qYj3 QN qXf3 932 qYj4 QN qXf4 933 qYj5 QN qXf5 934 qYj6 QN qXf6 935 qYj7 QN qXf7 936 qYjt QN qXft 937 938 dYj0lo DN (XTR0*2).S16 939 dYj0hi DN (XTR0*2+1).S16 940 dYj1lo DN (XTR1*2).S16 941 dYj1hi DN (XTR1*2+1).S16 942 dYj2lo DN (XTR2*2).S16 943 dYj2hi DN (XTR2*2+1).S16 944 dYj3lo DN (XTR3*2).S16 945 dYj3hi DN (XTR3*2+1).S16 946 dYj4lo DN (XTR4*2).S16 947 dYj4hi DN (XTR4*2+1).S16 948 dYj5lo DN (XTR5*2).S16 949 dYj5hi DN (XTR5*2+1).S16 950 dYj6lo DN (XTR6*2).S16 951 dYj6hi DN (XTR6*2+1).S16 952 dYj7lo DN (XTR7*2).S16 953 dYj7hi DN (XTR7*2+1).S16 954 dYjtlo DN (XTRt*2).S16 955 dYjthi DN (XTRt*2+1).S16 956 957 qYi0 QN qYj0 958 qYi1 QN qYj4 959 qYi2 QN qYj2 960 qYi3 QN qYj7 961 qYi4 QN qYj5 962 qYi5 QN qYjt 963 qYi6 QN qYj1 964 qYi7 QN qYj6 965 qYit QN qYj3 966 967 dYi0lo DN dYj0lo 968 dYi0hi DN dYj0hi 969 dYi1lo DN dYj4lo 970 dYi1hi DN dYj4hi 971 dYi2lo DN dYj2lo 972 dYi2hi DN dYj2hi 973 dYi3lo DN dYj7lo 974 dYi3hi DN dYj7hi 975 dYi4lo DN dYj5lo 976 dYi4hi DN dYj5hi 977 dYi5lo DN dYjtlo 978 dYi5hi DN dYjthi 979 dYi6lo DN dYj1lo 980 dYi6hi DN dYj1hi 981 dYi7lo DN dYj6lo 982 dYi7hi DN dYj6hi 983 dYitlo DN dYj3lo 984 dYithi DN dYj3hi 985 986 qYh0 QN qYit 987 qYh1 QN qYi0 988 qYh2 QN qYi2 989 qYh3 QN qYi3 990 qYh4 QN qYi7 991 qYh5 QN qYi5 992 qYh6 QN qYi4 993 qYh7 QN qYi1 994 qYht QN qYi6 995 996 dYh0lo DN dYitlo 997 dYh0hi DN dYithi 998 dYh1lo DN dYi0lo 999 dYh1hi DN dYi0hi 1000 dYh2lo DN dYi2lo 1001 dYh2hi DN dYi2hi 1002 dYh3lo DN dYi3lo 1003 dYh3hi DN dYi3hi 1004 dYh4lo DN dYi7lo 1005 dYh4hi DN dYi7hi 1006 dYh5lo DN dYi5lo 1007 dYh5hi DN dYi5hi 1008 dYh6lo DN dYi4lo 1009 dYh6hi DN dYi4hi 1010 dYh7lo DN dYi1lo 1011 dYh7hi DN dYi1hi 1012 dYhtlo DN dYi6lo 1013 dYhthi DN dYi6hi 1014 1015 qYg0 QN qYh2 1016 qYg1 QN qYht 1017 qYg2 QN qYh1 1018 qYg3 QN qYh0 1019 qYg4 QN qYh4 1020 qYg5 QN qYh5 1021 qYg6 QN qYh6 1022 qYg7 QN qYh7 1023 qYgt QN qYh3 1024 1025 qYf0 QN qYg6 1026 qYf1 QN qYg5 1027 qYf2 QN qYg4 1028 qYf3 QN qYgt 1029 qYf4 QN qYg3 1030 qYf5 QN qYg2 1031 qYf6 QN qYg1 1032 qYf7 QN qYg0 1033 qYft QN qYg7 1034 1035 VRSHR qYj7, qYj7, #2 1036 VRSHR qYj6, qYj6, #1 1037 1038 VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2 1039 VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7 1040 VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2 1041 VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6 1042 VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2 1043 VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3 1044 1045 VQDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2) 1046 ;// IStage 4,3 rows 0to1 x 1/2 1047 1048 MOV pTemp, #0x4 ;// ensure correct round 1049 VDUP qScale1, pTemp ;// of DC result 1050 VADD qYi0, qYi0, qScale1 1051 1052 VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2 1053 VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2 1054 1055 VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4 1056 VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2 1057 VSUB qYh2, qYi2, qYi3 ;// h2, h3 1058 VQDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2) 1059 1060 VMULL qXt0, dYi4lo, C ;// c*i4 1061 VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6 1062 VMULL qXt1, dYi4hi, C 1063 VMLAL qXt1, dYi6hi, S 1064 VSHRN dYh4lo, qXt0, #16 ;// h4 1065 VSHRN dYh4hi, qXt1, #16 1066 1067 VMULL qXt0, dYi6lo, C ;// c*i6 1068 VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6 1069 VMULL qXt1, dYi6hi, C 1070 VMLSL qXt1, dYi4hi, S 1071 VSHRN dYh6lo, qXt0, #16 ;// h6 1072 VSHRN dYh6hi, qXt1, #16 1073 1074 VSUB qYg6, qYh6, qYh7 1075 VSUB qYg5, qYh5, qYg6 1076 VSUB qYg4, qYh4, qYg5 1077 1078 ;// IStage 2 rows 0to3 x 1/2 1079 VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2 1080 VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2 1081 VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2 1082 VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2 1083 1084 1085 ;// IStage 1 all rows 1086 VHADD qYf3, qYg3, qYg4 1087 VHSUB qYf4, qYg3, qYg4 1088 VHADD qYf2, qYg2, qYg5 1089 VHSUB qYf5, qYg2, qYg5 1090 VHADD qYf1, qYg1, qYg6 1091 VHSUB qYf6, qYg1, qYg6 1092 VHADD qYf0, qYg0, qYg7 1093 VHSUB qYf7, qYg0, qYg7 1094 1095 YTR0 EQU Src0 1096 YTR1 EQU Src4 1097 YTR2 EQU Src1 1098 YTR3 EQU Src2 1099 YTR4 EQU Src7 1100 YTR5 EQU Src5 1101 YTR6 EQU Tmp 1102 YTR7 EQU Src6 1103 YTRt EQU Src3 1104 1105 qC0 QN YTR0.S32 ;// for YTRpose 1106 qC1 QN YTR1.S32 1107 qC2 QN YTR2.S32 1108 qC3 QN YTR3.S32 1109 qC4 QN YTR4.S32 1110 qC5 QN YTR5.S32 1111 qC6 QN YTR6.S32 1112 qC7 QN YTR7.S32 1113 1114 dD0 DN YTR0*2+1 ;// for using VSWP 1115 dD1 DN YTR1*2+1 1116 dD2 DN YTR2*2+1 1117 dD3 DN YTR3*2+1 1118 dD4 DN YTR4*2 1119 dD5 DN YTR5*2 1120 dD6 DN YTR6*2 1121 dD7 DN YTR7*2 1122 1123 VTRN qYf0, qYf1 1124 VTRN qYf2, qYf3 1125 VTRN qYf4, qYf5 1126 VTRN qYf6, qYf7 1127 VTRN qC0, qC2 1128 VTRN qC1, qC3 1129 VTRN qC4, qC6 1130 VTRN qC5, qC7 1131 VSWP dD0, dD4 1132 VSWP dD1, dD5 1133 VSWP dD2, dD6 1134 VSWP dD3, dD7 1135 1136 1137 dYf0U8 DN YTR0*2.U8 1138 dYf1U8 DN YTR1*2.U8 1139 dYf2U8 DN YTR2*2.U8 1140 dYf3U8 DN YTR3*2.U8 1141 dYf4U8 DN YTR4*2.U8 1142 dYf5U8 DN YTR5*2.U8 1143 dYf6U8 DN YTR6*2.U8 1144 dYf7U8 DN YTR7*2.U8 1145 1146 ;// 1147 ;// Do saturation if outsize is other than S16 1148 ;// 1149 1150 IF ("$outsize"="u8") 1151 ;// Output range [0-255] 1152 VQMOVN dYf0U8, qYf0 1153 VQMOVN dYf1U8, qYf1 1154 VQMOVN dYf2U8, qYf2 1155 VQMOVN dYf3U8, qYf3 1156 VQMOVN dYf4U8, qYf4 1157 VQMOVN dYf5U8, qYf5 1158 VQMOVN dYf6U8, qYf6 1159 VQMOVN dYf7U8, qYf7 1160 ENDIF 1161 1162 IF ("$outsize"="s9") 1163 ;// Output range [-256 to +255] 1164 VQSHL qYf0, qYf0, #16-9 1165 VQSHL qYf1, qYf1, #16-9 1166 VQSHL qYf2, qYf2, #16-9 1167 VQSHL qYf3, qYf3, #16-9 1168 VQSHL qYf4, qYf4, #16-9 1169 VQSHL qYf5, qYf5, #16-9 1170 VQSHL qYf6, qYf6, #16-9 1171 VQSHL qYf7, qYf7, #16-9 1172 1173 VSHR qYf0, qYf0, #16-9 1174 VSHR qYf1, qYf1, #16-9 1175 VSHR qYf2, qYf2, #16-9 1176 VSHR qYf3, qYf3, #16-9 1177 VSHR qYf4, qYf4, #16-9 1178 VSHR qYf5, qYf5, #16-9 1179 VSHR qYf6, qYf6, #16-9 1180 VSHR qYf7, qYf7, #16-9 1181 ENDIF 1182 1183 ;// Store output depending on the Stride size 1184 IF "$stride"="s" 1185 VST1 qYf0, [pDest @64], Stride 1186 VST1 qYf1, [pDest @64], Stride 1187 VST1 qYf2, [pDest @64], Stride 1188 VST1 qYf3, [pDest @64], Stride 1189 VST1 qYf4, [pDest @64], Stride 1190 VST1 qYf5, [pDest @64], Stride 1191 VST1 qYf6, [pDest @64], Stride 1192 VST1 qYf7, [pDest @64] 1193 ELSE 1194 IF ("$outsize"="u8") 1195 VST1 dYf0U8, [pDest @64], #8 1196 VST1 dYf1U8, [pDest @64], #8 1197 VST1 dYf2U8, [pDest @64], #8 1198 VST1 dYf3U8, [pDest @64], #8 1199 VST1 dYf4U8, [pDest @64], #8 1200 VST1 dYf5U8, [pDest @64], #8 1201 VST1 dYf6U8, [pDest @64], #8 1202 VST1 dYf7U8, [pDest @64] 1203 ELSE 1204 ;// ("$outsize"="s9") or ("$outsize"="s16") 1205 VST1 qYf0, [pDest @64], #16 1206 VST1 qYf1, [pDest @64], #16 1207 VST1 qYf2, [pDest @64], #16 1208 VST1 qYf3, [pDest @64], #16 1209 VST1 qYf4, [pDest @64], #16 1210 VST1 qYf5, [pDest @64], #16 1211 VST1 qYf6, [pDest @64], #16 1212 VST1 qYf7, [pDest @64] 1213 ENDIF 1214 1215 ENDIF 1216 1217 1218 1219 ENDIF ;// CortexA8 1220 1221 1222 1223 MEND 1224 1225 ;// Scale TWO input rows with TWO rows of 16 bit scale values 1226 ;// 1227 ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row 1228 ;// input (Eight input values) with one row of scale values. Also 1229 ;// Loads next scale values from pScale, if $LastRow flag is not set. 1230 ;// 1231 ;// Input Registers: 1232 ;// 1233 ;// $dAlo - Input D register with first four S16 values of row n 1234 ;// $dAhi - Input D register with next four S16 values of row n 1235 ;// $dBlo - Input D register with first four S16 values of row n+1 1236 ;// $dBhi - Input D register with next four S16 values of row n+1 1237 ;// pScale - Pointer to next row of scale values 1238 ;// qT0lo - Temporary scratch register 1239 ;// qT0hi - Temporary scratch register 1240 ;// qT1lo - Temporary scratch register 1241 ;// qT1hi - Temporary scratch register 1242 ;// dScale1lo - Scale value of row n 1243 ;// dScale1hi - Scale value of row n 1244 ;// dScale2lo - Scale value of row n+1 1245 ;// dScale2hi - Scale value of row n+1 1246 ;// 1247 ;// Input Flag 1248 ;// 1249 ;// $LastRow - Flag to indicate whether current row is last row 1250 ;// 1251 ;// Output Registers: 1252 ;// 1253 ;// $dAlo - Scaled output values (first four S16 of row n) 1254 ;// $dAhi - Scaled output values (next four S16 of row n) 1255 ;// $dBlo - Scaled output values (first four S16 of row n+1) 1256 ;// $dBhi - Scaled output values (next four S16 of row n+1) 1257 ;// qScale1 - Scale values for next row 1258 ;// qScale2 - Scale values for next row+1 1259 ;// pScale - Pointer to next row of scale values 1260 ;// 1261 MACRO 1262 M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow 1263 VMULL qT0lo, $dAlo, dScale1lo 1264 VMULL qT0hi, $dAhi, dScale1hi 1265 VMULL qT1lo, $dBlo, dScale2lo 1266 VMULL qT1hi, $dBhi, dScale2hi 1267 IF "$LastRow"="0" 1268 VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1 1269 VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2 1270 ENDIF 1271 VQRSHRN $dAlo, qT0lo, #12 1272 VQRSHRN $dAhi, qT0hi, #12 1273 VQRSHRN $dBlo, qT1lo, #12 1274 VQRSHRN $dBhi, qT1hi, #12 1275 MEND 1276 1277 ;// Scale 8x8 block input values with 16 bit scale values 1278 ;// 1279 ;// This macro is used to pre-scale block of 8x8 input. 1280 ;// This also do the Ist stage transformations of IDCT. 1281 ;// 1282 ;// Input Registers: 1283 ;// 1284 ;// dXjnlo - n th input D register with first four S16 values 1285 ;// dXjnhi - n th input D register with next four S16 values 1286 ;// qXjn - n th input Q register with eight S16 values 1287 ;// pScale - Pointer to scale values 1288 ;// 1289 ;// Output Registers: 1290 ;// 1291 ;// qXin - n th output Q register with eight S16 output values of 1st stage 1292 ;// 1293 MACRO 1294 M_IDCT_PRESCALE16 1295 VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0 1296 VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0 1297 M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1 1298 M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0 1299 M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0 1300 M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1 1301 VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2 1302 VSUB qXi6, qXj1, qXj7 ;// j1-j7 1303 LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants 1304 VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2 1305 VSUB qXi2, qXj2, qXj6 ;// j2-j6 1306 VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants 1307 VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2 1308 VSUB qXi4, qXj5, qXj3 ;// j5-j3 1309 MEND 1310 1311 1312 ;// Scale 8x8 block input values with 32 bit scale values 1313 ;// 1314 ;// This macro is used to pre-scale block of 8x8 input. 1315 ;// This also do the Ist stage transformations of IDCT. 1316 ;// 1317 ;// Input Registers: 1318 ;// 1319 ;// dXjnlo - n th input D register with first four S16 values 1320 ;// dXjnhi - n th input D register with next four S16 values 1321 ;// qXjn - n th input Q register with eight S16 values 1322 ;// pScale - Pointer to 32bit scale values in Q23 format 1323 ;// 1324 ;// Output Registers: 1325 ;// 1326 ;// dXinlo - n th output D register with first four S16 output values of 1st stage 1327 ;// dXinhi - n th output D register with next four S16 output values of 1st stage 1328 ;// 1329 MACRO 1330 M_IDCT_PRESCALE32 1331 qScale0lo QN 0.S32 1332 qScale0hi QN 1.S32 1333 qScale1lo QN 2.S32 1334 qScale1hi QN 3.S32 1335 qScale2lo QN qScale1lo 1336 qScale2hi QN qScale1hi 1337 qScale3lo QN qScale1lo 1338 qScale3hi QN qScale1hi 1339 qScale4lo QN qScale1lo 1340 qScale4hi QN qScale1hi 1341 qScale5lo QN qScale0lo 1342 qScale5hi QN qScale0hi 1343 qScale6lo QN qScale0lo 1344 qScale6hi QN qScale0hi 1345 qScale7lo QN qScale0lo 1346 qScale7hi QN qScale0hi 1347 1348 qSrc0lo QN 4.S32 1349 qSrc0hi QN 5.S32 1350 qSrc1lo QN 6.S32 1351 qSrc1hi QN Src4.S32 1352 qSrc2lo QN qSrc0lo 1353 qSrc2hi QN qSrc0hi 1354 qSrc3lo QN qSrc0lo 1355 qSrc3hi QN qSrc0hi 1356 qSrc4lo QN qSrc0lo 1357 qSrc4hi QN qSrc0hi 1358 qSrc5lo QN qSrc1lo 1359 qSrc5hi QN qSrc1hi 1360 qSrc6lo QN qSrc1lo 1361 qSrc6hi QN qSrc1hi 1362 qSrc7lo QN qSrc0lo 1363 qSrc7hi QN qSrc0hi 1364 1365 qRes17lo QN qScale0lo 1366 qRes17hi QN qScale0hi 1367 qRes26lo QN qScale0lo 1368 qRes26hi QN qScale0hi 1369 qRes53lo QN qScale0lo 1370 qRes53hi QN qScale0hi 1371 1372 ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7] 1373 1374 ;// Row 0 1375 VLD1 {qScale0lo, qScale0hi}, [pScale]! 1376 VSHLL qSrc0lo, dXj0lo, #(12-1) 1377 VSHLL qSrc0hi, dXj0hi, #(12-1) 1378 VLD1 {qScale1lo, qScale1hi}, [pScale]! 1379 VQRDMULH qSrc0lo, qScale0lo, qSrc0lo 1380 VQRDMULH qSrc0hi, qScale0hi, qSrc0hi 1381 VLD1 {qScale7lo, qScale7hi}, [pTemp]! 1382 VSHLL qSrc1lo, dXj1lo, #(12-1) 1383 VSHLL qSrc1hi, dXj1hi, #(12-1) 1384 VMOVN dXi0lo, qSrc0lo ;// Output i0 1385 VMOVN dXi0hi, qSrc0hi 1386 VSHLL qSrc7lo, dXj7lo, #(12-1) 1387 VSHLL qSrc7hi, dXj7hi, #(12-1) 1388 SUB pTemp, pTemp, #((16*2)+(4*8*1)) 1389 VQRDMULH qSrc1lo, qScale1lo, qSrc1lo 1390 VQRDMULH qSrc1hi, qScale1hi, qSrc1hi 1391 VQRDMULH qSrc7lo, qScale7lo, qSrc7lo 1392 VQRDMULH qSrc7hi, qScale7hi, qSrc7hi 1393 VLD1 {qScale2lo, qScale2hi}, [pScale]! 1394 1395 ;// Row 1 & 7 1396 VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2 1397 VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2 1398 VMOVN dXi5lo, qRes17lo ;// Output i5 1399 VMOVN dXi5hi, qRes17hi 1400 VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7 1401 VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7 1402 VMOVN dXi6lo, qRes17lo ;// Output i6 1403 VMOVN dXi6hi, qRes17hi 1404 VSHLL qSrc2lo, dXj2lo, #(12-1) 1405 VSHLL qSrc2hi, dXj2hi, #(12-1) 1406 VLD1 {qScale6lo, qScale6hi}, [pTemp]! 1407 VSHLL qSrc6lo, dXj6lo, #(12-1) 1408 VSHLL qSrc6hi, dXj6hi, #(12-1) 1409 SUB pTemp, pTemp, #((16*2)+(4*8*1)) 1410 VQRDMULH qSrc2lo, qScale2lo, qSrc2lo 1411 VQRDMULH qSrc2hi, qScale2hi, qSrc2hi 1412 VQRDMULH qSrc6lo, qScale6lo, qSrc6lo 1413 VQRDMULH qSrc6hi, qScale6hi, qSrc6hi 1414 VLD1 {qScale3lo, qScale3hi}, [pScale]! 1415 1416 ;// Row 2 & 6 1417 VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2 1418 VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2 1419 VMOVN dXi3lo, qRes26lo ;// Output i3 1420 VMOVN dXi3hi, qRes26hi 1421 VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6 1422 VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6 1423 VMOVN dXi2lo, qRes26lo ;// Output i2 1424 VMOVN dXi2hi, qRes26hi 1425 VSHLL qSrc3lo, dXj3lo, #(12-1) 1426 VSHLL qSrc3hi, dXj3hi, #(12-1) 1427 VLD1 {qScale5lo, qScale5hi}, [pTemp]! 1428 VSHLL qSrc5lo, dXj5lo, #(12-1) 1429 VSHLL qSrc5hi, dXj5hi, #(12-1) 1430 VQRDMULH qSrc3lo, qScale3lo, qSrc3lo 1431 VQRDMULH qSrc3hi, qScale3hi, qSrc3hi 1432 VQRDMULH qSrc5lo, qScale5lo, qSrc5lo 1433 VQRDMULH qSrc5hi, qScale5hi, qSrc5hi 1434 1435 ;// Row 3 & 5 1436 VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2 1437 VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2 1438 SUB pSrc, pSrc, #16*2*2 1439 VMOVN dXi7lo, qRes53lo ;// Output i7 1440 VMOVN dXi7hi, qRes53hi 1441 VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3 1442 VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3 1443 VLD1 qXj4, [pSrc @64] 1444 VMOVN dXi4lo, qRes53lo ;// Output i4 1445 VMOVN dXi4hi, qRes53hi 1446 VSHLL qSrc4lo, dXj4lo, #(12-1) 1447 VSHLL qSrc4hi, dXj4hi, #(12-1) 1448 VLD1 {qScale4lo, qScale4hi}, [pScale] 1449 LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants 1450 VQRDMULH qSrc4lo, qScale4lo, qSrc4lo 1451 VQRDMULH qSrc4hi, qScale4hi, qSrc4hi 1452 VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants 1453 ;// Row 4 1454 VMOVN dXi1lo, qSrc4lo ;// Output i1 1455 VMOVN dXi1hi, qSrc4hi 1456 1457 MEND 1458 1459 END 1460