1 ;// 2 ;// Copyright (C) 2004 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// IDCT_s.s 19 ;// 20 ;// Inverse DCT module 21 ;// 22 ;// 23 ;// ALGORITHM DESCRIPTION 24 ;// 25 ;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each 26 ;// column and then a 1D IDCT for each row. 27 ;// 28 ;// The 8-point 1D IDCT is defined by 29 ;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2 30 ;// 31 ;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0 32 ;// c(u,x) = cos( (2x+1)*u*pi/16 ) 33 ;// 34 ;// We compute the 8-point 1D IDCT using the reverse of 35 ;// the Arai-Agui-Nakajima flow graph which we split into 36 ;// 5 stages named in reverse order to identify with the 37 ;// forward DCT. Direct inversion of the forward formulae 38 ;// in file FDCT_s.s gives: 39 ;// 40 ;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ] 41 ;// [ A(0) = 2*sqrt(2) 42 ;// A(u) = 4*cos(u*pi/16) for (u!=0) ] 43 ;// 44 ;// IStage 4: i0 = j0 i1 = j4 45 ;// i3 = (j2+j6)/2 i2 = (j2-j6)/2 46 ;// i7 = (j5+j3)/2 i4 = (j5-j3)/2 47 ;// i5 = (j1+j7)/2 i6 = (j1-j7)/2 48 ;// 49 ;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2 50 ;// h2 = (i2*sqrt2)-i3 h3 = i3 51 ;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6 52 ;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6 53 ;// [ The above two lines rotate by -(pi/8) ] 54 ;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2 55 ;// 56 ;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2 57 ;// g1 = (h1+h2)/2 g2 = (h1-h2)/2 58 ;// g7 = h7 g6 = h6 - h7 59 ;// g5 = h5 - g6 g4 = h4 - g5 60 ;// 61 ;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2 62 ;// f1 = (g1+g6)/2 f6 = (g1-g6)/2 63 ;// f2 = (g2+g5)/2 f5 = (g2-g5)/2 64 ;// f3 = (g3+g4)/2 f4 = (g3-g4)/2 65 ;// 66 ;// Note that most coefficients are halved 3 times during the 67 ;// above calculation. We can rescale the algorithm dividing 68 ;// the input by 8 to remove the halvings. 69 ;// 70 ;// IStage 5: j(u) = T(u)*A(u)/8 71 ;// 72 ;// IStage 4: i0 = j0 i1 = j4 73 ;// i3 = j2 + j6 i2 = j2 - j6 74 ;// i7 = j5 + j3 i4 = j5 - j3 75 ;// i5 = j1 + j7 i6 = j1 - j7 76 ;// 77 ;// IStage 3: h0 = i0 + i1 h1 = i0 - i1 78 ;// h2 = (i2*sqrt2)-i3 h3 = i3 79 ;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6) 80 ;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6) 81 ;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7 82 ;// 83 ;// IStage 2: g0 = h0 + h3 g3 = h0 - h3 84 ;// g1 = h1 + h2 g2 = h1 - h2 85 ;// g7 = h7 g6 = h6 - h7 86 ;// g5 = h5 - g6 g4 = h4 - g5 87 ;// 88 ;// IStage 1: f0 = g0 + g7 f7 = g0 - g7 89 ;// f1 = g1 + g6 f6 = g1 - g6 90 ;// f2 = g2 + g5 f5 = g2 - g5 91 ;// f3 = g3 + g4 f4 = g3 - g4 92 ;// 93 ;// Note: 94 ;// 1. The scaling by A(u)/8 can often be combined with inverse 95 ;// quantization. The column and row scalings can be combined. 96 ;// 2. The flowgraph in the AAN paper has h4,g6 negated compared 97 ;// to the above code but is otherwise identical. 98 ;// 3. The rotation by -pi/8 can be peformed using three multiplies 99 ;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4 100 ;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6 101 ;// 4. If |T(u)|<=1 then from the IDCT definition, 102 ;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2 103 ;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2 104 ;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2 105 ;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2) 106 ;// = (approx)2.64 107 ;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits. 108 ;// The table below shows input patterns generating the maximum 109 ;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1 110 ;// InputPattern Max |f(x)| 111 ;// PPPPPPPP |f0| = 2.64 112 ;// PPPMMMMM |f1| = 2.64 113 ;// PPMMMPPP |f2| = 2.64 114 ;// PPMMPPMM |f3| = 2.64 115 ;// PMMPPMMP |f4| = 2.64 116 ;// PMMPMMPM |f5| = 2.64 117 ;// PMPPMPMP |f6| = 2.64 118 ;// PMPMPMPM |f7| = 2.64 119 ;// Note that this input pattern is the transpose of the 120 ;// corresponding max input patter for the FDCT. 121 122 ;// Arguments 123 124 pSrc RN 0 ;// source data buffer 125 Stride RN 1 ;// destination stride in bytes 126 pDest RN 2 ;// destination data buffer 127 pScale RN 3 ;// pointer to scaling table 128 129 130 ;// DCT Inverse Macro 131 ;// The DCT code should be parametrized according 132 ;// to the following inputs: 133 ;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255) 134 ;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255) 135 ;// "s16" : 16-bit signed data not saturated (max size ~+/-14273) 136 ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment 137 ;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment 138 ;// 139 ;// Inputs: 140 ;// pSrc = r0 = Pointer to input data 141 ;// Range is -256 to +255 (9-bit) 142 ;// Stride = r1 = Stride between input lines 143 ;// pDest = r2 = Pointer to output data 144 ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale 145 146 147 148 MACRO 149 M_IDCT $outsize, $inscale, $stride 150 LCLA SHIFT 151 152 153 IF ARM1136JS 154 155 ;// REGISTER ALLOCATION 156 ;// This is hard since we have 8 values, 9 free registers and each 157 ;// butterfly requires a temporary register. We also want to 158 ;// maintain register order so we can use LDM/STM. The table below 159 ;// summarises the register allocation that meets all these criteria. 160 ;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above. 161 ;// 162 ;// r1 a01 g0 h0 163 ;// r4 b01 f0 g1 h1 i0 164 ;// r5 a23 f1 g2 i1 165 ;// r6 b23 f2 g3 h2 i2 166 ;// r7 a45 f3 h3 i3 167 ;// r8 b45 f4 g4 h4 i4 168 ;// r9 a67 f5 g5 h5 i5 169 ;// r10 b67 f6 g6 h6 i6 170 ;// r11 f7 g7 h7 i7 171 ;// 172 ra01 RN 1 173 rb01 RN 4 174 ra23 RN 5 175 rb23 RN 6 176 ra45 RN 7 177 rb45 RN 8 178 ra67 RN 9 179 rb67 RN 10 180 rtmp RN 11 181 csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ] 182 LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ] 183 ;// Transpose allocation 184 xft RN ra01 185 xf0 RN rb01 186 xf1 RN ra23 187 xf2 RN rb23 188 xf3 RN ra45 189 xf4 RN rb45 190 xf5 RN ra67 191 xf6 RN rb67 192 xf7 RN rtmp 193 ;// IStage 1 allocation 194 xg0 RN xft 195 xg1 RN xf0 196 xg2 RN xf1 197 xg3 RN xf2 198 xgt RN xf3 199 xg4 RN xf4 200 xg5 RN xf5 201 xg6 RN xf6 202 xg7 RN xf7 203 ;// IStage 2 allocation 204 xh0 RN xg0 205 xh1 RN xg1 206 xht RN xg2 207 xh2 RN xg3 208 xh3 RN xgt 209 xh4 RN xg4 210 xh5 RN xg5 211 xh6 RN xg6 212 xh7 RN xg7 213 ;// IStage 3,4 allocation 214 xit RN xh0 215 xi0 RN xh1 216 xi1 RN xht 217 xi2 RN xh2 218 xi3 RN xh3 219 xi4 RN xh4 220 xi5 RN xh5 221 xi6 RN xh6 222 xi7 RN xh7 223 224 M_STR pDest, ppDest 225 IF "$stride"="s" 226 M_STR Stride, pStride 227 ENDIF 228 M_ADR pDest, pBlk 229 LDR csPiBy8, =0x30fc7642 230 LDR LoopRR2, =0x00005a82 231 232 v6_idct_col$_F 233 ;// Load even values 234 LDR xi4, [pSrc], #4 ;// j0 235 LDR xi5, [pSrc, #4*16-4] ;// j4 236 LDR xi6, [pSrc, #2*16-4] ;// j2 237 LDR xi7, [pSrc, #6*16-4] ;// j6 238 239 ;// Scale Even Values 240 IF "$inscale"="s16" ;// 16x16 mul 241 SHIFT SETA 12 242 LDR xi0, [pScale], #4 243 LDR xi1, [pScale, #4*16-4] 244 LDR xi2, [pScale, #2*16-4] 245 MOV xit, #1<<(SHIFT-1) 246 SMLABB xi3, xi0, xi4, xit 247 SMLATT xi4, xi0, xi4, xit 248 SMLABB xi0, xi1, xi5, xit 249 SMLATT xi5, xi1, xi5, xit 250 MOV xi3, xi3, ASR #SHIFT 251 PKHBT xi4, xi3, xi4, LSL #(16-SHIFT) 252 LDR xi3, [pScale, #6*16-4] 253 SMLABB xi1, xi2, xi6, xit 254 SMLATT xi6, xi2, xi6, xit 255 MOV xi0, xi0, ASR #SHIFT 256 PKHBT xi5, xi0, xi5, LSL #(16-SHIFT) 257 SMLABB xi2, xi3, xi7, xit 258 SMLATT xi7, xi3, xi7, xit 259 MOV xi1, xi1, ASR #SHIFT 260 PKHBT xi6, xi1, xi6, LSL #(16-SHIFT) 261 MOV xi2, xi2, ASR #SHIFT 262 PKHBT xi7, xi2, xi7, LSL #(16-SHIFT) 263 ENDIF 264 IF "$inscale"="s32" ;// 32x16 mul 265 SHIFT SETA (12+8-16) 266 MOV xit, #1<<(SHIFT-1) 267 LDR xi0, [pScale], #8 268 LDR xi1, [pScale, #0*32+4-8] 269 LDR xi2, [pScale, #4*32-8] 270 LDR xi3, [pScale, #4*32+4-8] 271 SMLAWB xi0, xi0, xi4, xit 272 SMLAWT xi1, xi1, xi4, xit 273 SMLAWB xi2, xi2, xi5, xit 274 SMLAWT xi3, xi3, xi5, xit 275 MOV xi0, xi0, ASR #SHIFT 276 PKHBT xi4, xi0, xi1, LSL #(16-SHIFT) 277 MOV xi2, xi2, ASR #SHIFT 278 PKHBT xi5, xi2, xi3, LSL #(16-SHIFT) 279 LDR xi0, [pScale, #2*32-8] 280 LDR xi1, [pScale, #2*32+4-8] 281 LDR xi2, [pScale, #6*32-8] 282 LDR xi3, [pScale, #6*32+4-8] 283 SMLAWB xi0, xi0, xi6, xit 284 SMLAWT xi1, xi1, xi6, xit 285 SMLAWB xi2, xi2, xi7, xit 286 SMLAWT xi3, xi3, xi7, xit 287 MOV xi0, xi0, ASR #SHIFT 288 PKHBT xi6, xi0, xi1, LSL #(16-SHIFT) 289 MOV xi2, xi2, ASR #SHIFT 290 PKHBT xi7, xi2, xi3, LSL #(16-SHIFT) 291 ENDIF 292 293 ;// Load odd values 294 LDR xi0, [pSrc, #1*16-4] ;// j1 295 LDR xi1, [pSrc, #7*16-4] ;// j7 296 LDR xi2, [pSrc, #5*16-4] ;// j5 297 LDR xi3, [pSrc, #3*16-4] ;// j3 298 299 IF {TRUE} 300 ;// shortcut if odd values 0 301 TEQ xi0, #0 302 TEQEQ xi1, #0 303 TEQEQ xi2, #0 304 TEQEQ xi3, #0 305 BEQ v6OddZero$_F 306 ENDIF 307 308 ;// Store scaled even values 309 STMIA pDest, {xi4, xi5, xi6, xi7} 310 311 ;// Scale odd values 312 IF "$inscale"="s16" 313 ;// Perform AAN Scale 314 LDR xi4, [pScale, #1*16-4] 315 LDR xi5, [pScale, #7*16-4] 316 LDR xi6, [pScale, #5*16-4] 317 SMLABB xi7, xi0, xi4, xit 318 SMLATT xi0, xi0, xi4, xit 319 SMLABB xi4, xi1, xi5, xit 320 SMLATT xi1, xi1, xi5, xit 321 MOV xi7, xi7, ASR #SHIFT 322 PKHBT xi0, xi7, xi0, LSL #(16-SHIFT) 323 LDR xi7, [pScale, #3*16-4] 324 SMLABB xi5, xi2, xi6, xit 325 SMLATT xi2, xi2, xi6, xit 326 MOV xi4, xi4, ASR #SHIFT 327 PKHBT xi1, xi4, xi1, LSL #(16-SHIFT) 328 SMLABB xi6, xi3, xi7, xit 329 SMLATT xi3, xi3, xi7, xit 330 MOV xi5, xi5, ASR #SHIFT 331 PKHBT xi2, xi5, xi2, LSL #(16-SHIFT) 332 MOV xi6, xi6, ASR #SHIFT 333 PKHBT xi3, xi6, xi3, LSL #(16-SHIFT) 334 ENDIF 335 IF "$inscale"="s32" ;// 32x16 mul 336 LDR xi4, [pScale, #1*32-8] 337 LDR xi5, [pScale, #1*32+4-8] 338 LDR xi6, [pScale, #7*32-8] 339 LDR xi7, [pScale, #7*32+4-8] 340 SMLAWB xi4, xi4, xi0, xit 341 SMLAWT xi5, xi5, xi0, xit 342 SMLAWB xi6, xi6, xi1, xit 343 SMLAWT xi7, xi7, xi1, xit 344 MOV xi4, xi4, ASR #SHIFT 345 PKHBT xi0, xi4, xi5, LSL #(16-SHIFT) 346 MOV xi6, xi6, ASR #SHIFT 347 PKHBT xi1, xi6, xi7, LSL #(16-SHIFT) 348 LDR xi4, [pScale, #5*32-8] 349 LDR xi5, [pScale, #5*32+4-8] 350 LDR xi6, [pScale, #3*32-8] 351 LDR xi7, [pScale, #3*32+4-8] 352 SMLAWB xi4, xi4, xi2, xit 353 SMLAWT xi5, xi5, xi2, xit 354 SMLAWB xi6, xi6, xi3, xit 355 SMLAWT xi7, xi7, xi3, xit 356 MOV xi4, xi4, ASR #SHIFT 357 PKHBT xi2, xi4, xi5, LSL #(16-SHIFT) 358 MOV xi6, xi6, ASR #SHIFT 359 PKHBT xi3, xi6, xi7, LSL #(16-SHIFT) 360 ENDIF 361 362 SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2 363 SSUB16 xi6, xi0, xi1 ;// j1-j7 364 SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2 365 SSUB16 xi4, xi2, xi3 ;// j5-j3 366 367 SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 368 369 PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a 370 PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b 371 372 SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] 373 SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] 374 SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] 375 SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] 376 377 SMULBB xi1, xi3, LoopRR2 378 SMULTB xi3, xi3, LoopRR2 379 380 PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 381 PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 382 SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 383 384 ;// xi0,xi1,xi2,xi3 now free 385 ;// IStage 4,3, rows 2to3 x1/2 386 387 MOV xi3, xi3, LSL #1 388 PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 389 LDRD xi0, [pDest, #8] ;// j2,j6 scaled 390 391 ;// IStage 2, rows4to7 392 SSUB16 xg6, xh6, xh7 393 SSUB16 xg5, xh5, xg6 394 SSUB16 xg4, xh4, xg5 395 396 SSUB16 xi2, xi0, xi1 ;// (j2-j6) 397 SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 398 399 SMULBB xi0, xi2, LoopRR2 400 SMULTB xi2, xi2, LoopRR2 401 402 MOV xi2, xi2, LSL #1 403 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 404 405 ;// xi0, xi1 now free 406 ;// IStage 4,3 rows 0to1 x 1/2 407 LDRD xi0, [pDest] ;// j0, j4 scaled 408 SSUB16 xh2, xh2, xi3 409 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 410 411 SHADD16 xh0, xi0, xi1 412 SHSUB16 xh1, xi0, xi1 413 414 ;// IStage 2 rows 0to3 x 1/2 415 SHSUB16 xg2, xh1, xh2 416 SHADD16 xg1, xh1, xh2 417 SHSUB16 xg3, xh0, xh3 418 SHADD16 xg0, xh0, xh3 419 420 ;// IStage 1 all rows 421 SADD16 xf3, xg3, xg4 422 SSUB16 xf4, xg3, xg4 423 SADD16 xf2, xg2, xg5 424 SSUB16 xf5, xg2, xg5 425 SADD16 xf1, xg1, xg6 426 SSUB16 xf6, xg1, xg6 427 SADD16 xf0, xg0, xg7 428 SSUB16 xf7, xg0, xg7 429 430 ;// Transpose, store and loop 431 PKHBT ra01, xf0, xf1, LSL #16 432 PKHTB rb01, xf1, xf0, ASR #16 433 434 PKHBT ra23, xf2, xf3, LSL #16 435 PKHTB rb23, xf3, xf2, ASR #16 436 437 PKHBT ra45, xf4, xf5, LSL #16 438 PKHTB rb45, xf5, xf4, ASR #16 439 440 PKHBT ra67, xf6, xf7, LSL #16 441 STMIA pDest!, {ra01, ra23, ra45, ra67} 442 PKHTB rb67, xf7, xf6, ASR #16 443 STMIA pDest!, {rb01, rb23, rb45, rb67} 444 BCC v6_idct_col$_F 445 446 SUB pSrc, pDest, #(64*2) 447 M_LDR pDest, ppDest 448 IF "$stride"="s" 449 M_LDR pScale, pStride 450 ENDIF 451 B v6_idct_row$_F 452 453 v6OddZero$_F 454 SSUB16 xi2, xi6, xi7 ;// (j2-j6) 455 SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2 456 457 SMULBB xi0, xi2, LoopRR2 458 SMULTB xi2, xi2, LoopRR2 459 460 MOV xi2, xi2, LSL #1 461 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 462 SSUB16 xh2, xh2, xi3 463 464 ;// xi0, xi1 now free 465 ;// IStage 4,3 rows 0to1 x 1/2 466 467 SHADD16 xh0, xi4, xi5 468 SHSUB16 xh1, xi4, xi5 469 470 ;// IStage 2 rows 0to3 x 1/2 471 SHSUB16 xg2, xh1, xh2 472 SHADD16 xg1, xh1, xh2 473 SHSUB16 xg3, xh0, xh3 474 SHADD16 xg0, xh0, xh3 475 476 ;// IStage 1 all rows 477 MOV xf3, xg3 478 MOV xf4, xg3 479 MOV xf2, xg2 480 MOV xf5, xg2 481 MOV xf1, xg1 482 MOV xf6, xg1 483 MOV xf0, xg0 484 MOV xf7, xg0 485 486 ;// Transpose 487 PKHBT ra01, xf0, xf1, LSL #16 488 PKHTB rb01, xf1, xf0, ASR #16 489 490 PKHBT ra23, xf2, xf3, LSL #16 491 PKHTB rb23, xf3, xf2, ASR #16 492 493 PKHBT ra45, xf4, xf5, LSL #16 494 PKHTB rb45, xf5, xf4, ASR #16 495 496 PKHBT ra67, xf6, xf7, LSL #16 497 PKHTB rb67, xf7, xf6, ASR #16 498 499 STMIA pDest!, {ra01, ra23, ra45, ra67} 500 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 501 STMIA pDest!, {rb01, rb23, rb45, rb67} 502 503 BCC v6_idct_col$_F 504 SUB pSrc, pDest, #(64*2) 505 M_LDR pDest, ppDest 506 IF "$stride"="s" 507 M_LDR pScale, pStride 508 ENDIF 509 510 511 v6_idct_row$_F 512 ;// IStage 4,3, rows4to7 x1/4 513 LDR xit, =0x00010001 ;// rounding constant 514 LDR xi0, [pSrc, #1*16] ;// j1 515 LDR xi1, [pSrc, #7*16] ;// 4*j7 516 LDR xi2, [pSrc, #5*16] ;// j5 517 LDR xi3, [pSrc, #3*16] ;// j3 518 519 SHADD16 xi1, xi1, xit ;// 2*j7 520 SHADD16 xi1, xi1, xit ;// j7 521 522 SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2 523 SSUB16 xi6, xi0, xi1 ;// j1-j7 524 SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2 525 SSUB16 xi4, xi2, xi3 ;// j5-j3 526 527 SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2 528 529 PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a 530 PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b 531 532 SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s] 533 SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s] 534 SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c] 535 SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c] 536 537 SMULBB xi1, xi3, LoopRR2 538 SMULTB xi3, xi3, LoopRR2 539 540 PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4 541 PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4 542 SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4 543 544 MOV xi3, xi3, LSL #1 545 PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4 546 547 ;// xi0,xi1,xi2,xi3 now free 548 ;// IStage 4,3, rows 2to3 x1/2 549 550 LDR xi0, [pSrc, #2*16] ;// j2 551 LDR xi1, [pSrc, #6*16] ;// 2*j6 552 553 ;// IStage 2, rows4to7 554 SSUB16 xg6, xh6, xh7 555 SSUB16 xg5, xh5, xg6 556 SSUB16 xg4, xh4, xg5 557 558 SHADD16 xi1, xi1, xit ;// j6 559 SSUB16 xi2, xi0, xi1 ;// (j2-j6) 560 SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2 561 562 SMULBB xi0, xi2, LoopRR2 563 SMULTB xi2, xi2, LoopRR2 564 565 MOV xi2, xi2, LSL #1 566 567 PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4 568 569 ;// xi0, xi1 now free 570 ;// IStage 4,3 rows 0to1 x 1/2 571 LDR xi1, [pSrc, #4*16] ;// j4 572 LDR xi0, [pSrc], #4 ;// j0 573 574 SSUB16 xh2, xh2, xi3 575 ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows 576 577 ADD xi0, xi0, xit, LSL #2 ;// ensure correct round 578 SHADD16 xh0, xi0, xi1 ;// of DC result 579 SHSUB16 xh1, xi0, xi1 580 581 ;// IStage 2 rows 0to3 x 1/2 582 SHSUB16 xg2, xh1, xh2 583 SHADD16 xg1, xh1, xh2 584 SHSUB16 xg3, xh0, xh3 585 SHADD16 xg0, xh0, xh3 586 587 ;// IStage 1 all rows 588 SHADD16 xf3, xg3, xg4 589 SHSUB16 xf4, xg3, xg4 590 SHADD16 xf2, xg2, xg5 591 SHSUB16 xf5, xg2, xg5 592 SHADD16 xf1, xg1, xg6 593 SHSUB16 xf6, xg1, xg6 594 SHADD16 xf0, xg0, xg7 595 SHSUB16 xf7, xg0, xg7 596 597 ;// Saturate 598 IF ("$outsize"="u8") 599 USAT16 xf0, #8, xf0 600 USAT16 xf1, #8, xf1 601 USAT16 xf2, #8, xf2 602 USAT16 xf3, #8, xf3 603 USAT16 xf4, #8, xf4 604 USAT16 xf5, #8, xf5 605 USAT16 xf6, #8, xf6 606 USAT16 xf7, #8, xf7 607 ENDIF 608 IF ("$outsize"="s9") 609 SSAT16 xf0, #9, xf0 610 SSAT16 xf1, #9, xf1 611 SSAT16 xf2, #9, xf2 612 SSAT16 xf3, #9, xf3 613 SSAT16 xf4, #9, xf4 614 SSAT16 xf5, #9, xf5 615 SSAT16 xf6, #9, xf6 616 SSAT16 xf7, #9, xf7 617 ENDIF 618 619 ;// Transpose to Row, Pack and store 620 IF ("$outsize"="u8") 621 ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ] 622 ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ] 623 ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ] 624 ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ] 625 PKHBT ra01, xf0, xf2, LSL #16 626 PKHTB rb01, xf2, xf0, ASR #16 627 PKHBT ra23, xf4, xf6, LSL #16 628 PKHTB rb23, xf6, xf4, ASR #16 629 STMIA pDest, {ra01, ra23} 630 IF "$stride"="s" 631 ADD pDest, pDest, pScale 632 STMIA pDest, {rb01, rb23} 633 ADD pDest, pDest, pScale 634 ELSE 635 ADD pDest, pDest, #($stride) 636 STMIA pDest, {rb01, rb23} 637 ADD pDest, pDest, #($stride) 638 ENDIF 639 ENDIF 640 IF ("$outsize"="s9"):LOR:("$outsize"="s16") 641 PKHBT ra01, xf0, xf1, LSL #16 642 PKHTB rb01, xf1, xf0, ASR #16 643 644 PKHBT ra23, xf2, xf3, LSL #16 645 PKHTB rb23, xf3, xf2, ASR #16 646 647 PKHBT ra45, xf4, xf5, LSL #16 648 PKHTB rb45, xf5, xf4, ASR #16 649 650 PKHBT ra67, xf6, xf7, LSL #16 651 PKHTB rb67, xf7, xf6, ASR #16 652 653 STMIA pDest, {ra01, ra23, ra45, ra67} 654 IF "$stride"="s" 655 ADD pDest, pDest, pScale 656 STMIA pDest, {rb01, rb23, rb45, rb67} 657 ADD pDest, pDest, pScale 658 ELSE 659 ADD pDest, pDest, #($stride) 660 STMIA pDest, {rb01, rb23, rb45, rb67} 661 ADD pDest, pDest, #($stride) 662 ENDIF 663 ENDIF 664 665 BCC v6_idct_row$_F 666 ENDIF ;// ARM1136JS 667 668 669 IF CortexA8 670 671 Src0 EQU 7 672 Src1 EQU 8 673 Src2 EQU 9 674 Src3 EQU 10 675 Src4 EQU 11 676 Src5 EQU 12 677 Src6 EQU 13 678 Src7 EQU 14 679 Tmp EQU 15 680 681 qXj0 QN Src0.S16 682 qXj1 QN Src1.S16 683 qXj2 QN Src2.S16 684 qXj3 QN Src3.S16 685 qXj4 QN Src4.S16 686 qXj5 QN Src5.S16 687 qXj6 QN Src6.S16 688 qXj7 QN Src7.S16 689 qXjt QN Tmp.S16 690 691 dXj0lo DN (Src0*2).S16 692 dXj0hi DN (Src0*2+1).S16 693 dXj1lo DN (Src1*2).S16 694 dXj1hi DN (Src1*2+1).S16 695 dXj2lo DN (Src2*2).S16 696 dXj2hi DN (Src2*2+1).S16 697 dXj3lo DN (Src3*2).S16 698 dXj3hi DN (Src3*2+1).S16 699 dXj4lo DN (Src4*2).S16 700 dXj4hi DN (Src4*2+1).S16 701 dXj5lo DN (Src5*2).S16 702 dXj5hi DN (Src5*2+1).S16 703 dXj6lo DN (Src6*2).S16 704 dXj6hi DN (Src6*2+1).S16 705 dXj7lo DN (Src7*2).S16 706 dXj7hi DN (Src7*2+1).S16 707 dXjtlo DN (Tmp*2).S16 708 dXjthi DN (Tmp*2+1).S16 709 710 qXi0 QN qXj0 711 qXi1 QN qXj4 712 qXi2 QN qXj2 713 qXi3 QN qXj7 714 qXi4 QN qXj5 715 qXi5 QN qXjt 716 qXi6 QN qXj1 717 qXi7 QN qXj6 718 qXit QN qXj3 719 720 dXi0lo DN dXj0lo 721 dXi0hi DN dXj0hi 722 dXi1lo DN dXj4lo 723 dXi1hi DN dXj4hi 724 dXi2lo DN dXj2lo 725 dXi2hi DN dXj2hi 726 dXi3lo DN dXj7lo 727 dXi3hi DN dXj7hi 728 dXi4lo DN dXj5lo 729 dXi4hi DN dXj5hi 730 dXi5lo DN dXjtlo 731 dXi5hi DN dXjthi 732 dXi6lo DN dXj1lo 733 dXi6hi DN dXj1hi 734 dXi7lo DN dXj6lo 735 dXi7hi DN dXj6hi 736 dXitlo DN dXj3lo 737 dXithi DN dXj3hi 738 739 qXh0 QN qXit 740 qXh1 QN qXi0 741 qXh2 QN qXi2 742 qXh3 QN qXi3 743 qXh4 QN qXi7 744 qXh5 QN qXi5 745 qXh6 QN qXi4 746 qXh7 QN qXi1 747 qXht QN qXi6 748 749 dXh0lo DN dXitlo 750 dXh0hi DN dXithi 751 dXh1lo DN dXi0lo 752 dXh1hi DN dXi0hi 753 dXh2lo DN dXi2lo 754 dXh2hi DN dXi2hi 755 dXh3lo DN dXi3lo 756 dXh3hi DN dXi3hi 757 dXh4lo DN dXi7lo 758 dXh4hi DN dXi7hi 759 dXh5lo DN dXi5lo 760 dXh5hi DN dXi5hi 761 dXh6lo DN dXi4lo 762 dXh6hi DN dXi4hi 763 dXh7lo DN dXi1lo 764 dXh7hi DN dXi1hi 765 dXhtlo DN dXi6lo 766 dXhthi DN dXi6hi 767 768 qXg0 QN qXh2 769 qXg1 QN qXht 770 qXg2 QN qXh1 771 qXg3 QN qXh0 772 qXg4 QN qXh4 773 qXg5 QN qXh5 774 qXg6 QN qXh6 775 qXg7 QN qXh7 776 qXgt QN qXh3 777 778 qXf0 QN qXg6 779 qXf1 QN qXg5 780 qXf2 QN qXg4 781 qXf3 QN qXgt 782 qXf4 QN qXg3 783 qXf5 QN qXg2 784 qXf6 QN qXg1 785 qXf7 QN qXg0 786 qXft QN qXg7 787 788 789 qXt0 QN 1.S32 790 qXt1 QN 2.S32 791 qT0lo QN 1.S32 792 qT0hi QN 2.S32 793 qT1lo QN 3.S32 794 qT1hi QN 4.S32 795 qScalelo QN 5.S32 ;// used to read post scale values 796 qScalehi QN 6.S32 797 qTemp0 QN 5.S32 798 qTemp1 QN 6.S32 799 800 801 Scale1 EQU 6 802 Scale2 EQU 15 803 qScale1 QN Scale1.S16 804 qScale2 QN Scale2.S16 805 dScale1lo DN (Scale1*2).S16 806 dScale1hi DN (Scale1*2+1).S16 807 dScale2lo DN (Scale2*2).S16 808 dScale2hi DN (Scale2*2+1).S16 809 810 dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]} 811 InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15 812 S DN dCoefs[1] ;// Sin(PI/8) in Q15 813 C DN dCoefs[2] ;// Cos(PI/8) in Q15 814 815 pTemp RN 12 816 817 818 IMPORT armCOMM_IDCTCoef 819 820 VLD1 {qXj0,qXj1}, [pSrc @64]! 821 VLD1 {qXj2,qXj3}, [pSrc @64]! 822 VLD1 {qXj4,qXj5}, [pSrc @64]! 823 VLD1 {qXj6,qXj7}, [pSrc @64]! 824 825 ;// Load PreScale and multiply with Src 826 ;// IStage 4 827 828 IF "$inscale"="s16" ;// 16X16 Mul 829 M_IDCT_PRESCALE16 830 ENDIF 831 832 IF "$inscale"="s32" ;// 32X32 ,ul 833 M_IDCT_PRESCALE32 834 ENDIF 835 836 ;// IStage 3 837 VQRDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2) 838 VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2 839 VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2 840 VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4 841 VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2 842 VQRDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2) 843 VSUB qXh2, qXi2, qXi3 ;// h2, h3 844 845 VMULL qXt0, dXi4lo, C ;// c*i4 846 VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6 847 VMULL qXt1, dXi4hi, C 848 VMLAL qXt1, dXi6hi, S 849 VSHRN dXh4lo, qXt0, #16 ;// h4 850 VSHRN dXh4hi, qXt1, #16 851 852 VMULL qXt0, dXi6lo, C ;// c*i6 853 VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6 854 VMULL qXt1, dXi6hi, C 855 VMLSL qXt1, dXi4hi, S 856 VSHRN dXh6lo, qXt0, #16 ;// h6 857 VSHRN dXh6hi, qXt1, #16 858 859 ;// IStage 2 860 VSUB qXg6, qXh6, qXh7 861 VSUB qXg5, qXh5, qXg6 862 VSUB qXg4, qXh4, qXg5 863 VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2 864 VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2 865 VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2 866 VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2 867 868 ;// IStage 1 all rows 869 VADD qXf3, qXg3, qXg4 870 VSUB qXf4, qXg3, qXg4 871 VADD qXf2, qXg2, qXg5 872 VSUB qXf5, qXg2, qXg5 873 VADD qXf1, qXg1, qXg6 874 VSUB qXf6, qXg1, qXg6 875 VADD qXf0, qXg0, qXg7 876 VSUB qXf7, qXg0, qXg7 877 878 ;// Transpose, store and loop 879 XTR0 EQU Src5 880 XTR1 EQU Tmp 881 XTR2 EQU Src6 882 XTR3 EQU Src7 883 XTR4 EQU Src3 884 XTR5 EQU Src0 885 XTR6 EQU Src1 886 XTR7 EQU Src2 887 XTRt EQU Src4 888 889 qA0 QN XTR0.S32 ;// for XTRpose 890 qA1 QN XTR1.S32 891 qA2 QN XTR2.S32 892 qA3 QN XTR3.S32 893 qA4 QN XTR4.S32 894 qA5 QN XTR5.S32 895 qA6 QN XTR6.S32 896 qA7 QN XTR7.S32 897 898 dB0 DN XTR0*2+1 ;// for using VSWP 899 dB1 DN XTR1*2+1 900 dB2 DN XTR2*2+1 901 dB3 DN XTR3*2+1 902 dB4 DN XTR4*2 903 dB5 DN XTR5*2 904 dB6 DN XTR6*2 905 dB7 DN XTR7*2 906 907 908 VTRN qXf0, qXf1 909 VTRN qXf2, qXf3 910 VTRN qXf4, qXf5 911 VTRN qXf6, qXf7 912 VTRN qA0, qA2 913 VTRN qA1, qA3 914 VTRN qA4, qA6 915 VTRN qA5, qA7 916 VSWP dB0, dB4 917 VSWP dB1, dB5 918 VSWP dB2, dB6 919 VSWP dB3, dB7 920 921 922 qYj0 QN qXf0 923 qYj1 QN qXf1 924 qYj2 QN qXf2 925 qYj3 QN qXf3 926 qYj4 QN qXf4 927 qYj5 QN qXf5 928 qYj6 QN qXf6 929 qYj7 QN qXf7 930 qYjt QN qXft 931 932 dYj0lo DN (XTR0*2).S16 933 dYj0hi DN (XTR0*2+1).S16 934 dYj1lo DN (XTR1*2).S16 935 dYj1hi DN (XTR1*2+1).S16 936 dYj2lo DN (XTR2*2).S16 937 dYj2hi DN (XTR2*2+1).S16 938 dYj3lo DN (XTR3*2).S16 939 dYj3hi DN (XTR3*2+1).S16 940 dYj4lo DN (XTR4*2).S16 941 dYj4hi DN (XTR4*2+1).S16 942 dYj5lo DN (XTR5*2).S16 943 dYj5hi DN (XTR5*2+1).S16 944 dYj6lo DN (XTR6*2).S16 945 dYj6hi DN (XTR6*2+1).S16 946 dYj7lo DN (XTR7*2).S16 947 dYj7hi DN (XTR7*2+1).S16 948 dYjtlo DN (XTRt*2).S16 949 dYjthi DN (XTRt*2+1).S16 950 951 qYi0 QN qYj0 952 qYi1 QN qYj4 953 qYi2 QN qYj2 954 qYi3 QN qYj7 955 qYi4 QN qYj5 956 qYi5 QN qYjt 957 qYi6 QN qYj1 958 qYi7 QN qYj6 959 qYit QN qYj3 960 961 dYi0lo DN dYj0lo 962 dYi0hi DN dYj0hi 963 dYi1lo DN dYj4lo 964 dYi1hi DN dYj4hi 965 dYi2lo DN dYj2lo 966 dYi2hi DN dYj2hi 967 dYi3lo DN dYj7lo 968 dYi3hi DN dYj7hi 969 dYi4lo DN dYj5lo 970 dYi4hi DN dYj5hi 971 dYi5lo DN dYjtlo 972 dYi5hi DN dYjthi 973 dYi6lo DN dYj1lo 974 dYi6hi DN dYj1hi 975 dYi7lo DN dYj6lo 976 dYi7hi DN dYj6hi 977 dYitlo DN dYj3lo 978 dYithi DN dYj3hi 979 980 qYh0 QN qYit 981 qYh1 QN qYi0 982 qYh2 QN qYi2 983 qYh3 QN qYi3 984 qYh4 QN qYi7 985 qYh5 QN qYi5 986 qYh6 QN qYi4 987 qYh7 QN qYi1 988 qYht QN qYi6 989 990 dYh0lo DN dYitlo 991 dYh0hi DN dYithi 992 dYh1lo DN dYi0lo 993 dYh1hi DN dYi0hi 994 dYh2lo DN dYi2lo 995 dYh2hi DN dYi2hi 996 dYh3lo DN dYi3lo 997 dYh3hi DN dYi3hi 998 dYh4lo DN dYi7lo 999 dYh4hi DN dYi7hi 1000 dYh5lo DN dYi5lo 1001 dYh5hi DN dYi5hi 1002 dYh6lo DN dYi4lo 1003 dYh6hi DN dYi4hi 1004 dYh7lo DN dYi1lo 1005 dYh7hi DN dYi1hi 1006 dYhtlo DN dYi6lo 1007 dYhthi DN dYi6hi 1008 1009 qYg0 QN qYh2 1010 qYg1 QN qYht 1011 qYg2 QN qYh1 1012 qYg3 QN qYh0 1013 qYg4 QN qYh4 1014 qYg5 QN qYh5 1015 qYg6 QN qYh6 1016 qYg7 QN qYh7 1017 qYgt QN qYh3 1018 1019 qYf0 QN qYg6 1020 qYf1 QN qYg5 1021 qYf2 QN qYg4 1022 qYf3 QN qYgt 1023 qYf4 QN qYg3 1024 qYf5 QN qYg2 1025 qYf6 QN qYg1 1026 qYf7 QN qYg0 1027 qYft QN qYg7 1028 1029 VRSHR qYj7, qYj7, #2 1030 VRSHR qYj6, qYj6, #1 1031 1032 VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2 1033 VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7 1034 VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2 1035 VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6 1036 VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2 1037 VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3 1038 1039 VQRDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2) 1040 ;// IStage 4,3 rows 0to1 x 1/2 1041 1042 MOV pTemp, #0x4 ;// ensure correct round 1043 VDUP qScale1, pTemp ;// of DC result 1044 VADD qYi0, qYi0, qScale1 1045 1046 VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2 1047 VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2 1048 1049 VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4 1050 VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2 1051 VSUB qYh2, qYi2, qYi3 ;// h2, h3 1052 VQRDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2) 1053 1054 VMULL qXt0, dYi4lo, C ;// c*i4 1055 VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6 1056 VMULL qXt1, dYi4hi, C 1057 VMLAL qXt1, dYi6hi, S 1058 VSHRN dYh4lo, qXt0, #16 ;// h4 1059 VSHRN dYh4hi, qXt1, #16 1060 1061 VMULL qXt0, dYi6lo, C ;// c*i6 1062 VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6 1063 VMULL qXt1, dYi6hi, C 1064 VMLSL qXt1, dYi4hi, S 1065 VSHRN dYh6lo, qXt0, #16 ;// h6 1066 VSHRN dYh6hi, qXt1, #16 1067 1068 VSUB qYg6, qYh6, qYh7 1069 VSUB qYg5, qYh5, qYg6 1070 VSUB qYg4, qYh4, qYg5 1071 1072 ;// IStage 2 rows 0to3 x 1/2 1073 VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2 1074 VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2 1075 VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2 1076 VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2 1077 1078 1079 ;// IStage 1 all rows 1080 VHADD qYf3, qYg3, qYg4 1081 VHSUB qYf4, qYg3, qYg4 1082 VHADD qYf2, qYg2, qYg5 1083 VHSUB qYf5, qYg2, qYg5 1084 VHADD qYf1, qYg1, qYg6 1085 VHSUB qYf6, qYg1, qYg6 1086 VHADD qYf0, qYg0, qYg7 1087 VHSUB qYf7, qYg0, qYg7 1088 1089 YTR0 EQU Src0 1090 YTR1 EQU Src4 1091 YTR2 EQU Src1 1092 YTR3 EQU Src2 1093 YTR4 EQU Src7 1094 YTR5 EQU Src5 1095 YTR6 EQU Tmp 1096 YTR7 EQU Src6 1097 YTRt EQU Src3 1098 1099 qC0 QN YTR0.S32 ;// for YTRpose 1100 qC1 QN YTR1.S32 1101 qC2 QN YTR2.S32 1102 qC3 QN YTR3.S32 1103 qC4 QN YTR4.S32 1104 qC5 QN YTR5.S32 1105 qC6 QN YTR6.S32 1106 qC7 QN YTR7.S32 1107 1108 dD0 DN YTR0*2+1 ;// for using VSWP 1109 dD1 DN YTR1*2+1 1110 dD2 DN YTR2*2+1 1111 dD3 DN YTR3*2+1 1112 dD4 DN YTR4*2 1113 dD5 DN YTR5*2 1114 dD6 DN YTR6*2 1115 dD7 DN YTR7*2 1116 1117 VTRN qYf0, qYf1 1118 VTRN qYf2, qYf3 1119 VTRN qYf4, qYf5 1120 VTRN qYf6, qYf7 1121 VTRN qC0, qC2 1122 VTRN qC1, qC3 1123 VTRN qC4, qC6 1124 VTRN qC5, qC7 1125 VSWP dD0, dD4 1126 VSWP dD1, dD5 1127 VSWP dD2, dD6 1128 VSWP dD3, dD7 1129 1130 1131 dYf0U8 DN YTR0*2.U8 1132 dYf1U8 DN YTR1*2.U8 1133 dYf2U8 DN YTR2*2.U8 1134 dYf3U8 DN YTR3*2.U8 1135 dYf4U8 DN YTR4*2.U8 1136 dYf5U8 DN YTR5*2.U8 1137 dYf6U8 DN YTR6*2.U8 1138 dYf7U8 DN YTR7*2.U8 1139 1140 ;// 1141 ;// Do saturation if outsize is other than S16 1142 ;// 1143 1144 IF ("$outsize"="u8") 1145 ;// Output range [0-255] 1146 VQMOVN dYf0U8, qYf0 1147 VQMOVN dYf1U8, qYf1 1148 VQMOVN dYf2U8, qYf2 1149 VQMOVN dYf3U8, qYf3 1150 VQMOVN dYf4U8, qYf4 1151 VQMOVN dYf5U8, qYf5 1152 VQMOVN dYf6U8, qYf6 1153 VQMOVN dYf7U8, qYf7 1154 ENDIF 1155 1156 IF ("$outsize"="s9") 1157 ;// Output range [-256 to +255] 1158 VQSHL qYf0, qYf0, #16-9 1159 VQSHL qYf1, qYf1, #16-9 1160 VQSHL qYf2, qYf2, #16-9 1161 VQSHL qYf3, qYf3, #16-9 1162 VQSHL qYf4, qYf4, #16-9 1163 VQSHL qYf5, qYf5, #16-9 1164 VQSHL qYf6, qYf6, #16-9 1165 VQSHL qYf7, qYf7, #16-9 1166 1167 VSHR qYf0, qYf0, #16-9 1168 VSHR qYf1, qYf1, #16-9 1169 VSHR qYf2, qYf2, #16-9 1170 VSHR qYf3, qYf3, #16-9 1171 VSHR qYf4, qYf4, #16-9 1172 VSHR qYf5, qYf5, #16-9 1173 VSHR qYf6, qYf6, #16-9 1174 VSHR qYf7, qYf7, #16-9 1175 ENDIF 1176 1177 ;// Store output depending on the Stride size 1178 IF "$stride"="s" 1179 VST1 qYf0, [pDest @64], Stride 1180 VST1 qYf1, [pDest @64], Stride 1181 VST1 qYf2, [pDest @64], Stride 1182 VST1 qYf3, [pDest @64], Stride 1183 VST1 qYf4, [pDest @64], Stride 1184 VST1 qYf5, [pDest @64], Stride 1185 VST1 qYf6, [pDest @64], Stride 1186 VST1 qYf7, [pDest @64] 1187 ELSE 1188 IF ("$outsize"="u8") 1189 VST1 dYf0U8, [pDest @64], #8 1190 VST1 dYf1U8, [pDest @64], #8 1191 VST1 dYf2U8, [pDest @64], #8 1192 VST1 dYf3U8, [pDest @64], #8 1193 VST1 dYf4U8, [pDest @64], #8 1194 VST1 dYf5U8, [pDest @64], #8 1195 VST1 dYf6U8, [pDest @64], #8 1196 VST1 dYf7U8, [pDest @64] 1197 ELSE 1198 ;// ("$outsize"="s9") or ("$outsize"="s16") 1199 VST1 qYf0, [pDest @64], #16 1200 VST1 qYf1, [pDest @64], #16 1201 VST1 qYf2, [pDest @64], #16 1202 VST1 qYf3, [pDest @64], #16 1203 VST1 qYf4, [pDest @64], #16 1204 VST1 qYf5, [pDest @64], #16 1205 VST1 qYf6, [pDest @64], #16 1206 VST1 qYf7, [pDest @64] 1207 ENDIF 1208 1209 ENDIF 1210 1211 1212 1213 ENDIF ;// CortexA8 1214 1215 1216 1217 MEND 1218 1219 ;// Scale TWO input rows with TWO rows of 16 bit scale values 1220 ;// 1221 ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row 1222 ;// input (Eight input values) with one row of scale values. Also 1223 ;// Loads next scale values from pScale, if $LastRow flag is not set. 1224 ;// 1225 ;// Input Registers: 1226 ;// 1227 ;// $dAlo - Input D register with first four S16 values of row n 1228 ;// $dAhi - Input D register with next four S16 values of row n 1229 ;// $dBlo - Input D register with first four S16 values of row n+1 1230 ;// $dBhi - Input D register with next four S16 values of row n+1 1231 ;// pScale - Pointer to next row of scale values 1232 ;// qT0lo - Temporary scratch register 1233 ;// qT0hi - Temporary scratch register 1234 ;// qT1lo - Temporary scratch register 1235 ;// qT1hi - Temporary scratch register 1236 ;// dScale1lo - Scale value of row n 1237 ;// dScale1hi - Scale value of row n 1238 ;// dScale2lo - Scale value of row n+1 1239 ;// dScale2hi - Scale value of row n+1 1240 ;// 1241 ;// Input Flag 1242 ;// 1243 ;// $LastRow - Flag to indicate whether current row is last row 1244 ;// 1245 ;// Output Registers: 1246 ;// 1247 ;// $dAlo - Scaled output values (first four S16 of row n) 1248 ;// $dAhi - Scaled output values (next four S16 of row n) 1249 ;// $dBlo - Scaled output values (first four S16 of row n+1) 1250 ;// $dBhi - Scaled output values (next four S16 of row n+1) 1251 ;// qScale1 - Scale values for next row 1252 ;// qScale2 - Scale values for next row+1 1253 ;// pScale - Pointer to next row of scale values 1254 ;// 1255 MACRO 1256 M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow 1257 VMULL qT0lo, $dAlo, dScale1lo 1258 VMULL qT0hi, $dAhi, dScale1hi 1259 VMULL qT1lo, $dBlo, dScale2lo 1260 VMULL qT1hi, $dBhi, dScale2hi 1261 IF "$LastRow"="0" 1262 VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1 1263 VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2 1264 ENDIF 1265 VQRSHRN $dAlo, qT0lo, #12 1266 VQRSHRN $dAhi, qT0hi, #12 1267 VQRSHRN $dBlo, qT1lo, #12 1268 VQRSHRN $dBhi, qT1hi, #12 1269 MEND 1270 1271 ;// Scale 8x8 block input values with 16 bit scale values 1272 ;// 1273 ;// This macro is used to pre-scale block of 8x8 input. 1274 ;// This also do the Ist stage transformations of IDCT. 1275 ;// 1276 ;// Input Registers: 1277 ;// 1278 ;// dXjnlo - n th input D register with first four S16 values 1279 ;// dXjnhi - n th input D register with next four S16 values 1280 ;// qXjn - n th input Q register with eight S16 values 1281 ;// pScale - Pointer to scale values 1282 ;// 1283 ;// Output Registers: 1284 ;// 1285 ;// qXin - n th output Q register with eight S16 output values of 1st stage 1286 ;// 1287 MACRO 1288 M_IDCT_PRESCALE16 1289 VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0 1290 VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0 1291 M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1 1292 M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0 1293 M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0 1294 M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1 1295 VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2 1296 VSUB qXi6, qXj1, qXj7 ;// j1-j7 1297 LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants 1298 VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2 1299 VSUB qXi2, qXj2, qXj6 ;// j2-j6 1300 VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants 1301 VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2 1302 VSUB qXi4, qXj5, qXj3 ;// j5-j3 1303 MEND 1304 1305 1306 ;// Scale 8x8 block input values with 32 bit scale values 1307 ;// 1308 ;// This macro is used to pre-scale block of 8x8 input. 1309 ;// This also do the Ist stage transformations of IDCT. 1310 ;// 1311 ;// Input Registers: 1312 ;// 1313 ;// dXjnlo - n th input D register with first four S16 values 1314 ;// dXjnhi - n th input D register with next four S16 values 1315 ;// qXjn - n th input Q register with eight S16 values 1316 ;// pScale - Pointer to 32bit scale values in Q23 format 1317 ;// 1318 ;// Output Registers: 1319 ;// 1320 ;// dXinlo - n th output D register with first four S16 output values of 1st stage 1321 ;// dXinhi - n th output D register with next four S16 output values of 1st stage 1322 ;// 1323 MACRO 1324 M_IDCT_PRESCALE32 1325 qScale0lo QN 0.S32 1326 qScale0hi QN 1.S32 1327 qScale1lo QN 2.S32 1328 qScale1hi QN 3.S32 1329 qScale2lo QN qScale1lo 1330 qScale2hi QN qScale1hi 1331 qScale3lo QN qScale1lo 1332 qScale3hi QN qScale1hi 1333 qScale4lo QN qScale1lo 1334 qScale4hi QN qScale1hi 1335 qScale5lo QN qScale0lo 1336 qScale5hi QN qScale0hi 1337 qScale6lo QN qScale0lo 1338 qScale6hi QN qScale0hi 1339 qScale7lo QN qScale0lo 1340 qScale7hi QN qScale0hi 1341 1342 qSrc0lo QN 4.S32 1343 qSrc0hi QN 5.S32 1344 qSrc1lo QN 6.S32 1345 qSrc1hi QN Src4.S32 1346 qSrc2lo QN qSrc0lo 1347 qSrc2hi QN qSrc0hi 1348 qSrc3lo QN qSrc0lo 1349 qSrc3hi QN qSrc0hi 1350 qSrc4lo QN qSrc0lo 1351 qSrc4hi QN qSrc0hi 1352 qSrc5lo QN qSrc1lo 1353 qSrc5hi QN qSrc1hi 1354 qSrc6lo QN qSrc1lo 1355 qSrc6hi QN qSrc1hi 1356 qSrc7lo QN qSrc0lo 1357 qSrc7hi QN qSrc0hi 1358 1359 qRes17lo QN qScale0lo 1360 qRes17hi QN qScale0hi 1361 qRes26lo QN qScale0lo 1362 qRes26hi QN qScale0hi 1363 qRes53lo QN qScale0lo 1364 qRes53hi QN qScale0hi 1365 1366 ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7] 1367 1368 ;// Row 0 1369 VLD1 {qScale0lo, qScale0hi}, [pScale]! 1370 VSHLL qSrc0lo, dXj0lo, #(12-1) 1371 VSHLL qSrc0hi, dXj0hi, #(12-1) 1372 VLD1 {qScale1lo, qScale1hi}, [pScale]! 1373 VQRDMULH qSrc0lo, qScale0lo, qSrc0lo 1374 VQRDMULH qSrc0hi, qScale0hi, qSrc0hi 1375 VLD1 {qScale7lo, qScale7hi}, [pTemp]! 1376 VSHLL qSrc1lo, dXj1lo, #(12-1) 1377 VSHLL qSrc1hi, dXj1hi, #(12-1) 1378 VMOVN dXi0lo, qSrc0lo ;// Output i0 1379 VMOVN dXi0hi, qSrc0hi 1380 VSHLL qSrc7lo, dXj7lo, #(12-1) 1381 VSHLL qSrc7hi, dXj7hi, #(12-1) 1382 SUB pTemp, pTemp, #((16*2)+(4*8*1)) 1383 VQRDMULH qSrc1lo, qScale1lo, qSrc1lo 1384 VQRDMULH qSrc1hi, qScale1hi, qSrc1hi 1385 VQRDMULH qSrc7lo, qScale7lo, qSrc7lo 1386 VQRDMULH qSrc7hi, qScale7hi, qSrc7hi 1387 VLD1 {qScale2lo, qScale2hi}, [pScale]! 1388 1389 ;// Row 1 & 7 1390 VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2 1391 VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2 1392 VMOVN dXi5lo, qRes17lo ;// Output i5 1393 VMOVN dXi5hi, qRes17hi 1394 VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7 1395 VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7 1396 VMOVN dXi6lo, qRes17lo ;// Output i6 1397 VMOVN dXi6hi, qRes17hi 1398 VSHLL qSrc2lo, dXj2lo, #(12-1) 1399 VSHLL qSrc2hi, dXj2hi, #(12-1) 1400 VLD1 {qScale6lo, qScale6hi}, [pTemp]! 1401 VSHLL qSrc6lo, dXj6lo, #(12-1) 1402 VSHLL qSrc6hi, dXj6hi, #(12-1) 1403 SUB pTemp, pTemp, #((16*2)+(4*8*1)) 1404 VQRDMULH qSrc2lo, qScale2lo, qSrc2lo 1405 VQRDMULH qSrc2hi, qScale2hi, qSrc2hi 1406 VQRDMULH qSrc6lo, qScale6lo, qSrc6lo 1407 VQRDMULH qSrc6hi, qScale6hi, qSrc6hi 1408 VLD1 {qScale3lo, qScale3hi}, [pScale]! 1409 1410 ;// Row 2 & 6 1411 VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2 1412 VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2 1413 VMOVN dXi3lo, qRes26lo ;// Output i3 1414 VMOVN dXi3hi, qRes26hi 1415 VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6 1416 VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6 1417 VMOVN dXi2lo, qRes26lo ;// Output i2 1418 VMOVN dXi2hi, qRes26hi 1419 VSHLL qSrc3lo, dXj3lo, #(12-1) 1420 VSHLL qSrc3hi, dXj3hi, #(12-1) 1421 VLD1 {qScale5lo, qScale5hi}, [pTemp]! 1422 VSHLL qSrc5lo, dXj5lo, #(12-1) 1423 VSHLL qSrc5hi, dXj5hi, #(12-1) 1424 VQRDMULH qSrc3lo, qScale3lo, qSrc3lo 1425 VQRDMULH qSrc3hi, qScale3hi, qSrc3hi 1426 VQRDMULH qSrc5lo, qScale5lo, qSrc5lo 1427 VQRDMULH qSrc5hi, qScale5hi, qSrc5hi 1428 1429 ;// Row 3 & 5 1430 VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2 1431 VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2 1432 SUB pSrc, pSrc, #16*2*2 1433 VMOVN dXi7lo, qRes53lo ;// Output i7 1434 VMOVN dXi7hi, qRes53hi 1435 VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3 1436 VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3 1437 VLD1 qXj4, [pSrc @64] 1438 VMOVN dXi4lo, qRes53lo ;// Output i4 1439 VMOVN dXi4hi, qRes53hi 1440 VSHLL qSrc4lo, dXj4lo, #(12-1) 1441 VSHLL qSrc4hi, dXj4hi, #(12-1) 1442 VLD1 {qScale4lo, qScale4hi}, [pScale] 1443 LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants 1444 VQRDMULH qSrc4lo, qScale4lo, qSrc4lo 1445 VQRDMULH qSrc4hi, qScale4hi, qSrc4hi 1446 VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants 1447 ;// Row 4 1448 VMOVN dXi1lo, qSrc4lo ;// Output i1 1449 VMOVN dXi1hi, qSrc4hi 1450 1451 MEND 1452 1453 END 1454