Home | History | Annotate | Download | only in api
      1 ;//
      2 ;// This confidential and proprietary software may be used only as
      3 ;// authorised by a licensing agreement from ARM Limited
      4 ;//   (C) COPYRIGHT 2004 ARM Limited
      5 ;//       ALL RIGHTS RESERVED
      6 ;// The entire notice above must be reproduced on all authorised
      7 ;// copies and copies may only be made to the extent permitted
      8 ;// by a licensing agreement from ARM Limited.
      9 ;//
     10 ;// IDCT_s.s
     11 ;//
     12 ;// Inverse DCT module
     13 ;//
     14 ;//
     15 ;// ALGORITHM DESCRIPTION
     16 ;//
     17 ;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
     18 ;// column and then a 1D IDCT for each row.
     19 ;//
     20 ;// The 8-point 1D IDCT is defined by
     21 ;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
     22 ;//
     23 ;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
     24 ;//   c(u,x) = cos( (2x+1)*u*pi/16 )
     25 ;//
     26 ;// We compute the 8-point 1D IDCT using the reverse of
     27 ;// the Arai-Agui-Nakajima flow graph which we split into
     28 ;// 5 stages named in reverse order to identify with the
     29 ;// forward DCT. Direct inversion of the forward formulae
     30 ;// in file FDCT_s.s gives:
     31 ;//
     32 ;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
     33 ;//             [ A(0) = 2*sqrt(2)
     34 ;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
     35 ;//
     36 ;// IStage 4:   i0 = j0             i1 = j4
     37 ;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
     38 ;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
     39 ;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
     40 ;//
     41 ;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
     42 ;//             h2 = (i2*sqrt2)-i3  h3 = i3
     43 ;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
     44 ;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
     45 ;//             [ The above two lines rotate by -(pi/8) ]
     46 ;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
     47 ;//
     48 ;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
     49 ;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
     50 ;//             g7 = h7             g6 = h6 - h7
     51 ;//             g5 = h5 - g6        g4 = h4 - g5
     52 ;//
     53 ;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
     54 ;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
     55 ;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
     56 ;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
     57 ;//
     58 ;// Note that most coefficients are halved 3 times during the
     59 ;// above calculation. We can rescale the algorithm dividing
     60 ;// the input by 8 to remove the halvings.
     61 ;//
     62 ;// IStage 5:   j(u) = T(u)*A(u)/8
     63 ;//
     64 ;// IStage 4:   i0 = j0             i1 = j4
     65 ;//             i3 = j2 + j6        i2 = j2 - j6
     66 ;//             i7 = j5 + j3        i4 = j5 - j3
     67 ;//             i5 = j1 + j7        i6 = j1 - j7
     68 ;//
     69 ;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
     70 ;//             h2 = (i2*sqrt2)-i3  h3 = i3
     71 ;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
     72 ;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
     73 ;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
     74 ;//
     75 ;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
     76 ;//             g1 = h1 + h2        g2 = h1 - h2
     77 ;//             g7 = h7             g6 = h6 - h7
     78 ;//             g5 = h5 - g6        g4 = h4 - g5
     79 ;//
     80 ;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
     81 ;//             f1 = g1 + g6        f6 = g1 - g6
     82 ;//             f2 = g2 + g5        f5 = g2 - g5
     83 ;//             f3 = g3 + g4        f4 = g3 - g4
     84 ;//
     85 ;// Note:
     86 ;// 1. The scaling by A(u)/8 can often be combined with inverse
     87 ;//    quantization. The column and row scalings can be combined.
     88 ;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
     89 ;//    to the above code but is otherwise identical.
     90 ;// 3. The rotation by -pi/8 can be peformed using three multiplies
     91 ;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
     92 ;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
     93 ;// 4. If |T(u)|<=1 then from the IDCT definition,
     94 ;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
     95 ;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
     96 ;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
     97 ;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
     98 ;//            = (approx)2.64
     99 ;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
    100 ;//    The table below shows input patterns generating the maximum
    101 ;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
    102 ;//    InputPattern      Max |f(x)|
    103 ;//      PPPPPPPP        |f0| =  2.64
    104 ;//      PPPMMMMM        |f1| =  2.64
    105 ;//      PPMMMPPP        |f2| =  2.64
    106 ;//      PPMMPPMM        |f3| =  2.64
    107 ;//      PMMPPMMP        |f4| =  2.64
    108 ;//      PMMPMMPM        |f5| =  2.64
    109 ;//      PMPPMPMP        |f6| =  2.64
    110 ;//      PMPMPMPM        |f7| =  2.64
    111 ;//   Note that this input pattern is the transpose of the
    112 ;//   corresponding max input patter for the FDCT.
    113 
    114 ;// Arguments
    115 
    116 pSrc    RN 0    ;// source data buffer
    117 Stride  RN 1    ;// destination stride in bytes
    118 pDest   RN 2    ;// destination data buffer
    119 pScale  RN 3    ;// pointer to scaling table
    120 
    121 
    122         ;// DCT Inverse Macro
    123         ;// The DCT code should be parametrized according
    124         ;// to the following inputs:
    125         ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
    126         ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
    127         ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
    128         ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
    129         ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
    130         ;//
    131         ;// Inputs:
    132         ;// pSrc   = r0 = Pointer to input data
    133         ;//               Range is -256 to +255 (9-bit)
    134         ;// Stride = r1 = Stride between input lines
    135         ;// pDest  = r2 = Pointer to output data
    136         ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
    137 
    138 
    139 
    140         MACRO
    141         M_IDCT  $outsize, $inscale, $stride
    142         LCLA    SHIFT
    143 
    144 
    145         IF ARM1136JS
    146 
    147 ;// REGISTER ALLOCATION
    148 ;// This is hard since we have 8 values, 9 free registers and each
    149 ;// butterfly requires a temporary register. We also want to
    150 ;// maintain register order so we can use LDM/STM. The table below
    151 ;// summarises the register allocation that meets all these criteria.
    152 ;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
    153 ;//
    154 ;// r1  a01     g0  h0
    155 ;// r4  b01 f0  g1  h1  i0
    156 ;// r5  a23 f1  g2      i1
    157 ;// r6  b23 f2  g3  h2  i2
    158 ;// r7  a45 f3      h3  i3
    159 ;// r8  b45 f4  g4  h4  i4
    160 ;// r9  a67 f5  g5  h5  i5
    161 ;// r10 b67 f6  g6  h6  i6
    162 ;// r11     f7  g7  h7  i7
    163 ;//
    164 ra01    RN 1
    165 rb01    RN 4
    166 ra23    RN 5
    167 rb23    RN 6
    168 ra45    RN 7
    169 rb45    RN 8
    170 ra67    RN 9
    171 rb67    RN 10
    172 rtmp    RN 11
    173 csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
    174 LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
    175 ;// Transpose allocation
    176 xft     RN ra01
    177 xf0     RN rb01
    178 xf1     RN ra23
    179 xf2     RN rb23
    180 xf3     RN ra45
    181 xf4     RN rb45
    182 xf5     RN ra67
    183 xf6     RN rb67
    184 xf7     RN rtmp
    185 ;// IStage 1 allocation
    186 xg0     RN xft
    187 xg1     RN xf0
    188 xg2     RN xf1
    189 xg3     RN xf2
    190 xgt     RN xf3
    191 xg4     RN xf4
    192 xg5     RN xf5
    193 xg6     RN xf6
    194 xg7     RN xf7
    195 ;// IStage 2 allocation
    196 xh0     RN xg0
    197 xh1     RN xg1
    198 xht     RN xg2
    199 xh2     RN xg3
    200 xh3     RN xgt
    201 xh4     RN xg4
    202 xh5     RN xg5
    203 xh6     RN xg6
    204 xh7     RN xg7
    205 ;// IStage 3,4 allocation
    206 xit     RN xh0
    207 xi0     RN xh1
    208 xi1     RN xht
    209 xi2     RN xh2
    210 xi3     RN xh3
    211 xi4     RN xh4
    212 xi5     RN xh5
    213 xi6     RN xh6
    214 xi7     RN xh7
    215 
    216         M_STR   pDest,  ppDest
    217         IF "$stride"="s"
    218             M_STR   Stride, pStride
    219         ENDIF
    220         M_ADR   pDest,  pBlk
    221         LDR     csPiBy8, =0x30fc7642
    222         LDR     LoopRR2, =0x00005a82
    223 
    224 v6_idct_col$_F
    225         ;// Load even values
    226         LDR     xi4, [pSrc], #4  ;// j0
    227         LDR     xi5, [pSrc, #4*16-4]  ;// j4
    228         LDR     xi6, [pSrc, #2*16-4]  ;// j2
    229         LDR     xi7, [pSrc, #6*16-4]  ;// j6
    230 
    231         ;// Scale Even Values
    232         IF "$inscale"="s16" ;// 16x16 mul
    233 SHIFT       SETA    12
    234             LDR     xi0, [pScale], #4
    235             LDR     xi1, [pScale, #4*16-4]
    236             LDR     xi2, [pScale, #2*16-4]
    237             MOV     xit, #1<<(SHIFT-1)
    238             SMLABB  xi3, xi0, xi4, xit
    239             SMLATT  xi4, xi0, xi4, xit
    240             SMLABB  xi0, xi1, xi5, xit
    241             SMLATT  xi5, xi1, xi5, xit
    242             MOV     xi3, xi3, ASR #SHIFT
    243             PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
    244             LDR     xi3, [pScale, #6*16-4]
    245             SMLABB  xi1, xi2, xi6, xit
    246             SMLATT  xi6, xi2, xi6, xit
    247             MOV     xi0, xi0, ASR #SHIFT
    248             PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
    249             SMLABB  xi2, xi3, xi7, xit
    250             SMLATT  xi7, xi3, xi7, xit
    251             MOV     xi1, xi1, ASR #SHIFT
    252             PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
    253             MOV     xi2, xi2, ASR #SHIFT
    254             PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
    255         ENDIF
    256         IF "$inscale"="s32" ;// 32x16 mul
    257 SHIFT       SETA    (12+8-16)
    258             MOV     xit, #1<<(SHIFT-1)
    259             LDR     xi0, [pScale], #8
    260             LDR     xi1, [pScale, #0*32+4-8]
    261             LDR     xi2, [pScale, #4*32-8]
    262             LDR     xi3, [pScale, #4*32+4-8]
    263             SMLAWB  xi0, xi0, xi4, xit
    264             SMLAWT  xi1, xi1, xi4, xit
    265             SMLAWB  xi2, xi2, xi5, xit
    266             SMLAWT  xi3, xi3, xi5, xit
    267             MOV     xi0, xi0, ASR #SHIFT
    268             PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
    269             MOV     xi2, xi2, ASR #SHIFT
    270             PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
    271             LDR     xi0, [pScale, #2*32-8]
    272             LDR     xi1, [pScale, #2*32+4-8]
    273             LDR     xi2, [pScale, #6*32-8]
    274             LDR     xi3, [pScale, #6*32+4-8]
    275             SMLAWB  xi0, xi0, xi6, xit
    276             SMLAWT  xi1, xi1, xi6, xit
    277             SMLAWB  xi2, xi2, xi7, xit
    278             SMLAWT  xi3, xi3, xi7, xit
    279             MOV     xi0, xi0, ASR #SHIFT
    280             PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
    281             MOV     xi2, xi2, ASR #SHIFT
    282             PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
    283         ENDIF
    284 
    285         ;// Load odd values
    286         LDR     xi0, [pSrc, #1*16-4]      ;// j1
    287         LDR     xi1, [pSrc, #7*16-4]      ;// j7
    288         LDR     xi2, [pSrc, #5*16-4]      ;// j5
    289         LDR     xi3, [pSrc, #3*16-4]      ;// j3
    290 
    291         IF  {TRUE}
    292             ;// shortcut if odd values 0
    293             TEQ     xi0, #0
    294             TEQEQ   xi1, #0
    295             TEQEQ   xi2, #0
    296             TEQEQ   xi3, #0
    297             BEQ     v6OddZero$_F
    298         ENDIF
    299 
    300         ;// Store scaled even values
    301         STMIA   pDest, {xi4, xi5, xi6, xi7}
    302 
    303         ;// Scale odd values
    304         IF "$inscale"="s16"
    305             ;// Perform AAN Scale
    306             LDR     xi4, [pScale, #1*16-4]
    307             LDR     xi5, [pScale, #7*16-4]
    308             LDR     xi6, [pScale, #5*16-4]
    309             SMLABB  xi7, xi0, xi4, xit
    310             SMLATT  xi0, xi0, xi4, xit
    311             SMLABB  xi4, xi1, xi5, xit
    312             SMLATT  xi1, xi1, xi5, xit
    313             MOV     xi7, xi7, ASR #SHIFT
    314             PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
    315             LDR     xi7, [pScale, #3*16-4]
    316             SMLABB  xi5, xi2, xi6, xit
    317             SMLATT  xi2, xi2, xi6, xit
    318             MOV     xi4, xi4, ASR #SHIFT
    319             PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
    320             SMLABB  xi6, xi3, xi7, xit
    321             SMLATT  xi3, xi3, xi7, xit
    322             MOV     xi5, xi5, ASR #SHIFT
    323             PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
    324             MOV     xi6, xi6, ASR #SHIFT
    325             PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
    326         ENDIF
    327         IF "$inscale"="s32" ;// 32x16 mul
    328             LDR     xi4, [pScale, #1*32-8]
    329             LDR     xi5, [pScale, #1*32+4-8]
    330             LDR     xi6, [pScale, #7*32-8]
    331             LDR     xi7, [pScale, #7*32+4-8]
    332             SMLAWB  xi4, xi4, xi0, xit
    333             SMLAWT  xi5, xi5, xi0, xit
    334             SMLAWB  xi6, xi6, xi1, xit
    335             SMLAWT  xi7, xi7, xi1, xit
    336             MOV     xi4, xi4, ASR #SHIFT
    337             PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
    338             MOV     xi6, xi6, ASR #SHIFT
    339             PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
    340             LDR     xi4, [pScale, #5*32-8]
    341             LDR     xi5, [pScale, #5*32+4-8]
    342             LDR     xi6, [pScale, #3*32-8]
    343             LDR     xi7, [pScale, #3*32+4-8]
    344             SMLAWB  xi4, xi4, xi2, xit
    345             SMLAWT  xi5, xi5, xi2, xit
    346             SMLAWB  xi6, xi6, xi3, xit
    347             SMLAWT  xi7, xi7, xi3, xit
    348             MOV     xi4, xi4, ASR #SHIFT
    349             PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
    350             MOV     xi6, xi6, ASR #SHIFT
    351             PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
    352         ENDIF
    353 
    354         SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
    355         SSUB16  xi6, xi0, xi1           ;// j1-j7
    356         SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
    357         SSUB16  xi4, xi2, xi3           ;// j5-j3
    358 
    359         SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
    360 
    361         PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
    362         PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
    363 
    364         SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
    365         SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
    366         SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
    367         SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
    368 
    369         SMULBB  xi1, xi3, LoopRR2
    370         SMULTB  xi3, xi3, LoopRR2
    371 
    372         PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
    373         PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
    374         SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
    375 
    376         ;// xi0,xi1,xi2,xi3 now free
    377         ;// IStage 4,3, rows 2to3 x1/2
    378 
    379         MOV     xi3, xi3, LSL #1
    380         PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
    381         LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
    382 
    383         ;// IStage 2, rows4to7
    384         SSUB16  xg6, xh6, xh7
    385         SSUB16  xg5, xh5, xg6
    386         SSUB16  xg4, xh4, xg5
    387 
    388         SSUB16  xi2, xi0, xi1           ;// (j2-j6)
    389         SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
    390 
    391         SMULBB  xi0, xi2, LoopRR2
    392         SMULTB  xi2, xi2, LoopRR2
    393 
    394         MOV     xi2, xi2, LSL #1
    395         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    396 
    397         ;// xi0, xi1 now free
    398         ;// IStage 4,3 rows 0to1 x 1/2
    399         LDRD    xi0, [pDest]            ;// j0, j4 scaled
    400         SSUB16  xh2, xh2, xi3
    401         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    402 
    403         SHADD16 xh0, xi0, xi1
    404         SHSUB16 xh1, xi0, xi1
    405 
    406         ;// IStage 2 rows 0to3 x 1/2
    407         SHSUB16 xg2, xh1, xh2
    408         SHADD16 xg1, xh1, xh2
    409         SHSUB16 xg3, xh0, xh3
    410         SHADD16 xg0, xh0, xh3
    411 
    412         ;// IStage 1 all rows
    413         SADD16  xf3, xg3, xg4
    414         SSUB16  xf4, xg3, xg4
    415         SADD16  xf2, xg2, xg5
    416         SSUB16  xf5, xg2, xg5
    417         SADD16  xf1, xg1, xg6
    418         SSUB16  xf6, xg1, xg6
    419         SADD16  xf0, xg0, xg7
    420         SSUB16  xf7, xg0, xg7
    421 
    422         ;// Transpose, store and loop
    423         PKHBT   ra01, xf0, xf1, LSL #16
    424         PKHTB   rb01, xf1, xf0, ASR #16
    425 
    426         PKHBT   ra23, xf2, xf3, LSL #16
    427         PKHTB   rb23, xf3, xf2, ASR #16
    428 
    429         PKHBT   ra45, xf4, xf5, LSL #16
    430         PKHTB   rb45, xf5, xf4, ASR #16
    431 
    432         PKHBT   ra67, xf6, xf7, LSL #16
    433         STMIA   pDest!, {ra01, ra23, ra45, ra67}
    434         PKHTB   rb67, xf7, xf6, ASR #16
    435         STMIA   pDest!, {rb01, rb23, rb45, rb67}
    436         BCC     v6_idct_col$_F
    437 
    438         SUB     pSrc, pDest, #(64*2)
    439         M_LDR   pDest, ppDest
    440         IF "$stride"="s"
    441             M_LDR   pScale, pStride
    442         ENDIF
    443         B       v6_idct_row$_F
    444 
    445 v6OddZero$_F
    446         SSUB16  xi2, xi6, xi7           ;// (j2-j6)
    447         SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
    448 
    449         SMULBB  xi0, xi2, LoopRR2
    450         SMULTB  xi2, xi2, LoopRR2
    451 
    452         MOV     xi2, xi2, LSL #1
    453         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    454         SSUB16  xh2, xh2, xi3
    455 
    456         ;// xi0, xi1 now free
    457         ;// IStage 4,3 rows 0to1 x 1/2
    458 
    459         SHADD16 xh0, xi4, xi5
    460         SHSUB16 xh1, xi4, xi5
    461 
    462         ;// IStage 2 rows 0to3 x 1/2
    463         SHSUB16 xg2, xh1, xh2
    464         SHADD16 xg1, xh1, xh2
    465         SHSUB16 xg3, xh0, xh3
    466         SHADD16 xg0, xh0, xh3
    467 
    468         ;// IStage 1 all rows
    469         MOV  xf3, xg3
    470         MOV  xf4, xg3
    471         MOV  xf2, xg2
    472         MOV  xf5, xg2
    473         MOV  xf1, xg1
    474         MOV  xf6, xg1
    475         MOV  xf0, xg0
    476         MOV  xf7, xg0
    477 
    478         ;// Transpose
    479         PKHBT   ra01, xf0, xf1, LSL #16
    480         PKHTB   rb01, xf1, xf0, ASR #16
    481 
    482         PKHBT   ra23, xf2, xf3, LSL #16
    483         PKHTB   rb23, xf3, xf2, ASR #16
    484 
    485         PKHBT   ra45, xf4, xf5, LSL #16
    486         PKHTB   rb45, xf5, xf4, ASR #16
    487 
    488         PKHBT   ra67, xf6, xf7, LSL #16
    489         PKHTB   rb67, xf7, xf6, ASR #16
    490 
    491         STMIA   pDest!, {ra01, ra23, ra45, ra67}
    492         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    493         STMIA   pDest!, {rb01, rb23, rb45, rb67}
    494 
    495         BCC     v6_idct_col$_F
    496         SUB     pSrc, pDest, #(64*2)
    497         M_LDR   pDest, ppDest
    498         IF "$stride"="s"
    499             M_LDR   pScale, pStride
    500         ENDIF
    501 
    502 
    503 v6_idct_row$_F
    504         ;// IStage 4,3, rows4to7 x1/4
    505         LDR     xit, =0x00010001        ;// rounding constant
    506         LDR     xi0, [pSrc, #1*16]      ;// j1
    507         LDR     xi1, [pSrc, #7*16]      ;// 4*j7
    508         LDR     xi2, [pSrc, #5*16]      ;// j5
    509         LDR     xi3, [pSrc, #3*16]      ;// j3
    510 
    511         SHADD16 xi1, xi1, xit           ;// 2*j7
    512         SHADD16 xi1, xi1, xit           ;// j7
    513 
    514         SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
    515         SSUB16  xi6, xi0, xi1           ;// j1-j7
    516         SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
    517         SSUB16  xi4, xi2, xi3           ;// j5-j3
    518 
    519         SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
    520 
    521         PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
    522         PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
    523 
    524         SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
    525         SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
    526         SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
    527         SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
    528 
    529         SMULBB  xi1, xi3, LoopRR2
    530         SMULTB  xi3, xi3, LoopRR2
    531 
    532         PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
    533         PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
    534         SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
    535 
    536         MOV     xi3, xi3, LSL #1
    537         PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
    538 
    539         ;// xi0,xi1,xi2,xi3 now free
    540         ;// IStage 4,3, rows 2to3 x1/2
    541 
    542         LDR     xi0, [pSrc, #2*16]      ;// j2
    543         LDR     xi1, [pSrc, #6*16]      ;// 2*j6
    544 
    545         ;// IStage 2, rows4to7
    546         SSUB16  xg6, xh6, xh7
    547         SSUB16  xg5, xh5, xg6
    548         SSUB16  xg4, xh4, xg5
    549 
    550         SHADD16 xi1, xi1, xit           ;// j6
    551         SSUB16  xi2, xi0, xi1           ;// (j2-j6)
    552         SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
    553 
    554         SMULBB  xi0, xi2, LoopRR2
    555         SMULTB  xi2, xi2, LoopRR2
    556 
    557         MOV     xi2, xi2, LSL #1
    558 
    559         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    560 
    561         ;// xi0, xi1 now free
    562         ;// IStage 4,3 rows 0to1 x 1/2
    563         LDR     xi1, [pSrc, #4*16]      ;// j4
    564         LDR     xi0, [pSrc], #4         ;// j0
    565 
    566         SSUB16  xh2, xh2, xi3
    567         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    568 
    569         ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
    570         SHADD16 xh0, xi0, xi1           ;// of DC result
    571         SHSUB16 xh1, xi0, xi1
    572 
    573         ;// IStage 2 rows 0to3 x 1/2
    574         SHSUB16 xg2, xh1, xh2
    575         SHADD16 xg1, xh1, xh2
    576         SHSUB16 xg3, xh0, xh3
    577         SHADD16 xg0, xh0, xh3
    578 
    579         ;// IStage 1 all rows
    580         SHADD16 xf3, xg3, xg4
    581         SHSUB16 xf4, xg3, xg4
    582         SHADD16 xf2, xg2, xg5
    583         SHSUB16 xf5, xg2, xg5
    584         SHADD16 xf1, xg1, xg6
    585         SHSUB16 xf6, xg1, xg6
    586         SHADD16 xf0, xg0, xg7
    587         SHSUB16 xf7, xg0, xg7
    588 
    589         ;// Saturate
    590         IF ("$outsize"="u8")
    591             USAT16  xf0, #8, xf0
    592             USAT16  xf1, #8, xf1
    593             USAT16  xf2, #8, xf2
    594             USAT16  xf3, #8, xf3
    595             USAT16  xf4, #8, xf4
    596             USAT16  xf5, #8, xf5
    597             USAT16  xf6, #8, xf6
    598             USAT16  xf7, #8, xf7
    599         ENDIF
    600         IF ("$outsize"="s9")
    601             SSAT16  xf0, #9, xf0
    602             SSAT16  xf1, #9, xf1
    603             SSAT16  xf2, #9, xf2
    604             SSAT16  xf3, #9, xf3
    605             SSAT16  xf4, #9, xf4
    606             SSAT16  xf5, #9, xf5
    607             SSAT16  xf6, #9, xf6
    608             SSAT16  xf7, #9, xf7
    609         ENDIF
    610 
    611         ;// Transpose to Row, Pack and store
    612         IF ("$outsize"="u8")
    613             ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
    614             ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
    615             ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
    616             ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
    617             PKHBT   ra01, xf0, xf2, LSL #16
    618             PKHTB   rb01, xf2, xf0, ASR #16
    619             PKHBT   ra23, xf4, xf6, LSL #16
    620             PKHTB   rb23, xf6, xf4, ASR #16
    621             STMIA   pDest, {ra01, ra23}
    622             IF "$stride"="s"
    623                 ADD     pDest, pDest, pScale
    624                 STMIA   pDest, {rb01, rb23}
    625                 ADD     pDest, pDest, pScale
    626             ELSE
    627                 ADD     pDest, pDest, #($stride)
    628                 STMIA   pDest, {rb01, rb23}
    629                 ADD     pDest, pDest, #($stride)
    630             ENDIF
    631         ENDIF
    632         IF ("$outsize"="s9"):LOR:("$outsize"="s16")
    633             PKHBT   ra01, xf0, xf1, LSL #16
    634             PKHTB   rb01, xf1, xf0, ASR #16
    635 
    636             PKHBT   ra23, xf2, xf3, LSL #16
    637             PKHTB   rb23, xf3, xf2, ASR #16
    638 
    639             PKHBT   ra45, xf4, xf5, LSL #16
    640             PKHTB   rb45, xf5, xf4, ASR #16
    641 
    642             PKHBT   ra67, xf6, xf7, LSL #16
    643             PKHTB   rb67, xf7, xf6, ASR #16
    644 
    645             STMIA   pDest, {ra01, ra23, ra45, ra67}
    646             IF "$stride"="s"
    647                 ADD     pDest, pDest, pScale
    648                 STMIA   pDest, {rb01, rb23, rb45, rb67}
    649                 ADD     pDest, pDest, pScale
    650             ELSE
    651                 ADD     pDest, pDest, #($stride)
    652                 STMIA   pDest, {rb01, rb23, rb45, rb67}
    653                 ADD     pDest, pDest, #($stride)
    654             ENDIF
    655         ENDIF
    656 
    657         BCC     v6_idct_row$_F
    658         ENDIF ;// ARM1136JS
    659 
    660 
    661         IF CortexA8
    662 
    663 Src0            EQU  7
    664 Src1            EQU  8
    665 Src2            EQU  9
    666 Src3            EQU  10
    667 Src4            EQU  11
    668 Src5            EQU  12
    669 Src6            EQU  13
    670 Src7            EQU  14
    671 Tmp             EQU  15
    672 
    673 qXj0            QN Src0.S16
    674 qXj1            QN Src1.S16
    675 qXj2            QN Src2.S16
    676 qXj3            QN Src3.S16
    677 qXj4            QN Src4.S16
    678 qXj5            QN Src5.S16
    679 qXj6            QN Src6.S16
    680 qXj7            QN Src7.S16
    681 qXjt            QN Tmp.S16
    682 
    683 dXj0lo          DN (Src0*2).S16
    684 dXj0hi          DN (Src0*2+1).S16
    685 dXj1lo          DN (Src1*2).S16
    686 dXj1hi          DN (Src1*2+1).S16
    687 dXj2lo          DN (Src2*2).S16
    688 dXj2hi          DN (Src2*2+1).S16
    689 dXj3lo          DN (Src3*2).S16
    690 dXj3hi          DN (Src3*2+1).S16
    691 dXj4lo          DN (Src4*2).S16
    692 dXj4hi          DN (Src4*2+1).S16
    693 dXj5lo          DN (Src5*2).S16
    694 dXj5hi          DN (Src5*2+1).S16
    695 dXj6lo          DN (Src6*2).S16
    696 dXj6hi          DN (Src6*2+1).S16
    697 dXj7lo          DN (Src7*2).S16
    698 dXj7hi          DN (Src7*2+1).S16
    699 dXjtlo          DN (Tmp*2).S16
    700 dXjthi          DN (Tmp*2+1).S16
    701 
    702 qXi0            QN qXj0
    703 qXi1            QN qXj4
    704 qXi2            QN qXj2
    705 qXi3            QN qXj7
    706 qXi4            QN qXj5
    707 qXi5            QN qXjt
    708 qXi6            QN qXj1
    709 qXi7            QN qXj6
    710 qXit            QN qXj3
    711 
    712 dXi0lo          DN dXj0lo
    713 dXi0hi          DN dXj0hi
    714 dXi1lo          DN dXj4lo
    715 dXi1hi          DN dXj4hi
    716 dXi2lo          DN dXj2lo
    717 dXi2hi          DN dXj2hi
    718 dXi3lo          DN dXj7lo
    719 dXi3hi          DN dXj7hi
    720 dXi4lo          DN dXj5lo
    721 dXi4hi          DN dXj5hi
    722 dXi5lo          DN dXjtlo
    723 dXi5hi          DN dXjthi
    724 dXi6lo          DN dXj1lo
    725 dXi6hi          DN dXj1hi
    726 dXi7lo          DN dXj6lo
    727 dXi7hi          DN dXj6hi
    728 dXitlo          DN dXj3lo
    729 dXithi          DN dXj3hi
    730 
    731 qXh0            QN qXit
    732 qXh1            QN qXi0
    733 qXh2            QN qXi2
    734 qXh3            QN qXi3
    735 qXh4            QN qXi7
    736 qXh5            QN qXi5
    737 qXh6            QN qXi4
    738 qXh7            QN qXi1
    739 qXht            QN qXi6
    740 
    741 dXh0lo          DN dXitlo
    742 dXh0hi          DN dXithi
    743 dXh1lo          DN dXi0lo
    744 dXh1hi          DN dXi0hi
    745 dXh2lo          DN dXi2lo
    746 dXh2hi          DN dXi2hi
    747 dXh3lo          DN dXi3lo
    748 dXh3hi          DN dXi3hi
    749 dXh4lo          DN dXi7lo
    750 dXh4hi          DN dXi7hi
    751 dXh5lo          DN dXi5lo
    752 dXh5hi          DN dXi5hi
    753 dXh6lo          DN dXi4lo
    754 dXh6hi          DN dXi4hi
    755 dXh7lo          DN dXi1lo
    756 dXh7hi          DN dXi1hi
    757 dXhtlo          DN dXi6lo
    758 dXhthi          DN dXi6hi
    759 
    760 qXg0            QN qXh2
    761 qXg1            QN qXht
    762 qXg2            QN qXh1
    763 qXg3            QN qXh0
    764 qXg4            QN qXh4
    765 qXg5            QN qXh5
    766 qXg6            QN qXh6
    767 qXg7            QN qXh7
    768 qXgt            QN qXh3
    769 
    770 qXf0            QN qXg6
    771 qXf1            QN qXg5
    772 qXf2            QN qXg4
    773 qXf3            QN qXgt
    774 qXf4            QN qXg3
    775 qXf5            QN qXg2
    776 qXf6            QN qXg1
    777 qXf7            QN qXg0
    778 qXft            QN qXg7
    779 
    780 
    781 qXt0            QN 1.S32
    782 qXt1            QN 2.S32
    783 qT0lo           QN 1.S32
    784 qT0hi           QN 2.S32
    785 qT1lo           QN 3.S32
    786 qT1hi           QN 4.S32
    787 qScalelo        QN 5.S32        ;// used to read post scale values
    788 qScalehi        QN 6.S32
    789 qTemp0          QN 5.S32
    790 qTemp1          QN 6.S32
    791 
    792 
    793 Scale1          EQU 6
    794 Scale2          EQU 15
    795 qScale1         QN Scale1.S16
    796 qScale2         QN Scale2.S16
    797 dScale1lo       DN (Scale1*2).S16
    798 dScale1hi       DN (Scale1*2+1).S16
    799 dScale2lo       DN (Scale2*2).S16
    800 dScale2hi       DN (Scale2*2+1).S16
    801 
    802 dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
    803 InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
    804 S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
    805 C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
    806 
    807 pTemp           RN 12
    808 
    809 
    810         IMPORT  armCOMM_IDCTCoef
    811 
    812         VLD1        {qXj0,qXj1}, [pSrc @64]!
    813         VLD1        {qXj2,qXj3}, [pSrc @64]!
    814         VLD1        {qXj4,qXj5}, [pSrc @64]!
    815         VLD1        {qXj6,qXj7}, [pSrc @64]!
    816 
    817         ;// Load PreScale and multiply with Src
    818         ;// IStage 4
    819 
    820         IF "$inscale"="s16"                         ;// 16X16 Mul
    821             M_IDCT_PRESCALE16
    822         ENDIF
    823 
    824         IF "$inscale"="s32"                         ;// 32X32 ,ul
    825             M_IDCT_PRESCALE32
    826         ENDIF
    827 
    828         ;// IStage 3
    829         VQRDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
    830         VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
    831         VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
    832         VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
    833         VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
    834         VQRDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
    835         VSUB        qXh2, qXi2, qXi3                ;// h2, h3
    836 
    837         VMULL       qXt0, dXi4lo, C                 ;// c*i4
    838         VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
    839         VMULL       qXt1, dXi4hi, C
    840         VMLAL       qXt1, dXi6hi, S
    841         VSHRN       dXh4lo, qXt0, #16               ;// h4
    842         VSHRN       dXh4hi, qXt1, #16
    843 
    844         VMULL       qXt0, dXi6lo, C                 ;// c*i6
    845         VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
    846         VMULL       qXt1, dXi6hi, C
    847         VMLSL       qXt1, dXi4hi, S
    848         VSHRN       dXh6lo, qXt0, #16               ;// h6
    849         VSHRN       dXh6hi, qXt1, #16
    850 
    851         ;// IStage 2
    852         VSUB        qXg6, qXh6, qXh7
    853         VSUB        qXg5, qXh5, qXg6
    854         VSUB        qXg4, qXh4, qXg5
    855         VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
    856         VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
    857         VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
    858         VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
    859 
    860         ;// IStage 1 all rows
    861         VADD        qXf3, qXg3, qXg4
    862         VSUB        qXf4, qXg3, qXg4
    863         VADD        qXf2, qXg2, qXg5
    864         VSUB        qXf5, qXg2, qXg5
    865         VADD        qXf1, qXg1, qXg6
    866         VSUB        qXf6, qXg1, qXg6
    867         VADD        qXf0, qXg0, qXg7
    868         VSUB        qXf7, qXg0, qXg7
    869 
    870         ;// Transpose, store and loop
    871 XTR0            EQU Src5
    872 XTR1            EQU Tmp
    873 XTR2            EQU Src6
    874 XTR3            EQU Src7
    875 XTR4            EQU Src3
    876 XTR5            EQU Src0
    877 XTR6            EQU Src1
    878 XTR7            EQU Src2
    879 XTRt            EQU Src4
    880 
    881 qA0             QN  XTR0.S32  ;// for XTRpose
    882 qA1             QN  XTR1.S32
    883 qA2             QN  XTR2.S32
    884 qA3             QN  XTR3.S32
    885 qA4             QN  XTR4.S32
    886 qA5             QN  XTR5.S32
    887 qA6             QN  XTR6.S32
    888 qA7             QN  XTR7.S32
    889 
    890 dB0             DN  XTR0*2+1      ;// for using VSWP
    891 dB1             DN  XTR1*2+1
    892 dB2             DN  XTR2*2+1
    893 dB3             DN  XTR3*2+1
    894 dB4             DN  XTR4*2
    895 dB5             DN  XTR5*2
    896 dB6             DN  XTR6*2
    897 dB7             DN  XTR7*2
    898 
    899 
    900         VTRN        qXf0, qXf1
    901         VTRN        qXf2, qXf3
    902         VTRN        qXf4, qXf5
    903         VTRN        qXf6, qXf7
    904         VTRN        qA0, qA2
    905         VTRN        qA1, qA3
    906         VTRN        qA4, qA6
    907         VTRN        qA5, qA7
    908         VSWP        dB0, dB4
    909         VSWP        dB1, dB5
    910         VSWP        dB2, dB6
    911         VSWP        dB3, dB7
    912 
    913 
    914 qYj0            QN qXf0
    915 qYj1            QN qXf1
    916 qYj2            QN qXf2
    917 qYj3            QN qXf3
    918 qYj4            QN qXf4
    919 qYj5            QN qXf5
    920 qYj6            QN qXf6
    921 qYj7            QN qXf7
    922 qYjt            QN qXft
    923 
    924 dYj0lo          DN (XTR0*2).S16
    925 dYj0hi          DN (XTR0*2+1).S16
    926 dYj1lo          DN (XTR1*2).S16
    927 dYj1hi          DN (XTR1*2+1).S16
    928 dYj2lo          DN (XTR2*2).S16
    929 dYj2hi          DN (XTR2*2+1).S16
    930 dYj3lo          DN (XTR3*2).S16
    931 dYj3hi          DN (XTR3*2+1).S16
    932 dYj4lo          DN (XTR4*2).S16
    933 dYj4hi          DN (XTR4*2+1).S16
    934 dYj5lo          DN (XTR5*2).S16
    935 dYj5hi          DN (XTR5*2+1).S16
    936 dYj6lo          DN (XTR6*2).S16
    937 dYj6hi          DN (XTR6*2+1).S16
    938 dYj7lo          DN (XTR7*2).S16
    939 dYj7hi          DN (XTR7*2+1).S16
    940 dYjtlo          DN (XTRt*2).S16
    941 dYjthi          DN (XTRt*2+1).S16
    942 
    943 qYi0            QN qYj0
    944 qYi1            QN qYj4
    945 qYi2            QN qYj2
    946 qYi3            QN qYj7
    947 qYi4            QN qYj5
    948 qYi5            QN qYjt
    949 qYi6            QN qYj1
    950 qYi7            QN qYj6
    951 qYit            QN qYj3
    952 
    953 dYi0lo          DN dYj0lo
    954 dYi0hi          DN dYj0hi
    955 dYi1lo          DN dYj4lo
    956 dYi1hi          DN dYj4hi
    957 dYi2lo          DN dYj2lo
    958 dYi2hi          DN dYj2hi
    959 dYi3lo          DN dYj7lo
    960 dYi3hi          DN dYj7hi
    961 dYi4lo          DN dYj5lo
    962 dYi4hi          DN dYj5hi
    963 dYi5lo          DN dYjtlo
    964 dYi5hi          DN dYjthi
    965 dYi6lo          DN dYj1lo
    966 dYi6hi          DN dYj1hi
    967 dYi7lo          DN dYj6lo
    968 dYi7hi          DN dYj6hi
    969 dYitlo          DN dYj3lo
    970 dYithi          DN dYj3hi
    971 
    972 qYh0            QN qYit
    973 qYh1            QN qYi0
    974 qYh2            QN qYi2
    975 qYh3            QN qYi3
    976 qYh4            QN qYi7
    977 qYh5            QN qYi5
    978 qYh6            QN qYi4
    979 qYh7            QN qYi1
    980 qYht            QN qYi6
    981 
    982 dYh0lo          DN dYitlo
    983 dYh0hi          DN dYithi
    984 dYh1lo          DN dYi0lo
    985 dYh1hi          DN dYi0hi
    986 dYh2lo          DN dYi2lo
    987 dYh2hi          DN dYi2hi
    988 dYh3lo          DN dYi3lo
    989 dYh3hi          DN dYi3hi
    990 dYh4lo          DN dYi7lo
    991 dYh4hi          DN dYi7hi
    992 dYh5lo          DN dYi5lo
    993 dYh5hi          DN dYi5hi
    994 dYh6lo          DN dYi4lo
    995 dYh6hi          DN dYi4hi
    996 dYh7lo          DN dYi1lo
    997 dYh7hi          DN dYi1hi
    998 dYhtlo          DN dYi6lo
    999 dYhthi          DN dYi6hi
   1000 
   1001 qYg0            QN qYh2
   1002 qYg1            QN qYht
   1003 qYg2            QN qYh1
   1004 qYg3            QN qYh0
   1005 qYg4            QN qYh4
   1006 qYg5            QN qYh5
   1007 qYg6            QN qYh6
   1008 qYg7            QN qYh7
   1009 qYgt            QN qYh3
   1010 
   1011 qYf0            QN qYg6
   1012 qYf1            QN qYg5
   1013 qYf2            QN qYg4
   1014 qYf3            QN qYgt
   1015 qYf4            QN qYg3
   1016 qYf5            QN qYg2
   1017 qYf6            QN qYg1
   1018 qYf7            QN qYg0
   1019 qYft            QN qYg7
   1020 
   1021         VRSHR       qYj7, qYj7, #2
   1022         VRSHR       qYj6, qYj6, #1
   1023 
   1024         VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
   1025         VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
   1026         VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
   1027         VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
   1028         VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
   1029         VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
   1030 
   1031         VQRDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
   1032         ;// IStage 4,3 rows 0to1 x 1/2
   1033 
   1034         MOV         pTemp, #0x4             ;// ensure correct round
   1035         VDUP        qScale1, pTemp           ;// of DC result
   1036         VADD        qYi0, qYi0, qScale1
   1037 
   1038         VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
   1039         VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
   1040 
   1041         VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
   1042         VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
   1043         VSUB        qYh2, qYi2, qYi3        ;// h2, h3
   1044         VQRDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
   1045 
   1046         VMULL       qXt0, dYi4lo, C         ;// c*i4
   1047         VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
   1048         VMULL       qXt1, dYi4hi, C
   1049         VMLAL       qXt1, dYi6hi, S
   1050         VSHRN       dYh4lo, qXt0, #16       ;// h4
   1051         VSHRN       dYh4hi, qXt1, #16
   1052 
   1053         VMULL       qXt0, dYi6lo, C         ;// c*i6
   1054         VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
   1055         VMULL       qXt1, dYi6hi, C
   1056         VMLSL       qXt1, dYi4hi, S
   1057         VSHRN       dYh6lo, qXt0, #16       ;// h6
   1058         VSHRN       dYh6hi, qXt1, #16
   1059 
   1060         VSUB        qYg6, qYh6, qYh7
   1061         VSUB        qYg5, qYh5, qYg6
   1062         VSUB        qYg4, qYh4, qYg5
   1063 
   1064         ;// IStage 2 rows 0to3 x 1/2
   1065         VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
   1066         VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
   1067         VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
   1068         VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
   1069 
   1070 
   1071         ;// IStage 1 all rows
   1072         VHADD        qYf3, qYg3, qYg4
   1073         VHSUB        qYf4, qYg3, qYg4
   1074         VHADD        qYf2, qYg2, qYg5
   1075         VHSUB        qYf5, qYg2, qYg5
   1076         VHADD        qYf1, qYg1, qYg6
   1077         VHSUB        qYf6, qYg1, qYg6
   1078         VHADD        qYf0, qYg0, qYg7
   1079         VHSUB        qYf7, qYg0, qYg7
   1080 
   1081 YTR0            EQU Src0
   1082 YTR1            EQU Src4
   1083 YTR2            EQU Src1
   1084 YTR3            EQU Src2
   1085 YTR4            EQU Src7
   1086 YTR5            EQU Src5
   1087 YTR6            EQU Tmp
   1088 YTR7            EQU Src6
   1089 YTRt            EQU Src3
   1090 
   1091 qC0             QN  YTR0.S32                ;// for YTRpose
   1092 qC1             QN  YTR1.S32
   1093 qC2             QN  YTR2.S32
   1094 qC3             QN  YTR3.S32
   1095 qC4             QN  YTR4.S32
   1096 qC5             QN  YTR5.S32
   1097 qC6             QN  YTR6.S32
   1098 qC7             QN  YTR7.S32
   1099 
   1100 dD0             DN  YTR0*2+1                ;// for using VSWP
   1101 dD1             DN  YTR1*2+1
   1102 dD2             DN  YTR2*2+1
   1103 dD3             DN  YTR3*2+1
   1104 dD4             DN  YTR4*2
   1105 dD5             DN  YTR5*2
   1106 dD6             DN  YTR6*2
   1107 dD7             DN  YTR7*2
   1108 
   1109         VTRN        qYf0, qYf1
   1110         VTRN        qYf2, qYf3
   1111         VTRN        qYf4, qYf5
   1112         VTRN        qYf6, qYf7
   1113         VTRN        qC0, qC2
   1114         VTRN        qC1, qC3
   1115         VTRN        qC4, qC6
   1116         VTRN        qC5, qC7
   1117         VSWP        dD0, dD4
   1118         VSWP        dD1, dD5
   1119         VSWP        dD2, dD6
   1120         VSWP        dD3, dD7
   1121 
   1122 
   1123 dYf0U8          DN YTR0*2.U8
   1124 dYf1U8          DN YTR1*2.U8
   1125 dYf2U8          DN YTR2*2.U8
   1126 dYf3U8          DN YTR3*2.U8
   1127 dYf4U8          DN YTR4*2.U8
   1128 dYf5U8          DN YTR5*2.U8
   1129 dYf6U8          DN YTR6*2.U8
   1130 dYf7U8          DN YTR7*2.U8
   1131 
   1132         ;//
   1133         ;// Do saturation if outsize is other than S16
   1134         ;//
   1135 
   1136         IF ("$outsize"="u8")
   1137             ;// Output range [0-255]
   1138             VQMOVN            dYf0U8, qYf0
   1139             VQMOVN            dYf1U8, qYf1
   1140             VQMOVN            dYf2U8, qYf2
   1141             VQMOVN            dYf3U8, qYf3
   1142             VQMOVN            dYf4U8, qYf4
   1143             VQMOVN            dYf5U8, qYf5
   1144             VQMOVN            dYf6U8, qYf6
   1145             VQMOVN            dYf7U8, qYf7
   1146         ENDIF
   1147 
   1148         IF ("$outsize"="s9")
   1149             ;// Output range [-256 to +255]
   1150             VQSHL            qYf0, qYf0, #16-9
   1151             VQSHL            qYf1, qYf1, #16-9
   1152             VQSHL            qYf2, qYf2, #16-9
   1153             VQSHL            qYf3, qYf3, #16-9
   1154             VQSHL            qYf4, qYf4, #16-9
   1155             VQSHL            qYf5, qYf5, #16-9
   1156             VQSHL            qYf6, qYf6, #16-9
   1157             VQSHL            qYf7, qYf7, #16-9
   1158 
   1159             VSHR             qYf0, qYf0, #16-9
   1160             VSHR             qYf1, qYf1, #16-9
   1161             VSHR             qYf2, qYf2, #16-9
   1162             VSHR             qYf3, qYf3, #16-9
   1163             VSHR             qYf4, qYf4, #16-9
   1164             VSHR             qYf5, qYf5, #16-9
   1165             VSHR             qYf6, qYf6, #16-9
   1166             VSHR             qYf7, qYf7, #16-9
   1167         ENDIF
   1168 
   1169         ;// Store output depending on the Stride size
   1170         IF "$stride"="s"
   1171             VST1        qYf0, [pDest @64], Stride
   1172             VST1        qYf1, [pDest @64], Stride
   1173             VST1        qYf2, [pDest @64], Stride
   1174             VST1        qYf3, [pDest @64], Stride
   1175             VST1        qYf4, [pDest @64], Stride
   1176             VST1        qYf5, [pDest @64], Stride
   1177             VST1        qYf6, [pDest @64], Stride
   1178             VST1        qYf7, [pDest @64]
   1179         ELSE
   1180             IF ("$outsize"="u8")
   1181                 VST1        dYf0U8, [pDest @64], #8
   1182                 VST1        dYf1U8, [pDest @64], #8
   1183                 VST1        dYf2U8, [pDest @64], #8
   1184                 VST1        dYf3U8, [pDest @64], #8
   1185                 VST1        dYf4U8, [pDest @64], #8
   1186                 VST1        dYf5U8, [pDest @64], #8
   1187                 VST1        dYf6U8, [pDest @64], #8
   1188                 VST1        dYf7U8, [pDest @64]
   1189             ELSE
   1190                 ;// ("$outsize"="s9") or ("$outsize"="s16")
   1191                 VST1        qYf0, [pDest @64], #16
   1192                 VST1        qYf1, [pDest @64], #16
   1193                 VST1        qYf2, [pDest @64], #16
   1194                 VST1        qYf3, [pDest @64], #16
   1195                 VST1        qYf4, [pDest @64], #16
   1196                 VST1        qYf5, [pDest @64], #16
   1197                 VST1        qYf6, [pDest @64], #16
   1198                 VST1        qYf7, [pDest @64]
   1199             ENDIF
   1200 
   1201         ENDIF
   1202 
   1203 
   1204 
   1205         ENDIF ;// CortexA8
   1206 
   1207 
   1208 
   1209         MEND
   1210 
   1211         ;// Scale TWO input rows with TWO rows of 16 bit scale values
   1212         ;//
   1213         ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
   1214         ;// input (Eight input values) with one row of scale values. Also
   1215         ;// Loads next scale values from pScale, if $LastRow flag is not set.
   1216         ;//
   1217         ;// Input Registers:
   1218         ;//
   1219         ;// $dAlo           - Input D register with first four S16 values of row n
   1220         ;// $dAhi           - Input D register with next four S16 values of row n
   1221         ;// $dBlo           - Input D register with first four S16 values of row n+1
   1222         ;// $dBhi           - Input D register with next four S16 values of row n+1
   1223         ;// pScale          - Pointer to next row of scale values
   1224         ;// qT0lo           - Temporary scratch register
   1225         ;// qT0hi           - Temporary scratch register
   1226         ;// qT1lo           - Temporary scratch register
   1227         ;// qT1hi           - Temporary scratch register
   1228         ;// dScale1lo       - Scale value of row n
   1229         ;// dScale1hi       - Scale value of row n
   1230         ;// dScale2lo       - Scale value of row n+1
   1231         ;// dScale2hi       - Scale value of row n+1
   1232         ;//
   1233         ;// Input Flag
   1234         ;//
   1235         ;// $LastRow        - Flag to indicate whether current row is last row
   1236         ;//
   1237         ;// Output Registers:
   1238         ;//
   1239         ;// $dAlo           - Scaled output values (first four S16 of row n)
   1240         ;// $dAhi           - Scaled output values (next four S16 of row n)
   1241         ;// $dBlo           - Scaled output values (first four S16 of row n+1)
   1242         ;// $dBhi           - Scaled output values (next four S16 of row n+1)
   1243         ;// qScale1         - Scale values for next row
   1244         ;// qScale2         - Scale values for next row+1
   1245         ;// pScale          - Pointer to next row of scale values
   1246         ;//
   1247         MACRO
   1248         M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
   1249         VMULL       qT0lo, $dAlo, dScale1lo
   1250         VMULL       qT0hi, $dAhi, dScale1hi
   1251         VMULL       qT1lo, $dBlo, dScale2lo
   1252         VMULL       qT1hi, $dBhi, dScale2hi
   1253         IF "$LastRow"="0"
   1254             VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
   1255             VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
   1256         ENDIF
   1257         VQRSHRN       $dAlo, qT0lo, #12
   1258         VQRSHRN       $dAhi, qT0hi, #12
   1259         VQRSHRN       $dBlo, qT1lo, #12
   1260         VQRSHRN       $dBhi, qT1hi, #12
   1261         MEND
   1262 
   1263         ;// Scale 8x8 block input values with 16 bit scale values
   1264         ;//
   1265         ;// This macro is used to pre-scale block of 8x8 input.
   1266         ;// This also do the Ist stage transformations of IDCT.
   1267         ;//
   1268         ;// Input Registers:
   1269         ;//
   1270         ;// dXjnlo          - n th input D register with first four S16 values
   1271         ;// dXjnhi          - n th input D register with next four S16 values
   1272         ;// qXjn            - n th input Q register with eight S16 values
   1273         ;// pScale          - Pointer to scale values
   1274         ;//
   1275         ;// Output Registers:
   1276         ;//
   1277         ;// qXin            - n th output Q register with eight S16 output values of 1st stage
   1278         ;//
   1279         MACRO
   1280         M_IDCT_PRESCALE16
   1281         VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
   1282         VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
   1283         M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
   1284         M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
   1285         M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
   1286         M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
   1287         VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
   1288         VSUB        qXi6, qXj1, qXj7            ;// j1-j7
   1289         LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
   1290         VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
   1291         VSUB        qXi2, qXj2, qXj6            ;// j2-j6
   1292         VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
   1293         VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
   1294         VSUB        qXi4, qXj5, qXj3            ;// j5-j3
   1295         MEND
   1296 
   1297 
   1298         ;// Scale 8x8 block input values with 32 bit scale values
   1299         ;//
   1300         ;// This macro is used to pre-scale block of 8x8 input.
   1301         ;// This also do the Ist stage transformations of IDCT.
   1302         ;//
   1303         ;// Input Registers:
   1304         ;//
   1305         ;// dXjnlo          - n th input D register with first four S16 values
   1306         ;// dXjnhi          - n th input D register with next four S16 values
   1307         ;// qXjn            - n th input Q register with eight S16 values
   1308         ;// pScale          - Pointer to 32bit scale values in Q23 format
   1309         ;//
   1310         ;// Output Registers:
   1311         ;//
   1312         ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
   1313         ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
   1314         ;//
   1315         MACRO
   1316         M_IDCT_PRESCALE32
   1317 qScale0lo       QN 0.S32
   1318 qScale0hi       QN 1.S32
   1319 qScale1lo       QN 2.S32
   1320 qScale1hi       QN 3.S32
   1321 qScale2lo       QN qScale1lo
   1322 qScale2hi       QN qScale1hi
   1323 qScale3lo       QN qScale1lo
   1324 qScale3hi       QN qScale1hi
   1325 qScale4lo       QN qScale1lo
   1326 qScale4hi       QN qScale1hi
   1327 qScale5lo       QN qScale0lo
   1328 qScale5hi       QN qScale0hi
   1329 qScale6lo       QN qScale0lo
   1330 qScale6hi       QN qScale0hi
   1331 qScale7lo       QN qScale0lo
   1332 qScale7hi       QN qScale0hi
   1333 
   1334 qSrc0lo         QN 4.S32
   1335 qSrc0hi         QN 5.S32
   1336 qSrc1lo         QN 6.S32
   1337 qSrc1hi         QN Src4.S32
   1338 qSrc2lo         QN qSrc0lo
   1339 qSrc2hi         QN qSrc0hi
   1340 qSrc3lo         QN qSrc0lo
   1341 qSrc3hi         QN qSrc0hi
   1342 qSrc4lo         QN qSrc0lo
   1343 qSrc4hi         QN qSrc0hi
   1344 qSrc5lo         QN qSrc1lo
   1345 qSrc5hi         QN qSrc1hi
   1346 qSrc6lo         QN qSrc1lo
   1347 qSrc6hi         QN qSrc1hi
   1348 qSrc7lo         QN qSrc0lo
   1349 qSrc7hi         QN qSrc0hi
   1350 
   1351 qRes17lo        QN qScale0lo
   1352 qRes17hi        QN qScale0hi
   1353 qRes26lo        QN qScale0lo
   1354 qRes26hi        QN qScale0hi
   1355 qRes53lo        QN qScale0lo
   1356 qRes53hi        QN qScale0hi
   1357 
   1358             ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
   1359 
   1360             ;// Row 0
   1361             VLD1        {qScale0lo, qScale0hi}, [pScale]!
   1362             VSHLL       qSrc0lo, dXj0lo, #(12-1)
   1363             VSHLL       qSrc0hi, dXj0hi, #(12-1)
   1364             VLD1        {qScale1lo, qScale1hi}, [pScale]!
   1365             VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
   1366             VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
   1367             VLD1        {qScale7lo, qScale7hi}, [pTemp]!
   1368             VSHLL       qSrc1lo, dXj1lo, #(12-1)
   1369             VSHLL       qSrc1hi, dXj1hi, #(12-1)
   1370             VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
   1371             VMOVN       dXi0hi, qSrc0hi
   1372             VSHLL       qSrc7lo, dXj7lo, #(12-1)
   1373             VSHLL       qSrc7hi, dXj7hi, #(12-1)
   1374             SUB         pTemp, pTemp, #((16*2)+(4*8*1))
   1375             VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
   1376             VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
   1377             VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
   1378             VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
   1379             VLD1        {qScale2lo, qScale2hi}, [pScale]!
   1380 
   1381             ;// Row 1 & 7
   1382             VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
   1383             VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
   1384             VMOVN       dXi5lo, qRes17lo                ;// Output i5
   1385             VMOVN       dXi5hi, qRes17hi
   1386             VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
   1387             VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
   1388             VMOVN       dXi6lo, qRes17lo                ;// Output i6
   1389             VMOVN       dXi6hi, qRes17hi
   1390             VSHLL       qSrc2lo, dXj2lo, #(12-1)
   1391             VSHLL       qSrc2hi, dXj2hi, #(12-1)
   1392             VLD1        {qScale6lo, qScale6hi}, [pTemp]!
   1393             VSHLL       qSrc6lo, dXj6lo, #(12-1)
   1394             VSHLL       qSrc6hi, dXj6hi, #(12-1)
   1395             SUB         pTemp, pTemp, #((16*2)+(4*8*1))
   1396             VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
   1397             VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
   1398             VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
   1399             VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
   1400             VLD1        {qScale3lo, qScale3hi}, [pScale]!
   1401 
   1402             ;// Row 2 & 6
   1403             VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
   1404             VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
   1405             VMOVN       dXi3lo, qRes26lo                ;// Output i3
   1406             VMOVN       dXi3hi, qRes26hi
   1407             VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
   1408             VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
   1409             VMOVN       dXi2lo, qRes26lo                ;// Output i2
   1410             VMOVN       dXi2hi, qRes26hi
   1411             VSHLL       qSrc3lo, dXj3lo, #(12-1)
   1412             VSHLL       qSrc3hi, dXj3hi, #(12-1)
   1413             VLD1        {qScale5lo, qScale5hi}, [pTemp]!
   1414             VSHLL       qSrc5lo, dXj5lo, #(12-1)
   1415             VSHLL       qSrc5hi, dXj5hi, #(12-1)
   1416             VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
   1417             VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
   1418             VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
   1419             VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
   1420 
   1421             ;// Row 3 & 5
   1422             VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
   1423             VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
   1424             SUB         pSrc, pSrc, #16*2*2
   1425             VMOVN       dXi7lo, qRes53lo                ;// Output i7
   1426             VMOVN       dXi7hi, qRes53hi
   1427             VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
   1428             VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
   1429             VLD1        qXj4, [pSrc @64]
   1430             VMOVN       dXi4lo, qRes53lo                ;// Output i4
   1431             VMOVN       dXi4hi, qRes53hi
   1432             VSHLL       qSrc4lo, dXj4lo, #(12-1)
   1433             VSHLL       qSrc4hi, dXj4hi, #(12-1)
   1434             VLD1        {qScale4lo, qScale4hi}, [pScale]
   1435             LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
   1436             VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
   1437             VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
   1438             VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
   1439             ;// Row 4
   1440             VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
   1441             VMOVN       dXi1hi, qSrc4hi
   1442 
   1443         MEND
   1444 
   1445         END
   1446