Home | History | Annotate | Download | only in api
      1 ;//
      2 ;// This confidential and proprietary software may be used only as
      3 ;// authorised by a licensing agreement from ARM Limited
      4 ;//   (C) COPYRIGHT 2004 ARM Limited
      5 ;//       ALL RIGHTS RESERVED
      6 ;// The entire notice above must be reproduced on all authorised
      7 ;// copies and copies may only be made to the extent permitted
      8 ;// by a licensing agreement from ARM Limited.
      9 ;//
     10 ;// IDCT_s.s
     11 ;//
     12 ;// Inverse DCT module
     13 ;//
     14 ;//
     15 ;// ALGORITHM DESCRIPTION
     16 ;//
     17 ;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
     18 ;// column and then a 1D IDCT for each row.
     19 ;//
     20 ;// The 8-point 1D IDCT is defined by
     21 ;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
     22 ;//
     23 ;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
     24 ;//   c(u,x) = cos( (2x+1)*u*pi/16 )
     25 ;//
     26 ;// We compute the 8-point 1D IDCT using the reverse of
     27 ;// the Arai-Agui-Nakajima flow graph which we split into
     28 ;// 5 stages named in reverse order to identify with the
     29 ;// forward DCT. Direct inversion of the forward formulae
     30 ;// in file FDCT_s.s gives:
     31 ;//
     32 ;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
     33 ;//             [ A(0) = 2*sqrt(2)
     34 ;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
     35 ;//
     36 ;// IStage 4:   i0 = j0             i1 = j4
     37 ;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
     38 ;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
     39 ;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
     40 ;//
     41 ;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
     42 ;//             h2 = (i2*sqrt2)-i3  h3 = i3
     43 ;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
     44 ;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
     45 ;//             [ The above two lines rotate by -(pi/8) ]
     46 ;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
     47 ;//
     48 ;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
     49 ;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
     50 ;//             g7 = h7             g6 = h6 - h7
     51 ;//             g5 = h5 - g6        g4 = h4 - g5
     52 ;//
     53 ;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
     54 ;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
     55 ;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
     56 ;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
     57 ;//
     58 ;// Note that most coefficients are halved 3 times during the
     59 ;// above calculation. We can rescale the algorithm dividing
     60 ;// the input by 8 to remove the halvings.
     61 ;//
     62 ;// IStage 5:   j(u) = T(u)*A(u)/8
     63 ;//
     64 ;// IStage 4:   i0 = j0             i1 = j4
     65 ;//             i3 = j2 + j6        i2 = j2 - j6
     66 ;//             i7 = j5 + j3        i4 = j5 - j3
     67 ;//             i5 = j1 + j7        i6 = j1 - j7
     68 ;//
     69 ;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
     70 ;//             h2 = (i2*sqrt2)-i3  h3 = i3
     71 ;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
     72 ;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
     73 ;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
     74 ;//
     75 ;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
     76 ;//             g1 = h1 + h2        g2 = h1 - h2
     77 ;//             g7 = h7             g6 = h6 - h7
     78 ;//             g5 = h5 - g6        g4 = h4 - g5
     79 ;//
     80 ;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
     81 ;//             f1 = g1 + g6        f6 = g1 - g6
     82 ;//             f2 = g2 + g5        f5 = g2 - g5
     83 ;//             f3 = g3 + g4        f4 = g3 - g4
     84 ;//
     85 ;// Note:
     86 ;// 1. The scaling by A(u)/8 can often be combined with inverse
     87 ;//    quantization. The column and row scalings can be combined.
     88 ;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
     89 ;//    to the above code but is otherwise identical.
     90 ;// 3. The rotation by -pi/8 can be peformed using three multiplies
     91 ;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
     92 ;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
     93 ;// 4. If |T(u)|<=1 then from the IDCT definition,
     94 ;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
     95 ;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
     96 ;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
     97 ;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
     98 ;//            = (approx)2.64
     99 ;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
    100 ;//    The table below shows input patterns generating the maximum
    101 ;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
    102 ;//    InputPattern      Max |f(x)|
    103 ;//      PPPPPPPP        |f0| =  2.64
    104 ;//      PPPMMMMM        |f1| =  2.64
    105 ;//      PPMMMPPP        |f2| =  2.64
    106 ;//      PPMMPPMM        |f3| =  2.64
    107 ;//      PMMPPMMP        |f4| =  2.64
    108 ;//      PMMPMMPM        |f5| =  2.64
    109 ;//      PMPPMPMP        |f6| =  2.64
    110 ;//      PMPMPMPM        |f7| =  2.64
    111 ;//   Note that this input pattern is the transpose of the
    112 ;//   corresponding max input patter for the FDCT.
    113 
    114 ;// Arguments
    115 
    116 pSrc    RN 0    ;// source data buffer
    117 Stride  RN 1    ;// destination stride in bytes
    118 pDest   RN 2    ;// destination data buffer
    119 pScale  RN 3    ;// pointer to scaling table
    120 
    121 
    122         ;// DCT Inverse Macro
    123         ;// The DCT code should be parametrized according
    124         ;// to the following inputs:
    125         ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
    126         ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
    127         ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
    128         ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
    129         ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
    130         ;//
    131         ;// Inputs:
    132         ;// pSrc   = r0 = Pointer to input data
    133         ;//               Range is -256 to +255 (9-bit)
    134         ;// Stride = r1 = Stride between input lines
    135         ;// pDest  = r2 = Pointer to output data
    136         ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
    137 
    138 
    139 
    140         MACRO
    141         M_IDCT  $outsize, $inscale, $stride
    142         LCLA    SHIFT
    143 
    144 
    145         IF ARM1136JS
    146 
    147 ;// REGISTER ALLOCATION
    148 ;// This is hard since we have 8 values, 9 free registers and each
    149 ;// butterfly requires a temporary register. We also want to
    150 ;// maintain register order so we can use LDM/STM. The table below
    151 ;// summarises the register allocation that meets all these criteria.
    152 ;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
    153 ;//
    154 ;// r1  a01     g0  h0
    155 ;// r4  b01 f0  g1  h1  i0
    156 ;// r5  a23 f1  g2      i1
    157 ;// r6  b23 f2  g3  h2  i2
    158 ;// r7  a45 f3      h3  i3
    159 ;// r8  b45 f4  g4  h4  i4
    160 ;// r9  a67 f5  g5  h5  i5
    161 ;// r10 b67 f6  g6  h6  i6
    162 ;// r11     f7  g7  h7  i7
    163 ;//
    164 ra01    RN 1
    165 rb01    RN 4
    166 ra23    RN 5
    167 rb23    RN 6
    168 ra45    RN 7
    169 rb45    RN 8
    170 ra67    RN 9
    171 rb67    RN 10
    172 rtmp    RN 11
    173 csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
    174 LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
    175 ;// Transpose allocation
    176 xft     RN ra01
    177 xf0     RN rb01
    178 xf1     RN ra23
    179 xf2     RN rb23
    180 xf3     RN ra45
    181 xf4     RN rb45
    182 xf5     RN ra67
    183 xf6     RN rb67
    184 xf7     RN rtmp
    185 ;// IStage 1 allocation
    186 xg0     RN xft
    187 xg1     RN xf0
    188 xg2     RN xf1
    189 xg3     RN xf2
    190 xgt     RN xf3
    191 xg4     RN xf4
    192 xg5     RN xf5
    193 xg6     RN xf6
    194 xg7     RN xf7
    195 ;// IStage 2 allocation
    196 xh0     RN xg0
    197 xh1     RN xg1
    198 xht     RN xg2
    199 xh2     RN xg3
    200 xh3     RN xgt
    201 xh4     RN xg4
    202 xh5     RN xg5
    203 xh6     RN xg6
    204 xh7     RN xg7
    205 ;// IStage 3,4 allocation
    206 xit     RN xh0
    207 xi0     RN xh1
    208 xi1     RN xht
    209 xi2     RN xh2
    210 xi3     RN xh3
    211 xi4     RN xh4
    212 xi5     RN xh5
    213 xi6     RN xh6
    214 xi7     RN xh7
    215 
    216         M_STR   pDest,  ppDest
    217         IF "$stride"="s"
    218             M_STR   Stride, pStride
    219         ENDIF
    220         M_ADR   pDest,  pBlk
    221         LDR     csPiBy8, =0x30fc7642
    222         LDR     LoopRR2, =0x00005a82
    223 
    224 v6_idct_col$_F
    225         ;// Load even values
    226         LDR     xi4, [pSrc], #4  ;// j0
    227         LDR     xi5, [pSrc, #4*16-4]  ;// j4
    228         LDR     xi6, [pSrc, #2*16-4]  ;// j2
    229         LDR     xi7, [pSrc, #6*16-4]  ;// j6
    230 
    231         ;// Scale Even Values
    232         IF "$inscale"="s16" ;// 16x16 mul
    233 SHIFT       SETA    12
    234             LDR     xi0, [pScale], #4
    235             LDR     xi1, [pScale, #4*16-4]
    236             LDR     xi2, [pScale, #2*16-4]
    237             MOV     xit, #1<<(SHIFT-1)
    238             SMLABB  xi3, xi0, xi4, xit
    239             SMLATT  xi4, xi0, xi4, xit
    240             SMLABB  xi0, xi1, xi5, xit
    241             SMLATT  xi5, xi1, xi5, xit
    242             MOV     xi3, xi3, ASR #SHIFT
    243             PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
    244             LDR     xi3, [pScale, #6*16-4]
    245             SMLABB  xi1, xi2, xi6, xit
    246             SMLATT  xi6, xi2, xi6, xit
    247             MOV     xi0, xi0, ASR #SHIFT
    248             PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
    249             SMLABB  xi2, xi3, xi7, xit
    250             SMLATT  xi7, xi3, xi7, xit
    251             MOV     xi1, xi1, ASR #SHIFT
    252             PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
    253             MOV     xi2, xi2, ASR #SHIFT
    254             PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
    255         ENDIF
    256         IF "$inscale"="s32" ;// 32x16 mul
    257 SHIFT       SETA    (12+8-16)
    258             MOV     xit, #1<<(SHIFT-1)
    259             LDR     xi0, [pScale], #8
    260             LDR     xi1, [pScale, #0*32+4-8]
    261             LDR     xi2, [pScale, #4*32-8]
    262             LDR     xi3, [pScale, #4*32+4-8]
    263             SMLAWB  xi0, xi0, xi4, xit
    264             SMLAWT  xi1, xi1, xi4, xit
    265             SMLAWB  xi2, xi2, xi5, xit
    266             SMLAWT  xi3, xi3, xi5, xit
    267             MOV     xi0, xi0, ASR #SHIFT
    268             PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
    269             MOV     xi2, xi2, ASR #SHIFT
    270             PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
    271             LDR     xi0, [pScale, #2*32-8]
    272             LDR     xi1, [pScale, #2*32+4-8]
    273             LDR     xi2, [pScale, #6*32-8]
    274             LDR     xi3, [pScale, #6*32+4-8]
    275             SMLAWB  xi0, xi0, xi6, xit
    276             SMLAWT  xi1, xi1, xi6, xit
    277             SMLAWB  xi2, xi2, xi7, xit
    278             SMLAWT  xi3, xi3, xi7, xit
    279             MOV     xi0, xi0, ASR #SHIFT
    280             PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
    281             MOV     xi2, xi2, ASR #SHIFT
    282             PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
    283         ENDIF
    284 
    285         ;// Load odd values
    286         LDR     xi0, [pSrc, #1*16-4]      ;// j1
    287         LDR     xi1, [pSrc, #7*16-4]      ;// j7
    288         LDR     xi2, [pSrc, #5*16-4]      ;// j5
    289         LDR     xi3, [pSrc, #3*16-4]      ;// j3
    290 
    291         IF  {TRUE}
    292             ;// shortcut if odd values 0
    293             TEQ     xi0, #0
    294             TEQEQ   xi1, #0
    295             TEQEQ   xi2, #0
    296             TEQEQ   xi3, #0
    297             BEQ     v6OddZero$_F
    298         ENDIF
    299 
    300         ;// Store scaled even values
    301         STMIA   pDest, {xi4, xi5, xi6, xi7}
    302 
    303         ;// Scale odd values
    304         IF "$inscale"="s16"
    305             ;// Perform AAN Scale
    306             LDR     xi4, [pScale, #1*16-4]
    307             LDR     xi5, [pScale, #7*16-4]
    308             LDR     xi6, [pScale, #5*16-4]
    309             SMLABB  xi7, xi0, xi4, xit
    310             SMLATT  xi0, xi0, xi4, xit
    311             SMLABB  xi4, xi1, xi5, xit
    312             SMLATT  xi1, xi1, xi5, xit
    313             MOV     xi7, xi7, ASR #SHIFT
    314             PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
    315             LDR     xi7, [pScale, #3*16-4]
    316             SMLABB  xi5, xi2, xi6, xit
    317             SMLATT  xi2, xi2, xi6, xit
    318             MOV     xi4, xi4, ASR #SHIFT
    319             PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
    320             SMLABB  xi6, xi3, xi7, xit
    321             SMLATT  xi3, xi3, xi7, xit
    322             MOV     xi5, xi5, ASR #SHIFT
    323             PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
    324             MOV     xi6, xi6, ASR #SHIFT
    325             PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
    326         ENDIF
    327         IF "$inscale"="s32" ;// 32x16 mul
    328             LDR     xi4, [pScale, #1*32-8]
    329             LDR     xi5, [pScale, #1*32+4-8]
    330             LDR     xi6, [pScale, #7*32-8]
    331             LDR     xi7, [pScale, #7*32+4-8]
    332             SMLAWB  xi4, xi4, xi0, xit
    333             SMLAWT  xi5, xi5, xi0, xit
    334             SMLAWB  xi6, xi6, xi1, xit
    335             SMLAWT  xi7, xi7, xi1, xit
    336             MOV     xi4, xi4, ASR #SHIFT
    337             PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
    338             MOV     xi6, xi6, ASR #SHIFT
    339             PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
    340             LDR     xi4, [pScale, #5*32-8]
    341             LDR     xi5, [pScale, #5*32+4-8]
    342             LDR     xi6, [pScale, #3*32-8]
    343             LDR     xi7, [pScale, #3*32+4-8]
    344             SMLAWB  xi4, xi4, xi2, xit
    345             SMLAWT  xi5, xi5, xi2, xit
    346             SMLAWB  xi6, xi6, xi3, xit
    347             SMLAWT  xi7, xi7, xi3, xit
    348             MOV     xi4, xi4, ASR #SHIFT
    349             PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
    350             MOV     xi6, xi6, ASR #SHIFT
    351             PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
    352         ENDIF
    353 
    354         LDR     xit, =0x00010001        ;// rounding constant
    355         SADD16 xi5, xi0, xi1           ;// (j1+j7)/2
    356         SHADD16 xi5, xi5, xit
    357 
    358         SSUB16  xi6, xi0, xi1           ;// j1-j7
    359         SADD16 xi7, xi2, xi3           ;// (j5+j3)/2
    360         SHADD16 xi7, xi7, xit
    361 
    362         SSUB16  xi4, xi2, xi3           ;// j5-j3
    363 
    364         SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
    365 
    366         PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
    367         PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
    368 
    369         SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
    370         SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
    371         SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
    372         SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
    373 
    374         SMULBB  xi1, xi3, LoopRR2
    375         SMULTB  xi3, xi3, LoopRR2
    376 
    377         PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
    378         PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
    379         SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
    380 
    381         ;// xi0,xi1,xi2,xi3 now free
    382         ;// IStage 4,3, rows 2to3 x1/2
    383 
    384         MOV     xi3, xi3, LSL #1
    385         PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
    386         LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
    387 
    388         ;// IStage 2, rows4to7
    389         SSUB16  xg6, xh6, xh7
    390         SSUB16  xg5, xh5, xg6
    391         SSUB16  xg4, xh4, xg5
    392 
    393         SSUB16  xi2, xi0, xi1           ;// (j2-j6)
    394 
    395         SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
    396 
    397         SMULBB  xi0, xi2, LoopRR2
    398         SMULTB  xi2, xi2, LoopRR2
    399 
    400         MOV     xi2, xi2, LSL #1
    401         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    402 
    403         ;// xi0, xi1 now free
    404         ;// IStage 4,3 rows 0to1 x 1/2
    405         LDRD    xi0, [pDest]            ;// j0, j4 scaled
    406         SSUB16  xh2, xh2, xi3
    407         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    408 
    409         SHADD16 xh0, xi0, xi1
    410         SHSUB16 xh1, xi0, xi1
    411 
    412         ;// IStage 2 rows 0to3 x 1/2
    413         SHSUB16 xg2, xh1, xh2
    414         SHADD16 xg1, xh1, xh2
    415         SHSUB16 xg3, xh0, xh3
    416         SHADD16 xg0, xh0, xh3
    417 
    418         ;// IStage 1 all rows
    419         SADD16  xf3, xg3, xg4
    420         SSUB16  xf4, xg3, xg4
    421         SADD16  xf2, xg2, xg5
    422         SSUB16  xf5, xg2, xg5
    423         SADD16  xf1, xg1, xg6
    424         SSUB16  xf6, xg1, xg6
    425         SADD16  xf0, xg0, xg7
    426         SSUB16  xf7, xg0, xg7
    427 
    428         ;// Transpose, store and loop
    429         PKHBT   ra01, xf0, xf1, LSL #16
    430         PKHTB   rb01, xf1, xf0, ASR #16
    431 
    432         PKHBT   ra23, xf2, xf3, LSL #16
    433         PKHTB   rb23, xf3, xf2, ASR #16
    434 
    435         PKHBT   ra45, xf4, xf5, LSL #16
    436         PKHTB   rb45, xf5, xf4, ASR #16
    437 
    438         PKHBT   ra67, xf6, xf7, LSL #16
    439         STMIA   pDest!, {ra01, ra23, ra45, ra67}
    440         PKHTB   rb67, xf7, xf6, ASR #16
    441         STMIA   pDest!, {rb01, rb23, rb45, rb67}
    442         BCC     v6_idct_col$_F
    443 
    444         SUB     pSrc, pDest, #(64*2)
    445         M_LDR   pDest, ppDest
    446         IF "$stride"="s"
    447             M_LDR   pScale, pStride
    448         ENDIF
    449         B       v6_idct_row$_F
    450 
    451 v6OddZero$_F
    452         SSUB16  xi2, xi6, xi7           ;// (j2-j6)
    453         SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
    454 
    455         SMULBB  xi0, xi2, LoopRR2
    456         SMULTB  xi2, xi2, LoopRR2
    457 
    458         MOV     xi2, xi2, LSL #1
    459         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    460         SSUB16  xh2, xh2, xi3
    461 
    462         ;// xi0, xi1 now free
    463         ;// IStage 4,3 rows 0to1 x 1/2
    464 
    465         SHADD16 xh0, xi4, xi5
    466         SHSUB16 xh1, xi4, xi5
    467 
    468         ;// IStage 2 rows 0to3 x 1/2
    469         SHSUB16 xg2, xh1, xh2
    470         SHADD16 xg1, xh1, xh2
    471         SHSUB16 xg3, xh0, xh3
    472         SHADD16 xg0, xh0, xh3
    473 
    474         ;// IStage 1 all rows
    475         MOV  xf3, xg3
    476         MOV  xf4, xg3
    477         MOV  xf2, xg2
    478         MOV  xf5, xg2
    479         MOV  xf1, xg1
    480         MOV  xf6, xg1
    481         MOV  xf0, xg0
    482         MOV  xf7, xg0
    483 
    484         ;// Transpose
    485         PKHBT   ra01, xf0, xf1, LSL #16
    486         PKHTB   rb01, xf1, xf0, ASR #16
    487 
    488         PKHBT   ra23, xf2, xf3, LSL #16
    489         PKHTB   rb23, xf3, xf2, ASR #16
    490 
    491         PKHBT   ra45, xf4, xf5, LSL #16
    492         PKHTB   rb45, xf5, xf4, ASR #16
    493 
    494         PKHBT   ra67, xf6, xf7, LSL #16
    495         PKHTB   rb67, xf7, xf6, ASR #16
    496 
    497         STMIA   pDest!, {ra01, ra23, ra45, ra67}
    498         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    499         STMIA   pDest!, {rb01, rb23, rb45, rb67}
    500 
    501         BCC     v6_idct_col$_F
    502         SUB     pSrc, pDest, #(64*2)
    503         M_LDR   pDest, ppDest
    504         IF "$stride"="s"
    505             M_LDR   pScale, pStride
    506         ENDIF
    507 
    508 
    509 v6_idct_row$_F
    510         ;// IStage 4,3, rows4to7 x1/4
    511         LDR     xit, =0x00010001        ;// rounding constant
    512         LDR     xi0, [pSrc, #1*16]      ;// j1
    513         LDR     xi1, [pSrc, #7*16]      ;// 4*j7
    514         LDR     xi2, [pSrc, #5*16]      ;// j5
    515         LDR     xi3, [pSrc, #3*16]      ;// j3
    516 
    517         SHADD16 xi1, xi1, xit           ;// 2*j7
    518         SHADD16 xi1, xi1, xit           ;// j7
    519 
    520         SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
    521         SSUB16  xi6, xi0, xi1           ;// j1-j7
    522         SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
    523         SSUB16  xi4, xi2, xi3           ;// j5-j3
    524 
    525         SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
    526 
    527         PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
    528         PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
    529 
    530         SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
    531         SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
    532         SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
    533         SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
    534 
    535         SMULBB  xi1, xi3, LoopRR2
    536         SMULTB  xi3, xi3, LoopRR2
    537 
    538         PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
    539         PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
    540         SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
    541 
    542         MOV     xi3, xi3, LSL #1
    543         PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
    544 
    545         ;// xi0,xi1,xi2,xi3 now free
    546         ;// IStage 4,3, rows 2to3 x1/2
    547 
    548         LDR     xi0, [pSrc, #2*16]      ;// j2
    549         LDR     xi1, [pSrc, #6*16]      ;// 2*j6
    550 
    551         ;// IStage 2, rows4to7
    552         SSUB16  xg6, xh6, xh7
    553         SSUB16  xg5, xh5, xg6
    554         SSUB16  xg4, xh4, xg5
    555 
    556         SHADD16 xi1, xi1, xit           ;// j6
    557         SSUB16  xi2, xi0, xi1           ;// (j2-j6)
    558         SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
    559 
    560         SMULBB  xi0, xi2, LoopRR2
    561         SMULTB  xi2, xi2, LoopRR2
    562 
    563         MOV     xi2, xi2, LSL #1
    564 
    565         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    566 
    567         ;// xi0, xi1 now free
    568         ;// IStage 4,3 rows 0to1 x 1/2
    569         LDR     xi1, [pSrc, #4*16]      ;// j4
    570         LDR     xi0, [pSrc], #4         ;// j0
    571 
    572         SSUB16  xh2, xh2, xi3
    573         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    574 
    575         ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
    576         SHADD16 xh0, xi0, xi1           ;// of DC result
    577         SHSUB16 xh1, xi0, xi1
    578 
    579         ;// IStage 2 rows 0to3 x 1/2
    580         SHSUB16 xg2, xh1, xh2
    581         SHADD16 xg1, xh1, xh2
    582         SHSUB16 xg3, xh0, xh3
    583         SHADD16 xg0, xh0, xh3
    584 
    585         ;// IStage 1 all rows
    586         SHADD16 xf3, xg3, xg4
    587         SHSUB16 xf4, xg3, xg4
    588         SHADD16 xf2, xg2, xg5
    589         SHSUB16 xf5, xg2, xg5
    590         SHADD16 xf1, xg1, xg6
    591         SHSUB16 xf6, xg1, xg6
    592         SHADD16 xf0, xg0, xg7
    593         SHSUB16 xf7, xg0, xg7
    594 
    595         ;// Saturate
    596         IF ("$outsize"="u8")
    597             USAT16  xf0, #8, xf0
    598             USAT16  xf1, #8, xf1
    599             USAT16  xf2, #8, xf2
    600             USAT16  xf3, #8, xf3
    601             USAT16  xf4, #8, xf4
    602             USAT16  xf5, #8, xf5
    603             USAT16  xf6, #8, xf6
    604             USAT16  xf7, #8, xf7
    605         ENDIF
    606         IF ("$outsize"="s9")
    607             SSAT16  xf0, #9, xf0
    608             SSAT16  xf1, #9, xf1
    609             SSAT16  xf2, #9, xf2
    610             SSAT16  xf3, #9, xf3
    611             SSAT16  xf4, #9, xf4
    612             SSAT16  xf5, #9, xf5
    613             SSAT16  xf6, #9, xf6
    614             SSAT16  xf7, #9, xf7
    615         ENDIF
    616 
    617         ;// Transpose to Row, Pack and store
    618         IF ("$outsize"="u8")
    619             ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
    620             ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
    621             ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
    622             ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
    623             PKHBT   ra01, xf0, xf2, LSL #16
    624             PKHTB   rb01, xf2, xf0, ASR #16
    625             PKHBT   ra23, xf4, xf6, LSL #16
    626             PKHTB   rb23, xf6, xf4, ASR #16
    627             STMIA   pDest, {ra01, ra23}
    628             IF "$stride"="s"
    629                 ADD     pDest, pDest, pScale
    630                 STMIA   pDest, {rb01, rb23}
    631                 ADD     pDest, pDest, pScale
    632             ELSE
    633                 ADD     pDest, pDest, #($stride)
    634                 STMIA   pDest, {rb01, rb23}
    635                 ADD     pDest, pDest, #($stride)
    636             ENDIF
    637         ENDIF
    638         IF ("$outsize"="s9"):LOR:("$outsize"="s16")
    639             PKHBT   ra01, xf0, xf1, LSL #16
    640             PKHTB   rb01, xf1, xf0, ASR #16
    641 
    642             PKHBT   ra23, xf2, xf3, LSL #16
    643             PKHTB   rb23, xf3, xf2, ASR #16
    644 
    645             PKHBT   ra45, xf4, xf5, LSL #16
    646             PKHTB   rb45, xf5, xf4, ASR #16
    647 
    648             PKHBT   ra67, xf6, xf7, LSL #16
    649             PKHTB   rb67, xf7, xf6, ASR #16
    650 
    651             STMIA   pDest, {ra01, ra23, ra45, ra67}
    652             IF "$stride"="s"
    653                 ADD     pDest, pDest, pScale
    654                 STMIA   pDest, {rb01, rb23, rb45, rb67}
    655                 ADD     pDest, pDest, pScale
    656             ELSE
    657                 ADD     pDest, pDest, #($stride)
    658                 STMIA   pDest, {rb01, rb23, rb45, rb67}
    659                 ADD     pDest, pDest, #($stride)
    660             ENDIF
    661         ENDIF
    662 
    663         BCC     v6_idct_row$_F
    664         ENDIF ;// ARM1136JS
    665 
    666 
    667         IF CortexA8
    668 
    669 Src0            EQU  7
    670 Src1            EQU  8
    671 Src2            EQU  9
    672 Src3            EQU  10
    673 Src4            EQU  11
    674 Src5            EQU  12
    675 Src6            EQU  13
    676 Src7            EQU  14
    677 Tmp             EQU  15
    678 
    679 qXj0            QN Src0.S16
    680 qXj1            QN Src1.S16
    681 qXj2            QN Src2.S16
    682 qXj3            QN Src3.S16
    683 qXj4            QN Src4.S16
    684 qXj5            QN Src5.S16
    685 qXj6            QN Src6.S16
    686 qXj7            QN Src7.S16
    687 qXjt            QN Tmp.S16
    688 
    689 dXj0lo          DN (Src0*2).S16
    690 dXj0hi          DN (Src0*2+1).S16
    691 dXj1lo          DN (Src1*2).S16
    692 dXj1hi          DN (Src1*2+1).S16
    693 dXj2lo          DN (Src2*2).S16
    694 dXj2hi          DN (Src2*2+1).S16
    695 dXj3lo          DN (Src3*2).S16
    696 dXj3hi          DN (Src3*2+1).S16
    697 dXj4lo          DN (Src4*2).S16
    698 dXj4hi          DN (Src4*2+1).S16
    699 dXj5lo          DN (Src5*2).S16
    700 dXj5hi          DN (Src5*2+1).S16
    701 dXj6lo          DN (Src6*2).S16
    702 dXj6hi          DN (Src6*2+1).S16
    703 dXj7lo          DN (Src7*2).S16
    704 dXj7hi          DN (Src7*2+1).S16
    705 dXjtlo          DN (Tmp*2).S16
    706 dXjthi          DN (Tmp*2+1).S16
    707 
    708 qXi0            QN qXj0
    709 qXi1            QN qXj4
    710 qXi2            QN qXj2
    711 qXi3            QN qXj7
    712 qXi4            QN qXj5
    713 qXi5            QN qXjt
    714 qXi6            QN qXj1
    715 qXi7            QN qXj6
    716 qXit            QN qXj3
    717 
    718 dXi0lo          DN dXj0lo
    719 dXi0hi          DN dXj0hi
    720 dXi1lo          DN dXj4lo
    721 dXi1hi          DN dXj4hi
    722 dXi2lo          DN dXj2lo
    723 dXi2hi          DN dXj2hi
    724 dXi3lo          DN dXj7lo
    725 dXi3hi          DN dXj7hi
    726 dXi4lo          DN dXj5lo
    727 dXi4hi          DN dXj5hi
    728 dXi5lo          DN dXjtlo
    729 dXi5hi          DN dXjthi
    730 dXi6lo          DN dXj1lo
    731 dXi6hi          DN dXj1hi
    732 dXi7lo          DN dXj6lo
    733 dXi7hi          DN dXj6hi
    734 dXitlo          DN dXj3lo
    735 dXithi          DN dXj3hi
    736 
    737 qXh0            QN qXit
    738 qXh1            QN qXi0
    739 qXh2            QN qXi2
    740 qXh3            QN qXi3
    741 qXh4            QN qXi7
    742 qXh5            QN qXi5
    743 qXh6            QN qXi4
    744 qXh7            QN qXi1
    745 qXht            QN qXi6
    746 
    747 dXh0lo          DN dXitlo
    748 dXh0hi          DN dXithi
    749 dXh1lo          DN dXi0lo
    750 dXh1hi          DN dXi0hi
    751 dXh2lo          DN dXi2lo
    752 dXh2hi          DN dXi2hi
    753 dXh3lo          DN dXi3lo
    754 dXh3hi          DN dXi3hi
    755 dXh4lo          DN dXi7lo
    756 dXh4hi          DN dXi7hi
    757 dXh5lo          DN dXi5lo
    758 dXh5hi          DN dXi5hi
    759 dXh6lo          DN dXi4lo
    760 dXh6hi          DN dXi4hi
    761 dXh7lo          DN dXi1lo
    762 dXh7hi          DN dXi1hi
    763 dXhtlo          DN dXi6lo
    764 dXhthi          DN dXi6hi
    765 
    766 qXg0            QN qXh2
    767 qXg1            QN qXht
    768 qXg2            QN qXh1
    769 qXg3            QN qXh0
    770 qXg4            QN qXh4
    771 qXg5            QN qXh5
    772 qXg6            QN qXh6
    773 qXg7            QN qXh7
    774 qXgt            QN qXh3
    775 
    776 qXf0            QN qXg6
    777 qXf1            QN qXg5
    778 qXf2            QN qXg4
    779 qXf3            QN qXgt
    780 qXf4            QN qXg3
    781 qXf5            QN qXg2
    782 qXf6            QN qXg1
    783 qXf7            QN qXg0
    784 qXft            QN qXg7
    785 
    786 
    787 qXt0            QN 1.S32
    788 qXt1            QN 2.S32
    789 qT0lo           QN 1.S32
    790 qT0hi           QN 2.S32
    791 qT1lo           QN 3.S32
    792 qT1hi           QN 4.S32
    793 qScalelo        QN 5.S32        ;// used to read post scale values
    794 qScalehi        QN 6.S32
    795 qTemp0          QN 5.S32
    796 qTemp1          QN 6.S32
    797 
    798 
    799 Scale1          EQU 6
    800 Scale2          EQU 15
    801 qScale1         QN Scale1.S16
    802 qScale2         QN Scale2.S16
    803 dScale1lo       DN (Scale1*2).S16
    804 dScale1hi       DN (Scale1*2+1).S16
    805 dScale2lo       DN (Scale2*2).S16
    806 dScale2hi       DN (Scale2*2+1).S16
    807 
    808 dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
    809 InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
    810 S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
    811 C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
    812 
    813 pTemp           RN 12
    814 
    815 
    816         IMPORT  armCOMM_IDCTCoef
    817 
    818         VLD1        {qXj0,qXj1}, [pSrc @64]!
    819         VLD1        {qXj2,qXj3}, [pSrc @64]!
    820         VLD1        {qXj4,qXj5}, [pSrc @64]!
    821         VLD1        {qXj6,qXj7}, [pSrc @64]!
    822 
    823         ;// Load PreScale and multiply with Src
    824         ;// IStage 4
    825 
    826         IF "$inscale"="s16"                         ;// 16X16 Mul
    827             M_IDCT_PRESCALE16
    828         ENDIF
    829 
    830         IF "$inscale"="s32"                         ;// 32X32 ,ul
    831             M_IDCT_PRESCALE32
    832         ENDIF
    833 
    834         ;// IStage 3
    835         VQDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
    836         VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
    837         VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
    838         VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
    839         VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
    840         VQDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
    841         VSUB        qXh2, qXi2, qXi3                ;// h2, h3
    842 
    843         VMULL       qXt0, dXi4lo, C                 ;// c*i4
    844         VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
    845         VMULL       qXt1, dXi4hi, C
    846         VMLAL       qXt1, dXi6hi, S
    847         VSHRN       dXh4lo, qXt0, #16               ;// h4
    848         VSHRN       dXh4hi, qXt1, #16
    849 
    850         VMULL       qXt0, dXi6lo, C                 ;// c*i6
    851         VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
    852         VMULL       qXt1, dXi6hi, C
    853         VMLSL       qXt1, dXi4hi, S
    854         VSHRN       dXh6lo, qXt0, #16               ;// h6
    855         VSHRN       dXh6hi, qXt1, #16
    856 
    857         ;// IStage 2
    858         VSUB        qXg6, qXh6, qXh7
    859         VSUB        qXg5, qXh5, qXg6
    860         VSUB        qXg4, qXh4, qXg5
    861         VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
    862         VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
    863         VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
    864         VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
    865 
    866         ;// IStage 1 all rows
    867         VADD        qXf3, qXg3, qXg4
    868         VSUB        qXf4, qXg3, qXg4
    869         VADD        qXf2, qXg2, qXg5
    870         VSUB        qXf5, qXg2, qXg5
    871         VADD        qXf1, qXg1, qXg6
    872         VSUB        qXf6, qXg1, qXg6
    873         VADD        qXf0, qXg0, qXg7
    874         VSUB        qXf7, qXg0, qXg7
    875 
    876         ;// Transpose, store and loop
    877 XTR0            EQU Src5
    878 XTR1            EQU Tmp
    879 XTR2            EQU Src6
    880 XTR3            EQU Src7
    881 XTR4            EQU Src3
    882 XTR5            EQU Src0
    883 XTR6            EQU Src1
    884 XTR7            EQU Src2
    885 XTRt            EQU Src4
    886 
    887 qA0             QN  XTR0.S32  ;// for XTRpose
    888 qA1             QN  XTR1.S32
    889 qA2             QN  XTR2.S32
    890 qA3             QN  XTR3.S32
    891 qA4             QN  XTR4.S32
    892 qA5             QN  XTR5.S32
    893 qA6             QN  XTR6.S32
    894 qA7             QN  XTR7.S32
    895 
    896 dB0             DN  XTR0*2+1      ;// for using VSWP
    897 dB1             DN  XTR1*2+1
    898 dB2             DN  XTR2*2+1
    899 dB3             DN  XTR3*2+1
    900 dB4             DN  XTR4*2
    901 dB5             DN  XTR5*2
    902 dB6             DN  XTR6*2
    903 dB7             DN  XTR7*2
    904 
    905 
    906         VTRN        qXf0, qXf1
    907         VTRN        qXf2, qXf3
    908         VTRN        qXf4, qXf5
    909         VTRN        qXf6, qXf7
    910         VTRN        qA0, qA2
    911         VTRN        qA1, qA3
    912         VTRN        qA4, qA6
    913         VTRN        qA5, qA7
    914         VSWP        dB0, dB4
    915         VSWP        dB1, dB5
    916         VSWP        dB2, dB6
    917         VSWP        dB3, dB7
    918 
    919 
    920 qYj0            QN qXf0
    921 qYj1            QN qXf1
    922 qYj2            QN qXf2
    923 qYj3            QN qXf3
    924 qYj4            QN qXf4
    925 qYj5            QN qXf5
    926 qYj6            QN qXf6
    927 qYj7            QN qXf7
    928 qYjt            QN qXft
    929 
    930 dYj0lo          DN (XTR0*2).S16
    931 dYj0hi          DN (XTR0*2+1).S16
    932 dYj1lo          DN (XTR1*2).S16
    933 dYj1hi          DN (XTR1*2+1).S16
    934 dYj2lo          DN (XTR2*2).S16
    935 dYj2hi          DN (XTR2*2+1).S16
    936 dYj3lo          DN (XTR3*2).S16
    937 dYj3hi          DN (XTR3*2+1).S16
    938 dYj4lo          DN (XTR4*2).S16
    939 dYj4hi          DN (XTR4*2+1).S16
    940 dYj5lo          DN (XTR5*2).S16
    941 dYj5hi          DN (XTR5*2+1).S16
    942 dYj6lo          DN (XTR6*2).S16
    943 dYj6hi          DN (XTR6*2+1).S16
    944 dYj7lo          DN (XTR7*2).S16
    945 dYj7hi          DN (XTR7*2+1).S16
    946 dYjtlo          DN (XTRt*2).S16
    947 dYjthi          DN (XTRt*2+1).S16
    948 
    949 qYi0            QN qYj0
    950 qYi1            QN qYj4
    951 qYi2            QN qYj2
    952 qYi3            QN qYj7
    953 qYi4            QN qYj5
    954 qYi5            QN qYjt
    955 qYi6            QN qYj1
    956 qYi7            QN qYj6
    957 qYit            QN qYj3
    958 
    959 dYi0lo          DN dYj0lo
    960 dYi0hi          DN dYj0hi
    961 dYi1lo          DN dYj4lo
    962 dYi1hi          DN dYj4hi
    963 dYi2lo          DN dYj2lo
    964 dYi2hi          DN dYj2hi
    965 dYi3lo          DN dYj7lo
    966 dYi3hi          DN dYj7hi
    967 dYi4lo          DN dYj5lo
    968 dYi4hi          DN dYj5hi
    969 dYi5lo          DN dYjtlo
    970 dYi5hi          DN dYjthi
    971 dYi6lo          DN dYj1lo
    972 dYi6hi          DN dYj1hi
    973 dYi7lo          DN dYj6lo
    974 dYi7hi          DN dYj6hi
    975 dYitlo          DN dYj3lo
    976 dYithi          DN dYj3hi
    977 
    978 qYh0            QN qYit
    979 qYh1            QN qYi0
    980 qYh2            QN qYi2
    981 qYh3            QN qYi3
    982 qYh4            QN qYi7
    983 qYh5            QN qYi5
    984 qYh6            QN qYi4
    985 qYh7            QN qYi1
    986 qYht            QN qYi6
    987 
    988 dYh0lo          DN dYitlo
    989 dYh0hi          DN dYithi
    990 dYh1lo          DN dYi0lo
    991 dYh1hi          DN dYi0hi
    992 dYh2lo          DN dYi2lo
    993 dYh2hi          DN dYi2hi
    994 dYh3lo          DN dYi3lo
    995 dYh3hi          DN dYi3hi
    996 dYh4lo          DN dYi7lo
    997 dYh4hi          DN dYi7hi
    998 dYh5lo          DN dYi5lo
    999 dYh5hi          DN dYi5hi
   1000 dYh6lo          DN dYi4lo
   1001 dYh6hi          DN dYi4hi
   1002 dYh7lo          DN dYi1lo
   1003 dYh7hi          DN dYi1hi
   1004 dYhtlo          DN dYi6lo
   1005 dYhthi          DN dYi6hi
   1006 
   1007 qYg0            QN qYh2
   1008 qYg1            QN qYht
   1009 qYg2            QN qYh1
   1010 qYg3            QN qYh0
   1011 qYg4            QN qYh4
   1012 qYg5            QN qYh5
   1013 qYg6            QN qYh6
   1014 qYg7            QN qYh7
   1015 qYgt            QN qYh3
   1016 
   1017 qYf0            QN qYg6
   1018 qYf1            QN qYg5
   1019 qYf2            QN qYg4
   1020 qYf3            QN qYgt
   1021 qYf4            QN qYg3
   1022 qYf5            QN qYg2
   1023 qYf6            QN qYg1
   1024 qYf7            QN qYg0
   1025 qYft            QN qYg7
   1026 
   1027         VRSHR       qYj7, qYj7, #2
   1028         VRSHR       qYj6, qYj6, #1
   1029 
   1030         VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
   1031         VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
   1032         VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
   1033         VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
   1034         VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
   1035         VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
   1036 
   1037         VQDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
   1038         ;// IStage 4,3 rows 0to1 x 1/2
   1039 
   1040         MOV         pTemp, #0x4             ;// ensure correct round
   1041         VDUP        qScale1, pTemp           ;// of DC result
   1042         VADD        qYi0, qYi0, qScale1
   1043 
   1044         VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
   1045         VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
   1046 
   1047         VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
   1048         VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
   1049         VSUB        qYh2, qYi2, qYi3        ;// h2, h3
   1050         VQDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
   1051 
   1052         VMULL       qXt0, dYi4lo, C         ;// c*i4
   1053         VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
   1054         VMULL       qXt1, dYi4hi, C
   1055         VMLAL       qXt1, dYi6hi, S
   1056         VSHRN       dYh4lo, qXt0, #16       ;// h4
   1057         VSHRN       dYh4hi, qXt1, #16
   1058 
   1059         VMULL       qXt0, dYi6lo, C         ;// c*i6
   1060         VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
   1061         VMULL       qXt1, dYi6hi, C
   1062         VMLSL       qXt1, dYi4hi, S
   1063         VSHRN       dYh6lo, qXt0, #16       ;// h6
   1064         VSHRN       dYh6hi, qXt1, #16
   1065 
   1066         VSUB        qYg6, qYh6, qYh7
   1067         VSUB        qYg5, qYh5, qYg6
   1068         VSUB        qYg4, qYh4, qYg5
   1069 
   1070         ;// IStage 2 rows 0to3 x 1/2
   1071         VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
   1072         VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
   1073         VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
   1074         VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
   1075 
   1076 
   1077         ;// IStage 1 all rows
   1078         VHADD        qYf3, qYg3, qYg4
   1079         VHSUB        qYf4, qYg3, qYg4
   1080         VHADD        qYf2, qYg2, qYg5
   1081         VHSUB        qYf5, qYg2, qYg5
   1082         VHADD        qYf1, qYg1, qYg6
   1083         VHSUB        qYf6, qYg1, qYg6
   1084         VHADD        qYf0, qYg0, qYg7
   1085         VHSUB        qYf7, qYg0, qYg7
   1086 
   1087 YTR0            EQU Src0
   1088 YTR1            EQU Src4
   1089 YTR2            EQU Src1
   1090 YTR3            EQU Src2
   1091 YTR4            EQU Src7
   1092 YTR5            EQU Src5
   1093 YTR6            EQU Tmp
   1094 YTR7            EQU Src6
   1095 YTRt            EQU Src3
   1096 
   1097 qC0             QN  YTR0.S32                ;// for YTRpose
   1098 qC1             QN  YTR1.S32
   1099 qC2             QN  YTR2.S32
   1100 qC3             QN  YTR3.S32
   1101 qC4             QN  YTR4.S32
   1102 qC5             QN  YTR5.S32
   1103 qC6             QN  YTR6.S32
   1104 qC7             QN  YTR7.S32
   1105 
   1106 dD0             DN  YTR0*2+1                ;// for using VSWP
   1107 dD1             DN  YTR1*2+1
   1108 dD2             DN  YTR2*2+1
   1109 dD3             DN  YTR3*2+1
   1110 dD4             DN  YTR4*2
   1111 dD5             DN  YTR5*2
   1112 dD6             DN  YTR6*2
   1113 dD7             DN  YTR7*2
   1114 
   1115         VTRN        qYf0, qYf1
   1116         VTRN        qYf2, qYf3
   1117         VTRN        qYf4, qYf5
   1118         VTRN        qYf6, qYf7
   1119         VTRN        qC0, qC2
   1120         VTRN        qC1, qC3
   1121         VTRN        qC4, qC6
   1122         VTRN        qC5, qC7
   1123         VSWP        dD0, dD4
   1124         VSWP        dD1, dD5
   1125         VSWP        dD2, dD6
   1126         VSWP        dD3, dD7
   1127 
   1128 
   1129 dYf0U8          DN YTR0*2.U8
   1130 dYf1U8          DN YTR1*2.U8
   1131 dYf2U8          DN YTR2*2.U8
   1132 dYf3U8          DN YTR3*2.U8
   1133 dYf4U8          DN YTR4*2.U8
   1134 dYf5U8          DN YTR5*2.U8
   1135 dYf6U8          DN YTR6*2.U8
   1136 dYf7U8          DN YTR7*2.U8
   1137 
   1138         ;//
   1139         ;// Do saturation if outsize is other than S16
   1140         ;//
   1141 
   1142         IF ("$outsize"="u8")
   1143             ;// Output range [0-255]
   1144             VQMOVN            dYf0U8, qYf0
   1145             VQMOVN            dYf1U8, qYf1
   1146             VQMOVN            dYf2U8, qYf2
   1147             VQMOVN            dYf3U8, qYf3
   1148             VQMOVN            dYf4U8, qYf4
   1149             VQMOVN            dYf5U8, qYf5
   1150             VQMOVN            dYf6U8, qYf6
   1151             VQMOVN            dYf7U8, qYf7
   1152         ENDIF
   1153 
   1154         IF ("$outsize"="s9")
   1155             ;// Output range [-256 to +255]
   1156             VQSHL            qYf0, qYf0, #16-9
   1157             VQSHL            qYf1, qYf1, #16-9
   1158             VQSHL            qYf2, qYf2, #16-9
   1159             VQSHL            qYf3, qYf3, #16-9
   1160             VQSHL            qYf4, qYf4, #16-9
   1161             VQSHL            qYf5, qYf5, #16-9
   1162             VQSHL            qYf6, qYf6, #16-9
   1163             VQSHL            qYf7, qYf7, #16-9
   1164 
   1165             VSHR             qYf0, qYf0, #16-9
   1166             VSHR             qYf1, qYf1, #16-9
   1167             VSHR             qYf2, qYf2, #16-9
   1168             VSHR             qYf3, qYf3, #16-9
   1169             VSHR             qYf4, qYf4, #16-9
   1170             VSHR             qYf5, qYf5, #16-9
   1171             VSHR             qYf6, qYf6, #16-9
   1172             VSHR             qYf7, qYf7, #16-9
   1173         ENDIF
   1174 
   1175         ;// Store output depending on the Stride size
   1176         IF "$stride"="s"
   1177             VST1        qYf0, [pDest @64], Stride
   1178             VST1        qYf1, [pDest @64], Stride
   1179             VST1        qYf2, [pDest @64], Stride
   1180             VST1        qYf3, [pDest @64], Stride
   1181             VST1        qYf4, [pDest @64], Stride
   1182             VST1        qYf5, [pDest @64], Stride
   1183             VST1        qYf6, [pDest @64], Stride
   1184             VST1        qYf7, [pDest @64]
   1185         ELSE
   1186             IF ("$outsize"="u8")
   1187                 VST1        dYf0U8, [pDest @64], #8
   1188                 VST1        dYf1U8, [pDest @64], #8
   1189                 VST1        dYf2U8, [pDest @64], #8
   1190                 VST1        dYf3U8, [pDest @64], #8
   1191                 VST1        dYf4U8, [pDest @64], #8
   1192                 VST1        dYf5U8, [pDest @64], #8
   1193                 VST1        dYf6U8, [pDest @64], #8
   1194                 VST1        dYf7U8, [pDest @64]
   1195             ELSE
   1196                 ;// ("$outsize"="s9") or ("$outsize"="s16")
   1197                 VST1        qYf0, [pDest @64], #16
   1198                 VST1        qYf1, [pDest @64], #16
   1199                 VST1        qYf2, [pDest @64], #16
   1200                 VST1        qYf3, [pDest @64], #16
   1201                 VST1        qYf4, [pDest @64], #16
   1202                 VST1        qYf5, [pDest @64], #16
   1203                 VST1        qYf6, [pDest @64], #16
   1204                 VST1        qYf7, [pDest @64]
   1205             ENDIF
   1206 
   1207         ENDIF
   1208 
   1209 
   1210 
   1211         ENDIF ;// CortexA8
   1212 
   1213 
   1214 
   1215         MEND
   1216 
   1217         ;// Scale TWO input rows with TWO rows of 16 bit scale values
   1218         ;//
   1219         ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
   1220         ;// input (Eight input values) with one row of scale values. Also
   1221         ;// Loads next scale values from pScale, if $LastRow flag is not set.
   1222         ;//
   1223         ;// Input Registers:
   1224         ;//
   1225         ;// $dAlo           - Input D register with first four S16 values of row n
   1226         ;// $dAhi           - Input D register with next four S16 values of row n
   1227         ;// $dBlo           - Input D register with first four S16 values of row n+1
   1228         ;// $dBhi           - Input D register with next four S16 values of row n+1
   1229         ;// pScale          - Pointer to next row of scale values
   1230         ;// qT0lo           - Temporary scratch register
   1231         ;// qT0hi           - Temporary scratch register
   1232         ;// qT1lo           - Temporary scratch register
   1233         ;// qT1hi           - Temporary scratch register
   1234         ;// dScale1lo       - Scale value of row n
   1235         ;// dScale1hi       - Scale value of row n
   1236         ;// dScale2lo       - Scale value of row n+1
   1237         ;// dScale2hi       - Scale value of row n+1
   1238         ;//
   1239         ;// Input Flag
   1240         ;//
   1241         ;// $LastRow        - Flag to indicate whether current row is last row
   1242         ;//
   1243         ;// Output Registers:
   1244         ;//
   1245         ;// $dAlo           - Scaled output values (first four S16 of row n)
   1246         ;// $dAhi           - Scaled output values (next four S16 of row n)
   1247         ;// $dBlo           - Scaled output values (first four S16 of row n+1)
   1248         ;// $dBhi           - Scaled output values (next four S16 of row n+1)
   1249         ;// qScale1         - Scale values for next row
   1250         ;// qScale2         - Scale values for next row+1
   1251         ;// pScale          - Pointer to next row of scale values
   1252         ;//
   1253         MACRO
   1254         M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
   1255         VMULL       qT0lo, $dAlo, dScale1lo
   1256         VMULL       qT0hi, $dAhi, dScale1hi
   1257         VMULL       qT1lo, $dBlo, dScale2lo
   1258         VMULL       qT1hi, $dBhi, dScale2hi
   1259         IF "$LastRow"="0"
   1260             VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
   1261             VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
   1262         ENDIF
   1263         VQRSHRN       $dAlo, qT0lo, #12
   1264         VQRSHRN       $dAhi, qT0hi, #12
   1265         VQRSHRN       $dBlo, qT1lo, #12
   1266         VQRSHRN       $dBhi, qT1hi, #12
   1267         MEND
   1268 
   1269         ;// Scale 8x8 block input values with 16 bit scale values
   1270         ;//
   1271         ;// This macro is used to pre-scale block of 8x8 input.
   1272         ;// This also do the Ist stage transformations of IDCT.
   1273         ;//
   1274         ;// Input Registers:
   1275         ;//
   1276         ;// dXjnlo          - n th input D register with first four S16 values
   1277         ;// dXjnhi          - n th input D register with next four S16 values
   1278         ;// qXjn            - n th input Q register with eight S16 values
   1279         ;// pScale          - Pointer to scale values
   1280         ;//
   1281         ;// Output Registers:
   1282         ;//
   1283         ;// qXin            - n th output Q register with eight S16 output values of 1st stage
   1284         ;//
   1285         MACRO
   1286         M_IDCT_PRESCALE16
   1287         VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
   1288         VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
   1289         M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
   1290         M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
   1291         M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
   1292         M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
   1293         VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
   1294         VSUB        qXi6, qXj1, qXj7            ;// j1-j7
   1295         LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
   1296         VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
   1297         VSUB        qXi2, qXj2, qXj6            ;// j2-j6
   1298         VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
   1299         VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
   1300         VSUB        qXi4, qXj5, qXj3            ;// j5-j3
   1301         MEND
   1302 
   1303 
   1304         ;// Scale 8x8 block input values with 32 bit scale values
   1305         ;//
   1306         ;// This macro is used to pre-scale block of 8x8 input.
   1307         ;// This also do the Ist stage transformations of IDCT.
   1308         ;//
   1309         ;// Input Registers:
   1310         ;//
   1311         ;// dXjnlo          - n th input D register with first four S16 values
   1312         ;// dXjnhi          - n th input D register with next four S16 values
   1313         ;// qXjn            - n th input Q register with eight S16 values
   1314         ;// pScale          - Pointer to 32bit scale values in Q23 format
   1315         ;//
   1316         ;// Output Registers:
   1317         ;//
   1318         ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
   1319         ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
   1320         ;//
   1321         MACRO
   1322         M_IDCT_PRESCALE32
   1323 qScale0lo       QN 0.S32
   1324 qScale0hi       QN 1.S32
   1325 qScale1lo       QN 2.S32
   1326 qScale1hi       QN 3.S32
   1327 qScale2lo       QN qScale1lo
   1328 qScale2hi       QN qScale1hi
   1329 qScale3lo       QN qScale1lo
   1330 qScale3hi       QN qScale1hi
   1331 qScale4lo       QN qScale1lo
   1332 qScale4hi       QN qScale1hi
   1333 qScale5lo       QN qScale0lo
   1334 qScale5hi       QN qScale0hi
   1335 qScale6lo       QN qScale0lo
   1336 qScale6hi       QN qScale0hi
   1337 qScale7lo       QN qScale0lo
   1338 qScale7hi       QN qScale0hi
   1339 
   1340 qSrc0lo         QN 4.S32
   1341 qSrc0hi         QN 5.S32
   1342 qSrc1lo         QN 6.S32
   1343 qSrc1hi         QN Src4.S32
   1344 qSrc2lo         QN qSrc0lo
   1345 qSrc2hi         QN qSrc0hi
   1346 qSrc3lo         QN qSrc0lo
   1347 qSrc3hi         QN qSrc0hi
   1348 qSrc4lo         QN qSrc0lo
   1349 qSrc4hi         QN qSrc0hi
   1350 qSrc5lo         QN qSrc1lo
   1351 qSrc5hi         QN qSrc1hi
   1352 qSrc6lo         QN qSrc1lo
   1353 qSrc6hi         QN qSrc1hi
   1354 qSrc7lo         QN qSrc0lo
   1355 qSrc7hi         QN qSrc0hi
   1356 
   1357 qRes17lo        QN qScale0lo
   1358 qRes17hi        QN qScale0hi
   1359 qRes26lo        QN qScale0lo
   1360 qRes26hi        QN qScale0hi
   1361 qRes53lo        QN qScale0lo
   1362 qRes53hi        QN qScale0hi
   1363 
   1364             ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
   1365 
   1366             ;// Row 0
   1367             VLD1        {qScale0lo, qScale0hi}, [pScale]!
   1368             VSHLL       qSrc0lo, dXj0lo, #(12-1)
   1369             VSHLL       qSrc0hi, dXj0hi, #(12-1)
   1370             VLD1        {qScale1lo, qScale1hi}, [pScale]!
   1371             VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
   1372             VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
   1373             VLD1        {qScale7lo, qScale7hi}, [pTemp]!
   1374             VSHLL       qSrc1lo, dXj1lo, #(12-1)
   1375             VSHLL       qSrc1hi, dXj1hi, #(12-1)
   1376             VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
   1377             VMOVN       dXi0hi, qSrc0hi
   1378             VSHLL       qSrc7lo, dXj7lo, #(12-1)
   1379             VSHLL       qSrc7hi, dXj7hi, #(12-1)
   1380             SUB         pTemp, pTemp, #((16*2)+(4*8*1))
   1381             VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
   1382             VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
   1383             VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
   1384             VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
   1385             VLD1        {qScale2lo, qScale2hi}, [pScale]!
   1386 
   1387             ;// Row 1 & 7
   1388             VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
   1389             VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
   1390             VMOVN       dXi5lo, qRes17lo                ;// Output i5
   1391             VMOVN       dXi5hi, qRes17hi
   1392             VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
   1393             VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
   1394             VMOVN       dXi6lo, qRes17lo                ;// Output i6
   1395             VMOVN       dXi6hi, qRes17hi
   1396             VSHLL       qSrc2lo, dXj2lo, #(12-1)
   1397             VSHLL       qSrc2hi, dXj2hi, #(12-1)
   1398             VLD1        {qScale6lo, qScale6hi}, [pTemp]!
   1399             VSHLL       qSrc6lo, dXj6lo, #(12-1)
   1400             VSHLL       qSrc6hi, dXj6hi, #(12-1)
   1401             SUB         pTemp, pTemp, #((16*2)+(4*8*1))
   1402             VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
   1403             VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
   1404             VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
   1405             VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
   1406             VLD1        {qScale3lo, qScale3hi}, [pScale]!
   1407 
   1408             ;// Row 2 & 6
   1409             VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
   1410             VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
   1411             VMOVN       dXi3lo, qRes26lo                ;// Output i3
   1412             VMOVN       dXi3hi, qRes26hi
   1413             VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
   1414             VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
   1415             VMOVN       dXi2lo, qRes26lo                ;// Output i2
   1416             VMOVN       dXi2hi, qRes26hi
   1417             VSHLL       qSrc3lo, dXj3lo, #(12-1)
   1418             VSHLL       qSrc3hi, dXj3hi, #(12-1)
   1419             VLD1        {qScale5lo, qScale5hi}, [pTemp]!
   1420             VSHLL       qSrc5lo, dXj5lo, #(12-1)
   1421             VSHLL       qSrc5hi, dXj5hi, #(12-1)
   1422             VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
   1423             VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
   1424             VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
   1425             VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
   1426 
   1427             ;// Row 3 & 5
   1428             VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
   1429             VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
   1430             SUB         pSrc, pSrc, #16*2*2
   1431             VMOVN       dXi7lo, qRes53lo                ;// Output i7
   1432             VMOVN       dXi7hi, qRes53hi
   1433             VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
   1434             VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
   1435             VLD1        qXj4, [pSrc @64]
   1436             VMOVN       dXi4lo, qRes53lo                ;// Output i4
   1437             VMOVN       dXi4hi, qRes53hi
   1438             VSHLL       qSrc4lo, dXj4lo, #(12-1)
   1439             VSHLL       qSrc4hi, dXj4hi, #(12-1)
   1440             VLD1        {qScale4lo, qScale4hi}, [pScale]
   1441             LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
   1442             VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
   1443             VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
   1444             VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
   1445             ;// Row 4
   1446             VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
   1447             VMOVN       dXi1hi, qSrc4hi
   1448 
   1449         MEND
   1450 
   1451         END
   1452