Home | History | Annotate | Download | only in api
      1 ;//
      2 ;// Copyright (C) 2004 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// IDCT_s.s
     19 ;//
     20 ;// Inverse DCT module
     21 ;//
     22 ;//
     23 ;// ALGORITHM DESCRIPTION
     24 ;//
     25 ;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
     26 ;// column and then a 1D IDCT for each row.
     27 ;//
     28 ;// The 8-point 1D IDCT is defined by
     29 ;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
     30 ;//
     31 ;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
     32 ;//   c(u,x) = cos( (2x+1)*u*pi/16 )
     33 ;//
     34 ;// We compute the 8-point 1D IDCT using the reverse of
     35 ;// the Arai-Agui-Nakajima flow graph which we split into
     36 ;// 5 stages named in reverse order to identify with the
     37 ;// forward DCT. Direct inversion of the forward formulae
     38 ;// in file FDCT_s.s gives:
     39 ;//
     40 ;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
     41 ;//             [ A(0) = 2*sqrt(2)
     42 ;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
     43 ;//
     44 ;// IStage 4:   i0 = j0             i1 = j4
     45 ;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
     46 ;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
     47 ;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
     48 ;//
     49 ;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
     50 ;//             h2 = (i2*sqrt2)-i3  h3 = i3
     51 ;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
     52 ;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
     53 ;//             [ The above two lines rotate by -(pi/8) ]
     54 ;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
     55 ;//
     56 ;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
     57 ;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
     58 ;//             g7 = h7             g6 = h6 - h7
     59 ;//             g5 = h5 - g6        g4 = h4 - g5
     60 ;//
     61 ;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
     62 ;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
     63 ;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
     64 ;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
     65 ;//
     66 ;// Note that most coefficients are halved 3 times during the
     67 ;// above calculation. We can rescale the algorithm dividing
     68 ;// the input by 8 to remove the halvings.
     69 ;//
     70 ;// IStage 5:   j(u) = T(u)*A(u)/8
     71 ;//
     72 ;// IStage 4:   i0 = j0             i1 = j4
     73 ;//             i3 = j2 + j6        i2 = j2 - j6
     74 ;//             i7 = j5 + j3        i4 = j5 - j3
     75 ;//             i5 = j1 + j7        i6 = j1 - j7
     76 ;//
     77 ;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
     78 ;//             h2 = (i2*sqrt2)-i3  h3 = i3
     79 ;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
     80 ;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
     81 ;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
     82 ;//
     83 ;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
     84 ;//             g1 = h1 + h2        g2 = h1 - h2
     85 ;//             g7 = h7             g6 = h6 - h7
     86 ;//             g5 = h5 - g6        g4 = h4 - g5
     87 ;//
     88 ;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
     89 ;//             f1 = g1 + g6        f6 = g1 - g6
     90 ;//             f2 = g2 + g5        f5 = g2 - g5
     91 ;//             f3 = g3 + g4        f4 = g3 - g4
     92 ;//
     93 ;// Note:
     94 ;// 1. The scaling by A(u)/8 can often be combined with inverse
     95 ;//    quantization. The column and row scalings can be combined.
     96 ;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
     97 ;//    to the above code but is otherwise identical.
     98 ;// 3. The rotation by -pi/8 can be peformed using three multiplies
     99 ;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
    100 ;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
    101 ;// 4. If |T(u)|<=1 then from the IDCT definition,
    102 ;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
    103 ;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
    104 ;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
    105 ;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
    106 ;//            = (approx)2.64
    107 ;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
    108 ;//    The table below shows input patterns generating the maximum
    109 ;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
    110 ;//    InputPattern      Max |f(x)|
    111 ;//      PPPPPPPP        |f0| =  2.64
    112 ;//      PPPMMMMM        |f1| =  2.64
    113 ;//      PPMMMPPP        |f2| =  2.64
    114 ;//      PPMMPPMM        |f3| =  2.64
    115 ;//      PMMPPMMP        |f4| =  2.64
    116 ;//      PMMPMMPM        |f5| =  2.64
    117 ;//      PMPPMPMP        |f6| =  2.64
    118 ;//      PMPMPMPM        |f7| =  2.64
    119 ;//   Note that this input pattern is the transpose of the
    120 ;//   corresponding max input patter for the FDCT.
    121 
    122 ;// Arguments
    123 
    124 pSrc    RN 0    ;// source data buffer
    125 Stride  RN 1    ;// destination stride in bytes
    126 pDest   RN 2    ;// destination data buffer
    127 pScale  RN 3    ;// pointer to scaling table
    128 
    129 
    130         ;// DCT Inverse Macro
    131         ;// The DCT code should be parametrized according
    132         ;// to the following inputs:
    133         ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
    134         ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
    135         ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
    136         ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
    137         ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
    138         ;//
    139         ;// Inputs:
    140         ;// pSrc   = r0 = Pointer to input data
    141         ;//               Range is -256 to +255 (9-bit)
    142         ;// Stride = r1 = Stride between input lines
    143         ;// pDest  = r2 = Pointer to output data
    144         ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
    145 
    146 
    147 
    148         MACRO
    149         M_IDCT  $outsize, $inscale, $stride
    150         LCLA    SHIFT
    151 
    152 
    153         IF ARM1136JS
    154 
    155 ;// REGISTER ALLOCATION
    156 ;// This is hard since we have 8 values, 9 free registers and each
    157 ;// butterfly requires a temporary register. We also want to
    158 ;// maintain register order so we can use LDM/STM. The table below
    159 ;// summarises the register allocation that meets all these criteria.
    160 ;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
    161 ;//
    162 ;// r1  a01     g0  h0
    163 ;// r4  b01 f0  g1  h1  i0
    164 ;// r5  a23 f1  g2      i1
    165 ;// r6  b23 f2  g3  h2  i2
    166 ;// r7  a45 f3      h3  i3
    167 ;// r8  b45 f4  g4  h4  i4
    168 ;// r9  a67 f5  g5  h5  i5
    169 ;// r10 b67 f6  g6  h6  i6
    170 ;// r11     f7  g7  h7  i7
    171 ;//
    172 ra01    RN 1
    173 rb01    RN 4
    174 ra23    RN 5
    175 rb23    RN 6
    176 ra45    RN 7
    177 rb45    RN 8
    178 ra67    RN 9
    179 rb67    RN 10
    180 rtmp    RN 11
    181 csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
    182 LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
    183 ;// Transpose allocation
    184 xft     RN ra01
    185 xf0     RN rb01
    186 xf1     RN ra23
    187 xf2     RN rb23
    188 xf3     RN ra45
    189 xf4     RN rb45
    190 xf5     RN ra67
    191 xf6     RN rb67
    192 xf7     RN rtmp
    193 ;// IStage 1 allocation
    194 xg0     RN xft
    195 xg1     RN xf0
    196 xg2     RN xf1
    197 xg3     RN xf2
    198 xgt     RN xf3
    199 xg4     RN xf4
    200 xg5     RN xf5
    201 xg6     RN xf6
    202 xg7     RN xf7
    203 ;// IStage 2 allocation
    204 xh0     RN xg0
    205 xh1     RN xg1
    206 xht     RN xg2
    207 xh2     RN xg3
    208 xh3     RN xgt
    209 xh4     RN xg4
    210 xh5     RN xg5
    211 xh6     RN xg6
    212 xh7     RN xg7
    213 ;// IStage 3,4 allocation
    214 xit     RN xh0
    215 xi0     RN xh1
    216 xi1     RN xht
    217 xi2     RN xh2
    218 xi3     RN xh3
    219 xi4     RN xh4
    220 xi5     RN xh5
    221 xi6     RN xh6
    222 xi7     RN xh7
    223 
    224         M_STR   pDest,  ppDest
    225         IF "$stride"="s"
    226             M_STR   Stride, pStride
    227         ENDIF
    228         M_ADR   pDest,  pBlk
    229         LDR     csPiBy8, =0x30fc7642
    230         LDR     LoopRR2, =0x00005a82
    231 
    232 v6_idct_col$_F
    233         ;// Load even values
    234         LDR     xi4, [pSrc], #4  ;// j0
    235         LDR     xi5, [pSrc, #4*16-4]  ;// j4
    236         LDR     xi6, [pSrc, #2*16-4]  ;// j2
    237         LDR     xi7, [pSrc, #6*16-4]  ;// j6
    238 
    239         ;// Scale Even Values
    240         IF "$inscale"="s16" ;// 16x16 mul
    241 SHIFT       SETA    12
    242             LDR     xi0, [pScale], #4
    243             LDR     xi1, [pScale, #4*16-4]
    244             LDR     xi2, [pScale, #2*16-4]
    245             MOV     xit, #1<<(SHIFT-1)
    246             SMLABB  xi3, xi0, xi4, xit
    247             SMLATT  xi4, xi0, xi4, xit
    248             SMLABB  xi0, xi1, xi5, xit
    249             SMLATT  xi5, xi1, xi5, xit
    250             MOV     xi3, xi3, ASR #SHIFT
    251             PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
    252             LDR     xi3, [pScale, #6*16-4]
    253             SMLABB  xi1, xi2, xi6, xit
    254             SMLATT  xi6, xi2, xi6, xit
    255             MOV     xi0, xi0, ASR #SHIFT
    256             PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
    257             SMLABB  xi2, xi3, xi7, xit
    258             SMLATT  xi7, xi3, xi7, xit
    259             MOV     xi1, xi1, ASR #SHIFT
    260             PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
    261             MOV     xi2, xi2, ASR #SHIFT
    262             PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
    263         ENDIF
    264         IF "$inscale"="s32" ;// 32x16 mul
    265 SHIFT       SETA    (12+8-16)
    266             MOV     xit, #1<<(SHIFT-1)
    267             LDR     xi0, [pScale], #8
    268             LDR     xi1, [pScale, #0*32+4-8]
    269             LDR     xi2, [pScale, #4*32-8]
    270             LDR     xi3, [pScale, #4*32+4-8]
    271             SMLAWB  xi0, xi0, xi4, xit
    272             SMLAWT  xi1, xi1, xi4, xit
    273             SMLAWB  xi2, xi2, xi5, xit
    274             SMLAWT  xi3, xi3, xi5, xit
    275             MOV     xi0, xi0, ASR #SHIFT
    276             PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
    277             MOV     xi2, xi2, ASR #SHIFT
    278             PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
    279             LDR     xi0, [pScale, #2*32-8]
    280             LDR     xi1, [pScale, #2*32+4-8]
    281             LDR     xi2, [pScale, #6*32-8]
    282             LDR     xi3, [pScale, #6*32+4-8]
    283             SMLAWB  xi0, xi0, xi6, xit
    284             SMLAWT  xi1, xi1, xi6, xit
    285             SMLAWB  xi2, xi2, xi7, xit
    286             SMLAWT  xi3, xi3, xi7, xit
    287             MOV     xi0, xi0, ASR #SHIFT
    288             PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
    289             MOV     xi2, xi2, ASR #SHIFT
    290             PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
    291         ENDIF
    292 
    293         ;// Load odd values
    294         LDR     xi0, [pSrc, #1*16-4]      ;// j1
    295         LDR     xi1, [pSrc, #7*16-4]      ;// j7
    296         LDR     xi2, [pSrc, #5*16-4]      ;// j5
    297         LDR     xi3, [pSrc, #3*16-4]      ;// j3
    298 
    299         IF  {TRUE}
    300             ;// shortcut if odd values 0
    301             TEQ     xi0, #0
    302             TEQEQ   xi1, #0
    303             TEQEQ   xi2, #0
    304             TEQEQ   xi3, #0
    305             BEQ     v6OddZero$_F
    306         ENDIF
    307 
    308         ;// Store scaled even values
    309         STMIA   pDest, {xi4, xi5, xi6, xi7}
    310 
    311         ;// Scale odd values
    312         IF "$inscale"="s16"
    313             ;// Perform AAN Scale
    314             LDR     xi4, [pScale, #1*16-4]
    315             LDR     xi5, [pScale, #7*16-4]
    316             LDR     xi6, [pScale, #5*16-4]
    317             SMLABB  xi7, xi0, xi4, xit
    318             SMLATT  xi0, xi0, xi4, xit
    319             SMLABB  xi4, xi1, xi5, xit
    320             SMLATT  xi1, xi1, xi5, xit
    321             MOV     xi7, xi7, ASR #SHIFT
    322             PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
    323             LDR     xi7, [pScale, #3*16-4]
    324             SMLABB  xi5, xi2, xi6, xit
    325             SMLATT  xi2, xi2, xi6, xit
    326             MOV     xi4, xi4, ASR #SHIFT
    327             PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
    328             SMLABB  xi6, xi3, xi7, xit
    329             SMLATT  xi3, xi3, xi7, xit
    330             MOV     xi5, xi5, ASR #SHIFT
    331             PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
    332             MOV     xi6, xi6, ASR #SHIFT
    333             PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
    334         ENDIF
    335         IF "$inscale"="s32" ;// 32x16 mul
    336             LDR     xi4, [pScale, #1*32-8]
    337             LDR     xi5, [pScale, #1*32+4-8]
    338             LDR     xi6, [pScale, #7*32-8]
    339             LDR     xi7, [pScale, #7*32+4-8]
    340             SMLAWB  xi4, xi4, xi0, xit
    341             SMLAWT  xi5, xi5, xi0, xit
    342             SMLAWB  xi6, xi6, xi1, xit
    343             SMLAWT  xi7, xi7, xi1, xit
    344             MOV     xi4, xi4, ASR #SHIFT
    345             PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
    346             MOV     xi6, xi6, ASR #SHIFT
    347             PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
    348             LDR     xi4, [pScale, #5*32-8]
    349             LDR     xi5, [pScale, #5*32+4-8]
    350             LDR     xi6, [pScale, #3*32-8]
    351             LDR     xi7, [pScale, #3*32+4-8]
    352             SMLAWB  xi4, xi4, xi2, xit
    353             SMLAWT  xi5, xi5, xi2, xit
    354             SMLAWB  xi6, xi6, xi3, xit
    355             SMLAWT  xi7, xi7, xi3, xit
    356             MOV     xi4, xi4, ASR #SHIFT
    357             PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
    358             MOV     xi6, xi6, ASR #SHIFT
    359             PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
    360         ENDIF
    361 
    362         SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
    363         SSUB16  xi6, xi0, xi1           ;// j1-j7
    364         SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
    365         SSUB16  xi4, xi2, xi3           ;// j5-j3
    366 
    367         SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
    368 
    369         PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
    370         PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
    371 
    372         SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
    373         SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
    374         SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
    375         SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
    376 
    377         SMULBB  xi1, xi3, LoopRR2
    378         SMULTB  xi3, xi3, LoopRR2
    379 
    380         PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
    381         PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
    382         SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
    383 
    384         ;// xi0,xi1,xi2,xi3 now free
    385         ;// IStage 4,3, rows 2to3 x1/2
    386 
    387         MOV     xi3, xi3, LSL #1
    388         PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
    389         LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
    390 
    391         ;// IStage 2, rows4to7
    392         SSUB16  xg6, xh6, xh7
    393         SSUB16  xg5, xh5, xg6
    394         SSUB16  xg4, xh4, xg5
    395 
    396         SSUB16  xi2, xi0, xi1           ;// (j2-j6)
    397         SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
    398 
    399         SMULBB  xi0, xi2, LoopRR2
    400         SMULTB  xi2, xi2, LoopRR2
    401 
    402         MOV     xi2, xi2, LSL #1
    403         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    404 
    405         ;// xi0, xi1 now free
    406         ;// IStage 4,3 rows 0to1 x 1/2
    407         LDRD    xi0, [pDest]            ;// j0, j4 scaled
    408         SSUB16  xh2, xh2, xi3
    409         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    410 
    411         SHADD16 xh0, xi0, xi1
    412         SHSUB16 xh1, xi0, xi1
    413 
    414         ;// IStage 2 rows 0to3 x 1/2
    415         SHSUB16 xg2, xh1, xh2
    416         SHADD16 xg1, xh1, xh2
    417         SHSUB16 xg3, xh0, xh3
    418         SHADD16 xg0, xh0, xh3
    419 
    420         ;// IStage 1 all rows
    421         SADD16  xf3, xg3, xg4
    422         SSUB16  xf4, xg3, xg4
    423         SADD16  xf2, xg2, xg5
    424         SSUB16  xf5, xg2, xg5
    425         SADD16  xf1, xg1, xg6
    426         SSUB16  xf6, xg1, xg6
    427         SADD16  xf0, xg0, xg7
    428         SSUB16  xf7, xg0, xg7
    429 
    430         ;// Transpose, store and loop
    431         PKHBT   ra01, xf0, xf1, LSL #16
    432         PKHTB   rb01, xf1, xf0, ASR #16
    433 
    434         PKHBT   ra23, xf2, xf3, LSL #16
    435         PKHTB   rb23, xf3, xf2, ASR #16
    436 
    437         PKHBT   ra45, xf4, xf5, LSL #16
    438         PKHTB   rb45, xf5, xf4, ASR #16
    439 
    440         PKHBT   ra67, xf6, xf7, LSL #16
    441         STMIA   pDest!, {ra01, ra23, ra45, ra67}
    442         PKHTB   rb67, xf7, xf6, ASR #16
    443         STMIA   pDest!, {rb01, rb23, rb45, rb67}
    444         BCC     v6_idct_col$_F
    445 
    446         SUB     pSrc, pDest, #(64*2)
    447         M_LDR   pDest, ppDest
    448         IF "$stride"="s"
    449             M_LDR   pScale, pStride
    450         ENDIF
    451         B       v6_idct_row$_F
    452 
    453 v6OddZero$_F
    454         SSUB16  xi2, xi6, xi7           ;// (j2-j6)
    455         SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
    456 
    457         SMULBB  xi0, xi2, LoopRR2
    458         SMULTB  xi2, xi2, LoopRR2
    459 
    460         MOV     xi2, xi2, LSL #1
    461         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    462         SSUB16  xh2, xh2, xi3
    463 
    464         ;// xi0, xi1 now free
    465         ;// IStage 4,3 rows 0to1 x 1/2
    466 
    467         SHADD16 xh0, xi4, xi5
    468         SHSUB16 xh1, xi4, xi5
    469 
    470         ;// IStage 2 rows 0to3 x 1/2
    471         SHSUB16 xg2, xh1, xh2
    472         SHADD16 xg1, xh1, xh2
    473         SHSUB16 xg3, xh0, xh3
    474         SHADD16 xg0, xh0, xh3
    475 
    476         ;// IStage 1 all rows
    477         MOV  xf3, xg3
    478         MOV  xf4, xg3
    479         MOV  xf2, xg2
    480         MOV  xf5, xg2
    481         MOV  xf1, xg1
    482         MOV  xf6, xg1
    483         MOV  xf0, xg0
    484         MOV  xf7, xg0
    485 
    486         ;// Transpose
    487         PKHBT   ra01, xf0, xf1, LSL #16
    488         PKHTB   rb01, xf1, xf0, ASR #16
    489 
    490         PKHBT   ra23, xf2, xf3, LSL #16
    491         PKHTB   rb23, xf3, xf2, ASR #16
    492 
    493         PKHBT   ra45, xf4, xf5, LSL #16
    494         PKHTB   rb45, xf5, xf4, ASR #16
    495 
    496         PKHBT   ra67, xf6, xf7, LSL #16
    497         PKHTB   rb67, xf7, xf6, ASR #16
    498 
    499         STMIA   pDest!, {ra01, ra23, ra45, ra67}
    500         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    501         STMIA   pDest!, {rb01, rb23, rb45, rb67}
    502 
    503         BCC     v6_idct_col$_F
    504         SUB     pSrc, pDest, #(64*2)
    505         M_LDR   pDest, ppDest
    506         IF "$stride"="s"
    507             M_LDR   pScale, pStride
    508         ENDIF
    509 
    510 
    511 v6_idct_row$_F
    512         ;// IStage 4,3, rows4to7 x1/4
    513         LDR     xit, =0x00010001        ;// rounding constant
    514         LDR     xi0, [pSrc, #1*16]      ;// j1
    515         LDR     xi1, [pSrc, #7*16]      ;// 4*j7
    516         LDR     xi2, [pSrc, #5*16]      ;// j5
    517         LDR     xi3, [pSrc, #3*16]      ;// j3
    518 
    519         SHADD16 xi1, xi1, xit           ;// 2*j7
    520         SHADD16 xi1, xi1, xit           ;// j7
    521 
    522         SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
    523         SSUB16  xi6, xi0, xi1           ;// j1-j7
    524         SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
    525         SSUB16  xi4, xi2, xi3           ;// j5-j3
    526 
    527         SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
    528 
    529         PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
    530         PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
    531 
    532         SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
    533         SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
    534         SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
    535         SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
    536 
    537         SMULBB  xi1, xi3, LoopRR2
    538         SMULTB  xi3, xi3, LoopRR2
    539 
    540         PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
    541         PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
    542         SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
    543 
    544         MOV     xi3, xi3, LSL #1
    545         PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
    546 
    547         ;// xi0,xi1,xi2,xi3 now free
    548         ;// IStage 4,3, rows 2to3 x1/2
    549 
    550         LDR     xi0, [pSrc, #2*16]      ;// j2
    551         LDR     xi1, [pSrc, #6*16]      ;// 2*j6
    552 
    553         ;// IStage 2, rows4to7
    554         SSUB16  xg6, xh6, xh7
    555         SSUB16  xg5, xh5, xg6
    556         SSUB16  xg4, xh4, xg5
    557 
    558         SHADD16 xi1, xi1, xit           ;// j6
    559         SSUB16  xi2, xi0, xi1           ;// (j2-j6)
    560         SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
    561 
    562         SMULBB  xi0, xi2, LoopRR2
    563         SMULTB  xi2, xi2, LoopRR2
    564 
    565         MOV     xi2, xi2, LSL #1
    566 
    567         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    568 
    569         ;// xi0, xi1 now free
    570         ;// IStage 4,3 rows 0to1 x 1/2
    571         LDR     xi1, [pSrc, #4*16]      ;// j4
    572         LDR     xi0, [pSrc], #4         ;// j0
    573 
    574         SSUB16  xh2, xh2, xi3
    575         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    576 
    577         ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
    578         SHADD16 xh0, xi0, xi1           ;// of DC result
    579         SHSUB16 xh1, xi0, xi1
    580 
    581         ;// IStage 2 rows 0to3 x 1/2
    582         SHSUB16 xg2, xh1, xh2
    583         SHADD16 xg1, xh1, xh2
    584         SHSUB16 xg3, xh0, xh3
    585         SHADD16 xg0, xh0, xh3
    586 
    587         ;// IStage 1 all rows
    588         SHADD16 xf3, xg3, xg4
    589         SHSUB16 xf4, xg3, xg4
    590         SHADD16 xf2, xg2, xg5
    591         SHSUB16 xf5, xg2, xg5
    592         SHADD16 xf1, xg1, xg6
    593         SHSUB16 xf6, xg1, xg6
    594         SHADD16 xf0, xg0, xg7
    595         SHSUB16 xf7, xg0, xg7
    596 
    597         ;// Saturate
    598         IF ("$outsize"="u8")
    599             USAT16  xf0, #8, xf0
    600             USAT16  xf1, #8, xf1
    601             USAT16  xf2, #8, xf2
    602             USAT16  xf3, #8, xf3
    603             USAT16  xf4, #8, xf4
    604             USAT16  xf5, #8, xf5
    605             USAT16  xf6, #8, xf6
    606             USAT16  xf7, #8, xf7
    607         ENDIF
    608         IF ("$outsize"="s9")
    609             SSAT16  xf0, #9, xf0
    610             SSAT16  xf1, #9, xf1
    611             SSAT16  xf2, #9, xf2
    612             SSAT16  xf3, #9, xf3
    613             SSAT16  xf4, #9, xf4
    614             SSAT16  xf5, #9, xf5
    615             SSAT16  xf6, #9, xf6
    616             SSAT16  xf7, #9, xf7
    617         ENDIF
    618 
    619         ;// Transpose to Row, Pack and store
    620         IF ("$outsize"="u8")
    621             ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
    622             ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
    623             ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
    624             ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
    625             PKHBT   ra01, xf0, xf2, LSL #16
    626             PKHTB   rb01, xf2, xf0, ASR #16
    627             PKHBT   ra23, xf4, xf6, LSL #16
    628             PKHTB   rb23, xf6, xf4, ASR #16
    629             STMIA   pDest, {ra01, ra23}
    630             IF "$stride"="s"
    631                 ADD     pDest, pDest, pScale
    632                 STMIA   pDest, {rb01, rb23}
    633                 ADD     pDest, pDest, pScale
    634             ELSE
    635                 ADD     pDest, pDest, #($stride)
    636                 STMIA   pDest, {rb01, rb23}
    637                 ADD     pDest, pDest, #($stride)
    638             ENDIF
    639         ENDIF
    640         IF ("$outsize"="s9"):LOR:("$outsize"="s16")
    641             PKHBT   ra01, xf0, xf1, LSL #16
    642             PKHTB   rb01, xf1, xf0, ASR #16
    643 
    644             PKHBT   ra23, xf2, xf3, LSL #16
    645             PKHTB   rb23, xf3, xf2, ASR #16
    646 
    647             PKHBT   ra45, xf4, xf5, LSL #16
    648             PKHTB   rb45, xf5, xf4, ASR #16
    649 
    650             PKHBT   ra67, xf6, xf7, LSL #16
    651             PKHTB   rb67, xf7, xf6, ASR #16
    652 
    653             STMIA   pDest, {ra01, ra23, ra45, ra67}
    654             IF "$stride"="s"
    655                 ADD     pDest, pDest, pScale
    656                 STMIA   pDest, {rb01, rb23, rb45, rb67}
    657                 ADD     pDest, pDest, pScale
    658             ELSE
    659                 ADD     pDest, pDest, #($stride)
    660                 STMIA   pDest, {rb01, rb23, rb45, rb67}
    661                 ADD     pDest, pDest, #($stride)
    662             ENDIF
    663         ENDIF
    664 
    665         BCC     v6_idct_row$_F
    666         ENDIF ;// ARM1136JS
    667 
    668 
    669         IF CortexA8
    670 
    671 Src0            EQU  7
    672 Src1            EQU  8
    673 Src2            EQU  9
    674 Src3            EQU  10
    675 Src4            EQU  11
    676 Src5            EQU  12
    677 Src6            EQU  13
    678 Src7            EQU  14
    679 Tmp             EQU  15
    680 
    681 qXj0            QN Src0.S16
    682 qXj1            QN Src1.S16
    683 qXj2            QN Src2.S16
    684 qXj3            QN Src3.S16
    685 qXj4            QN Src4.S16
    686 qXj5            QN Src5.S16
    687 qXj6            QN Src6.S16
    688 qXj7            QN Src7.S16
    689 qXjt            QN Tmp.S16
    690 
    691 dXj0lo          DN (Src0*2).S16
    692 dXj0hi          DN (Src0*2+1).S16
    693 dXj1lo          DN (Src1*2).S16
    694 dXj1hi          DN (Src1*2+1).S16
    695 dXj2lo          DN (Src2*2).S16
    696 dXj2hi          DN (Src2*2+1).S16
    697 dXj3lo          DN (Src3*2).S16
    698 dXj3hi          DN (Src3*2+1).S16
    699 dXj4lo          DN (Src4*2).S16
    700 dXj4hi          DN (Src4*2+1).S16
    701 dXj5lo          DN (Src5*2).S16
    702 dXj5hi          DN (Src5*2+1).S16
    703 dXj6lo          DN (Src6*2).S16
    704 dXj6hi          DN (Src6*2+1).S16
    705 dXj7lo          DN (Src7*2).S16
    706 dXj7hi          DN (Src7*2+1).S16
    707 dXjtlo          DN (Tmp*2).S16
    708 dXjthi          DN (Tmp*2+1).S16
    709 
    710 qXi0            QN qXj0
    711 qXi1            QN qXj4
    712 qXi2            QN qXj2
    713 qXi3            QN qXj7
    714 qXi4            QN qXj5
    715 qXi5            QN qXjt
    716 qXi6            QN qXj1
    717 qXi7            QN qXj6
    718 qXit            QN qXj3
    719 
    720 dXi0lo          DN dXj0lo
    721 dXi0hi          DN dXj0hi
    722 dXi1lo          DN dXj4lo
    723 dXi1hi          DN dXj4hi
    724 dXi2lo          DN dXj2lo
    725 dXi2hi          DN dXj2hi
    726 dXi3lo          DN dXj7lo
    727 dXi3hi          DN dXj7hi
    728 dXi4lo          DN dXj5lo
    729 dXi4hi          DN dXj5hi
    730 dXi5lo          DN dXjtlo
    731 dXi5hi          DN dXjthi
    732 dXi6lo          DN dXj1lo
    733 dXi6hi          DN dXj1hi
    734 dXi7lo          DN dXj6lo
    735 dXi7hi          DN dXj6hi
    736 dXitlo          DN dXj3lo
    737 dXithi          DN dXj3hi
    738 
    739 qXh0            QN qXit
    740 qXh1            QN qXi0
    741 qXh2            QN qXi2
    742 qXh3            QN qXi3
    743 qXh4            QN qXi7
    744 qXh5            QN qXi5
    745 qXh6            QN qXi4
    746 qXh7            QN qXi1
    747 qXht            QN qXi6
    748 
    749 dXh0lo          DN dXitlo
    750 dXh0hi          DN dXithi
    751 dXh1lo          DN dXi0lo
    752 dXh1hi          DN dXi0hi
    753 dXh2lo          DN dXi2lo
    754 dXh2hi          DN dXi2hi
    755 dXh3lo          DN dXi3lo
    756 dXh3hi          DN dXi3hi
    757 dXh4lo          DN dXi7lo
    758 dXh4hi          DN dXi7hi
    759 dXh5lo          DN dXi5lo
    760 dXh5hi          DN dXi5hi
    761 dXh6lo          DN dXi4lo
    762 dXh6hi          DN dXi4hi
    763 dXh7lo          DN dXi1lo
    764 dXh7hi          DN dXi1hi
    765 dXhtlo          DN dXi6lo
    766 dXhthi          DN dXi6hi
    767 
    768 qXg0            QN qXh2
    769 qXg1            QN qXht
    770 qXg2            QN qXh1
    771 qXg3            QN qXh0
    772 qXg4            QN qXh4
    773 qXg5            QN qXh5
    774 qXg6            QN qXh6
    775 qXg7            QN qXh7
    776 qXgt            QN qXh3
    777 
    778 qXf0            QN qXg6
    779 qXf1            QN qXg5
    780 qXf2            QN qXg4
    781 qXf3            QN qXgt
    782 qXf4            QN qXg3
    783 qXf5            QN qXg2
    784 qXf6            QN qXg1
    785 qXf7            QN qXg0
    786 qXft            QN qXg7
    787 
    788 
    789 qXt0            QN 1.S32
    790 qXt1            QN 2.S32
    791 qT0lo           QN 1.S32
    792 qT0hi           QN 2.S32
    793 qT1lo           QN 3.S32
    794 qT1hi           QN 4.S32
    795 qScalelo        QN 5.S32        ;// used to read post scale values
    796 qScalehi        QN 6.S32
    797 qTemp0          QN 5.S32
    798 qTemp1          QN 6.S32
    799 
    800 
    801 Scale1          EQU 6
    802 Scale2          EQU 15
    803 qScale1         QN Scale1.S16
    804 qScale2         QN Scale2.S16
    805 dScale1lo       DN (Scale1*2).S16
    806 dScale1hi       DN (Scale1*2+1).S16
    807 dScale2lo       DN (Scale2*2).S16
    808 dScale2hi       DN (Scale2*2+1).S16
    809 
    810 dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
    811 InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
    812 S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
    813 C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
    814 
    815 pTemp           RN 12
    816 
    817 
    818         IMPORT  armCOMM_IDCTCoef
    819 
    820         VLD1        {qXj0,qXj1}, [pSrc @64]!
    821         VLD1        {qXj2,qXj3}, [pSrc @64]!
    822         VLD1        {qXj4,qXj5}, [pSrc @64]!
    823         VLD1        {qXj6,qXj7}, [pSrc @64]!
    824 
    825         ;// Load PreScale and multiply with Src
    826         ;// IStage 4
    827 
    828         IF "$inscale"="s16"                         ;// 16X16 Mul
    829             M_IDCT_PRESCALE16
    830         ENDIF
    831 
    832         IF "$inscale"="s32"                         ;// 32X32 ,ul
    833             M_IDCT_PRESCALE32
    834         ENDIF
    835 
    836         ;// IStage 3
    837         VQRDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
    838         VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
    839         VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
    840         VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
    841         VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
    842         VQRDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
    843         VSUB        qXh2, qXi2, qXi3                ;// h2, h3
    844 
    845         VMULL       qXt0, dXi4lo, C                 ;// c*i4
    846         VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
    847         VMULL       qXt1, dXi4hi, C
    848         VMLAL       qXt1, dXi6hi, S
    849         VSHRN       dXh4lo, qXt0, #16               ;// h4
    850         VSHRN       dXh4hi, qXt1, #16
    851 
    852         VMULL       qXt0, dXi6lo, C                 ;// c*i6
    853         VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
    854         VMULL       qXt1, dXi6hi, C
    855         VMLSL       qXt1, dXi4hi, S
    856         VSHRN       dXh6lo, qXt0, #16               ;// h6
    857         VSHRN       dXh6hi, qXt1, #16
    858 
    859         ;// IStage 2
    860         VSUB        qXg6, qXh6, qXh7
    861         VSUB        qXg5, qXh5, qXg6
    862         VSUB        qXg4, qXh4, qXg5
    863         VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
    864         VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
    865         VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
    866         VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
    867 
    868         ;// IStage 1 all rows
    869         VADD        qXf3, qXg3, qXg4
    870         VSUB        qXf4, qXg3, qXg4
    871         VADD        qXf2, qXg2, qXg5
    872         VSUB        qXf5, qXg2, qXg5
    873         VADD        qXf1, qXg1, qXg6
    874         VSUB        qXf6, qXg1, qXg6
    875         VADD        qXf0, qXg0, qXg7
    876         VSUB        qXf7, qXg0, qXg7
    877 
    878         ;// Transpose, store and loop
    879 XTR0            EQU Src5
    880 XTR1            EQU Tmp
    881 XTR2            EQU Src6
    882 XTR3            EQU Src7
    883 XTR4            EQU Src3
    884 XTR5            EQU Src0
    885 XTR6            EQU Src1
    886 XTR7            EQU Src2
    887 XTRt            EQU Src4
    888 
    889 qA0             QN  XTR0.S32  ;// for XTRpose
    890 qA1             QN  XTR1.S32
    891 qA2             QN  XTR2.S32
    892 qA3             QN  XTR3.S32
    893 qA4             QN  XTR4.S32
    894 qA5             QN  XTR5.S32
    895 qA6             QN  XTR6.S32
    896 qA7             QN  XTR7.S32
    897 
    898 dB0             DN  XTR0*2+1      ;// for using VSWP
    899 dB1             DN  XTR1*2+1
    900 dB2             DN  XTR2*2+1
    901 dB3             DN  XTR3*2+1
    902 dB4             DN  XTR4*2
    903 dB5             DN  XTR5*2
    904 dB6             DN  XTR6*2
    905 dB7             DN  XTR7*2
    906 
    907 
    908         VTRN        qXf0, qXf1
    909         VTRN        qXf2, qXf3
    910         VTRN        qXf4, qXf5
    911         VTRN        qXf6, qXf7
    912         VTRN        qA0, qA2
    913         VTRN        qA1, qA3
    914         VTRN        qA4, qA6
    915         VTRN        qA5, qA7
    916         VSWP        dB0, dB4
    917         VSWP        dB1, dB5
    918         VSWP        dB2, dB6
    919         VSWP        dB3, dB7
    920 
    921 
    922 qYj0            QN qXf0
    923 qYj1            QN qXf1
    924 qYj2            QN qXf2
    925 qYj3            QN qXf3
    926 qYj4            QN qXf4
    927 qYj5            QN qXf5
    928 qYj6            QN qXf6
    929 qYj7            QN qXf7
    930 qYjt            QN qXft
    931 
    932 dYj0lo          DN (XTR0*2).S16
    933 dYj0hi          DN (XTR0*2+1).S16
    934 dYj1lo          DN (XTR1*2).S16
    935 dYj1hi          DN (XTR1*2+1).S16
    936 dYj2lo          DN (XTR2*2).S16
    937 dYj2hi          DN (XTR2*2+1).S16
    938 dYj3lo          DN (XTR3*2).S16
    939 dYj3hi          DN (XTR3*2+1).S16
    940 dYj4lo          DN (XTR4*2).S16
    941 dYj4hi          DN (XTR4*2+1).S16
    942 dYj5lo          DN (XTR5*2).S16
    943 dYj5hi          DN (XTR5*2+1).S16
    944 dYj6lo          DN (XTR6*2).S16
    945 dYj6hi          DN (XTR6*2+1).S16
    946 dYj7lo          DN (XTR7*2).S16
    947 dYj7hi          DN (XTR7*2+1).S16
    948 dYjtlo          DN (XTRt*2).S16
    949 dYjthi          DN (XTRt*2+1).S16
    950 
    951 qYi0            QN qYj0
    952 qYi1            QN qYj4
    953 qYi2            QN qYj2
    954 qYi3            QN qYj7
    955 qYi4            QN qYj5
    956 qYi5            QN qYjt
    957 qYi6            QN qYj1
    958 qYi7            QN qYj6
    959 qYit            QN qYj3
    960 
    961 dYi0lo          DN dYj0lo
    962 dYi0hi          DN dYj0hi
    963 dYi1lo          DN dYj4lo
    964 dYi1hi          DN dYj4hi
    965 dYi2lo          DN dYj2lo
    966 dYi2hi          DN dYj2hi
    967 dYi3lo          DN dYj7lo
    968 dYi3hi          DN dYj7hi
    969 dYi4lo          DN dYj5lo
    970 dYi4hi          DN dYj5hi
    971 dYi5lo          DN dYjtlo
    972 dYi5hi          DN dYjthi
    973 dYi6lo          DN dYj1lo
    974 dYi6hi          DN dYj1hi
    975 dYi7lo          DN dYj6lo
    976 dYi7hi          DN dYj6hi
    977 dYitlo          DN dYj3lo
    978 dYithi          DN dYj3hi
    979 
    980 qYh0            QN qYit
    981 qYh1            QN qYi0
    982 qYh2            QN qYi2
    983 qYh3            QN qYi3
    984 qYh4            QN qYi7
    985 qYh5            QN qYi5
    986 qYh6            QN qYi4
    987 qYh7            QN qYi1
    988 qYht            QN qYi6
    989 
    990 dYh0lo          DN dYitlo
    991 dYh0hi          DN dYithi
    992 dYh1lo          DN dYi0lo
    993 dYh1hi          DN dYi0hi
    994 dYh2lo          DN dYi2lo
    995 dYh2hi          DN dYi2hi
    996 dYh3lo          DN dYi3lo
    997 dYh3hi          DN dYi3hi
    998 dYh4lo          DN dYi7lo
    999 dYh4hi          DN dYi7hi
   1000 dYh5lo          DN dYi5lo
   1001 dYh5hi          DN dYi5hi
   1002 dYh6lo          DN dYi4lo
   1003 dYh6hi          DN dYi4hi
   1004 dYh7lo          DN dYi1lo
   1005 dYh7hi          DN dYi1hi
   1006 dYhtlo          DN dYi6lo
   1007 dYhthi          DN dYi6hi
   1008 
   1009 qYg0            QN qYh2
   1010 qYg1            QN qYht
   1011 qYg2            QN qYh1
   1012 qYg3            QN qYh0
   1013 qYg4            QN qYh4
   1014 qYg5            QN qYh5
   1015 qYg6            QN qYh6
   1016 qYg7            QN qYh7
   1017 qYgt            QN qYh3
   1018 
   1019 qYf0            QN qYg6
   1020 qYf1            QN qYg5
   1021 qYf2            QN qYg4
   1022 qYf3            QN qYgt
   1023 qYf4            QN qYg3
   1024 qYf5            QN qYg2
   1025 qYf6            QN qYg1
   1026 qYf7            QN qYg0
   1027 qYft            QN qYg7
   1028 
   1029         VRSHR       qYj7, qYj7, #2
   1030         VRSHR       qYj6, qYj6, #1
   1031 
   1032         VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
   1033         VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
   1034         VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
   1035         VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
   1036         VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
   1037         VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
   1038 
   1039         VQRDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
   1040         ;// IStage 4,3 rows 0to1 x 1/2
   1041 
   1042         MOV         pTemp, #0x4             ;// ensure correct round
   1043         VDUP        qScale1, pTemp           ;// of DC result
   1044         VADD        qYi0, qYi0, qScale1
   1045 
   1046         VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
   1047         VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
   1048 
   1049         VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
   1050         VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
   1051         VSUB        qYh2, qYi2, qYi3        ;// h2, h3
   1052         VQRDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
   1053 
   1054         VMULL       qXt0, dYi4lo, C         ;// c*i4
   1055         VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
   1056         VMULL       qXt1, dYi4hi, C
   1057         VMLAL       qXt1, dYi6hi, S
   1058         VSHRN       dYh4lo, qXt0, #16       ;// h4
   1059         VSHRN       dYh4hi, qXt1, #16
   1060 
   1061         VMULL       qXt0, dYi6lo, C         ;// c*i6
   1062         VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
   1063         VMULL       qXt1, dYi6hi, C
   1064         VMLSL       qXt1, dYi4hi, S
   1065         VSHRN       dYh6lo, qXt0, #16       ;// h6
   1066         VSHRN       dYh6hi, qXt1, #16
   1067 
   1068         VSUB        qYg6, qYh6, qYh7
   1069         VSUB        qYg5, qYh5, qYg6
   1070         VSUB        qYg4, qYh4, qYg5
   1071 
   1072         ;// IStage 2 rows 0to3 x 1/2
   1073         VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
   1074         VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
   1075         VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
   1076         VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
   1077 
   1078 
   1079         ;// IStage 1 all rows
   1080         VHADD        qYf3, qYg3, qYg4
   1081         VHSUB        qYf4, qYg3, qYg4
   1082         VHADD        qYf2, qYg2, qYg5
   1083         VHSUB        qYf5, qYg2, qYg5
   1084         VHADD        qYf1, qYg1, qYg6
   1085         VHSUB        qYf6, qYg1, qYg6
   1086         VHADD        qYf0, qYg0, qYg7
   1087         VHSUB        qYf7, qYg0, qYg7
   1088 
   1089 YTR0            EQU Src0
   1090 YTR1            EQU Src4
   1091 YTR2            EQU Src1
   1092 YTR3            EQU Src2
   1093 YTR4            EQU Src7
   1094 YTR5            EQU Src5
   1095 YTR6            EQU Tmp
   1096 YTR7            EQU Src6
   1097 YTRt            EQU Src3
   1098 
   1099 qC0             QN  YTR0.S32                ;// for YTRpose
   1100 qC1             QN  YTR1.S32
   1101 qC2             QN  YTR2.S32
   1102 qC3             QN  YTR3.S32
   1103 qC4             QN  YTR4.S32
   1104 qC5             QN  YTR5.S32
   1105 qC6             QN  YTR6.S32
   1106 qC7             QN  YTR7.S32
   1107 
   1108 dD0             DN  YTR0*2+1                ;// for using VSWP
   1109 dD1             DN  YTR1*2+1
   1110 dD2             DN  YTR2*2+1
   1111 dD3             DN  YTR3*2+1
   1112 dD4             DN  YTR4*2
   1113 dD5             DN  YTR5*2
   1114 dD6             DN  YTR6*2
   1115 dD7             DN  YTR7*2
   1116 
   1117         VTRN        qYf0, qYf1
   1118         VTRN        qYf2, qYf3
   1119         VTRN        qYf4, qYf5
   1120         VTRN        qYf6, qYf7
   1121         VTRN        qC0, qC2
   1122         VTRN        qC1, qC3
   1123         VTRN        qC4, qC6
   1124         VTRN        qC5, qC7
   1125         VSWP        dD0, dD4
   1126         VSWP        dD1, dD5
   1127         VSWP        dD2, dD6
   1128         VSWP        dD3, dD7
   1129 
   1130 
   1131 dYf0U8          DN YTR0*2.U8
   1132 dYf1U8          DN YTR1*2.U8
   1133 dYf2U8          DN YTR2*2.U8
   1134 dYf3U8          DN YTR3*2.U8
   1135 dYf4U8          DN YTR4*2.U8
   1136 dYf5U8          DN YTR5*2.U8
   1137 dYf6U8          DN YTR6*2.U8
   1138 dYf7U8          DN YTR7*2.U8
   1139 
   1140         ;//
   1141         ;// Do saturation if outsize is other than S16
   1142         ;//
   1143 
   1144         IF ("$outsize"="u8")
   1145             ;// Output range [0-255]
   1146             VQMOVN            dYf0U8, qYf0
   1147             VQMOVN            dYf1U8, qYf1
   1148             VQMOVN            dYf2U8, qYf2
   1149             VQMOVN            dYf3U8, qYf3
   1150             VQMOVN            dYf4U8, qYf4
   1151             VQMOVN            dYf5U8, qYf5
   1152             VQMOVN            dYf6U8, qYf6
   1153             VQMOVN            dYf7U8, qYf7
   1154         ENDIF
   1155 
   1156         IF ("$outsize"="s9")
   1157             ;// Output range [-256 to +255]
   1158             VQSHL            qYf0, qYf0, #16-9
   1159             VQSHL            qYf1, qYf1, #16-9
   1160             VQSHL            qYf2, qYf2, #16-9
   1161             VQSHL            qYf3, qYf3, #16-9
   1162             VQSHL            qYf4, qYf4, #16-9
   1163             VQSHL            qYf5, qYf5, #16-9
   1164             VQSHL            qYf6, qYf6, #16-9
   1165             VQSHL            qYf7, qYf7, #16-9
   1166 
   1167             VSHR             qYf0, qYf0, #16-9
   1168             VSHR             qYf1, qYf1, #16-9
   1169             VSHR             qYf2, qYf2, #16-9
   1170             VSHR             qYf3, qYf3, #16-9
   1171             VSHR             qYf4, qYf4, #16-9
   1172             VSHR             qYf5, qYf5, #16-9
   1173             VSHR             qYf6, qYf6, #16-9
   1174             VSHR             qYf7, qYf7, #16-9
   1175         ENDIF
   1176 
   1177         ;// Store output depending on the Stride size
   1178         IF "$stride"="s"
   1179             VST1        qYf0, [pDest @64], Stride
   1180             VST1        qYf1, [pDest @64], Stride
   1181             VST1        qYf2, [pDest @64], Stride
   1182             VST1        qYf3, [pDest @64], Stride
   1183             VST1        qYf4, [pDest @64], Stride
   1184             VST1        qYf5, [pDest @64], Stride
   1185             VST1        qYf6, [pDest @64], Stride
   1186             VST1        qYf7, [pDest @64]
   1187         ELSE
   1188             IF ("$outsize"="u8")
   1189                 VST1        dYf0U8, [pDest @64], #8
   1190                 VST1        dYf1U8, [pDest @64], #8
   1191                 VST1        dYf2U8, [pDest @64], #8
   1192                 VST1        dYf3U8, [pDest @64], #8
   1193                 VST1        dYf4U8, [pDest @64], #8
   1194                 VST1        dYf5U8, [pDest @64], #8
   1195                 VST1        dYf6U8, [pDest @64], #8
   1196                 VST1        dYf7U8, [pDest @64]
   1197             ELSE
   1198                 ;// ("$outsize"="s9") or ("$outsize"="s16")
   1199                 VST1        qYf0, [pDest @64], #16
   1200                 VST1        qYf1, [pDest @64], #16
   1201                 VST1        qYf2, [pDest @64], #16
   1202                 VST1        qYf3, [pDest @64], #16
   1203                 VST1        qYf4, [pDest @64], #16
   1204                 VST1        qYf5, [pDest @64], #16
   1205                 VST1        qYf6, [pDest @64], #16
   1206                 VST1        qYf7, [pDest @64]
   1207             ENDIF
   1208 
   1209         ENDIF
   1210 
   1211 
   1212 
   1213         ENDIF ;// CortexA8
   1214 
   1215 
   1216 
   1217         MEND
   1218 
   1219         ;// Scale TWO input rows with TWO rows of 16 bit scale values
   1220         ;//
   1221         ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
   1222         ;// input (Eight input values) with one row of scale values. Also
   1223         ;// Loads next scale values from pScale, if $LastRow flag is not set.
   1224         ;//
   1225         ;// Input Registers:
   1226         ;//
   1227         ;// $dAlo           - Input D register with first four S16 values of row n
   1228         ;// $dAhi           - Input D register with next four S16 values of row n
   1229         ;// $dBlo           - Input D register with first four S16 values of row n+1
   1230         ;// $dBhi           - Input D register with next four S16 values of row n+1
   1231         ;// pScale          - Pointer to next row of scale values
   1232         ;// qT0lo           - Temporary scratch register
   1233         ;// qT0hi           - Temporary scratch register
   1234         ;// qT1lo           - Temporary scratch register
   1235         ;// qT1hi           - Temporary scratch register
   1236         ;// dScale1lo       - Scale value of row n
   1237         ;// dScale1hi       - Scale value of row n
   1238         ;// dScale2lo       - Scale value of row n+1
   1239         ;// dScale2hi       - Scale value of row n+1
   1240         ;//
   1241         ;// Input Flag
   1242         ;//
   1243         ;// $LastRow        - Flag to indicate whether current row is last row
   1244         ;//
   1245         ;// Output Registers:
   1246         ;//
   1247         ;// $dAlo           - Scaled output values (first four S16 of row n)
   1248         ;// $dAhi           - Scaled output values (next four S16 of row n)
   1249         ;// $dBlo           - Scaled output values (first four S16 of row n+1)
   1250         ;// $dBhi           - Scaled output values (next four S16 of row n+1)
   1251         ;// qScale1         - Scale values for next row
   1252         ;// qScale2         - Scale values for next row+1
   1253         ;// pScale          - Pointer to next row of scale values
   1254         ;//
   1255         MACRO
   1256         M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
   1257         VMULL       qT0lo, $dAlo, dScale1lo
   1258         VMULL       qT0hi, $dAhi, dScale1hi
   1259         VMULL       qT1lo, $dBlo, dScale2lo
   1260         VMULL       qT1hi, $dBhi, dScale2hi
   1261         IF "$LastRow"="0"
   1262             VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
   1263             VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
   1264         ENDIF
   1265         VQRSHRN       $dAlo, qT0lo, #12
   1266         VQRSHRN       $dAhi, qT0hi, #12
   1267         VQRSHRN       $dBlo, qT1lo, #12
   1268         VQRSHRN       $dBhi, qT1hi, #12
   1269         MEND
   1270 
   1271         ;// Scale 8x8 block input values with 16 bit scale values
   1272         ;//
   1273         ;// This macro is used to pre-scale block of 8x8 input.
   1274         ;// This also do the Ist stage transformations of IDCT.
   1275         ;//
   1276         ;// Input Registers:
   1277         ;//
   1278         ;// dXjnlo          - n th input D register with first four S16 values
   1279         ;// dXjnhi          - n th input D register with next four S16 values
   1280         ;// qXjn            - n th input Q register with eight S16 values
   1281         ;// pScale          - Pointer to scale values
   1282         ;//
   1283         ;// Output Registers:
   1284         ;//
   1285         ;// qXin            - n th output Q register with eight S16 output values of 1st stage
   1286         ;//
   1287         MACRO
   1288         M_IDCT_PRESCALE16
   1289         VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
   1290         VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
   1291         M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
   1292         M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
   1293         M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
   1294         M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
   1295         VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
   1296         VSUB        qXi6, qXj1, qXj7            ;// j1-j7
   1297         LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
   1298         VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
   1299         VSUB        qXi2, qXj2, qXj6            ;// j2-j6
   1300         VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
   1301         VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
   1302         VSUB        qXi4, qXj5, qXj3            ;// j5-j3
   1303         MEND
   1304 
   1305 
   1306         ;// Scale 8x8 block input values with 32 bit scale values
   1307         ;//
   1308         ;// This macro is used to pre-scale block of 8x8 input.
   1309         ;// This also do the Ist stage transformations of IDCT.
   1310         ;//
   1311         ;// Input Registers:
   1312         ;//
   1313         ;// dXjnlo          - n th input D register with first four S16 values
   1314         ;// dXjnhi          - n th input D register with next four S16 values
   1315         ;// qXjn            - n th input Q register with eight S16 values
   1316         ;// pScale          - Pointer to 32bit scale values in Q23 format
   1317         ;//
   1318         ;// Output Registers:
   1319         ;//
   1320         ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
   1321         ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
   1322         ;//
   1323         MACRO
   1324         M_IDCT_PRESCALE32
   1325 qScale0lo       QN 0.S32
   1326 qScale0hi       QN 1.S32
   1327 qScale1lo       QN 2.S32
   1328 qScale1hi       QN 3.S32
   1329 qScale2lo       QN qScale1lo
   1330 qScale2hi       QN qScale1hi
   1331 qScale3lo       QN qScale1lo
   1332 qScale3hi       QN qScale1hi
   1333 qScale4lo       QN qScale1lo
   1334 qScale4hi       QN qScale1hi
   1335 qScale5lo       QN qScale0lo
   1336 qScale5hi       QN qScale0hi
   1337 qScale6lo       QN qScale0lo
   1338 qScale6hi       QN qScale0hi
   1339 qScale7lo       QN qScale0lo
   1340 qScale7hi       QN qScale0hi
   1341 
   1342 qSrc0lo         QN 4.S32
   1343 qSrc0hi         QN 5.S32
   1344 qSrc1lo         QN 6.S32
   1345 qSrc1hi         QN Src4.S32
   1346 qSrc2lo         QN qSrc0lo
   1347 qSrc2hi         QN qSrc0hi
   1348 qSrc3lo         QN qSrc0lo
   1349 qSrc3hi         QN qSrc0hi
   1350 qSrc4lo         QN qSrc0lo
   1351 qSrc4hi         QN qSrc0hi
   1352 qSrc5lo         QN qSrc1lo
   1353 qSrc5hi         QN qSrc1hi
   1354 qSrc6lo         QN qSrc1lo
   1355 qSrc6hi         QN qSrc1hi
   1356 qSrc7lo         QN qSrc0lo
   1357 qSrc7hi         QN qSrc0hi
   1358 
   1359 qRes17lo        QN qScale0lo
   1360 qRes17hi        QN qScale0hi
   1361 qRes26lo        QN qScale0lo
   1362 qRes26hi        QN qScale0hi
   1363 qRes53lo        QN qScale0lo
   1364 qRes53hi        QN qScale0hi
   1365 
   1366             ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
   1367 
   1368             ;// Row 0
   1369             VLD1        {qScale0lo, qScale0hi}, [pScale]!
   1370             VSHLL       qSrc0lo, dXj0lo, #(12-1)
   1371             VSHLL       qSrc0hi, dXj0hi, #(12-1)
   1372             VLD1        {qScale1lo, qScale1hi}, [pScale]!
   1373             VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
   1374             VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
   1375             VLD1        {qScale7lo, qScale7hi}, [pTemp]!
   1376             VSHLL       qSrc1lo, dXj1lo, #(12-1)
   1377             VSHLL       qSrc1hi, dXj1hi, #(12-1)
   1378             VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
   1379             VMOVN       dXi0hi, qSrc0hi
   1380             VSHLL       qSrc7lo, dXj7lo, #(12-1)
   1381             VSHLL       qSrc7hi, dXj7hi, #(12-1)
   1382             SUB         pTemp, pTemp, #((16*2)+(4*8*1))
   1383             VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
   1384             VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
   1385             VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
   1386             VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
   1387             VLD1        {qScale2lo, qScale2hi}, [pScale]!
   1388 
   1389             ;// Row 1 & 7
   1390             VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
   1391             VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
   1392             VMOVN       dXi5lo, qRes17lo                ;// Output i5
   1393             VMOVN       dXi5hi, qRes17hi
   1394             VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
   1395             VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
   1396             VMOVN       dXi6lo, qRes17lo                ;// Output i6
   1397             VMOVN       dXi6hi, qRes17hi
   1398             VSHLL       qSrc2lo, dXj2lo, #(12-1)
   1399             VSHLL       qSrc2hi, dXj2hi, #(12-1)
   1400             VLD1        {qScale6lo, qScale6hi}, [pTemp]!
   1401             VSHLL       qSrc6lo, dXj6lo, #(12-1)
   1402             VSHLL       qSrc6hi, dXj6hi, #(12-1)
   1403             SUB         pTemp, pTemp, #((16*2)+(4*8*1))
   1404             VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
   1405             VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
   1406             VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
   1407             VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
   1408             VLD1        {qScale3lo, qScale3hi}, [pScale]!
   1409 
   1410             ;// Row 2 & 6
   1411             VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
   1412             VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
   1413             VMOVN       dXi3lo, qRes26lo                ;// Output i3
   1414             VMOVN       dXi3hi, qRes26hi
   1415             VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
   1416             VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
   1417             VMOVN       dXi2lo, qRes26lo                ;// Output i2
   1418             VMOVN       dXi2hi, qRes26hi
   1419             VSHLL       qSrc3lo, dXj3lo, #(12-1)
   1420             VSHLL       qSrc3hi, dXj3hi, #(12-1)
   1421             VLD1        {qScale5lo, qScale5hi}, [pTemp]!
   1422             VSHLL       qSrc5lo, dXj5lo, #(12-1)
   1423             VSHLL       qSrc5hi, dXj5hi, #(12-1)
   1424             VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
   1425             VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
   1426             VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
   1427             VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
   1428 
   1429             ;// Row 3 & 5
   1430             VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
   1431             VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
   1432             SUB         pSrc, pSrc, #16*2*2
   1433             VMOVN       dXi7lo, qRes53lo                ;// Output i7
   1434             VMOVN       dXi7hi, qRes53hi
   1435             VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
   1436             VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
   1437             VLD1        qXj4, [pSrc @64]
   1438             VMOVN       dXi4lo, qRes53lo                ;// Output i4
   1439             VMOVN       dXi4hi, qRes53hi
   1440             VSHLL       qSrc4lo, dXj4lo, #(12-1)
   1441             VSHLL       qSrc4hi, dXj4hi, #(12-1)
   1442             VLD1        {qScale4lo, qScale4hi}, [pScale]
   1443             LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
   1444             VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
   1445             VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
   1446             VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
   1447             ;// Row 4
   1448             VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
   1449             VMOVN       dXi1hi, qSrc4hi
   1450 
   1451         MEND
   1452 
   1453         END
   1454