Home | History | Annotate | Download | only in api
      1 ;//
      2 ;// Copyright (C) 2004 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// IDCT_s.s
     19 ;//
     20 ;// Inverse DCT module
     21 ;//
     22 ;//
     23 ;// ALGORITHM DESCRIPTION
     24 ;//
     25 ;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
     26 ;// column and then a 1D IDCT for each row.
     27 ;//
     28 ;// The 8-point 1D IDCT is defined by
     29 ;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
     30 ;//
     31 ;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
     32 ;//   c(u,x) = cos( (2x+1)*u*pi/16 )
     33 ;//
     34 ;// We compute the 8-point 1D IDCT using the reverse of
     35 ;// the Arai-Agui-Nakajima flow graph which we split into
     36 ;// 5 stages named in reverse order to identify with the
     37 ;// forward DCT. Direct inversion of the forward formulae
     38 ;// in file FDCT_s.s gives:
     39 ;//
     40 ;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
     41 ;//             [ A(0) = 2*sqrt(2)
     42 ;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
     43 ;//
     44 ;// IStage 4:   i0 = j0             i1 = j4
     45 ;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
     46 ;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
     47 ;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
     48 ;//
     49 ;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
     50 ;//             h2 = (i2*sqrt2)-i3  h3 = i3
     51 ;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
     52 ;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
     53 ;//             [ The above two lines rotate by -(pi/8) ]
     54 ;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2
     55 ;//
     56 ;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
     57 ;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
     58 ;//             g7 = h7             g6 = h6 - h7
     59 ;//             g5 = h5 - g6        g4 = h4 - g5
     60 ;//
     61 ;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
     62 ;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
     63 ;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
     64 ;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
     65 ;//
     66 ;// Note that most coefficients are halved 3 times during the
     67 ;// above calculation. We can rescale the algorithm dividing
     68 ;// the input by 8 to remove the halvings.
     69 ;//
     70 ;// IStage 5:   j(u) = T(u)*A(u)/8
     71 ;//
     72 ;// IStage 4:   i0 = j0             i1 = j4
     73 ;//             i3 = j2 + j6        i2 = j2 - j6
     74 ;//             i7 = j5 + j3        i4 = j5 - j3
     75 ;//             i5 = j1 + j7        i6 = j1 - j7
     76 ;//
     77 ;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
     78 ;//             h2 = (i2*sqrt2)-i3  h3 = i3
     79 ;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
     80 ;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
     81 ;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7
     82 ;//
     83 ;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
     84 ;//             g1 = h1 + h2        g2 = h1 - h2
     85 ;//             g7 = h7             g6 = h6 - h7
     86 ;//             g5 = h5 - g6        g4 = h4 - g5
     87 ;//
     88 ;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
     89 ;//             f1 = g1 + g6        f6 = g1 - g6
     90 ;//             f2 = g2 + g5        f5 = g2 - g5
     91 ;//             f3 = g3 + g4        f4 = g3 - g4
     92 ;//
     93 ;// Note:
     94 ;// 1. The scaling by A(u)/8 can often be combined with inverse
     95 ;//    quantization. The column and row scalings can be combined.
     96 ;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
     97 ;//    to the above code but is otherwise identical.
     98 ;// 3. The rotation by -pi/8 can be peformed using three multiplies
     99 ;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
    100 ;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
    101 ;// 4. If |T(u)|<=1 then from the IDCT definition,
    102 ;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
    103 ;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
    104 ;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
    105 ;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
    106 ;//            = (approx)2.64
    107 ;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
    108 ;//    The table below shows input patterns generating the maximum
    109 ;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
    110 ;//    InputPattern      Max |f(x)|
    111 ;//      PPPPPPPP        |f0| =  2.64
    112 ;//      PPPMMMMM        |f1| =  2.64
    113 ;//      PPMMMPPP        |f2| =  2.64
    114 ;//      PPMMPPMM        |f3| =  2.64
    115 ;//      PMMPPMMP        |f4| =  2.64
    116 ;//      PMMPMMPM        |f5| =  2.64
    117 ;//      PMPPMPMP        |f6| =  2.64
    118 ;//      PMPMPMPM        |f7| =  2.64
    119 ;//   Note that this input pattern is the transpose of the
    120 ;//   corresponding max input patter for the FDCT.
    121 
    122 ;// Arguments
    123 
    124 pSrc    RN 0    ;// source data buffer
    125 Stride  RN 1    ;// destination stride in bytes
    126 pDest   RN 2    ;// destination data buffer
    127 pScale  RN 3    ;// pointer to scaling table
    128 
    129 
    130         ;// DCT Inverse Macro
    131         ;// The DCT code should be parametrized according
    132         ;// to the following inputs:
    133         ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
    134         ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
    135         ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
    136         ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
    137         ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
    138         ;//
    139         ;// Inputs:
    140         ;// pSrc   = r0 = Pointer to input data
    141         ;//               Range is -256 to +255 (9-bit)
    142         ;// Stride = r1 = Stride between input lines
    143         ;// pDest  = r2 = Pointer to output data
    144         ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
    145 
    146 
    147 
    148         MACRO
    149         M_IDCT  $outsize, $inscale, $stride
    150         LCLA    SHIFT
    151 
    152 
    153         IF ARM1136JS
    154 
    155 ;// REGISTER ALLOCATION
    156 ;// This is hard since we have 8 values, 9 free registers and each
    157 ;// butterfly requires a temporary register. We also want to
    158 ;// maintain register order so we can use LDM/STM. The table below
    159 ;// summarises the register allocation that meets all these criteria.
    160 ;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
    161 ;//
    162 ;// r1  a01     g0  h0
    163 ;// r4  b01 f0  g1  h1  i0
    164 ;// r5  a23 f1  g2      i1
    165 ;// r6  b23 f2  g3  h2  i2
    166 ;// r7  a45 f3      h3  i3
    167 ;// r8  b45 f4  g4  h4  i4
    168 ;// r9  a67 f5  g5  h5  i5
    169 ;// r10 b67 f6  g6  h6  i6
    170 ;// r11     f7  g7  h7  i7
    171 ;//
    172 ra01    RN 1
    173 rb01    RN 4
    174 ra23    RN 5
    175 rb23    RN 6
    176 ra45    RN 7
    177 rb45    RN 8
    178 ra67    RN 9
    179 rb67    RN 10
    180 rtmp    RN 11
    181 csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
    182 LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
    183 ;// Transpose allocation
    184 xft     RN ra01
    185 xf0     RN rb01
    186 xf1     RN ra23
    187 xf2     RN rb23
    188 xf3     RN ra45
    189 xf4     RN rb45
    190 xf5     RN ra67
    191 xf6     RN rb67
    192 xf7     RN rtmp
    193 ;// IStage 1 allocation
    194 xg0     RN xft
    195 xg1     RN xf0
    196 xg2     RN xf1
    197 xg3     RN xf2
    198 xgt     RN xf3
    199 xg4     RN xf4
    200 xg5     RN xf5
    201 xg6     RN xf6
    202 xg7     RN xf7
    203 ;// IStage 2 allocation
    204 xh0     RN xg0
    205 xh1     RN xg1
    206 xht     RN xg2
    207 xh2     RN xg3
    208 xh3     RN xgt
    209 xh4     RN xg4
    210 xh5     RN xg5
    211 xh6     RN xg6
    212 xh7     RN xg7
    213 ;// IStage 3,4 allocation
    214 xit     RN xh0
    215 xi0     RN xh1
    216 xi1     RN xht
    217 xi2     RN xh2
    218 xi3     RN xh3
    219 xi4     RN xh4
    220 xi5     RN xh5
    221 xi6     RN xh6
    222 xi7     RN xh7
    223 
    224         M_STR   pDest,  ppDest
    225         IF "$stride"="s"
    226             M_STR   Stride, pStride
    227         ENDIF
    228         M_ADR   pDest,  pBlk
    229         LDR     csPiBy8, =0x30fc7642
    230         LDR     LoopRR2, =0x00005a82
    231 
    232 v6_idct_col$_F
    233         ;// Load even values
    234         LDR     xi4, [pSrc], #4  ;// j0
    235         LDR     xi5, [pSrc, #4*16-4]  ;// j4
    236         LDR     xi6, [pSrc, #2*16-4]  ;// j2
    237         LDR     xi7, [pSrc, #6*16-4]  ;// j6
    238 
    239         ;// Scale Even Values
    240         IF "$inscale"="s16" ;// 16x16 mul
    241 SHIFT       SETA    12
    242             LDR     xi0, [pScale], #4
    243             LDR     xi1, [pScale, #4*16-4]
    244             LDR     xi2, [pScale, #2*16-4]
    245             MOV     xit, #1<<(SHIFT-1)
    246             SMLABB  xi3, xi0, xi4, xit
    247             SMLATT  xi4, xi0, xi4, xit
    248             SMLABB  xi0, xi1, xi5, xit
    249             SMLATT  xi5, xi1, xi5, xit
    250             MOV     xi3, xi3, ASR #SHIFT
    251             PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
    252             LDR     xi3, [pScale, #6*16-4]
    253             SMLABB  xi1, xi2, xi6, xit
    254             SMLATT  xi6, xi2, xi6, xit
    255             MOV     xi0, xi0, ASR #SHIFT
    256             PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
    257             SMLABB  xi2, xi3, xi7, xit
    258             SMLATT  xi7, xi3, xi7, xit
    259             MOV     xi1, xi1, ASR #SHIFT
    260             PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
    261             MOV     xi2, xi2, ASR #SHIFT
    262             PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
    263         ENDIF
    264         IF "$inscale"="s32" ;// 32x16 mul
    265 SHIFT       SETA    (12+8-16)
    266             MOV     xit, #1<<(SHIFT-1)
    267             LDR     xi0, [pScale], #8
    268             LDR     xi1, [pScale, #0*32+4-8]
    269             LDR     xi2, [pScale, #4*32-8]
    270             LDR     xi3, [pScale, #4*32+4-8]
    271             SMLAWB  xi0, xi0, xi4, xit
    272             SMLAWT  xi1, xi1, xi4, xit
    273             SMLAWB  xi2, xi2, xi5, xit
    274             SMLAWT  xi3, xi3, xi5, xit
    275             MOV     xi0, xi0, ASR #SHIFT
    276             PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
    277             MOV     xi2, xi2, ASR #SHIFT
    278             PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
    279             LDR     xi0, [pScale, #2*32-8]
    280             LDR     xi1, [pScale, #2*32+4-8]
    281             LDR     xi2, [pScale, #6*32-8]
    282             LDR     xi3, [pScale, #6*32+4-8]
    283             SMLAWB  xi0, xi0, xi6, xit
    284             SMLAWT  xi1, xi1, xi6, xit
    285             SMLAWB  xi2, xi2, xi7, xit
    286             SMLAWT  xi3, xi3, xi7, xit
    287             MOV     xi0, xi0, ASR #SHIFT
    288             PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
    289             MOV     xi2, xi2, ASR #SHIFT
    290             PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
    291         ENDIF
    292 
    293         ;// Load odd values
    294         LDR     xi0, [pSrc, #1*16-4]      ;// j1
    295         LDR     xi1, [pSrc, #7*16-4]      ;// j7
    296         LDR     xi2, [pSrc, #5*16-4]      ;// j5
    297         LDR     xi3, [pSrc, #3*16-4]      ;// j3
    298 
    299         IF  {TRUE}
    300             ;// shortcut if odd values 0
    301             TEQ     xi0, #0
    302             TEQEQ   xi1, #0
    303             TEQEQ   xi2, #0
    304             TEQEQ   xi3, #0
    305             BEQ     v6OddZero$_F
    306         ENDIF
    307 
    308         ;// Store scaled even values
    309         STMIA   pDest, {xi4, xi5, xi6, xi7}
    310 
    311         ;// Scale odd values
    312         IF "$inscale"="s16"
    313             ;// Perform AAN Scale
    314             LDR     xi4, [pScale, #1*16-4]
    315             LDR     xi5, [pScale, #7*16-4]
    316             LDR     xi6, [pScale, #5*16-4]
    317             SMLABB  xi7, xi0, xi4, xit
    318             SMLATT  xi0, xi0, xi4, xit
    319             SMLABB  xi4, xi1, xi5, xit
    320             SMLATT  xi1, xi1, xi5, xit
    321             MOV     xi7, xi7, ASR #SHIFT
    322             PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
    323             LDR     xi7, [pScale, #3*16-4]
    324             SMLABB  xi5, xi2, xi6, xit
    325             SMLATT  xi2, xi2, xi6, xit
    326             MOV     xi4, xi4, ASR #SHIFT
    327             PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
    328             SMLABB  xi6, xi3, xi7, xit
    329             SMLATT  xi3, xi3, xi7, xit
    330             MOV     xi5, xi5, ASR #SHIFT
    331             PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
    332             MOV     xi6, xi6, ASR #SHIFT
    333             PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
    334         ENDIF
    335         IF "$inscale"="s32" ;// 32x16 mul
    336             LDR     xi4, [pScale, #1*32-8]
    337             LDR     xi5, [pScale, #1*32+4-8]
    338             LDR     xi6, [pScale, #7*32-8]
    339             LDR     xi7, [pScale, #7*32+4-8]
    340             SMLAWB  xi4, xi4, xi0, xit
    341             SMLAWT  xi5, xi5, xi0, xit
    342             SMLAWB  xi6, xi6, xi1, xit
    343             SMLAWT  xi7, xi7, xi1, xit
    344             MOV     xi4, xi4, ASR #SHIFT
    345             PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
    346             MOV     xi6, xi6, ASR #SHIFT
    347             PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
    348             LDR     xi4, [pScale, #5*32-8]
    349             LDR     xi5, [pScale, #5*32+4-8]
    350             LDR     xi6, [pScale, #3*32-8]
    351             LDR     xi7, [pScale, #3*32+4-8]
    352             SMLAWB  xi4, xi4, xi2, xit
    353             SMLAWT  xi5, xi5, xi2, xit
    354             SMLAWB  xi6, xi6, xi3, xit
    355             SMLAWT  xi7, xi7, xi3, xit
    356             MOV     xi4, xi4, ASR #SHIFT
    357             PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
    358             MOV     xi6, xi6, ASR #SHIFT
    359             PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
    360         ENDIF
    361 
    362         LDR     xit, =0x00010001        ;// rounding constant
    363         SADD16 xi5, xi0, xi1           ;// (j1+j7)/2
    364         SHADD16 xi5, xi5, xit
    365 
    366         SSUB16  xi6, xi0, xi1           ;// j1-j7
    367         SADD16 xi7, xi2, xi3           ;// (j5+j3)/2
    368         SHADD16 xi7, xi7, xit
    369 
    370         SSUB16  xi4, xi2, xi3           ;// j5-j3
    371 
    372         SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
    373 
    374         PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
    375         PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
    376 
    377         SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
    378         SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
    379         SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
    380         SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
    381 
    382         SMULBB  xi1, xi3, LoopRR2
    383         SMULTB  xi3, xi3, LoopRR2
    384 
    385         PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
    386         PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
    387         SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
    388 
    389         ;// xi0,xi1,xi2,xi3 now free
    390         ;// IStage 4,3, rows 2to3 x1/2
    391 
    392         MOV     xi3, xi3, LSL #1
    393         PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
    394         LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
    395 
    396         ;// IStage 2, rows4to7
    397         SSUB16  xg6, xh6, xh7
    398         SSUB16  xg5, xh5, xg6
    399         SSUB16  xg4, xh4, xg5
    400 
    401         SSUB16  xi2, xi0, xi1           ;// (j2-j6)
    402 
    403         SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
    404 
    405         SMULBB  xi0, xi2, LoopRR2
    406         SMULTB  xi2, xi2, LoopRR2
    407 
    408         MOV     xi2, xi2, LSL #1
    409         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    410 
    411         ;// xi0, xi1 now free
    412         ;// IStage 4,3 rows 0to1 x 1/2
    413         LDRD    xi0, [pDest]            ;// j0, j4 scaled
    414         SSUB16  xh2, xh2, xi3
    415         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    416 
    417         SHADD16 xh0, xi0, xi1
    418         SHSUB16 xh1, xi0, xi1
    419 
    420         ;// IStage 2 rows 0to3 x 1/2
    421         SHSUB16 xg2, xh1, xh2
    422         SHADD16 xg1, xh1, xh2
    423         SHSUB16 xg3, xh0, xh3
    424         SHADD16 xg0, xh0, xh3
    425 
    426         ;// IStage 1 all rows
    427         SADD16  xf3, xg3, xg4
    428         SSUB16  xf4, xg3, xg4
    429         SADD16  xf2, xg2, xg5
    430         SSUB16  xf5, xg2, xg5
    431         SADD16  xf1, xg1, xg6
    432         SSUB16  xf6, xg1, xg6
    433         SADD16  xf0, xg0, xg7
    434         SSUB16  xf7, xg0, xg7
    435 
    436         ;// Transpose, store and loop
    437         PKHBT   ra01, xf0, xf1, LSL #16
    438         PKHTB   rb01, xf1, xf0, ASR #16
    439 
    440         PKHBT   ra23, xf2, xf3, LSL #16
    441         PKHTB   rb23, xf3, xf2, ASR #16
    442 
    443         PKHBT   ra45, xf4, xf5, LSL #16
    444         PKHTB   rb45, xf5, xf4, ASR #16
    445 
    446         PKHBT   ra67, xf6, xf7, LSL #16
    447         STMIA   pDest!, {ra01, ra23, ra45, ra67}
    448         PKHTB   rb67, xf7, xf6, ASR #16
    449         STMIA   pDest!, {rb01, rb23, rb45, rb67}
    450         BCC     v6_idct_col$_F
    451 
    452         SUB     pSrc, pDest, #(64*2)
    453         M_LDR   pDest, ppDest
    454         IF "$stride"="s"
    455             M_LDR   pScale, pStride
    456         ENDIF
    457         B       v6_idct_row$_F
    458 
    459 v6OddZero$_F
    460         SSUB16  xi2, xi6, xi7           ;// (j2-j6)
    461         SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
    462 
    463         SMULBB  xi0, xi2, LoopRR2
    464         SMULTB  xi2, xi2, LoopRR2
    465 
    466         MOV     xi2, xi2, LSL #1
    467         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    468         SSUB16  xh2, xh2, xi3
    469 
    470         ;// xi0, xi1 now free
    471         ;// IStage 4,3 rows 0to1 x 1/2
    472 
    473         SHADD16 xh0, xi4, xi5
    474         SHSUB16 xh1, xi4, xi5
    475 
    476         ;// IStage 2 rows 0to3 x 1/2
    477         SHSUB16 xg2, xh1, xh2
    478         SHADD16 xg1, xh1, xh2
    479         SHSUB16 xg3, xh0, xh3
    480         SHADD16 xg0, xh0, xh3
    481 
    482         ;// IStage 1 all rows
    483         MOV  xf3, xg3
    484         MOV  xf4, xg3
    485         MOV  xf2, xg2
    486         MOV  xf5, xg2
    487         MOV  xf1, xg1
    488         MOV  xf6, xg1
    489         MOV  xf0, xg0
    490         MOV  xf7, xg0
    491 
    492         ;// Transpose
    493         PKHBT   ra01, xf0, xf1, LSL #16
    494         PKHTB   rb01, xf1, xf0, ASR #16
    495 
    496         PKHBT   ra23, xf2, xf3, LSL #16
    497         PKHTB   rb23, xf3, xf2, ASR #16
    498 
    499         PKHBT   ra45, xf4, xf5, LSL #16
    500         PKHTB   rb45, xf5, xf4, ASR #16
    501 
    502         PKHBT   ra67, xf6, xf7, LSL #16
    503         PKHTB   rb67, xf7, xf6, ASR #16
    504 
    505         STMIA   pDest!, {ra01, ra23, ra45, ra67}
    506         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    507         STMIA   pDest!, {rb01, rb23, rb45, rb67}
    508 
    509         BCC     v6_idct_col$_F
    510         SUB     pSrc, pDest, #(64*2)
    511         M_LDR   pDest, ppDest
    512         IF "$stride"="s"
    513             M_LDR   pScale, pStride
    514         ENDIF
    515 
    516 
    517 v6_idct_row$_F
    518         ;// IStage 4,3, rows4to7 x1/4
    519         LDR     xit, =0x00010001        ;// rounding constant
    520         LDR     xi0, [pSrc, #1*16]      ;// j1
    521         LDR     xi1, [pSrc, #7*16]      ;// 4*j7
    522         LDR     xi2, [pSrc, #5*16]      ;// j5
    523         LDR     xi3, [pSrc, #3*16]      ;// j3
    524 
    525         SHADD16 xi1, xi1, xit           ;// 2*j7
    526         SHADD16 xi1, xi1, xit           ;// j7
    527 
    528         SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
    529         SSUB16  xi6, xi0, xi1           ;// j1-j7
    530         SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
    531         SSUB16  xi4, xi2, xi3           ;// j5-j3
    532 
    533         SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
    534 
    535         PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
    536         PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
    537 
    538         SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
    539         SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
    540         SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]
    541         SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
    542 
    543         SMULBB  xi1, xi3, LoopRR2
    544         SMULTB  xi3, xi3, LoopRR2
    545 
    546         PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
    547         PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
    548         SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
    549 
    550         MOV     xi3, xi3, LSL #1
    551         PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
    552 
    553         ;// xi0,xi1,xi2,xi3 now free
    554         ;// IStage 4,3, rows 2to3 x1/2
    555 
    556         LDR     xi0, [pSrc, #2*16]      ;// j2
    557         LDR     xi1, [pSrc, #6*16]      ;// 2*j6
    558 
    559         ;// IStage 2, rows4to7
    560         SSUB16  xg6, xh6, xh7
    561         SSUB16  xg5, xh5, xg6
    562         SSUB16  xg4, xh4, xg5
    563 
    564         SHADD16 xi1, xi1, xit           ;// j6
    565         SSUB16  xi2, xi0, xi1           ;// (j2-j6)
    566         SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
    567 
    568         SMULBB  xi0, xi2, LoopRR2
    569         SMULTB  xi2, xi2, LoopRR2
    570 
    571         MOV     xi2, xi2, LSL #1
    572 
    573         PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
    574 
    575         ;// xi0, xi1 now free
    576         ;// IStage 4,3 rows 0to1 x 1/2
    577         LDR     xi1, [pSrc, #4*16]      ;// j4
    578         LDR     xi0, [pSrc], #4         ;// j0
    579 
    580         SSUB16  xh2, xh2, xi3
    581         ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
    582 
    583         ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
    584         SHADD16 xh0, xi0, xi1           ;// of DC result
    585         SHSUB16 xh1, xi0, xi1
    586 
    587         ;// IStage 2 rows 0to3 x 1/2
    588         SHSUB16 xg2, xh1, xh2
    589         SHADD16 xg1, xh1, xh2
    590         SHSUB16 xg3, xh0, xh3
    591         SHADD16 xg0, xh0, xh3
    592 
    593         ;// IStage 1 all rows
    594         SHADD16 xf3, xg3, xg4
    595         SHSUB16 xf4, xg3, xg4
    596         SHADD16 xf2, xg2, xg5
    597         SHSUB16 xf5, xg2, xg5
    598         SHADD16 xf1, xg1, xg6
    599         SHSUB16 xf6, xg1, xg6
    600         SHADD16 xf0, xg0, xg7
    601         SHSUB16 xf7, xg0, xg7
    602 
    603         ;// Saturate
    604         IF ("$outsize"="u8")
    605             USAT16  xf0, #8, xf0
    606             USAT16  xf1, #8, xf1
    607             USAT16  xf2, #8, xf2
    608             USAT16  xf3, #8, xf3
    609             USAT16  xf4, #8, xf4
    610             USAT16  xf5, #8, xf5
    611             USAT16  xf6, #8, xf6
    612             USAT16  xf7, #8, xf7
    613         ENDIF
    614         IF ("$outsize"="s9")
    615             SSAT16  xf0, #9, xf0
    616             SSAT16  xf1, #9, xf1
    617             SSAT16  xf2, #9, xf2
    618             SSAT16  xf3, #9, xf3
    619             SSAT16  xf4, #9, xf4
    620             SSAT16  xf5, #9, xf5
    621             SSAT16  xf6, #9, xf6
    622             SSAT16  xf7, #9, xf7
    623         ENDIF
    624 
    625         ;// Transpose to Row, Pack and store
    626         IF ("$outsize"="u8")
    627             ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
    628             ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
    629             ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
    630             ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
    631             PKHBT   ra01, xf0, xf2, LSL #16
    632             PKHTB   rb01, xf2, xf0, ASR #16
    633             PKHBT   ra23, xf4, xf6, LSL #16
    634             PKHTB   rb23, xf6, xf4, ASR #16
    635             STMIA   pDest, {ra01, ra23}
    636             IF "$stride"="s"
    637                 ADD     pDest, pDest, pScale
    638                 STMIA   pDest, {rb01, rb23}
    639                 ADD     pDest, pDest, pScale
    640             ELSE
    641                 ADD     pDest, pDest, #($stride)
    642                 STMIA   pDest, {rb01, rb23}
    643                 ADD     pDest, pDest, #($stride)
    644             ENDIF
    645         ENDIF
    646         IF ("$outsize"="s9"):LOR:("$outsize"="s16")
    647             PKHBT   ra01, xf0, xf1, LSL #16
    648             PKHTB   rb01, xf1, xf0, ASR #16
    649 
    650             PKHBT   ra23, xf2, xf3, LSL #16
    651             PKHTB   rb23, xf3, xf2, ASR #16
    652 
    653             PKHBT   ra45, xf4, xf5, LSL #16
    654             PKHTB   rb45, xf5, xf4, ASR #16
    655 
    656             PKHBT   ra67, xf6, xf7, LSL #16
    657             PKHTB   rb67, xf7, xf6, ASR #16
    658 
    659             STMIA   pDest, {ra01, ra23, ra45, ra67}
    660             IF "$stride"="s"
    661                 ADD     pDest, pDest, pScale
    662                 STMIA   pDest, {rb01, rb23, rb45, rb67}
    663                 ADD     pDest, pDest, pScale
    664             ELSE
    665                 ADD     pDest, pDest, #($stride)
    666                 STMIA   pDest, {rb01, rb23, rb45, rb67}
    667                 ADD     pDest, pDest, #($stride)
    668             ENDIF
    669         ENDIF
    670 
    671         BCC     v6_idct_row$_F
    672         ENDIF ;// ARM1136JS
    673 
    674 
    675         IF CortexA8
    676 
    677 Src0            EQU  7
    678 Src1            EQU  8
    679 Src2            EQU  9
    680 Src3            EQU  10
    681 Src4            EQU  11
    682 Src5            EQU  12
    683 Src6            EQU  13
    684 Src7            EQU  14
    685 Tmp             EQU  15
    686 
    687 qXj0            QN Src0.S16
    688 qXj1            QN Src1.S16
    689 qXj2            QN Src2.S16
    690 qXj3            QN Src3.S16
    691 qXj4            QN Src4.S16
    692 qXj5            QN Src5.S16
    693 qXj6            QN Src6.S16
    694 qXj7            QN Src7.S16
    695 qXjt            QN Tmp.S16
    696 
    697 dXj0lo          DN (Src0*2).S16
    698 dXj0hi          DN (Src0*2+1).S16
    699 dXj1lo          DN (Src1*2).S16
    700 dXj1hi          DN (Src1*2+1).S16
    701 dXj2lo          DN (Src2*2).S16
    702 dXj2hi          DN (Src2*2+1).S16
    703 dXj3lo          DN (Src3*2).S16
    704 dXj3hi          DN (Src3*2+1).S16
    705 dXj4lo          DN (Src4*2).S16
    706 dXj4hi          DN (Src4*2+1).S16
    707 dXj5lo          DN (Src5*2).S16
    708 dXj5hi          DN (Src5*2+1).S16
    709 dXj6lo          DN (Src6*2).S16
    710 dXj6hi          DN (Src6*2+1).S16
    711 dXj7lo          DN (Src7*2).S16
    712 dXj7hi          DN (Src7*2+1).S16
    713 dXjtlo          DN (Tmp*2).S16
    714 dXjthi          DN (Tmp*2+1).S16
    715 
    716 qXi0            QN qXj0
    717 qXi1            QN qXj4
    718 qXi2            QN qXj2
    719 qXi3            QN qXj7
    720 qXi4            QN qXj5
    721 qXi5            QN qXjt
    722 qXi6            QN qXj1
    723 qXi7            QN qXj6
    724 qXit            QN qXj3
    725 
    726 dXi0lo          DN dXj0lo
    727 dXi0hi          DN dXj0hi
    728 dXi1lo          DN dXj4lo
    729 dXi1hi          DN dXj4hi
    730 dXi2lo          DN dXj2lo
    731 dXi2hi          DN dXj2hi
    732 dXi3lo          DN dXj7lo
    733 dXi3hi          DN dXj7hi
    734 dXi4lo          DN dXj5lo
    735 dXi4hi          DN dXj5hi
    736 dXi5lo          DN dXjtlo
    737 dXi5hi          DN dXjthi
    738 dXi6lo          DN dXj1lo
    739 dXi6hi          DN dXj1hi
    740 dXi7lo          DN dXj6lo
    741 dXi7hi          DN dXj6hi
    742 dXitlo          DN dXj3lo
    743 dXithi          DN dXj3hi
    744 
    745 qXh0            QN qXit
    746 qXh1            QN qXi0
    747 qXh2            QN qXi2
    748 qXh3            QN qXi3
    749 qXh4            QN qXi7
    750 qXh5            QN qXi5
    751 qXh6            QN qXi4
    752 qXh7            QN qXi1
    753 qXht            QN qXi6
    754 
    755 dXh0lo          DN dXitlo
    756 dXh0hi          DN dXithi
    757 dXh1lo          DN dXi0lo
    758 dXh1hi          DN dXi0hi
    759 dXh2lo          DN dXi2lo
    760 dXh2hi          DN dXi2hi
    761 dXh3lo          DN dXi3lo
    762 dXh3hi          DN dXi3hi
    763 dXh4lo          DN dXi7lo
    764 dXh4hi          DN dXi7hi
    765 dXh5lo          DN dXi5lo
    766 dXh5hi          DN dXi5hi
    767 dXh6lo          DN dXi4lo
    768 dXh6hi          DN dXi4hi
    769 dXh7lo          DN dXi1lo
    770 dXh7hi          DN dXi1hi
    771 dXhtlo          DN dXi6lo
    772 dXhthi          DN dXi6hi
    773 
    774 qXg0            QN qXh2
    775 qXg1            QN qXht
    776 qXg2            QN qXh1
    777 qXg3            QN qXh0
    778 qXg4            QN qXh4
    779 qXg5            QN qXh5
    780 qXg6            QN qXh6
    781 qXg7            QN qXh7
    782 qXgt            QN qXh3
    783 
    784 qXf0            QN qXg6
    785 qXf1            QN qXg5
    786 qXf2            QN qXg4
    787 qXf3            QN qXgt
    788 qXf4            QN qXg3
    789 qXf5            QN qXg2
    790 qXf6            QN qXg1
    791 qXf7            QN qXg0
    792 qXft            QN qXg7
    793 
    794 
    795 qXt0            QN 1.S32
    796 qXt1            QN 2.S32
    797 qT0lo           QN 1.S32
    798 qT0hi           QN 2.S32
    799 qT1lo           QN 3.S32
    800 qT1hi           QN 4.S32
    801 qScalelo        QN 5.S32        ;// used to read post scale values
    802 qScalehi        QN 6.S32
    803 qTemp0          QN 5.S32
    804 qTemp1          QN 6.S32
    805 
    806 
    807 Scale1          EQU 6
    808 Scale2          EQU 15
    809 qScale1         QN Scale1.S16
    810 qScale2         QN Scale2.S16
    811 dScale1lo       DN (Scale1*2).S16
    812 dScale1hi       DN (Scale1*2+1).S16
    813 dScale2lo       DN (Scale2*2).S16
    814 dScale2hi       DN (Scale2*2+1).S16
    815 
    816 dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
    817 InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
    818 S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
    819 C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
    820 
    821 pTemp           RN 12
    822 
    823 
    824         IMPORT  armCOMM_IDCTCoef
    825 
    826         VLD1        {qXj0,qXj1}, [pSrc @64]!
    827         VLD1        {qXj2,qXj3}, [pSrc @64]!
    828         VLD1        {qXj4,qXj5}, [pSrc @64]!
    829         VLD1        {qXj6,qXj7}, [pSrc @64]!
    830 
    831         ;// Load PreScale and multiply with Src
    832         ;// IStage 4
    833 
    834         IF "$inscale"="s16"                         ;// 16X16 Mul
    835             M_IDCT_PRESCALE16
    836         ENDIF
    837 
    838         IF "$inscale"="s32"                         ;// 32X32 ,ul
    839             M_IDCT_PRESCALE32
    840         ENDIF
    841 
    842         ;// IStage 3
    843         VQDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
    844         VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
    845         VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
    846         VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
    847         VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
    848         VQDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
    849         VSUB        qXh2, qXi2, qXi3                ;// h2, h3
    850 
    851         VMULL       qXt0, dXi4lo, C                 ;// c*i4
    852         VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
    853         VMULL       qXt1, dXi4hi, C
    854         VMLAL       qXt1, dXi6hi, S
    855         VSHRN       dXh4lo, qXt0, #16               ;// h4
    856         VSHRN       dXh4hi, qXt1, #16
    857 
    858         VMULL       qXt0, dXi6lo, C                 ;// c*i6
    859         VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
    860         VMULL       qXt1, dXi6hi, C
    861         VMLSL       qXt1, dXi4hi, S
    862         VSHRN       dXh6lo, qXt0, #16               ;// h6
    863         VSHRN       dXh6hi, qXt1, #16
    864 
    865         ;// IStage 2
    866         VSUB        qXg6, qXh6, qXh7
    867         VSUB        qXg5, qXh5, qXg6
    868         VSUB        qXg4, qXh4, qXg5
    869         VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
    870         VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
    871         VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
    872         VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
    873 
    874         ;// IStage 1 all rows
    875         VADD        qXf3, qXg3, qXg4
    876         VSUB        qXf4, qXg3, qXg4
    877         VADD        qXf2, qXg2, qXg5
    878         VSUB        qXf5, qXg2, qXg5
    879         VADD        qXf1, qXg1, qXg6
    880         VSUB        qXf6, qXg1, qXg6
    881         VADD        qXf0, qXg0, qXg7
    882         VSUB        qXf7, qXg0, qXg7
    883 
    884         ;// Transpose, store and loop
    885 XTR0            EQU Src5
    886 XTR1            EQU Tmp
    887 XTR2            EQU Src6
    888 XTR3            EQU Src7
    889 XTR4            EQU Src3
    890 XTR5            EQU Src0
    891 XTR6            EQU Src1
    892 XTR7            EQU Src2
    893 XTRt            EQU Src4
    894 
    895 qA0             QN  XTR0.S32  ;// for XTRpose
    896 qA1             QN  XTR1.S32
    897 qA2             QN  XTR2.S32
    898 qA3             QN  XTR3.S32
    899 qA4             QN  XTR4.S32
    900 qA5             QN  XTR5.S32
    901 qA6             QN  XTR6.S32
    902 qA7             QN  XTR7.S32
    903 
    904 dB0             DN  XTR0*2+1      ;// for using VSWP
    905 dB1             DN  XTR1*2+1
    906 dB2             DN  XTR2*2+1
    907 dB3             DN  XTR3*2+1
    908 dB4             DN  XTR4*2
    909 dB5             DN  XTR5*2
    910 dB6             DN  XTR6*2
    911 dB7             DN  XTR7*2
    912 
    913 
    914         VTRN        qXf0, qXf1
    915         VTRN        qXf2, qXf3
    916         VTRN        qXf4, qXf5
    917         VTRN        qXf6, qXf7
    918         VTRN        qA0, qA2
    919         VTRN        qA1, qA3
    920         VTRN        qA4, qA6
    921         VTRN        qA5, qA7
    922         VSWP        dB0, dB4
    923         VSWP        dB1, dB5
    924         VSWP        dB2, dB6
    925         VSWP        dB3, dB7
    926 
    927 
    928 qYj0            QN qXf0
    929 qYj1            QN qXf1
    930 qYj2            QN qXf2
    931 qYj3            QN qXf3
    932 qYj4            QN qXf4
    933 qYj5            QN qXf5
    934 qYj6            QN qXf6
    935 qYj7            QN qXf7
    936 qYjt            QN qXft
    937 
    938 dYj0lo          DN (XTR0*2).S16
    939 dYj0hi          DN (XTR0*2+1).S16
    940 dYj1lo          DN (XTR1*2).S16
    941 dYj1hi          DN (XTR1*2+1).S16
    942 dYj2lo          DN (XTR2*2).S16
    943 dYj2hi          DN (XTR2*2+1).S16
    944 dYj3lo          DN (XTR3*2).S16
    945 dYj3hi          DN (XTR3*2+1).S16
    946 dYj4lo          DN (XTR4*2).S16
    947 dYj4hi          DN (XTR4*2+1).S16
    948 dYj5lo          DN (XTR5*2).S16
    949 dYj5hi          DN (XTR5*2+1).S16
    950 dYj6lo          DN (XTR6*2).S16
    951 dYj6hi          DN (XTR6*2+1).S16
    952 dYj7lo          DN (XTR7*2).S16
    953 dYj7hi          DN (XTR7*2+1).S16
    954 dYjtlo          DN (XTRt*2).S16
    955 dYjthi          DN (XTRt*2+1).S16
    956 
    957 qYi0            QN qYj0
    958 qYi1            QN qYj4
    959 qYi2            QN qYj2
    960 qYi3            QN qYj7
    961 qYi4            QN qYj5
    962 qYi5            QN qYjt
    963 qYi6            QN qYj1
    964 qYi7            QN qYj6
    965 qYit            QN qYj3
    966 
    967 dYi0lo          DN dYj0lo
    968 dYi0hi          DN dYj0hi
    969 dYi1lo          DN dYj4lo
    970 dYi1hi          DN dYj4hi
    971 dYi2lo          DN dYj2lo
    972 dYi2hi          DN dYj2hi
    973 dYi3lo          DN dYj7lo
    974 dYi3hi          DN dYj7hi
    975 dYi4lo          DN dYj5lo
    976 dYi4hi          DN dYj5hi
    977 dYi5lo          DN dYjtlo
    978 dYi5hi          DN dYjthi
    979 dYi6lo          DN dYj1lo
    980 dYi6hi          DN dYj1hi
    981 dYi7lo          DN dYj6lo
    982 dYi7hi          DN dYj6hi
    983 dYitlo          DN dYj3lo
    984 dYithi          DN dYj3hi
    985 
    986 qYh0            QN qYit
    987 qYh1            QN qYi0
    988 qYh2            QN qYi2
    989 qYh3            QN qYi3
    990 qYh4            QN qYi7
    991 qYh5            QN qYi5
    992 qYh6            QN qYi4
    993 qYh7            QN qYi1
    994 qYht            QN qYi6
    995 
    996 dYh0lo          DN dYitlo
    997 dYh0hi          DN dYithi
    998 dYh1lo          DN dYi0lo
    999 dYh1hi          DN dYi0hi
   1000 dYh2lo          DN dYi2lo
   1001 dYh2hi          DN dYi2hi
   1002 dYh3lo          DN dYi3lo
   1003 dYh3hi          DN dYi3hi
   1004 dYh4lo          DN dYi7lo
   1005 dYh4hi          DN dYi7hi
   1006 dYh5lo          DN dYi5lo
   1007 dYh5hi          DN dYi5hi
   1008 dYh6lo          DN dYi4lo
   1009 dYh6hi          DN dYi4hi
   1010 dYh7lo          DN dYi1lo
   1011 dYh7hi          DN dYi1hi
   1012 dYhtlo          DN dYi6lo
   1013 dYhthi          DN dYi6hi
   1014 
   1015 qYg0            QN qYh2
   1016 qYg1            QN qYht
   1017 qYg2            QN qYh1
   1018 qYg3            QN qYh0
   1019 qYg4            QN qYh4
   1020 qYg5            QN qYh5
   1021 qYg6            QN qYh6
   1022 qYg7            QN qYh7
   1023 qYgt            QN qYh3
   1024 
   1025 qYf0            QN qYg6
   1026 qYf1            QN qYg5
   1027 qYf2            QN qYg4
   1028 qYf3            QN qYgt
   1029 qYf4            QN qYg3
   1030 qYf5            QN qYg2
   1031 qYf6            QN qYg1
   1032 qYf7            QN qYg0
   1033 qYft            QN qYg7
   1034 
   1035         VRSHR       qYj7, qYj7, #2
   1036         VRSHR       qYj6, qYj6, #1
   1037 
   1038         VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
   1039         VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
   1040         VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
   1041         VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
   1042         VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
   1043         VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
   1044 
   1045         VQDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
   1046         ;// IStage 4,3 rows 0to1 x 1/2
   1047 
   1048         MOV         pTemp, #0x4             ;// ensure correct round
   1049         VDUP        qScale1, pTemp           ;// of DC result
   1050         VADD        qYi0, qYi0, qScale1
   1051 
   1052         VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
   1053         VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
   1054 
   1055         VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
   1056         VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
   1057         VSUB        qYh2, qYi2, qYi3        ;// h2, h3
   1058         VQDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
   1059 
   1060         VMULL       qXt0, dYi4lo, C         ;// c*i4
   1061         VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
   1062         VMULL       qXt1, dYi4hi, C
   1063         VMLAL       qXt1, dYi6hi, S
   1064         VSHRN       dYh4lo, qXt0, #16       ;// h4
   1065         VSHRN       dYh4hi, qXt1, #16
   1066 
   1067         VMULL       qXt0, dYi6lo, C         ;// c*i6
   1068         VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
   1069         VMULL       qXt1, dYi6hi, C
   1070         VMLSL       qXt1, dYi4hi, S
   1071         VSHRN       dYh6lo, qXt0, #16       ;// h6
   1072         VSHRN       dYh6hi, qXt1, #16
   1073 
   1074         VSUB        qYg6, qYh6, qYh7
   1075         VSUB        qYg5, qYh5, qYg6
   1076         VSUB        qYg4, qYh4, qYg5
   1077 
   1078         ;// IStage 2 rows 0to3 x 1/2
   1079         VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
   1080         VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
   1081         VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
   1082         VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
   1083 
   1084 
   1085         ;// IStage 1 all rows
   1086         VHADD        qYf3, qYg3, qYg4
   1087         VHSUB        qYf4, qYg3, qYg4
   1088         VHADD        qYf2, qYg2, qYg5
   1089         VHSUB        qYf5, qYg2, qYg5
   1090         VHADD        qYf1, qYg1, qYg6
   1091         VHSUB        qYf6, qYg1, qYg6
   1092         VHADD        qYf0, qYg0, qYg7
   1093         VHSUB        qYf7, qYg0, qYg7
   1094 
   1095 YTR0            EQU Src0
   1096 YTR1            EQU Src4
   1097 YTR2            EQU Src1
   1098 YTR3            EQU Src2
   1099 YTR4            EQU Src7
   1100 YTR5            EQU Src5
   1101 YTR6            EQU Tmp
   1102 YTR7            EQU Src6
   1103 YTRt            EQU Src3
   1104 
   1105 qC0             QN  YTR0.S32                ;// for YTRpose
   1106 qC1             QN  YTR1.S32
   1107 qC2             QN  YTR2.S32
   1108 qC3             QN  YTR3.S32
   1109 qC4             QN  YTR4.S32
   1110 qC5             QN  YTR5.S32
   1111 qC6             QN  YTR6.S32
   1112 qC7             QN  YTR7.S32
   1113 
   1114 dD0             DN  YTR0*2+1                ;// for using VSWP
   1115 dD1             DN  YTR1*2+1
   1116 dD2             DN  YTR2*2+1
   1117 dD3             DN  YTR3*2+1
   1118 dD4             DN  YTR4*2
   1119 dD5             DN  YTR5*2
   1120 dD6             DN  YTR6*2
   1121 dD7             DN  YTR7*2
   1122 
   1123         VTRN        qYf0, qYf1
   1124         VTRN        qYf2, qYf3
   1125         VTRN        qYf4, qYf5
   1126         VTRN        qYf6, qYf7
   1127         VTRN        qC0, qC2
   1128         VTRN        qC1, qC3
   1129         VTRN        qC4, qC6
   1130         VTRN        qC5, qC7
   1131         VSWP        dD0, dD4
   1132         VSWP        dD1, dD5
   1133         VSWP        dD2, dD6
   1134         VSWP        dD3, dD7
   1135 
   1136 
   1137 dYf0U8          DN YTR0*2.U8
   1138 dYf1U8          DN YTR1*2.U8
   1139 dYf2U8          DN YTR2*2.U8
   1140 dYf3U8          DN YTR3*2.U8
   1141 dYf4U8          DN YTR4*2.U8
   1142 dYf5U8          DN YTR5*2.U8
   1143 dYf6U8          DN YTR6*2.U8
   1144 dYf7U8          DN YTR7*2.U8
   1145 
   1146         ;//
   1147         ;// Do saturation if outsize is other than S16
   1148         ;//
   1149 
   1150         IF ("$outsize"="u8")
   1151             ;// Output range [0-255]
   1152             VQMOVN            dYf0U8, qYf0
   1153             VQMOVN            dYf1U8, qYf1
   1154             VQMOVN            dYf2U8, qYf2
   1155             VQMOVN            dYf3U8, qYf3
   1156             VQMOVN            dYf4U8, qYf4
   1157             VQMOVN            dYf5U8, qYf5
   1158             VQMOVN            dYf6U8, qYf6
   1159             VQMOVN            dYf7U8, qYf7
   1160         ENDIF
   1161 
   1162         IF ("$outsize"="s9")
   1163             ;// Output range [-256 to +255]
   1164             VQSHL            qYf0, qYf0, #16-9
   1165             VQSHL            qYf1, qYf1, #16-9
   1166             VQSHL            qYf2, qYf2, #16-9
   1167             VQSHL            qYf3, qYf3, #16-9
   1168             VQSHL            qYf4, qYf4, #16-9
   1169             VQSHL            qYf5, qYf5, #16-9
   1170             VQSHL            qYf6, qYf6, #16-9
   1171             VQSHL            qYf7, qYf7, #16-9
   1172 
   1173             VSHR             qYf0, qYf0, #16-9
   1174             VSHR             qYf1, qYf1, #16-9
   1175             VSHR             qYf2, qYf2, #16-9
   1176             VSHR             qYf3, qYf3, #16-9
   1177             VSHR             qYf4, qYf4, #16-9
   1178             VSHR             qYf5, qYf5, #16-9
   1179             VSHR             qYf6, qYf6, #16-9
   1180             VSHR             qYf7, qYf7, #16-9
   1181         ENDIF
   1182 
   1183         ;// Store output depending on the Stride size
   1184         IF "$stride"="s"
   1185             VST1        qYf0, [pDest @64], Stride
   1186             VST1        qYf1, [pDest @64], Stride
   1187             VST1        qYf2, [pDest @64], Stride
   1188             VST1        qYf3, [pDest @64], Stride
   1189             VST1        qYf4, [pDest @64], Stride
   1190             VST1        qYf5, [pDest @64], Stride
   1191             VST1        qYf6, [pDest @64], Stride
   1192             VST1        qYf7, [pDest @64]
   1193         ELSE
   1194             IF ("$outsize"="u8")
   1195                 VST1        dYf0U8, [pDest @64], #8
   1196                 VST1        dYf1U8, [pDest @64], #8
   1197                 VST1        dYf2U8, [pDest @64], #8
   1198                 VST1        dYf3U8, [pDest @64], #8
   1199                 VST1        dYf4U8, [pDest @64], #8
   1200                 VST1        dYf5U8, [pDest @64], #8
   1201                 VST1        dYf6U8, [pDest @64], #8
   1202                 VST1        dYf7U8, [pDest @64]
   1203             ELSE
   1204                 ;// ("$outsize"="s9") or ("$outsize"="s16")
   1205                 VST1        qYf0, [pDest @64], #16
   1206                 VST1        qYf1, [pDest @64], #16
   1207                 VST1        qYf2, [pDest @64], #16
   1208                 VST1        qYf3, [pDest @64], #16
   1209                 VST1        qYf4, [pDest @64], #16
   1210                 VST1        qYf5, [pDest @64], #16
   1211                 VST1        qYf6, [pDest @64], #16
   1212                 VST1        qYf7, [pDest @64]
   1213             ENDIF
   1214 
   1215         ENDIF
   1216 
   1217 
   1218 
   1219         ENDIF ;// CortexA8
   1220 
   1221 
   1222 
   1223         MEND
   1224 
   1225         ;// Scale TWO input rows with TWO rows of 16 bit scale values
   1226         ;//
   1227         ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
   1228         ;// input (Eight input values) with one row of scale values. Also
   1229         ;// Loads next scale values from pScale, if $LastRow flag is not set.
   1230         ;//
   1231         ;// Input Registers:
   1232         ;//
   1233         ;// $dAlo           - Input D register with first four S16 values of row n
   1234         ;// $dAhi           - Input D register with next four S16 values of row n
   1235         ;// $dBlo           - Input D register with first four S16 values of row n+1
   1236         ;// $dBhi           - Input D register with next four S16 values of row n+1
   1237         ;// pScale          - Pointer to next row of scale values
   1238         ;// qT0lo           - Temporary scratch register
   1239         ;// qT0hi           - Temporary scratch register
   1240         ;// qT1lo           - Temporary scratch register
   1241         ;// qT1hi           - Temporary scratch register
   1242         ;// dScale1lo       - Scale value of row n
   1243         ;// dScale1hi       - Scale value of row n
   1244         ;// dScale2lo       - Scale value of row n+1
   1245         ;// dScale2hi       - Scale value of row n+1
   1246         ;//
   1247         ;// Input Flag
   1248         ;//
   1249         ;// $LastRow        - Flag to indicate whether current row is last row
   1250         ;//
   1251         ;// Output Registers:
   1252         ;//
   1253         ;// $dAlo           - Scaled output values (first four S16 of row n)
   1254         ;// $dAhi           - Scaled output values (next four S16 of row n)
   1255         ;// $dBlo           - Scaled output values (first four S16 of row n+1)
   1256         ;// $dBhi           - Scaled output values (next four S16 of row n+1)
   1257         ;// qScale1         - Scale values for next row
   1258         ;// qScale2         - Scale values for next row+1
   1259         ;// pScale          - Pointer to next row of scale values
   1260         ;//
   1261         MACRO
   1262         M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
   1263         VMULL       qT0lo, $dAlo, dScale1lo
   1264         VMULL       qT0hi, $dAhi, dScale1hi
   1265         VMULL       qT1lo, $dBlo, dScale2lo
   1266         VMULL       qT1hi, $dBhi, dScale2hi
   1267         IF "$LastRow"="0"
   1268             VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
   1269             VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
   1270         ENDIF
   1271         VQRSHRN       $dAlo, qT0lo, #12
   1272         VQRSHRN       $dAhi, qT0hi, #12
   1273         VQRSHRN       $dBlo, qT1lo, #12
   1274         VQRSHRN       $dBhi, qT1hi, #12
   1275         MEND
   1276 
   1277         ;// Scale 8x8 block input values with 16 bit scale values
   1278         ;//
   1279         ;// This macro is used to pre-scale block of 8x8 input.
   1280         ;// This also do the Ist stage transformations of IDCT.
   1281         ;//
   1282         ;// Input Registers:
   1283         ;//
   1284         ;// dXjnlo          - n th input D register with first four S16 values
   1285         ;// dXjnhi          - n th input D register with next four S16 values
   1286         ;// qXjn            - n th input Q register with eight S16 values
   1287         ;// pScale          - Pointer to scale values
   1288         ;//
   1289         ;// Output Registers:
   1290         ;//
   1291         ;// qXin            - n th output Q register with eight S16 output values of 1st stage
   1292         ;//
   1293         MACRO
   1294         M_IDCT_PRESCALE16
   1295         VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
   1296         VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
   1297         M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
   1298         M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
   1299         M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
   1300         M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
   1301         VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
   1302         VSUB        qXi6, qXj1, qXj7            ;// j1-j7
   1303         LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
   1304         VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
   1305         VSUB        qXi2, qXj2, qXj6            ;// j2-j6
   1306         VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
   1307         VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
   1308         VSUB        qXi4, qXj5, qXj3            ;// j5-j3
   1309         MEND
   1310 
   1311 
   1312         ;// Scale 8x8 block input values with 32 bit scale values
   1313         ;//
   1314         ;// This macro is used to pre-scale block of 8x8 input.
   1315         ;// This also do the Ist stage transformations of IDCT.
   1316         ;//
   1317         ;// Input Registers:
   1318         ;//
   1319         ;// dXjnlo          - n th input D register with first four S16 values
   1320         ;// dXjnhi          - n th input D register with next four S16 values
   1321         ;// qXjn            - n th input Q register with eight S16 values
   1322         ;// pScale          - Pointer to 32bit scale values in Q23 format
   1323         ;//
   1324         ;// Output Registers:
   1325         ;//
   1326         ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
   1327         ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
   1328         ;//
   1329         MACRO
   1330         M_IDCT_PRESCALE32
   1331 qScale0lo       QN 0.S32
   1332 qScale0hi       QN 1.S32
   1333 qScale1lo       QN 2.S32
   1334 qScale1hi       QN 3.S32
   1335 qScale2lo       QN qScale1lo
   1336 qScale2hi       QN qScale1hi
   1337 qScale3lo       QN qScale1lo
   1338 qScale3hi       QN qScale1hi
   1339 qScale4lo       QN qScale1lo
   1340 qScale4hi       QN qScale1hi
   1341 qScale5lo       QN qScale0lo
   1342 qScale5hi       QN qScale0hi
   1343 qScale6lo       QN qScale0lo
   1344 qScale6hi       QN qScale0hi
   1345 qScale7lo       QN qScale0lo
   1346 qScale7hi       QN qScale0hi
   1347 
   1348 qSrc0lo         QN 4.S32
   1349 qSrc0hi         QN 5.S32
   1350 qSrc1lo         QN 6.S32
   1351 qSrc1hi         QN Src4.S32
   1352 qSrc2lo         QN qSrc0lo
   1353 qSrc2hi         QN qSrc0hi
   1354 qSrc3lo         QN qSrc0lo
   1355 qSrc3hi         QN qSrc0hi
   1356 qSrc4lo         QN qSrc0lo
   1357 qSrc4hi         QN qSrc0hi
   1358 qSrc5lo         QN qSrc1lo
   1359 qSrc5hi         QN qSrc1hi
   1360 qSrc6lo         QN qSrc1lo
   1361 qSrc6hi         QN qSrc1hi
   1362 qSrc7lo         QN qSrc0lo
   1363 qSrc7hi         QN qSrc0hi
   1364 
   1365 qRes17lo        QN qScale0lo
   1366 qRes17hi        QN qScale0hi
   1367 qRes26lo        QN qScale0lo
   1368 qRes26hi        QN qScale0hi
   1369 qRes53lo        QN qScale0lo
   1370 qRes53hi        QN qScale0hi
   1371 
   1372             ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
   1373 
   1374             ;// Row 0
   1375             VLD1        {qScale0lo, qScale0hi}, [pScale]!
   1376             VSHLL       qSrc0lo, dXj0lo, #(12-1)
   1377             VSHLL       qSrc0hi, dXj0hi, #(12-1)
   1378             VLD1        {qScale1lo, qScale1hi}, [pScale]!
   1379             VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
   1380             VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
   1381             VLD1        {qScale7lo, qScale7hi}, [pTemp]!
   1382             VSHLL       qSrc1lo, dXj1lo, #(12-1)
   1383             VSHLL       qSrc1hi, dXj1hi, #(12-1)
   1384             VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
   1385             VMOVN       dXi0hi, qSrc0hi
   1386             VSHLL       qSrc7lo, dXj7lo, #(12-1)
   1387             VSHLL       qSrc7hi, dXj7hi, #(12-1)
   1388             SUB         pTemp, pTemp, #((16*2)+(4*8*1))
   1389             VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
   1390             VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
   1391             VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
   1392             VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
   1393             VLD1        {qScale2lo, qScale2hi}, [pScale]!
   1394 
   1395             ;// Row 1 & 7
   1396             VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
   1397             VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
   1398             VMOVN       dXi5lo, qRes17lo                ;// Output i5
   1399             VMOVN       dXi5hi, qRes17hi
   1400             VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
   1401             VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
   1402             VMOVN       dXi6lo, qRes17lo                ;// Output i6
   1403             VMOVN       dXi6hi, qRes17hi
   1404             VSHLL       qSrc2lo, dXj2lo, #(12-1)
   1405             VSHLL       qSrc2hi, dXj2hi, #(12-1)
   1406             VLD1        {qScale6lo, qScale6hi}, [pTemp]!
   1407             VSHLL       qSrc6lo, dXj6lo, #(12-1)
   1408             VSHLL       qSrc6hi, dXj6hi, #(12-1)
   1409             SUB         pTemp, pTemp, #((16*2)+(4*8*1))
   1410             VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
   1411             VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
   1412             VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
   1413             VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
   1414             VLD1        {qScale3lo, qScale3hi}, [pScale]!
   1415 
   1416             ;// Row 2 & 6
   1417             VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
   1418             VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
   1419             VMOVN       dXi3lo, qRes26lo                ;// Output i3
   1420             VMOVN       dXi3hi, qRes26hi
   1421             VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
   1422             VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
   1423             VMOVN       dXi2lo, qRes26lo                ;// Output i2
   1424             VMOVN       dXi2hi, qRes26hi
   1425             VSHLL       qSrc3lo, dXj3lo, #(12-1)
   1426             VSHLL       qSrc3hi, dXj3hi, #(12-1)
   1427             VLD1        {qScale5lo, qScale5hi}, [pTemp]!
   1428             VSHLL       qSrc5lo, dXj5lo, #(12-1)
   1429             VSHLL       qSrc5hi, dXj5hi, #(12-1)
   1430             VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
   1431             VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
   1432             VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
   1433             VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
   1434 
   1435             ;// Row 3 & 5
   1436             VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
   1437             VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
   1438             SUB         pSrc, pSrc, #16*2*2
   1439             VMOVN       dXi7lo, qRes53lo                ;// Output i7
   1440             VMOVN       dXi7hi, qRes53hi
   1441             VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
   1442             VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
   1443             VLD1        qXj4, [pSrc @64]
   1444             VMOVN       dXi4lo, qRes53lo                ;// Output i4
   1445             VMOVN       dXi4hi, qRes53hi
   1446             VSHLL       qSrc4lo, dXj4lo, #(12-1)
   1447             VSHLL       qSrc4hi, dXj4hi, #(12-1)
   1448             VLD1        {qScale4lo, qScale4hi}, [pScale]
   1449             LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
   1450             VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
   1451             VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
   1452             VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
   1453             ;// Row 4
   1454             VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
   1455             VMOVN       dXi1hi, qSrc4hi
   1456 
   1457         MEND
   1458 
   1459         END
   1460