Home | History | Annotate | Download | only in armv7
      1 .text
      2 .p2align 2
      3 .global ixheaacd_complex_ifft_p2_asm
      4 
      5 ixheaacd_complex_ifft_p2_asm:
      6     STMFD           sp!, {r0-r12, lr}
      7     SUB             sp, sp, #0x44
      8     LDR             r0, [sp, #0x48]
      9     EOR             r0, r0, r0, ASR #31
     10     CLZ             r0, r0
     11     SUB             r12, r0, #16        @dig_rev_shift = norm32(npoints) + 1 -16@
     12     SUB             r0, r0, #1
     13     RSB             r0, r0, #0x1e
     14     AND             r1, r0, #1
     15     STR             r1, [sp, #0x30]
     16     MOV             r1, r0, ASR #1
     17     LDR             r0, [sp, #0x48]     @npoints
     18     STR             r1, [sp, #0x18]
     19     MOV             lr, r0, LSL #1      @(npoints >>1) * 4
     20     MOV             r0, #0
     21 
     22 FIRST_STAGE_R4:
     23     MOVW            r4, #0x3333
     24     MOVT            r4, #0x3333
     25     MOVW            r5, #0x0F0F
     26     MOVT            r5, #0x0F0F
     27     AND             r6, r4, r0
     28     AND             r7, r4, r0, LSR #2
     29     ORR             r4, r7, r6, LSL #2
     30     AND             r6, r5, r4
     31     AND             r7, r5, r4, LSR #4
     32     ORR             r4, r7, r6, LSL #4
     33     BIC             r6, r4, #0x0000FF00
     34     BIC             r7, r4, #0x00FF0000
     35     MOV             r7, r7, LSR #8
     36     ORR             r4, r7, r6, LSL #8
     37     LDR             r5, [sp, #0x30]
     38     MOV             r10, r4, LSR r12
     39     CMP             r5, #0
     40     ADDNE           r10, r10, #1
     41     BICNE           r10, r10, #1
     42 
     43     ADD             r1, r2, r10, LSL #2
     44     LDRD            r4, [r1]            @r4=x0r,  r5=x0i
     45     ADD             r1, r1, lr
     46     LDRD            r8, [r1]            @r8=x1r,  r9=x1i
     47     ADD             r1, r1, lr
     48     LDRD            r6, [r1]            @r6=x2r,  r7=x2i
     49     ADD             r1, r1, lr
     50     LDRD            r10, [r1]           @r10=x3r, r11=x3i
     51     ADD             r0, r0, #4
     52     CMP             r0, lr, ASR #1
     53 
     54     ADD             r4, r4, r6          @x0r = x0r + x2r@
     55     ADD             r5, r5, r7          @x0i = x0i + x2i@
     56     SUB             r6, r4, r6, lsl#1   @x2r = x0r - (x2r << 1)@
     57     SUB             r7, r5, r7, lsl#1   @x2i = x0i - (x2i << 1)@
     58     ADD             r8, r8, r10         @x1r = x1r + x3r@
     59     ADD             r9, r9, r11         @x1i = x1i + x3i@
     60     SUB             r1, r8, r10, lsl#1  @x3r = x1r - (x3r << 1)@
     61     SUB             r11, r9, r11, lsl#1 @x3i = x1i - (x3i << 1)@
     62 
     63     ADD             r4, r4, r8          @x0r = x0r + x1r@
     64     ADD             r5, r5, r9          @x0i = x0i + x1i@
     65     SUB             r8, r4, r8, lsl#1   @x1r = x0r - (x1r << 1)@
     66     SUB             r9, r5, r9, lsl#1   @x1i = x0i - (x1i << 1)
     67     SUB             r6, r6, r11         @x2r = x2r - x3i@
     68     ADD             r7, r7, r1          @x2i = x2i + x3r@
     69     ADD             r10, r6, r11, lsl#1 @x3i = x2r + (x3i << 1)@
     70     SUB             r11, r7, r1, lsl#1  @x3r = x2i - (x3r << 1)@
     71 
     72     STMIA           r3!, {r4-r11}
     73     BLT             FIRST_STAGE_R4
     74     LDR             r1, [sp, #0x18]
     75     LDR             r0, [sp, #0x48]
     76     MOV             r12, #0x40          @nodespacing = 64@
     77     STR             r12, [sp, #0x38]
     78     LDR             r12, [sp, #0x48]
     79     SUB             r3, r3, r0, LSL #3
     80     SUBS            r1, r1, #1
     81     STR             r3, [sp, #0x50]
     82     MOV             r4, r12, ASR #4
     83     MOV             r0, #4
     84     STR             r4, [sp, #0x34]
     85     STR             r1, [sp, #0x3c]
     86     BLE             RADIX2
     87 OUTER_LOOP:
     88     LDR             r1, [sp, #0x44]
     89     LDR             r12, [sp, #0x50]    @WORD32 *data = ptr_y@
     90     STR             r1, [sp, #0x2c]
     91     LDR             r1, [sp, #0x34]
     92 
     93     MOV             r0, r0, LSL #3      @(del<<1) * 4
     94 LOOP_TRIVIAL_TWIDDLE:
     95     LDRD            r4, [r12]           @r4=x0r,  r5=x0i
     96     ADD             r12, r12, r0
     97     LDRD            r6, [r12]           @r6=x1r,  r7=x1i
     98     ADD             r12, r12, r0
     99     LDRD            r8, [r12]           @r8=x2r,  r9=x2i
    100     ADD             r12, r12, r0
    101     LDRD            r10, [r12]          @r10=x3r, r11=x3i
    102 
    103 @MOV    r4,r4,ASR #1
    104 @MOV    r5,r5,ASR #1
    105 @MOV    r6,r6,ASR #1
    106 @MOV    r7,r7,ASR #1
    107 @MOV    r8,r8,ASR #1
    108 @MOV    r9,r9,ASR #1
    109 @MOV    r10,r10,ASR #1
    110 @MOV    r11,r11,ASR #1
    111 
    112     ADD             r4, r4, r8          @x0r = x0r + x2r@
    113     ADD             r5, r5, r9          @x0i = x0i + x2i@
    114     SUB             r8, r4, r8, lsl #1  @x2r = x0r - (x2r << 1)@
    115     SUB             r9, r5, r9, lsl #1  @x2i = x0i - (x2i << 1)@
    116     ADD             r6, r6, r10         @x1r = x1r + x3r@
    117     ADD             r7, r7, r11         @x1i = x1i + x3i@
    118     SUB             r2, r6, r10, lsl #1 @x3r = x1r - (x3r << 1)@
    119     SUB             r11, r7, r11, lsl #1 @x3i = x1i - (x3i << 1)@
    120 
    121     ADD             r4, r4, r6          @x0r = x0r + x1r@
    122     ADD             r5, r5, r7          @x0i = x0i + x1i@
    123 @MOV    r4,r4,ASR #1
    124 @MOV    r5,r5,ASR #1
    125     SUB             r6, r4, r6, lsl #1  @x1r = x0r - (x1r << 1)@
    126     SUB             r7, r5, r7, lsl #1  @x1i = x0i - (x1i << 1)
    127     SUB             r8, r8, r11         @x2r = x2r - x3i@
    128     ADD             r9, r9, r2          @x2i = x2i + x3r@
    129     ADD             r10, r8, r11, lsl#1 @x3i = x2r + (x3i << 1)@
    130     SUB             r11, r9, r2, lsl#1  @x3r = x2i - (x3r << 1)
    131 
    132     STRD            r10, [r12]          @r10=x3r, r11=x3i
    133     SUB             r12, r12, r0
    134     STRD            r6, [r12]           @r6=x1r,  r7=x1i
    135     SUB             r12, r12, r0
    136     STRD            r8, [r12]           @r8=x2r,  r9=x2i
    137     SUB             r12, r12, r0
    138     STRD            r4, [r12]           @r4=x0r,  r5=x0i
    139     ADD             r12, r12, r0, lsl #2
    140 
    141     SUBS            r1, r1, #1
    142     BNE             LOOP_TRIVIAL_TWIDDLE
    143 
    144     MOV             r0, r0, ASR #3
    145     LDR             r4, [sp, #0x38]
    146     LDR             r3, [sp, #0x50]
    147     MUL             r1, r0, r4
    148     ADD             r12, r3, #8
    149     STR             r1, [sp, #0x40]
    150     MOV             r3, r1, ASR #2
    151     ADD             r3, r3, r1, ASR #3
    152     SUB             r3, r3, r1, ASR #4
    153     ADD             r3, r3, r1, ASR #5
    154     SUB             r3, r3, r1, ASR #6
    155     ADD             r3, r3, r1, ASR #7
    156     SUB             r3, r3, r1, ASR #8
    157     STR             r3, [sp, #0x18]
    158 SECOND_LOOP:
    159     LDR             r3, [sp, #0x2c]
    160     LDR             r14, [sp, #0x34]
    161     MOV             r0, r0, LSL #3      @(del<<1) * 4
    162     LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
    163     LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
    164     LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
    165     LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
    166     LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
    167     LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
    168 
    169     STR             r4, [sp, #0x24]
    170     STR             r1, [sp, #0x14]
    171     STR             r2, [sp, #0x10]
    172     STR             r5, [sp, #0x0c]
    173     STR             r6, [sp, #0x08]
    174     STR             r7, [sp, #0x04]
    175     STR             r8, [sp]
    176 
    177 RADIX4_BFLY:
    178 
    179     LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
    180     LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
    181     LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
    182     SUBS            r14, r14, #1
    183 
    184     LDR             r1, [sp, #0x14]
    185     LDR             r2, [sp, #0x10]
    186 
    187     SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
    188     LSR             r3, r3, #31
    189     ORR             r4, r3, r4, LSL#1
    190     SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
    191     LSR             r3, r3, #31
    192     ORR             r6, r3, r6, LSL#1
    193     SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
    194     LSR             r3, r3, #31
    195     ORR             r5, r3, r5, LSL#1
    196     SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
    197     LSR             r3, r3, #31
    198     ORR             r7, r3, r7, LSL#1
    199     SUB             r7, r7, r6
    200     ADD             r6, r4, r5          @
    201 
    202     LDR             r1, [sp, #0x0c]
    203     LDR             r2, [sp, #0x08]
    204 
    205     SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
    206     LSR             r3, r3, #31
    207     ORR             r4, r3, r4, LSL#1
    208     SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
    209     LSR             r3, r3, #31
    210     ORR             r8, r3, r8, LSL#1
    211     SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
    212     LSR             r3, r3, #31
    213     ORR             r5, r3, r5, LSL#1
    214     SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    215     LSR             r3, r3, #31
    216     ORR             r9, r3, r9, LSL#1
    217     SUB             r9, r9, r8
    218     ADD             r8, r4, r5          @
    219 
    220     LDR             r1, [sp, #0x04]
    221     LDR             r2, [sp]
    222 
    223     SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
    224     LSR             r3, r3, #31
    225     ORR             r4, r3, r4, LSL#1
    226     SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
    227     LSR             r3, r3, #31
    228     ORR             r10, r3, r10, LSL#1
    229     SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
    230     LSR             r3, r3, #31
    231     ORR             r5, r3, r5, LSL#1
    232     SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
    233     LSR             r3, r3, #31
    234     ORR             r11, r3, r11, LSL#1
    235     SUB             r11, r11, r10
    236     ADD             r10, r4, r5         @
    237 
    238     @SUB   r12,r12,r0,lsl #1
    239     @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
    240     LDR             r4, [r12, -r0, lsl #1]! @
    241     LDR             r5, [r12, #0x04]
    242 
    243 
    244     ADD             r4, r8, r4          @x0r = x0r + x2r@
    245     ADD             r5, r9, r5          @x0i = x0i + x2i@
    246     SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
    247     SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
    248     ADD             r6, r6, r10         @x1r = x1r + x3r@
    249     ADD             r7, r7, r11         @x1i = x1i + x3i@
    250     SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
    251     SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
    252 
    253     ADD             r4, r4, r6          @x0r = x0r + x1r@
    254     ADD             r5, r5, r7          @x0i = x0i + x1i@
    255     SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
    256     SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
    257     STRD            r4, [r12]           @r4=x0r,  r5=x0i
    258     ADD             r12, r12, r0
    259 
    260     SUB             r8, r8, r11         @x2r = x2r - x3i@
    261     ADD             r9, r9, r10         @x2i = x2i + x3r@
    262     ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
    263     SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
    264 
    265     STRD            r8, [r12]           @r8=x2r,  r9=x2i
    266     ADD             r12, r12, r0
    267     STRD            r6, [r12]           @r6=x1r,  r7=x1i
    268     ADD             r12, r12, r0
    269     STRD            r4, [r12]           @r10=x3r, r11=x3i
    270     ADD             r12, r12, r0
    271 
    272     BNE             RADIX4_BFLY
    273     MOV             r0, r0, ASR #3
    274 
    275     LDR             r1, [sp, #0x48]
    276     LDR             r4, [sp, #0x24]
    277     SUB             r1, r12, r1, LSL #3
    278     LDR             r6, [sp, #0x38]
    279     ADD             r12, r1, #8
    280     LDR             r7, [sp, #0x18]
    281     ADD             r4, r4, r6
    282     CMP             r4, r7
    283     BLE             SECOND_LOOP
    284 
    285 SECOND_LOOP_2:
    286     LDR             r3, [sp, #0x2c]
    287     LDR             r14, [sp, #0x34]
    288     MOV             r0, r0, LSL #3      @(del<<1) * 4
    289 
    290     LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
    291     LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
    292     LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
    293     LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
    294     SUB             r3, r3, #2048       @ 512 *4
    295     LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
    296     LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
    297 
    298     STR             r4, [sp, #0x24]
    299 
    300     STR             r1, [sp, #0x14]
    301     STR             r2, [sp, #0x10]
    302     STR             r5, [sp, #0x0c]
    303     STR             r6, [sp, #0x08]
    304     STR             r7, [sp, #0x04]
    305     STR             r8, [sp]
    306 
    307 RADIX4_BFLY_2:
    308     LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
    309     LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
    310     LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
    311     SUBS            r14, r14, #1
    312     LDR             r1, [sp, #0x14]
    313     LDR             r2, [sp, #0x10]
    314 
    315     SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
    316     LSR             r3, r3, #31
    317     ORR             r4, r3, r4, LSL#1
    318     SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
    319     LSR             r3, r3, #31
    320     ORR             r6, r3, r6, LSL#1
    321     SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
    322     LSR             r3, r3, #31
    323     ORR             r5, r3, r5, LSL#1
    324     SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
    325     LSR             r3, r3, #31
    326     ORR             r7, r3, r7, LSL#1
    327     SUB             r7, r7, r6
    328     ADD             r6, r4, r5          @
    329 
    330     LDR             r1, [sp, #0x0c]
    331     LDR             r2, [sp, #0x08]
    332 
    333     SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
    334     LSR             r3, r3, #31
    335     ORR             r4, r3, r4, LSL#1
    336     SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
    337     LSR             r3, r3, #31
    338     ORR             r8, r3, r8, LSL#1
    339     SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
    340     LSR             r3, r3, #31
    341     ORR             r5, r3, r5, LSL#1
    342     SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    343     LSR             r3, r3, #31
    344     ORR             r9, r3, r9, LSL#1
    345     SUB             r9, r9, r8
    346     ADD             r8, r4, r5          @
    347 
    348     LDR             r1, [sp, #0x04]
    349     LDR             r2, [sp]
    350 
    351     SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
    352     LSR             r3, r3, #31
    353     ORR             r4, r3, r4, LSL#1
    354     SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
    355     LSR             r3, r3, #31
    356     ORR             r10, r3, r10, LSL#1
    357     SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
    358     LSR             r3, r3, #31
    359     ORR             r5, r3, r5, LSL#1
    360     SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
    361     LSR             r3, r3, #31
    362     ORR             r11, r3, r11, LSL#1
    363     SUB             r10, r10, r11
    364     ADD             r11, r5, r4         @
    365 
    366     @SUB    r12,r12,r0,lsl #1
    367     @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
    368     LDR             r4, [r12, -r0, lsl #1]! @
    369     LDR             r5, [r12, #0x04]
    370 
    371 
    372     ADD             r4, r8, r4          @x0r = x0r + x2r@
    373     ADD             r5, r9, r5          @x0i = x0i + x2i@
    374     SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
    375     SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
    376     ADD             r6, r6, r10         @x1r = x1r + x3r@
    377     ADD             r7, r7, r11         @x1i = x1i + x3i@
    378     SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
    379     SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
    380 
    381     ADD             r4, r4, r6          @x0r = x0r + x1r@
    382     ADD             r5, r5, r7          @x0i = x0i + x1i@
    383     SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
    384     SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
    385     STRD            r4, [r12]           @r4=x0r,  r5=x0i
    386     ADD             r12, r12, r0
    387 
    388     SUB             r8, r8, r11         @x2r = x2r - x3i@
    389     ADD             r9, r9, r10         @x2i = x2i + x3r@
    390     ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
    391     SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
    392 
    393     STRD            r8, [r12]           @r8=x2r,  r9=x2i
    394     ADD             r12, r12, r0
    395     STRD            r6, [r12]           @r6=x1r,  r7=x1i
    396     ADD             r12, r12, r0
    397     STRD            r4, [r12]           @r10=x3r, r11=x3i
    398     ADD             r12, r12, r0
    399 
    400     BNE             RADIX4_BFLY_2
    401     MOV             r0, r0, ASR #3
    402 
    403     LDR             r1, [sp, #0x48]
    404     LDR             r4, [sp, #0x24]
    405     SUB             r1, r12, r1, LSL #3
    406     LDR             r6, [sp, #0x38]
    407     ADD             r12, r1, #8
    408     LDR             r7, [sp, #0x40]
    409     ADD             r4, r4, r6
    410     CMP             r4, r7, ASR #1
    411     BLE             SECOND_LOOP_2
    412     LDR             r7, [sp, #0x18]
    413     CMP             r4, r7, LSL #1
    414     BGT             SECOND_LOOP_4
    415 
    416 SECOND_LOOP_3:
    417     LDR             r3, [sp, #0x2c]
    418     LDR             r14, [sp, #0x34]
    419     MOV             r0, r0, LSL #3      @(del<<1) * 4
    420 
    421     LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
    422     LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
    423     SUB             r3, r3, #2048       @ 512 *4
    424     LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
    425     LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
    426     LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
    427     LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
    428 
    429     STR             r4, [sp, #0x24]
    430     STR             r1, [sp, #0x14]
    431     STR             r2, [sp, #0x10]
    432     STR             r5, [sp, #0x0c]
    433     STR             r6, [sp, #0x08]
    434     STR             r7, [sp, #0x04]
    435     STR             r8, [sp]
    436 
    437 
    438 RADIX4_BFLY_3:
    439     LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
    440     LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
    441     LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
    442     SUBS            r14, r14, #1
    443 
    444     LDR             r1, [sp, #0x14]
    445     LDR             r2, [sp, #0x10]
    446 
    447     SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
    448     LSR             r3, r3, #31
    449     ORR             r4, r3, r4, LSL#1
    450     SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
    451     LSR             r3, r3, #31
    452     ORR             r6, r3, r6, LSL#1
    453     SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
    454     LSR             r3, r3, #31
    455     ORR             r5, r3, r5, LSL#1
    456     SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
    457     LSR             r3, r3, #31
    458     ORR             r7, r3, r7, LSL#1
    459     SUB             r7, r7, r6
    460     ADD             r6, r4, r5          @
    461 
    462     LDR             r1, [sp, #0x0c]
    463     LDR             r2, [sp, #0x08]
    464 
    465     SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
    466     LSR             r3, r3, #31
    467     ORR             r4, r3, r4, LSL#1
    468     SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
    469     LSR             r3, r3, #31
    470     ORR             r8, r3, r8, LSL#1
    471     SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
    472     LSR             r3, r3, #31
    473     ORR             r5, r3, r5, LSL#1
    474     SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    475     LSR             r3, r3, #31
    476     ORR             r9, r3, r9, LSL#1
    477     SUB             r8, r8, r9
    478     ADD             r9, r5, r4          @
    479 
    480     LDR             r1, [sp, #0x04]
    481     LDR             r2, [sp]
    482 
    483     SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
    484     LSR             r3, r3, #31
    485     ORR             r4, r3, r4, LSL#1
    486     SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
    487     LSR             r3, r3, #31
    488     ORR             r10, r3, r10, LSL#1
    489     SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
    490     LSR             r3, r3, #31
    491     ORR             r5, r3, r5, LSL#1
    492     SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
    493     LSR             r3, r3, #31
    494     ORR             r11, r3, r11, LSL#1
    495     SUB             r10, r10, r11
    496     ADD             r11, r5, r4         @
    497 
    498     @SUB    r12,r12,r0,lsl #1
    499     @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
    500     LDR             r4, [r12, -r0, lsl #1]! @
    501     LDR             r5, [r12, #0x04]
    502 
    503 
    504     ADD             r4, r8, r4          @x0r = x0r + x2r@
    505     ADD             r5, r9, r5          @x0i = x0i + x2i@
    506     SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
    507     SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
    508     ADD             r6, r6, r10         @x1r = x1r + x3r@
    509     ADD             r7, r7, r11         @x1i = x1i + x3i@
    510     SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
    511     SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
    512 
    513     ADD             r4, r4, r6          @x0r = x0r + x1r@
    514     ADD             r5, r5, r7          @x0i = x0i + x1i@
    515     SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
    516     SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
    517     STRD            r4, [r12]           @r4=x0r,  r5=x0i
    518     ADD             r12, r12, r0
    519 
    520     SUB             r8, r8, r11         @x2r = x2r - x3i@
    521     ADD             r9, r9, r10         @x2i = x2i + x3r@
    522     ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
    523     SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
    524 
    525     STRD            r8, [r12]           @r8=x2r,  r9=x2i
    526     ADD             r12, r12, r0
    527     STRD            r6, [r12]           @r6=x1r,  r7=x1i
    528     ADD             r12, r12, r0
    529     STRD            r4, [r12]           @r10=x3r, r11=x3i
    530     ADD             r12, r12, r0
    531 
    532     BNE             RADIX4_BFLY_3
    533     MOV             r0, r0, ASR #3
    534 
    535     LDR             r1, [sp, #0x48]
    536     LDR             r4, [sp, #0x24]
    537     SUB             r1, r12, r1, LSL #3
    538     LDR             r6, [sp, #0x38]
    539     ADD             r12, r1, #8
    540     LDR             r7, [sp, #0x18]
    541     ADD             r4, r4, r6
    542     CMP             r4, r7, LSL #1
    543     BLE             SECOND_LOOP_3
    544 
    545 SECOND_LOOP_4:
    546     LDR             r3, [sp, #0x2c]
    547     LDR             r14, [sp, #0x34]
    548     MOV             r0, r0, LSL #3      @(del<<1) * 4
    549 
    550     LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
    551     LDR             r2, [r3, #0x04]     @w1l = *(twiddles + 2*j + 1)@
    552     SUB             r3, r3, #2048       @ 512 *4
    553     LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
    554     LDR             r6, [r3, #0x04]     @w2l = *(twiddles + 2*(j<<1) + 1)@
    555     SUB             r3, r3, #2048       @ 512 *4
    556     LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
    557     LDR             r8, [r3, #0x04]     @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
    558 
    559 
    560     STR             r4, [sp, #0x24]
    561     STR             r1, [sp, #0x14]
    562     STR             r2, [sp, #0x10]
    563     STR             r5, [sp, #0x0c]
    564     STR             r6, [sp, #0x08]
    565     STR             r7, [sp, #0x04]
    566     STR             r8, [sp]
    567 
    568 RADIX4_BFLY_4:
    569     LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
    570     LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
    571     LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
    572     SUBS            r14, r14, #1
    573 
    574     LDR             r1, [sp, #0x14]
    575     LDR             r2, [sp, #0x10]
    576 
    577     SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
    578     LSR             r3, r3, #31
    579     ORR             r4, r3, r4, LSL#1
    580     SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
    581     LSR             r3, r3, #31
    582     ORR             r6, r3, r6, LSL#1
    583     SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
    584     LSR             r3, r3, #31
    585     ORR             r5, r3, r5, LSL#1
    586     SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
    587     LSR             r3, r3, #31
    588     ORR             r7, r3, r7, LSL#1
    589     SUB             r7, r7, r6
    590     ADD             r6, r4, r5          @
    591 
    592     LDR             r1, [sp, #0x0c]
    593     LDR             r2, [sp, #0x08]
    594 
    595     SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
    596     LSR             r3, r3, #31
    597     ORR             r4, r3, r4, LSL#1
    598     SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
    599     LSR             r3, r3, #31
    600     ORR             r8, r3, r8, LSL#1
    601     SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
    602     LSR             r3, r3, #31
    603     ORR             r5, r3, r5, LSL#1
    604     SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    605     LSR             r3, r3, #31
    606     ORR             r9, r3, r9, LSL#1
    607     SUB             r8, r8, r9
    608     ADD             r9, r5, r4          @
    609 
    610     LDR             r1, [sp, #0x04]
    611     LDR             r2, [sp]
    612 
    613     SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
    614     LSR             r3, r3, #31
    615     ORR             r4, r3, r4, LSL#1
    616     SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
    617     LSR             r3, r3, #31
    618     ORR             r10, r3, r10, LSL#1
    619     SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
    620     LSR             r3, r3, #31
    621     ORR             r5, r3, r5, LSL#1
    622     SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
    623     LSR             r3, r3, #31
    624     ORR             r11, r3, r11, LSL#1
    625     SUB             r11, r11, r10
    626     ADD             r10, r5, r4         @
    627     RSB             r10, r10, #0
    628 
    629     @SUB    r12,r12,r0,lsl #1
    630     @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
    631     LDR             r4, [r12, -r0, lsl #1]! @
    632     LDR             r5, [r12, #0x04]
    633 
    634 
    635     ADD             r4, r8, r4          @x0r = x0r + x2r@
    636     ADD             r5, r9, r5          @x0i = x0i + x2i@
    637     SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
    638     SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
    639     ADD             r6, r6, r10         @x1r = x1r + x3r@
    640     SUB             r7, r7, r11         @x1i = x1i - x3i@
    641     SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
    642     ADD             r11, r7, r11, lsl#1 @x3i = x1i + (x3i << 1)@
    643 
    644     ADD             r4, r4, r6          @x0r = x0r + x1r@
    645     ADD             r5, r5, r7          @x0i = x0i + x1i@
    646     SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
    647     SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
    648     STRD            r4, [r12]           @r4=x0r,  r5=x0i
    649     ADD             r12, r12, r0
    650 
    651     SUB             r8, r8, r11         @x2r = x2r - x3i@
    652     ADD             r9, r9, r10         @x2i = x2i + x3r@
    653     ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
    654     SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
    655 
    656     STRD            r8, [r12]           @r8=x2r,  r9=x2i
    657     ADD             r12, r12, r0
    658     STRD            r6, [r12]           @r6=x1r,  r7=x1i
    659     ADD             r12, r12, r0
    660     STRD            r4, [r12]           @r10=x3r, r11=x3i
    661     ADD             r12, r12, r0
    662 
    663     BNE             RADIX4_BFLY_4
    664     MOV             r0, r0, ASR #3
    665 
    666     LDR             r1, [sp, #0x48]
    667     LDR             r4, [sp, #0x24]
    668     SUB             r1, r12, r1, LSL #3
    669     LDR             r6, [sp, #0x38]
    670     ADD             r12, r1, #8
    671     LDR             r7, [sp, #0x40]
    672     ADD             r4, r4, r6
    673     CMP             r4, r7
    674     BLT             SECOND_LOOP_4
    675 
    676     LDR             r1, [sp, #0x38]
    677     MOV             r0, r0, LSL #2
    678     MOV             r1, r1, ASR #2
    679     STR             r1, [sp, #0x38]
    680     LDR             r1, [sp, #0x34]
    681     MOV             r1, r1, ASR #2
    682     STR             r1, [sp, #0x34]
    683     LDR             r1, [sp, #0x3c]
    684     SUBS            r1, r1, #1
    685     STR             r1, [sp, #0x3c]
    686     BGT             OUTER_LOOP
    687 
    688 RADIX2:
    689     LDR             r1, [sp, #0x30]
    690     CMP             r1, #0
    691     BEQ             EXIT
    692     LDR             r12, [sp, #0x38]
    693     LDR             r1, [sp, #0x44]
    694     CMP             r12, #0
    695     MOVEQ           r4, #1
    696     MOVNE           r4, r12, LSL #1
    697     MOVS            r3, r0
    698     BEQ             EXIT
    699 
    700     MOV             r3, r3, ASR #1
    701     LDR             r5, [sp, #0x50]
    702     MOV             r0, r0, LSL #3      @(del<<1) * 4
    703     STR             r1, [sp, #0x18]
    704 RADIX2_BFLY:
    705     LDR             r1, [sp, #0x18]
    706     LDRD            r6, [r5]            @r6 = x0r
    707     ADD             r5, r5, r0
    708     LDRD            r8, [r5]            @r8 = x1r
    709 
    710     LDR             r2, [r1]
    711     SUBS            r3, r3, #1
    712 
    713 
    714     SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
    715     LSR             r1, r1, #31
    716     ORR             r11, r1, r11, LSL#1
    717     SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
    718     LSR             r1, r1, #31
    719     ORR             r10, r1, r10, LSL#1
    720 
    721 
    722     LDR             r1, [sp, #0x18]
    723     LDR             r2, [r1, #0x04]
    724     ADD             r1, r1, r4, LSL #3
    725     STR             r1, [sp, #0x18]
    726 
    727     SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
    728     LSR             r1, r1, #31
    729     ORR             r8, r1, r8, LSL#1
    730     SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    731     LSR             r1, r1, #31
    732     ORR             r9, r1, r9, LSL#1
    733 
    734     ADD             r8, r8, r10
    735     SUB             r9, r9, r11
    736 
    737     ASR             r8, r8, #1
    738     ASR             r6, r6, #1
    739     ASR             r9, r9, #1
    740     ASR             r7, r7, #1
    741     ADD             r10, r8, r6         @(x0r/2) + (x1r/2)
    742     ADD             r11, r9, r7         @(x0i/2) + (x1i/2)@
    743     SUB             r8, r6, r8          @(x0r/2) - (x1r/2)
    744     SUB             r9, r7, r9          @(x0i/2) - (x1i/2)@
    745 
    746     STRD            r8, [r5]
    747     SUB             r5, r5, r0
    748     STRD            r10, [r5], #8
    749 
    750     BNE             RADIX2_BFLY
    751 
    752     LDR             r1, [sp, #0x44]
    753     MOV             r3, r0, ASR #4
    754     STR             r1, [sp, #0x18]
    755 RADIX2_BFLY_2:
    756     LDR             r1, [sp, #0x18]
    757     LDRD            r6, [r5]            @r6 = x0r
    758     ADD             r5, r5, r0
    759     LDRD            r8, [r5]            @r8 = x1r
    760 
    761     LDR             r2, [r1]
    762     SUBS            r3, r3, #1
    763 
    764 
    765 
    766     SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
    767     LSR             r1, r1, #31
    768     ORR             r11, r1, r11, LSL#1
    769     SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
    770     LSR             r1, r1, #31
    771     ORR             r10, r1, r10, LSL#1
    772 
    773 
    774     LDR             r1, [sp, #0x18]
    775     LDR             r2, [r1, #0x04]
    776     ADD             r1, r1, r4, LSL #3
    777     STR             r1, [sp, #0x18]
    778 
    779     SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
    780     LSR             r1, r1, #31
    781     ORR             r8, r1, r8, LSL#1
    782     SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
    783     LSR             r1, r1, #31
    784     ORR             r9, r1, r9, LSL#1
    785 
    786     SUB             r11, r11, r9
    787     ADD             r9, r10, r8         @
    788     MOV             r8, r11
    789 
    790     ASR             r8, r8, #1
    791     ASR             r6, r6, #1
    792     ASR             r9, r9, #1
    793     ASR             r7, r7, #1
    794     ADD             r10, r8, r6         @(x0r>>1) + (x1r)
    795     ADD             r11, r9, r7         @(x0i>>1) + (x1i)@
    796     SUB             r8, r6, r8          @(x0r>>1) - (x1r)
    797     SUB             r9, r7, r9          @(x0i>>1) - (x1i)@
    798 
    799     STRD            r8, [r5]
    800     SUB             r5, r5, r0
    801     STRD            r10, [r5], #8
    802 
    803     BNE             RADIX2_BFLY_2
    804 
    805 EXIT:
    806     ADD             sp, sp, #0x54
    807     LDMFD           sp!, {r4-r12, pc}
    808 
    809