Home | History | Annotate | Download | only in arm
      1 ; Copyright (c) 2007-2008 CSIRO
      2 ; Copyright (c) 2007-2009 Xiph.Org Foundation
      3 ; Copyright (c) 2013      Parrot
      4 ; Written by Aurlien Zanelli
      5 ;
      6 ; Redistribution and use in source and binary forms, with or without
      7 ; modification, are permitted provided that the following conditions
      8 ; are met:
      9 ;
     10 ; - Redistributions of source code must retain the above copyright
     11 ; notice, this list of conditions and the following disclaimer.
     12 ;
     13 ; - Redistributions in binary form must reproduce the above copyright
     14 ; notice, this list of conditions and the following disclaimer in the
     15 ; documentation and/or other materials provided with the distribution.
     16 ;
     17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     21 ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     22 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     23 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     24 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     25 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     26 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     27 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 
     29   AREA  |.text|, CODE, READONLY
     30 
     31   GET    celt/arm/armopts.s
     32 
     33 IF OPUS_ARM_MAY_HAVE_EDSP
     34   EXPORT celt_pitch_xcorr_edsp
     35 ENDIF
     36 
     37 IF OPUS_ARM_MAY_HAVE_NEON
     38   EXPORT celt_pitch_xcorr_neon
     39 ENDIF
     40 
     41 IF OPUS_ARM_MAY_HAVE_NEON
     42 
     43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
     44 xcorr_kernel_neon PROC
     45   ; input:
     46   ;   r3     = int         len
     47   ;   r4     = opus_val16 *x
     48   ;   r5     = opus_val16 *y
     49   ;   q0     = opus_val32  sum[4]
     50   ; output:
     51   ;   q0     = opus_val32  sum[4]
     52   ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
     53   ; internal usage:
     54   ;   r12 = int j
     55   ;   d3  = y_3|y_2|y_1|y_0
     56   ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
     57   ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
     58   ;   q8  = scratch
     59   ;
     60   ; Load y[0...3]
     61   ; This requires len>0 to always be valid (which we assert in the C code).
     62   VLD1.16      {d5}, [r5]!
     63   SUBS         r12, r3, #8
     64   BLE xcorr_kernel_neon_process4
     65 ; Process 8 samples at a time.
     66 ; This loop loads one y value more than we actually need. Therefore we have to
     67 ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
     68 ; reading past the end of the array.
     69 xcorr_kernel_neon_process8
     70   ; This loop has 19 total instructions (10 cycles to issue, minimum), with
     71   ; - 2 cycles of ARM insrtuctions,
     72   ; - 10 cycles of load/store/byte permute instructions, and
     73   ; - 9 cycles of data processing instructions.
     74   ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
     75   ; latter two categories, meaning the whole loop should run in 10 cycles per
     76   ; iteration, barring cache misses.
     77   ;
     78   ; Load x[0...7]
     79   VLD1.16      {d6, d7}, [r4]!
     80   ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
     81   ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
     82   VAND         d3, d5, d5
     83   SUBS         r12, r12, #8
     84   ; Load y[4...11]
     85   VLD1.16      {d4, d5}, [r5]!
     86   VMLAL.S16    q0, d3, d6[0]
     87   VEXT.16      d16, d3, d4, #1
     88   VMLAL.S16    q0, d4, d7[0]
     89   VEXT.16      d17, d4, d5, #1
     90   VMLAL.S16    q0, d16, d6[1]
     91   VEXT.16      d16, d3, d4, #2
     92   VMLAL.S16    q0, d17, d7[1]
     93   VEXT.16      d17, d4, d5, #2
     94   VMLAL.S16    q0, d16, d6[2]
     95   VEXT.16      d16, d3, d4, #3
     96   VMLAL.S16    q0, d17, d7[2]
     97   VEXT.16      d17, d4, d5, #3
     98   VMLAL.S16    q0, d16, d6[3]
     99   VMLAL.S16    q0, d17, d7[3]
    100   BGT xcorr_kernel_neon_process8
    101 ; Process 4 samples here if we have > 4 left (still reading one extra y value).
    102 xcorr_kernel_neon_process4
    103   ADDS         r12, r12, #4
    104   BLE xcorr_kernel_neon_process2
    105   ; Load x[0...3]
    106   VLD1.16      d6, [r4]!
    107   ; Use VAND since it's a data processing instruction again.
    108   VAND         d4, d5, d5
    109   SUB          r12, r12, #4
    110   ; Load y[4...7]
    111   VLD1.16      d5, [r5]!
    112   VMLAL.S16    q0, d4, d6[0]
    113   VEXT.16      d16, d4, d5, #1
    114   VMLAL.S16    q0, d16, d6[1]
    115   VEXT.16      d16, d4, d5, #2
    116   VMLAL.S16    q0, d16, d6[2]
    117   VEXT.16      d16, d4, d5, #3
    118   VMLAL.S16    q0, d16, d6[3]
    119 ; Process 2 samples here if we have > 2 left (still reading one extra y value).
    120 xcorr_kernel_neon_process2
    121   ADDS         r12, r12, #2
    122   BLE xcorr_kernel_neon_process1
    123   ; Load x[0...1]
    124   VLD2.16      {d6[],d7[]}, [r4]!
    125   ; Use VAND since it's a data processing instruction again.
    126   VAND         d4, d5, d5
    127   SUB          r12, r12, #2
    128   ; Load y[4...5]
    129   VLD1.32      {d5[]}, [r5]!
    130   VMLAL.S16    q0, d4, d6
    131   VEXT.16      d16, d4, d5, #1
    132   ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
    133   ; instead of VEXT, since it's a data-processing instruction.
    134   VSRI.64      d5, d4, #32
    135   VMLAL.S16    q0, d16, d7
    136 ; Process 1 sample using the extra y value we loaded above.
    137 xcorr_kernel_neon_process1
    138   ; Load next *x
    139   VLD1.16      {d6[]}, [r4]!
    140   ADDS         r12, r12, #1
    141   ; y[0...3] are left in d5 from prior iteration(s) (if any)
    142   VMLAL.S16    q0, d5, d6
    143   MOVLE        pc, lr
    144 ; Now process 1 last sample, not reading ahead.
    145   ; Load last *y
    146   VLD1.16      {d4[]}, [r5]!
    147   VSRI.64      d4, d5, #16
    148   ; Load last *x
    149   VLD1.16      {d6[]}, [r4]!
    150   VMLAL.S16    q0, d4, d6
    151   MOV          pc, lr
    152   ENDP
    153 
    154 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
    155 ;  opus_val32 *xcorr, int len, int max_pitch)
    156 celt_pitch_xcorr_neon PROC
    157   ; input:
    158   ;   r0  = opus_val16 *_x
    159   ;   r1  = opus_val16 *_y
    160   ;   r2  = opus_val32 *xcorr
    161   ;   r3  = int         len
    162   ; output:
    163   ;   r0  = int         maxcorr
    164   ; internal usage:
    165   ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
    166   ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
    167   ;   r6  = int         max_pitch
    168   ;   r12 = int         j
    169   ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
    170   STMFD        sp!, {r4-r6, lr}
    171   LDR          r6, [sp, #16]
    172   VMOV.S32     q15, #1
    173   ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
    174   SUBS         r6, r6, #4
    175   BLT celt_pitch_xcorr_neon_process4_done
    176 celt_pitch_xcorr_neon_process4
    177   ; xcorr_kernel_neon parameters:
    178   ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
    179   MOV          r4, r0
    180   MOV          r5, r1
    181   VEOR         q0, q0, q0
    182   ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
    183   ; So we don't save/restore any other registers.
    184   BL xcorr_kernel_neon
    185   SUBS         r6, r6, #4
    186   VST1.32      {q0}, [r2]!
    187   ; _y += 4
    188   ADD          r1, r1, #8
    189   VMAX.S32     q15, q15, q0
    190   ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
    191   BGE celt_pitch_xcorr_neon_process4
    192 ; We have less than 4 sums left to compute.
    193 celt_pitch_xcorr_neon_process4_done
    194   ADDS         r6, r6, #4
    195   ; Reduce maxcorr to a single value
    196   VMAX.S32     d30, d30, d31
    197   VPMAX.S32    d30, d30, d30
    198   ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
    199   BLE celt_pitch_xcorr_neon_done
    200 ; Now compute each remaining sum one at a time.
    201 celt_pitch_xcorr_neon_process_remaining
    202   MOV          r4, r0
    203   MOV          r5, r1
    204   VMOV.I32     q0, #0
    205   SUBS         r12, r3, #8
    206   BLT celt_pitch_xcorr_neon_process_remaining4
    207 ; Sum terms 8 at a time.
    208 celt_pitch_xcorr_neon_process_remaining_loop8
    209   ; Load x[0...7]
    210   VLD1.16      {q1}, [r4]!
    211   ; Load y[0...7]
    212   VLD1.16      {q2}, [r5]!
    213   SUBS         r12, r12, #8
    214   VMLAL.S16    q0, d4, d2
    215   VMLAL.S16    q0, d5, d3
    216   BGE celt_pitch_xcorr_neon_process_remaining_loop8
    217 ; Sum terms 4 at a time.
    218 celt_pitch_xcorr_neon_process_remaining4
    219   ADDS         r12, r12, #4
    220   BLT celt_pitch_xcorr_neon_process_remaining4_done
    221   ; Load x[0...3]
    222   VLD1.16      {d2}, [r4]!
    223   ; Load y[0...3]
    224   VLD1.16      {d3}, [r5]!
    225   SUB          r12, r12, #4
    226   VMLAL.S16    q0, d3, d2
    227 celt_pitch_xcorr_neon_process_remaining4_done
    228   ; Reduce the sum to a single value.
    229   VADD.S32     d0, d0, d1
    230   VPADDL.S32   d0, d0
    231   ADDS         r12, r12, #4
    232   BLE celt_pitch_xcorr_neon_process_remaining_loop_done
    233 ; Sum terms 1 at a time.
    234 celt_pitch_xcorr_neon_process_remaining_loop1
    235   VLD1.16      {d2[]}, [r4]!
    236   VLD1.16      {d3[]}, [r5]!
    237   SUBS         r12, r12, #1
    238   VMLAL.S16    q0, d2, d3
    239   BGT celt_pitch_xcorr_neon_process_remaining_loop1
    240 celt_pitch_xcorr_neon_process_remaining_loop_done
    241   VST1.32      {d0[0]}, [r2]!
    242   VMAX.S32     d30, d30, d0
    243   SUBS         r6, r6, #1
    244   ; _y++
    245   ADD          r1, r1, #2
    246   ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
    247   BGT celt_pitch_xcorr_neon_process_remaining
    248 celt_pitch_xcorr_neon_done
    249   VMOV.32      r0, d30[0]
    250   LDMFD        sp!, {r4-r6, pc}
    251   ENDP
    252 
    253 ENDIF
    254 
    255 IF OPUS_ARM_MAY_HAVE_EDSP
    256 
    257 ; This will get used on ARMv7 devices without NEON, so it has been optimized
    258 ; to take advantage of dual-issuing where possible.
    259 xcorr_kernel_edsp PROC
    260   ; input:
    261   ;   r3      = int         len
    262   ;   r4      = opus_val16 *_x (must be 32-bit aligned)
    263   ;   r5      = opus_val16 *_y (must be 32-bit aligned)
    264   ;   r6...r9 = opus_val32  sum[4]
    265   ; output:
    266   ;   r6...r9 = opus_val32  sum[4]
    267   ; preserved: r0-r5
    268   ; internal usage
    269   ;   r2      = int         j
    270   ;   r12,r14 = opus_val16  x[4]
    271   ;   r10,r11 = opus_val16  y[4]
    272   STMFD        sp!, {r2,r4,r5,lr}
    273   LDR          r10, [r5], #4      ; Load y[0...1]
    274   SUBS         r2, r3, #4         ; j = len-4
    275   LDR          r11, [r5], #4      ; Load y[2...3]
    276   BLE xcorr_kernel_edsp_process4_done
    277   LDR          r12, [r4], #4      ; Load x[0...1]
    278   ; Stall
    279 xcorr_kernel_edsp_process4
    280   ; The multiplies must issue from pipeline 0, and can't dual-issue with each
    281   ; other. Every other instruction here dual-issues with a multiply, and is
    282   ; thus "free". There should be no stalls in the body of the loop.
    283   SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
    284   LDR          r14, [r4], #4      ; Load x[2...3]
    285   SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
    286   SUBS         r2, r2, #4         ; j-=4
    287   SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
    288   SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
    289   SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
    290   LDR          r10, [r5], #4      ; Load y[4...5]
    291   SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
    292   SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
    293   SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
    294   LDRGT        r12, [r4], #4      ; Load x[0...1]
    295   SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
    296   SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
    297   SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
    298   SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
    299   SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
    300   LDR          r11, [r5], #4      ; Load y[6...7]
    301   SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
    302   SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
    303   SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
    304   BGT xcorr_kernel_edsp_process4
    305 xcorr_kernel_edsp_process4_done
    306   ADDS         r2, r2, #4
    307   BLE xcorr_kernel_edsp_done
    308   LDRH         r12, [r4], #2      ; r12 = *x++
    309   SUBS         r2, r2, #1         ; j--
    310   ; Stall
    311   SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
    312   LDRGTH       r14, [r4], #2      ; r14 = *x++
    313   SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
    314   SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
    315   SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
    316   BLE xcorr_kernel_edsp_done
    317   SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
    318   SUBS         r2, r2, #1         ; j--
    319   SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
    320   LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
    321   SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
    322   LDRGTH       r12, [r4], #2      ; r12 = *x++
    323   SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
    324   BLE xcorr_kernel_edsp_done
    325   SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
    326   CMP          r2, #1             ; j--
    327   SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
    328   LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
    329   SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
    330   LDRGTH       r14, [r4]          ; r14 = *x
    331   SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
    332   BLE xcorr_kernel_edsp_done
    333   SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
    334   LDRH         r11, [r5]          ; r11 = y_6 = *y
    335   SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
    336   SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
    337   SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
    338 xcorr_kernel_edsp_done
    339   LDMFD        sp!, {r2,r4,r5,pc}
    340   ENDP
    341 
    342 celt_pitch_xcorr_edsp PROC
    343   ; input:
    344   ;   r0  = opus_val16 *_x (must be 32-bit aligned)
    345   ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
    346   ;   r2  = opus_val32 *xcorr
    347   ;   r3  = int         len
    348   ; output:
    349   ;   r0  = maxcorr
    350   ; internal usage
    351   ;   r4  = opus_val16 *x
    352   ;   r5  = opus_val16 *y
    353   ;   r6  = opus_val32  sum0
    354   ;   r7  = opus_val32  sum1
    355   ;   r8  = opus_val32  sum2
    356   ;   r9  = opus_val32  sum3
    357   ;   r1  = int         max_pitch
    358   ;   r12 = int         j
    359   STMFD        sp!, {r4-r11, lr}
    360   MOV          r5, r1
    361   LDR          r1, [sp, #36]
    362   MOV          r4, r0
    363   TST          r5, #3
    364   ; maxcorr = 1
    365   MOV          r0, #1
    366   BEQ          celt_pitch_xcorr_edsp_process1u_done
    367 ; Compute one sum at the start to make y 32-bit aligned.
    368   SUBS         r12, r3, #4
    369   ; r14 = sum = 0
    370   MOV          r14, #0
    371   LDRH         r8, [r5], #2
    372   BLE celt_pitch_xcorr_edsp_process1u_loop4_done
    373   LDR          r6, [r4], #4
    374   MOV          r8, r8, LSL #16
    375 celt_pitch_xcorr_edsp_process1u_loop4
    376   LDR          r9, [r5], #4
    377   SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
    378   LDR          r7, [r4], #4
    379   SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
    380   LDR          r8, [r5], #4
    381   SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
    382   SUBS         r12, r12, #4         ; j-=4
    383   SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
    384   LDRGT        r6, [r4], #4
    385   BGT celt_pitch_xcorr_edsp_process1u_loop4
    386   MOV          r8, r8, LSR #16
    387 celt_pitch_xcorr_edsp_process1u_loop4_done
    388   ADDS         r12, r12, #4
    389 celt_pitch_xcorr_edsp_process1u_loop1
    390   LDRGEH       r6, [r4], #2
    391   ; Stall
    392   SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
    393   SUBGES       r12, r12, #1
    394   LDRGTH       r8, [r5], #2
    395   BGT celt_pitch_xcorr_edsp_process1u_loop1
    396   ; Restore _x
    397   SUB          r4, r4, r3, LSL #1
    398   ; Restore and advance _y
    399   SUB          r5, r5, r3, LSL #1
    400   ; maxcorr = max(maxcorr, sum)
    401   CMP          r0, r14
    402   ADD          r5, r5, #2
    403   MOVLT        r0, r14
    404   SUBS         r1, r1, #1
    405   ; xcorr[i] = sum
    406   STR          r14, [r2], #4
    407   BLE celt_pitch_xcorr_edsp_done
    408 celt_pitch_xcorr_edsp_process1u_done
    409   ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
    410   SUBS         r1, r1, #4
    411   BLT celt_pitch_xcorr_edsp_process2
    412 celt_pitch_xcorr_edsp_process4
    413   ; xcorr_kernel_edsp parameters:
    414   ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
    415   MOV          r6, #0
    416   MOV          r7, #0
    417   MOV          r8, #0
    418   MOV          r9, #0
    419   BL xcorr_kernel_edsp  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
    420   ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
    421   CMP          r0, r6
    422   ; _y+=4
    423   ADD          r5, r5, #8
    424   MOVLT        r0, r6
    425   CMP          r0, r7
    426   MOVLT        r0, r7
    427   CMP          r0, r8
    428   MOVLT        r0, r8
    429   CMP          r0, r9
    430   MOVLT        r0, r9
    431   STMIA        r2!, {r6-r9}
    432   SUBS         r1, r1, #4
    433   BGE celt_pitch_xcorr_edsp_process4
    434 celt_pitch_xcorr_edsp_process2
    435   ADDS         r1, r1, #2
    436   BLT celt_pitch_xcorr_edsp_process1a
    437   SUBS         r12, r3, #4
    438   ; {r10, r11} = {sum0, sum1} = {0, 0}
    439   MOV          r10, #0
    440   MOV          r11, #0
    441   LDR          r8, [r5], #4
    442   BLE celt_pitch_xcorr_edsp_process2_loop_done
    443   LDR          r6, [r4], #4
    444   LDR          r9, [r5], #4
    445 celt_pitch_xcorr_edsp_process2_loop4
    446   SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
    447   LDR          r7, [r4], #4
    448   SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
    449   SUBS         r12, r12, #4         ; j-=4
    450   SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
    451   LDR          r8, [r5], #4
    452   SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
    453   LDRGT        r6, [r4], #4
    454   SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
    455   SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
    456   SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
    457   LDRGT        r9, [r5], #4
    458   SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
    459   BGT celt_pitch_xcorr_edsp_process2_loop4
    460 celt_pitch_xcorr_edsp_process2_loop_done
    461   ADDS         r12, r12, #2
    462   BLE  celt_pitch_xcorr_edsp_process2_1
    463   LDR          r6, [r4], #4
    464   ; Stall
    465   SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
    466   LDR          r9, [r5], #4
    467   SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
    468   SUB          r12, r12, #2
    469   SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
    470   MOV          r8, r9
    471   SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
    472 celt_pitch_xcorr_edsp_process2_1
    473   LDRH         r6, [r4], #2
    474   ADDS         r12, r12, #1
    475   ; Stall
    476   SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
    477   LDRGTH       r7, [r4], #2
    478   SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
    479   BLE celt_pitch_xcorr_edsp_process2_done
    480   LDRH         r9, [r5], #2
    481   SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
    482   SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
    483 celt_pitch_xcorr_edsp_process2_done
    484   ; Restore _x
    485   SUB          r4, r4, r3, LSL #1
    486   ; Restore and advance _y
    487   SUB          r5, r5, r3, LSL #1
    488   ; maxcorr = max(maxcorr, sum0)
    489   CMP          r0, r10
    490   ADD          r5, r5, #2
    491   MOVLT        r0, r10
    492   SUB          r1, r1, #2
    493   ; maxcorr = max(maxcorr, sum1)
    494   CMP          r0, r11
    495   ; xcorr[i] = sum
    496   STR          r10, [r2], #4
    497   MOVLT        r0, r11
    498   STR          r11, [r2], #4
    499 celt_pitch_xcorr_edsp_process1a
    500   ADDS         r1, r1, #1
    501   BLT celt_pitch_xcorr_edsp_done
    502   SUBS         r12, r3, #4
    503   ; r14 = sum = 0
    504   MOV          r14, #0
    505   BLT celt_pitch_xcorr_edsp_process1a_loop_done
    506   LDR          r6, [r4], #4
    507   LDR          r8, [r5], #4
    508   LDR          r7, [r4], #4
    509   LDR          r9, [r5], #4
    510 celt_pitch_xcorr_edsp_process1a_loop4
    511   SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
    512   SUBS         r12, r12, #4         ; j-=4
    513   SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
    514   LDRGE        r6, [r4], #4
    515   SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
    516   LDRGE        r8, [r5], #4
    517   SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
    518   LDRGE        r7, [r4], #4
    519   LDRGE        r9, [r5], #4
    520   BGE celt_pitch_xcorr_edsp_process1a_loop4
    521 celt_pitch_xcorr_edsp_process1a_loop_done
    522   ADDS         r12, r12, #2
    523   LDRGE        r6, [r4], #4
    524   LDRGE        r8, [r5], #4
    525   ; Stall
    526   SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
    527   SUBGE        r12, r12, #2
    528   SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
    529   ADDS         r12, r12, #1
    530   LDRGEH       r6, [r4], #2
    531   LDRGEH       r8, [r5], #2
    532   ; Stall
    533   SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
    534   ; maxcorr = max(maxcorr, sum)
    535   CMP          r0, r14
    536   ; xcorr[i] = sum
    537   STR          r14, [r2], #4
    538   MOVLT        r0, r14
    539 celt_pitch_xcorr_edsp_done
    540   LDMFD        sp!, {r4-r11, pc}
    541   ENDP
    542 
    543 ENDIF
    544 
    545 END
    546