1 ; Copyright (c) 2007-2008 CSIRO 2 ; Copyright (c) 2007-2009 Xiph.Org Foundation 3 ; Copyright (c) 2013 Parrot 4 ; Written by Aurlien Zanelli 5 ; 6 ; Redistribution and use in source and binary forms, with or without 7 ; modification, are permitted provided that the following conditions 8 ; are met: 9 ; 10 ; - Redistributions of source code must retain the above copyright 11 ; notice, this list of conditions and the following disclaimer. 12 ; 13 ; - Redistributions in binary form must reproduce the above copyright 14 ; notice, this list of conditions and the following disclaimer in the 15 ; documentation and/or other materials provided with the distribution. 16 ; 17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 21 ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 AREA |.text|, CODE, READONLY 30 31 GET celt/arm/armopts.s 32 33 IF OPUS_ARM_MAY_HAVE_EDSP 34 EXPORT celt_pitch_xcorr_edsp 35 ENDIF 36 37 IF OPUS_ARM_MAY_HAVE_NEON 38 EXPORT celt_pitch_xcorr_neon 39 ENDIF 40 41 IF OPUS_ARM_MAY_HAVE_NEON 42 43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 44 xcorr_kernel_neon PROC 45 ; input: 46 ; r3 = int len 47 ; r4 = opus_val16 *x 48 ; r5 = opus_val16 *y 49 ; q0 = opus_val32 sum[4] 50 ; output: 51 ; q0 = opus_val32 sum[4] 52 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 53 ; internal usage: 54 ; r12 = int j 55 ; d3 = y_3|y_2|y_1|y_0 56 ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 57 ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 58 ; q8 = scratch 59 ; 60 ; Load y[0...3] 61 ; This requires len>0 to always be valid (which we assert in the C code). 62 VLD1.16 {d5}, [r5]! 63 SUBS r12, r3, #8 64 BLE xcorr_kernel_neon_process4 65 ; Process 8 samples at a time. 66 ; This loop loads one y value more than we actually need. Therefore we have to 67 ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid 68 ; reading past the end of the array. 69 xcorr_kernel_neon_process8 70 ; This loop has 19 total instructions (10 cycles to issue, minimum), with 71 ; - 2 cycles of ARM insrtuctions, 72 ; - 10 cycles of load/store/byte permute instructions, and 73 ; - 9 cycles of data processing instructions. 74 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the 75 ; latter two categories, meaning the whole loop should run in 10 cycles per 76 ; iteration, barring cache misses. 77 ; 78 ; Load x[0...7] 79 VLD1.16 {d6, d7}, [r4]! 80 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get 81 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. 82 VAND d3, d5, d5 83 SUBS r12, r12, #8 84 ; Load y[4...11] 85 VLD1.16 {d4, d5}, [r5]! 86 VMLAL.S16 q0, d3, d6[0] 87 VEXT.16 d16, d3, d4, #1 88 VMLAL.S16 q0, d4, d7[0] 89 VEXT.16 d17, d4, d5, #1 90 VMLAL.S16 q0, d16, d6[1] 91 VEXT.16 d16, d3, d4, #2 92 VMLAL.S16 q0, d17, d7[1] 93 VEXT.16 d17, d4, d5, #2 94 VMLAL.S16 q0, d16, d6[2] 95 VEXT.16 d16, d3, d4, #3 96 VMLAL.S16 q0, d17, d7[2] 97 VEXT.16 d17, d4, d5, #3 98 VMLAL.S16 q0, d16, d6[3] 99 VMLAL.S16 q0, d17, d7[3] 100 BGT xcorr_kernel_neon_process8 101 ; Process 4 samples here if we have > 4 left (still reading one extra y value). 102 xcorr_kernel_neon_process4 103 ADDS r12, r12, #4 104 BLE xcorr_kernel_neon_process2 105 ; Load x[0...3] 106 VLD1.16 d6, [r4]! 107 ; Use VAND since it's a data processing instruction again. 108 VAND d4, d5, d5 109 SUB r12, r12, #4 110 ; Load y[4...7] 111 VLD1.16 d5, [r5]! 112 VMLAL.S16 q0, d4, d6[0] 113 VEXT.16 d16, d4, d5, #1 114 VMLAL.S16 q0, d16, d6[1] 115 VEXT.16 d16, d4, d5, #2 116 VMLAL.S16 q0, d16, d6[2] 117 VEXT.16 d16, d4, d5, #3 118 VMLAL.S16 q0, d16, d6[3] 119 ; Process 2 samples here if we have > 2 left (still reading one extra y value). 120 xcorr_kernel_neon_process2 121 ADDS r12, r12, #2 122 BLE xcorr_kernel_neon_process1 123 ; Load x[0...1] 124 VLD2.16 {d6[],d7[]}, [r4]! 125 ; Use VAND since it's a data processing instruction again. 126 VAND d4, d5, d5 127 SUB r12, r12, #2 128 ; Load y[4...5] 129 VLD1.32 {d5[]}, [r5]! 130 VMLAL.S16 q0, d4, d6 131 VEXT.16 d16, d4, d5, #1 132 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI 133 ; instead of VEXT, since it's a data-processing instruction. 134 VSRI.64 d5, d4, #32 135 VMLAL.S16 q0, d16, d7 136 ; Process 1 sample using the extra y value we loaded above. 137 xcorr_kernel_neon_process1 138 ; Load next *x 139 VLD1.16 {d6[]}, [r4]! 140 ADDS r12, r12, #1 141 ; y[0...3] are left in d5 from prior iteration(s) (if any) 142 VMLAL.S16 q0, d5, d6 143 MOVLE pc, lr 144 ; Now process 1 last sample, not reading ahead. 145 ; Load last *y 146 VLD1.16 {d4[]}, [r5]! 147 VSRI.64 d4, d5, #16 148 ; Load last *x 149 VLD1.16 {d6[]}, [r4]! 150 VMLAL.S16 q0, d4, d6 151 MOV pc, lr 152 ENDP 153 154 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, 155 ; opus_val32 *xcorr, int len, int max_pitch) 156 celt_pitch_xcorr_neon PROC 157 ; input: 158 ; r0 = opus_val16 *_x 159 ; r1 = opus_val16 *_y 160 ; r2 = opus_val32 *xcorr 161 ; r3 = int len 162 ; output: 163 ; r0 = int maxcorr 164 ; internal usage: 165 ; r4 = opus_val16 *x (for xcorr_kernel_neon()) 166 ; r5 = opus_val16 *y (for xcorr_kernel_neon()) 167 ; r6 = int max_pitch 168 ; r12 = int j 169 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) 170 STMFD sp!, {r4-r6, lr} 171 LDR r6, [sp, #16] 172 VMOV.S32 q15, #1 173 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 174 SUBS r6, r6, #4 175 BLT celt_pitch_xcorr_neon_process4_done 176 celt_pitch_xcorr_neon_process4 177 ; xcorr_kernel_neon parameters: 178 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} 179 MOV r4, r0 180 MOV r5, r1 181 VEOR q0, q0, q0 182 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. 183 ; So we don't save/restore any other registers. 184 BL xcorr_kernel_neon 185 SUBS r6, r6, #4 186 VST1.32 {q0}, [r2]! 187 ; _y += 4 188 ADD r1, r1, #8 189 VMAX.S32 q15, q15, q0 190 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 191 BGE celt_pitch_xcorr_neon_process4 192 ; We have less than 4 sums left to compute. 193 celt_pitch_xcorr_neon_process4_done 194 ADDS r6, r6, #4 195 ; Reduce maxcorr to a single value 196 VMAX.S32 d30, d30, d31 197 VPMAX.S32 d30, d30, d30 198 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done 199 BLE celt_pitch_xcorr_neon_done 200 ; Now compute each remaining sum one at a time. 201 celt_pitch_xcorr_neon_process_remaining 202 MOV r4, r0 203 MOV r5, r1 204 VMOV.I32 q0, #0 205 SUBS r12, r3, #8 206 BLT celt_pitch_xcorr_neon_process_remaining4 207 ; Sum terms 8 at a time. 208 celt_pitch_xcorr_neon_process_remaining_loop8 209 ; Load x[0...7] 210 VLD1.16 {q1}, [r4]! 211 ; Load y[0...7] 212 VLD1.16 {q2}, [r5]! 213 SUBS r12, r12, #8 214 VMLAL.S16 q0, d4, d2 215 VMLAL.S16 q0, d5, d3 216 BGE celt_pitch_xcorr_neon_process_remaining_loop8 217 ; Sum terms 4 at a time. 218 celt_pitch_xcorr_neon_process_remaining4 219 ADDS r12, r12, #4 220 BLT celt_pitch_xcorr_neon_process_remaining4_done 221 ; Load x[0...3] 222 VLD1.16 {d2}, [r4]! 223 ; Load y[0...3] 224 VLD1.16 {d3}, [r5]! 225 SUB r12, r12, #4 226 VMLAL.S16 q0, d3, d2 227 celt_pitch_xcorr_neon_process_remaining4_done 228 ; Reduce the sum to a single value. 229 VADD.S32 d0, d0, d1 230 VPADDL.S32 d0, d0 231 ADDS r12, r12, #4 232 BLE celt_pitch_xcorr_neon_process_remaining_loop_done 233 ; Sum terms 1 at a time. 234 celt_pitch_xcorr_neon_process_remaining_loop1 235 VLD1.16 {d2[]}, [r4]! 236 VLD1.16 {d3[]}, [r5]! 237 SUBS r12, r12, #1 238 VMLAL.S16 q0, d2, d3 239 BGT celt_pitch_xcorr_neon_process_remaining_loop1 240 celt_pitch_xcorr_neon_process_remaining_loop_done 241 VST1.32 {d0[0]}, [r2]! 242 VMAX.S32 d30, d30, d0 243 SUBS r6, r6, #1 244 ; _y++ 245 ADD r1, r1, #2 246 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining 247 BGT celt_pitch_xcorr_neon_process_remaining 248 celt_pitch_xcorr_neon_done 249 VMOV.32 r0, d30[0] 250 LDMFD sp!, {r4-r6, pc} 251 ENDP 252 253 ENDIF 254 255 IF OPUS_ARM_MAY_HAVE_EDSP 256 257 ; This will get used on ARMv7 devices without NEON, so it has been optimized 258 ; to take advantage of dual-issuing where possible. 259 xcorr_kernel_edsp PROC 260 ; input: 261 ; r3 = int len 262 ; r4 = opus_val16 *_x (must be 32-bit aligned) 263 ; r5 = opus_val16 *_y (must be 32-bit aligned) 264 ; r6...r9 = opus_val32 sum[4] 265 ; output: 266 ; r6...r9 = opus_val32 sum[4] 267 ; preserved: r0-r5 268 ; internal usage 269 ; r2 = int j 270 ; r12,r14 = opus_val16 x[4] 271 ; r10,r11 = opus_val16 y[4] 272 STMFD sp!, {r2,r4,r5,lr} 273 LDR r10, [r5], #4 ; Load y[0...1] 274 SUBS r2, r3, #4 ; j = len-4 275 LDR r11, [r5], #4 ; Load y[2...3] 276 BLE xcorr_kernel_edsp_process4_done 277 LDR r12, [r4], #4 ; Load x[0...1] 278 ; Stall 279 xcorr_kernel_edsp_process4 280 ; The multiplies must issue from pipeline 0, and can't dual-issue with each 281 ; other. Every other instruction here dual-issues with a multiply, and is 282 ; thus "free". There should be no stalls in the body of the loop. 283 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0) 284 LDR r14, [r4], #4 ; Load x[2...3] 285 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1) 286 SUBS r2, r2, #4 ; j-=4 287 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2) 288 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3) 289 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1) 290 LDR r10, [r5], #4 ; Load y[4...5] 291 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2) 292 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3) 293 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4) 294 LDRGT r12, [r4], #4 ; Load x[0...1] 295 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2) 296 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3) 297 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4) 298 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5) 299 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3) 300 LDR r11, [r5], #4 ; Load y[6...7] 301 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4) 302 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5) 303 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6) 304 BGT xcorr_kernel_edsp_process4 305 xcorr_kernel_edsp_process4_done 306 ADDS r2, r2, #4 307 BLE xcorr_kernel_edsp_done 308 LDRH r12, [r4], #2 ; r12 = *x++ 309 SUBS r2, r2, #1 ; j-- 310 ; Stall 311 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0) 312 LDRGTH r14, [r4], #2 ; r14 = *x++ 313 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1) 314 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2) 315 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3) 316 BLE xcorr_kernel_edsp_done 317 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1) 318 SUBS r2, r2, #1 ; j-- 319 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2) 320 LDRH r10, [r5], #2 ; r10 = y_4 = *y++ 321 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3) 322 LDRGTH r12, [r4], #2 ; r12 = *x++ 323 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4) 324 BLE xcorr_kernel_edsp_done 325 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2) 326 CMP r2, #1 ; j-- 327 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3) 328 LDRH r2, [r5], #2 ; r2 = y_5 = *y++ 329 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4) 330 LDRGTH r14, [r4] ; r14 = *x 331 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5) 332 BLE xcorr_kernel_edsp_done 333 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3) 334 LDRH r11, [r5] ; r11 = y_6 = *y 335 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4) 336 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5) 337 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6) 338 xcorr_kernel_edsp_done 339 LDMFD sp!, {r2,r4,r5,pc} 340 ENDP 341 342 celt_pitch_xcorr_edsp PROC 343 ; input: 344 ; r0 = opus_val16 *_x (must be 32-bit aligned) 345 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned) 346 ; r2 = opus_val32 *xcorr 347 ; r3 = int len 348 ; output: 349 ; r0 = maxcorr 350 ; internal usage 351 ; r4 = opus_val16 *x 352 ; r5 = opus_val16 *y 353 ; r6 = opus_val32 sum0 354 ; r7 = opus_val32 sum1 355 ; r8 = opus_val32 sum2 356 ; r9 = opus_val32 sum3 357 ; r1 = int max_pitch 358 ; r12 = int j 359 STMFD sp!, {r4-r11, lr} 360 MOV r5, r1 361 LDR r1, [sp, #36] 362 MOV r4, r0 363 TST r5, #3 364 ; maxcorr = 1 365 MOV r0, #1 366 BEQ celt_pitch_xcorr_edsp_process1u_done 367 ; Compute one sum at the start to make y 32-bit aligned. 368 SUBS r12, r3, #4 369 ; r14 = sum = 0 370 MOV r14, #0 371 LDRH r8, [r5], #2 372 BLE celt_pitch_xcorr_edsp_process1u_loop4_done 373 LDR r6, [r4], #4 374 MOV r8, r8, LSL #16 375 celt_pitch_xcorr_edsp_process1u_loop4 376 LDR r9, [r5], #4 377 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 378 LDR r7, [r4], #4 379 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1) 380 LDR r8, [r5], #4 381 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) 382 SUBS r12, r12, #4 ; j-=4 383 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3) 384 LDRGT r6, [r4], #4 385 BGT celt_pitch_xcorr_edsp_process1u_loop4 386 MOV r8, r8, LSR #16 387 celt_pitch_xcorr_edsp_process1u_loop4_done 388 ADDS r12, r12, #4 389 celt_pitch_xcorr_edsp_process1u_loop1 390 LDRGEH r6, [r4], #2 391 ; Stall 392 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) 393 SUBGES r12, r12, #1 394 LDRGTH r8, [r5], #2 395 BGT celt_pitch_xcorr_edsp_process1u_loop1 396 ; Restore _x 397 SUB r4, r4, r3, LSL #1 398 ; Restore and advance _y 399 SUB r5, r5, r3, LSL #1 400 ; maxcorr = max(maxcorr, sum) 401 CMP r0, r14 402 ADD r5, r5, #2 403 MOVLT r0, r14 404 SUBS r1, r1, #1 405 ; xcorr[i] = sum 406 STR r14, [r2], #4 407 BLE celt_pitch_xcorr_edsp_done 408 celt_pitch_xcorr_edsp_process1u_done 409 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 410 SUBS r1, r1, #4 411 BLT celt_pitch_xcorr_edsp_process2 412 celt_pitch_xcorr_edsp_process4 413 ; xcorr_kernel_edsp parameters: 414 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} 415 MOV r6, #0 416 MOV r7, #0 417 MOV r8, #0 418 MOV r9, #0 419 BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) 420 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) 421 CMP r0, r6 422 ; _y+=4 423 ADD r5, r5, #8 424 MOVLT r0, r6 425 CMP r0, r7 426 MOVLT r0, r7 427 CMP r0, r8 428 MOVLT r0, r8 429 CMP r0, r9 430 MOVLT r0, r9 431 STMIA r2!, {r6-r9} 432 SUBS r1, r1, #4 433 BGE celt_pitch_xcorr_edsp_process4 434 celt_pitch_xcorr_edsp_process2 435 ADDS r1, r1, #2 436 BLT celt_pitch_xcorr_edsp_process1a 437 SUBS r12, r3, #4 438 ; {r10, r11} = {sum0, sum1} = {0, 0} 439 MOV r10, #0 440 MOV r11, #0 441 LDR r8, [r5], #4 442 BLE celt_pitch_xcorr_edsp_process2_loop_done 443 LDR r6, [r4], #4 444 LDR r9, [r5], #4 445 celt_pitch_xcorr_edsp_process2_loop4 446 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 447 LDR r7, [r4], #4 448 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 449 SUBS r12, r12, #4 ; j-=4 450 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) 451 LDR r8, [r5], #4 452 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) 453 LDRGT r6, [r4], #4 454 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2) 455 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3) 456 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3) 457 LDRGT r9, [r5], #4 458 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4) 459 BGT celt_pitch_xcorr_edsp_process2_loop4 460 celt_pitch_xcorr_edsp_process2_loop_done 461 ADDS r12, r12, #2 462 BLE celt_pitch_xcorr_edsp_process2_1 463 LDR r6, [r4], #4 464 ; Stall 465 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 466 LDR r9, [r5], #4 467 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 468 SUB r12, r12, #2 469 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) 470 MOV r8, r9 471 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) 472 celt_pitch_xcorr_edsp_process2_1 473 LDRH r6, [r4], #2 474 ADDS r12, r12, #1 475 ; Stall 476 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 477 LDRGTH r7, [r4], #2 478 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 479 BLE celt_pitch_xcorr_edsp_process2_done 480 LDRH r9, [r5], #2 481 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1) 482 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2) 483 celt_pitch_xcorr_edsp_process2_done 484 ; Restore _x 485 SUB r4, r4, r3, LSL #1 486 ; Restore and advance _y 487 SUB r5, r5, r3, LSL #1 488 ; maxcorr = max(maxcorr, sum0) 489 CMP r0, r10 490 ADD r5, r5, #2 491 MOVLT r0, r10 492 SUB r1, r1, #2 493 ; maxcorr = max(maxcorr, sum1) 494 CMP r0, r11 495 ; xcorr[i] = sum 496 STR r10, [r2], #4 497 MOVLT r0, r11 498 STR r11, [r2], #4 499 celt_pitch_xcorr_edsp_process1a 500 ADDS r1, r1, #1 501 BLT celt_pitch_xcorr_edsp_done 502 SUBS r12, r3, #4 503 ; r14 = sum = 0 504 MOV r14, #0 505 BLT celt_pitch_xcorr_edsp_process1a_loop_done 506 LDR r6, [r4], #4 507 LDR r8, [r5], #4 508 LDR r7, [r4], #4 509 LDR r9, [r5], #4 510 celt_pitch_xcorr_edsp_process1a_loop4 511 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 512 SUBS r12, r12, #4 ; j-=4 513 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) 514 LDRGE r6, [r4], #4 515 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) 516 LDRGE r8, [r5], #4 517 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) 518 LDRGE r7, [r4], #4 519 LDRGE r9, [r5], #4 520 BGE celt_pitch_xcorr_edsp_process1a_loop4 521 celt_pitch_xcorr_edsp_process1a_loop_done 522 ADDS r12, r12, #2 523 LDRGE r6, [r4], #4 524 LDRGE r8, [r5], #4 525 ; Stall 526 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 527 SUBGE r12, r12, #2 528 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) 529 ADDS r12, r12, #1 530 LDRGEH r6, [r4], #2 531 LDRGEH r8, [r5], #2 532 ; Stall 533 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) 534 ; maxcorr = max(maxcorr, sum) 535 CMP r0, r14 536 ; xcorr[i] = sum 537 STR r14, [r2], #4 538 MOVLT r0, r14 539 celt_pitch_xcorr_edsp_done 540 LDMFD sp!, {r4-r11, pc} 541 ENDP 542 543 ENDIF 544 545 END 546