1 ; RUN: llc < %s -mcpu=cortex-a53 -enable-post-misched=false -enable-aa-sched-mi | FileCheck %s 2 3 ; Check that the vector store intrinsic does not prevent fmla instructions from 4 ; being scheduled together. Since the vector loads and stores generated from 5 ; the intrinsics do not alias each other, the store can be pushed past the load. 6 ; This allows fmla instructions to be scheduled together. 7 8 9 ; CHECK: fmla 10 ; CHECK-NEXT: fmla 11 ; CHECK-NEXT: fmla 12 ; CHECK-NEXT: fmla 13 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128" 14 target triple = "aarch64--linux-gnu" 15 16 %Struct = type { i64*, [9 x double], [16 x {float, float}], [16 x {float, float}], i32, i32 } 17 18 ; Function Attrs: nounwind 19 define linkonce_odr void @func(%Struct* nocapture %this) unnamed_addr #0 align 2 { 20 entry: 21 %0 = insertelement <4 x float> undef, float undef, i32 0 22 %1 = insertelement <4 x float> %0, float undef, i32 1 23 %2 = insertelement <4 x float> %1, float undef, i32 2 24 %3 = insertelement <4 x float> %2, float undef, i32 3 25 %scevgep = getelementptr %Struct, %Struct* %this, i64 0, i32 2, i64 8, i32 0 26 %struct_ptr = bitcast float* %scevgep to i8* 27 %vec1 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8* %struct_ptr) 28 %ev1 = extractvalue { <4 x float>, <4 x float> } %vec1, 1 29 %fm1 = fmul <4 x float> %0, %ev1 30 %av1 = fadd <4 x float> %1, %fm1 31 %ev2 = extractvalue { <4 x float>, <4 x float> } %vec1, 0 32 %fm2 = fmul <4 x float> %2, %ev2 33 %av2 = fadd <4 x float> %3, %fm2 34 %scevgep2 = getelementptr %Struct, %Struct* %this, i64 0, i32 3, i64 8, i32 0 35 %struct_ptr2 = bitcast float* %scevgep2 to i8* 36 tail call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> %av2, <4 x float> %av1, i8* %struct_ptr2) 37 %scevgep3 = getelementptr %Struct, %Struct* %this, i64 0, i32 2, i64 12, i32 0 38 %struct_ptr3 = bitcast float* %scevgep3 to i8* 39 %vec2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8* %struct_ptr3) 40 %ev3 = extractvalue { <4 x float>, <4 x float> } %vec2, 1 41 %fm3 = fmul <4 x float> %0, %ev3 42 %av3 = fadd <4 x float> %1, %fm3 43 %ev4 = extractvalue { <4 x float>, <4 x float> } %vec2, 0 44 %fm4 = fmul <4 x float> %2, %ev4 45 %av4 = fadd <4 x float> %3, %fm4 46 %scevgep4 = getelementptr %Struct, %Struct* %this, i64 0, i32 3, i64 12, i32 0 47 %struct_ptr4 = bitcast float* %scevgep4 to i8* 48 tail call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> %av4, <4 x float> %av3, i8* %struct_ptr4) 49 ret void 50 } 51 52 ; Function Attrs: nounwind readonly 53 declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8*) #2 54 55 ; Function Attrs: nounwind 56 declare void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float>, <4 x float>, i8* nocapture) #1 57 58 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } 59 attributes #1 = { nounwind } 60 attributes #2 = { nounwind readonly } 61