1 // RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s 2 // Test ARM64 SIMD fused multiply add intrinsics 3 4 #include <arm_neon.h> 5 6 float32x2_t test_vfma_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) { 7 // CHECK: test_vfma_f32 8 return vfma_f32(a1, a2, a3); 9 // CHECK: llvm.fma.v2f32({{.*a2, .*a3, .*a1}}) 10 // CHECK-NEXT: ret 11 } 12 13 float32x4_t test_vfmaq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) { 14 // CHECK: test_vfmaq_f32 15 return vfmaq_f32(a1, a2, a3); 16 // CHECK: llvm.fma.v4f32({{.*a2, .*a3, .*a1}}) 17 // CHECK-NEXT: ret 18 } 19 20 float64x2_t test_vfmaq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) { 21 // CHECK: test_vfmaq_f64 22 return vfmaq_f64(a1, a2, a3); 23 // CHECK: llvm.fma.v2f64({{.*a2, .*a3, .*a1}}) 24 // CHECK-NEXT: ret 25 } 26 27 float32x2_t test_vfma_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) { 28 // CHECK: test_vfma_lane_f32 29 return vfma_lane_f32(a1, a2, a3, 1); 30 // NB: the test below is deliberately lose, so that we don't depend too much 31 // upon the exact IR used to select lane 1 (usually a shufflevector) 32 // CHECK: llvm.fma.v2f32(<2 x float> %a2, <2 x float> {{.*}}, <2 x float> %a1) 33 // CHECK-NEXT: ret 34 } 35 36 float32x4_t test_vfmaq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) { 37 // CHECK: test_vfmaq_lane_f32 38 return vfmaq_lane_f32(a1, a2, a3, 1); 39 // NB: the test below is deliberately lose, so that we don't depend too much 40 // upon the exact IR used to select lane 1 (usually a shufflevector) 41 // CHECK: llvm.fma.v4f32(<4 x float> %a2, <4 x float> {{.*}}, <4 x float> %a1) 42 // CHECK-NEXT: ret 43 } 44 45 float64x2_t test_vfmaq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) { 46 // CHECK: test_vfmaq_lane_f64 47 return vfmaq_lane_f64(a1, a2, a3, 0); 48 // NB: the test below is deliberately lose, so that we don't depend too much 49 // upon the exact IR used to select lane 1 (usually a shufflevector) 50 // CHECK: llvm.fma.v2f64(<2 x double> %a2, <2 x double> {{.*}}, <2 x double> %a1) 51 // CHECK-NEXT: ret 52 } 53 54 float32x2_t test_vfma_n_f32(float32x2_t a1, float32x2_t a2, float32_t a3) { 55 // CHECK: test_vfma_n_f32 56 return vfma_n_f32(a1, a2, a3); 57 // NB: the test below is deliberately lose, so that we don't depend too much 58 // upon the exact IR used to select lane 0 (usually two insertelements) 59 // CHECK: llvm.fma.v2f32 60 // CHECK-NEXT: ret 61 } 62 63 float32x4_t test_vfmaq_n_f32(float32x4_t a1, float32x4_t a2, float32_t a3) { 64 // CHECK: test_vfmaq_n_f32 65 return vfmaq_n_f32(a1, a2, a3); 66 // NB: the test below is deliberately lose, so that we don't depend too much 67 // upon the exact IR used to select lane 0 (usually four insertelements) 68 // CHECK: llvm.fma.v4f32 69 // CHECK-NEXT: ret 70 } 71 72 float64x2_t test_vfmaq_n_f64(float64x2_t a1, float64x2_t a2, float64_t a3) { 73 // CHECK: test_vfmaq_n_f64 74 return vfmaq_n_f64(a1, a2, a3); 75 // NB: the test below is deliberately lose, so that we don't depend too much 76 // upon the exact IR used to select lane 0 (usually two insertelements) 77 // CHECK: llvm.fma.v2f64 78 // CHECK-NEXT: ret 79 } 80 81 float32x2_t test_vfms_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) { 82 // CHECK: test_vfms_f32 83 return vfms_f32(a1, a2, a3); 84 // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a2 85 // CHECK: llvm.fma.v2f32(<2 x float> %a3, <2 x float> [[NEG]], <2 x float> %a1) 86 // CHECK-NEXT: ret 87 } 88 89 float32x4_t test_vfmsq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) { 90 // CHECK: test_vfmsq_f32 91 return vfmsq_f32(a1, a2, a3); 92 // CHECK: [[NEG:%.*]] = fsub <4 x float> {{.*}}, %a2 93 // CHECK: llvm.fma.v4f32(<4 x float> %a3, <4 x float> [[NEG]], <4 x float> %a1) 94 // CHECK-NEXT: ret 95 } 96 97 float64x2_t test_vfmsq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) { 98 // CHECK: test_vfmsq_f64 99 return vfmsq_f64(a1, a2, a3); 100 // CHECK: [[NEG:%.*]] = fsub <2 x double> {{.*}}, %a2 101 // CHECK: llvm.fma.v2f64(<2 x double> %a3, <2 x double> [[NEG]], <2 x double> %a1) 102 // CHECK-NEXT: ret 103 } 104 105 float32x2_t test_vfms_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) { 106 // CHECK: test_vfms_lane_f32 107 return vfms_lane_f32(a1, a2, a3, 1); 108 // NB: the test below is deliberately lose, so that we don't depend too much 109 // upon the exact IR used to select lane 1 (usually a shufflevector) 110 // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3 111 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]] 112 // CHECK: llvm.fma.v2f32(<2 x float> {{.*}}, <2 x float> [[LANE]], <2 x float> %a1) 113 // CHECK-NEXT: ret 114 } 115 116 float32x4_t test_vfmsq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) { 117 // CHECK: test_vfmsq_lane_f32 118 return vfmsq_lane_f32(a1, a2, a3, 1); 119 // NB: the test below is deliberately lose, so that we don't depend too much 120 // upon the exact IR used to select lane 1 (usually a shufflevector) 121 // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3 122 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]] 123 // CHECK: llvm.fma.v4f32(<4 x float> {{.*}}, <4 x float> [[LANE]], <4 x float> %a1) 124 // CHECK-NEXT: ret 125 } 126 127 float64x2_t test_vfmsq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) { 128 // CHECK: test_vfmsq_lane_f64 129 return vfmsq_lane_f64(a1, a2, a3, 0); 130 // NB: the test below is deliberately lose, so that we don't depend too much 131 // upon the exact IR used to select lane 1 (usually a shufflevector) 132 // CHECK: [[NEG:%.*]] = fsub <1 x double> {{.*}}, %a3 133 // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[NEG]] 134 // CHECK: llvm.fma.v2f64(<2 x double> {{.*}}, <2 x double> [[LANE]], <2 x double> %a1) 135 // CHECK-NEXT: ret 136 } 137