Home | History | Annotate | Download | only in CodeGen
      1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
      2 // RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
      3 
      4 // Test new aarch64 intrinsics and types
      5 
      6 #include <arm_neon.h>
      7 
      8 
      9 // CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
     10 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
     11 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
     12 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
     13 // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
     14 // CHECK:   ret float [[MUL]]
     15 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
     16   return vmuls_lane_f32(a, b, 1);
     17 }
     18 
     19 // CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
     20 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
     21 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
     22 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
     23 // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
     24 // CHECK:   ret double [[MUL]]
     25 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
     26   return vmuld_lane_f64(a, b, 0);
     27 }
     28 
     29 // CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #0 {
     30 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
     31 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
     32 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
     33 // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
     34 // CHECK:   ret float [[MUL]]
     35 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
     36   return vmuls_laneq_f32(a, b, 3);
     37 }
     38 
     39 // CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #0 {
     40 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
     41 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
     42 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
     43 // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
     44 // CHECK:   ret double [[MUL]]
     45 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
     46   return vmuld_laneq_f64(a, b, 1);
     47 }
     48 
     49 // CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
     50 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
     51 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
     52 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP1]] to double
     53 // CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
     54 // CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
     55 // CHECK:   ret <1 x double> [[TMP4]]
     56 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
     57   return vmul_n_f64(a, b);
     58 }
     59 
     60 // CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
     61 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
     62 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
     63 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
     64 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) #2
     65 // CHECK:   ret float [[VMULXS_F32_I]]
     66 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
     67   return vmulxs_lane_f32(a, b, 1);
     68 }
     69 
     70 // CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #0 {
     71 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
     72 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
     73 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
     74 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) #2
     75 // CHECK:   ret float [[VMULXS_F32_I]]
     76 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
     77   return vmulxs_laneq_f32(a, b, 3);
     78 }
     79 
     80 // CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
     81 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
     82 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
     83 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
     84 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) #2
     85 // CHECK:   ret double [[VMULXD_F64_I]]
     86 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
     87   return vmulxd_lane_f64(a, b, 0);
     88 }
     89 
     90 // CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #0 {
     91 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
     92 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
     93 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
     94 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) #2
     95 // CHECK:   ret double [[VMULXD_F64_I]]
     96 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
     97   return vmulxd_laneq_f64(a, b, 1);
     98 }
     99 
    100 // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
    101 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
    102 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
    103 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
    104 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
    105 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
    106 // CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
    107 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]]) #2
    108 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
    109 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
    110 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
    111 // CHECK:   ret <1 x double> [[VSET_LANE]]
    112 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
    113   return vmulx_lane_f64(a, b, 0);
    114 }
    115 
    116 
    117 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #0 {
    118 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
    119 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
    120 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
    121 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
    122 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
    123 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
    124 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
    125 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
    126 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
    127 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
    128 // CHECK:   ret <1 x double> [[VSET_LANE]]
    129 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
    130   return vmulx_laneq_f64(a, b, 0);
    131 }
    132 
    133 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #0 {
    134 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
    135 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
    136 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
    137 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
    138 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
    139 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
    140 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
    141 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
    142 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
    143 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
    144 // CHECK:   ret <1 x double> [[VSET_LANE]]
    145 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
    146   return vmulx_laneq_f64(a, b, 1);
    147 }
    148 
    149 
    150 // CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
    151 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
    152 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    153 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
    154 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
    155 // CHECK:   ret float [[TMP2]]
    156 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
    157   return vfmas_lane_f32(a, b, c, 1);
    158 }
    159 
    160 // CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
    161 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
    162 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
    163 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
    164 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
    165 // CHECK:   ret double [[TMP2]]
    166 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
    167   return vfmad_lane_f64(a, b, c, 0);
    168 }
    169 
    170 // CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
    171 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
    172 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
    173 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
    174 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
    175 // CHECK:   ret double [[TMP2]]
    176 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
    177   return vfmad_laneq_f64(a, b, c, 1);
    178 }
    179 
    180 // CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
    181 // CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
    182 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
    183 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
    184 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
    185 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
    186 // CHECK:   ret float [[TMP2]]
    187 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
    188   return vfmss_lane_f32(a, b, c, 1);
    189 }
    190 
    191 // CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
    192 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
    193 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
    194 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
    195 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
    196 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
    197 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
    198 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
    199 // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
    200 // CHECK:   ret <1 x double> [[FMLA2]]
    201 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
    202   return vfma_lane_f64(a, b, v, 0);
    203 }
    204 
    205 // CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
    206 // CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
    207 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
    208 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
    209 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
    210 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
    211 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
    212 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
    213 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
    214 // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
    215 // CHECK:   ret <1 x double> [[FMLA2]]
    216 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
    217   return vfms_lane_f64(a, b, v, 0);
    218 }
    219 
    220 // CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
    221 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
    222 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
    223 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
    224 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
    225 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
    226 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
    227 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
    228 // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
    229 // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
    230 // CHECK:   ret <1 x double> [[TMP7]]
    231 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
    232   return vfma_laneq_f64(a, b, v, 0);
    233 }
    234 
    235 // CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
    236 // CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
    237 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
    238 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
    239 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
    240 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
    241 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
    242 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
    243 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
    244 // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
    245 // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
    246 // CHECK:   ret <1 x double> [[TMP7]]
    247 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
    248   return vfms_laneq_f64(a, b, v, 0);
    249 }
    250 
    251 // CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
    252 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    253 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    254 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
    255 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
    256 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
    257 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
    258 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
    259 // CHECK:   ret i32 [[TMP4]]
    260 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
    261   return vqdmullh_lane_s16(a, b, 3);
    262 }
    263 
    264 // CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
    265 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    266 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    267 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
    268 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) #2
    269 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
    270 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
    271   return vqdmulls_lane_s32(a, b, 1);
    272 }
    273 
    274 // CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
    275 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
    276 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    277 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
    278 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
    279 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
    280 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
    281 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
    282 // CHECK:   ret i32 [[TMP4]]
    283 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
    284   return vqdmullh_laneq_s16(a, b, 7);
    285 }
    286 
    287 // CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #0 {
    288 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    289 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    290 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
    291 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) #2
    292 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
    293 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
    294   return vqdmulls_laneq_s32(a, b, 3);
    295 }
    296 
    297 // CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
    298 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    299 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    300 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
    301 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
    302 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
    303 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
    304 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
    305 // CHECK:   ret i16 [[TMP4]]
    306 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
    307   return vqdmulhh_lane_s16(a, b, 3);
    308 }
    309 
    310 // CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
    311 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    312 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    313 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
    314 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
    315 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
    316 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
    317   return vqdmulhs_lane_s32(a, b, 1);
    318 }
    319 
    320 
    321 // CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
    322 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
    323 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    324 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
    325 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
    326 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
    327 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
    328 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
    329 // CHECK:   ret i16 [[TMP4]]
    330 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
    331   return vqdmulhh_laneq_s16(a, b, 7);
    332 }
    333 
    334 
    335 // CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
    336 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    337 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    338 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
    339 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
    340 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
    341 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
    342   return vqdmulhs_laneq_s32(a, b, 3);
    343 }
    344 
    345 // CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
    346 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
    347 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    348 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
    349 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
    350 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
    351 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
    352 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
    353 // CHECK:   ret i16 [[TMP4]]
    354 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
    355   return vqrdmulhh_lane_s16(a, b, 3);
    356 }
    357 
    358 // CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
    359 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
    360 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    361 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
    362 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
    363 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
    364 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
    365   return vqrdmulhs_lane_s32(a, b, 1);
    366 }
    367 
    368 
    369 // CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
    370 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
    371 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    372 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
    373 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
    374 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
    375 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
    376 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
    377 // CHECK:   ret i16 [[TMP4]]
    378 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
    379   return vqrdmulhh_laneq_s16(a, b, 7);
    380 }
    381 
    382 
    383 // CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
    384 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
    385 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    386 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
    387 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
    388 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
    389 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
    390   return vqrdmulhs_laneq_s32(a, b, 3);
    391 }
    392 
    393 // CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
    394 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
    395 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    396 // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
    397 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
    398 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
    399 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
    400 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
    401 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
    402 // CHECK:   ret i32 [[VQDMLXL1]]
    403 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
    404   return vqdmlalh_lane_s16(a, b, c, 3);
    405 }
    406 
    407 // CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
    408 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
    409 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    410 // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
    411 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
    412 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
    413 // CHECK:   ret i64 [[VQDMLXL1]]
    414 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
    415   return vqdmlals_lane_s32(a, b, c, 1);
    416 }
    417 
    418 // CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
    419 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
    420 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    421 // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
    422 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
    423 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
    424 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
    425 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
    426 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
    427 // CHECK:   ret i32 [[VQDMLXL1]]
    428 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
    429   return vqdmlalh_laneq_s16(a, b, c, 7);
    430 }
    431 
    432 // CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
    433 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
    434 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    435 // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
    436 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
    437 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
    438 // CHECK:   ret i64 [[VQDMLXL1]]
    439 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
    440   return vqdmlals_laneq_s32(a, b, c, 3);
    441 }
    442 
    443 // CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
    444 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
    445 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
    446 // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
    447 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
    448 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
    449 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
    450 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
    451 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
    452 // CHECK:   ret i32 [[VQDMLXL1]]
    453 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
    454   return vqdmlslh_lane_s16(a, b, c, 3);
    455 }
    456 
    457 // CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
    458 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
    459 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
    460 // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
    461 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
    462 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
    463 // CHECK:   ret i64 [[VQDMLXL1]]
    464 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
    465   return vqdmlsls_lane_s32(a, b, c, 1);
    466 }
    467 
    468 // CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
    469 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
    470 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
    471 // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
    472 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
    473 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
    474 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
    475 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
    476 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
    477 // CHECK:   ret i32 [[VQDMLXL1]]
    478 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
    479   return vqdmlslh_laneq_s16(a, b, c, 7);
    480 }
    481 
    482 // CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
    483 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
    484 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
    485 // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
    486 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
    487 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
    488 // CHECK:   ret i64 [[VQDMLXL1]]
    489 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
    490   return vqdmlsls_laneq_s32(a, b, c, 3);
    491 }
    492 
    493 // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
    494 // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
    495 // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
    496 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
    497 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
    498 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
    499 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
    500 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
    501 // CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
    502 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]]) #2
    503 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
    504 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
    505 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
    506 // CHECK:   ret <1 x double> [[VSET_LANE]]
    507 float64x1_t test_vmulx_lane_f64_0() {
    508       float64x1_t arg1;
    509       float64x1_t arg2;
    510       float64x1_t result;
    511       float64_t sarg1, sarg2, sres;
    512       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
    513       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
    514       result = vmulx_lane_f64(arg1, arg2, 0);
    515       return result;
    516 }
    517 
    518 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #0 {
    519 // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
    520 // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
    521 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
    522 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
    523 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
    524 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
    525 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
    526 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
    527 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
    528 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
    529 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
    530 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
    531 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
    532 // CHECK:   ret <1 x double> [[VSET_LANE]]
    533 float64x1_t test_vmulx_laneq_f64_2() {
    534       float64x1_t arg1;
    535       float64x1_t arg2;
    536       float64x2_t arg3;
    537       float64x1_t result;
    538       float64_t sarg1, sarg2, sres;
    539       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
    540       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
    541       arg3 = vcombine_f64(arg1, arg2);
    542       result = vmulx_laneq_f64(arg1, arg3, 1);
    543       return result;
    544 }
    545