1 ; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE 2 ; RUN: llc -mtriple=armeb-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE 3 4 define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5 ; CHECK-LABEL: vcombine8 6 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] 7 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] 8 9 ; CHECK-LE-DAG: vmov r0, r1, [[LD0]] 10 ; CHECK-LE-DAG: vmov r2, r3, [[LD1]] 11 12 ; CHECK-BE-DAG: vmov r1, r0, d16 13 ; CHECK-BE-DAG: vmov r3, r2, d17 14 %tmp1 = load <8 x i8>, <8 x i8>* %A 15 %tmp2 = load <8 x i8>, <8 x i8>* %B 16 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 17 ret <16 x i8> %tmp3 18 } 19 20 define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 21 ; CHECK-LABEL: vcombine16 22 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] 23 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] 24 25 ; CHECK-LE-DAG: vmov r0, r1, [[LD0]] 26 ; CHECK-LE-DAG: vmov r2, r3, [[LD1]] 27 28 ; CHECK-BE-DAG: vmov r1, r0, d16 29 ; CHECK-BE-DAG: vmov r3, r2, d17 30 %tmp1 = load <4 x i16>, <4 x i16>* %A 31 %tmp2 = load <4 x i16>, <4 x i16>* %B 32 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 33 ret <8 x i16> %tmp3 34 } 35 36 define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 37 ; CHECK-LABEL: vcombine32 38 39 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] 40 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] 41 42 ; CHECK-LE: vmov r0, r1, [[LD0]] 43 ; CHECK-LE: vmov r2, r3, [[LD1]] 44 45 ; CHECK-BE: vmov r1, r0, d16 46 ; CHECK-BE: vmov r3, r2, d17 47 %tmp1 = load <2 x i32>, <2 x i32>* %A 48 %tmp2 = load <2 x i32>, <2 x i32>* %B 49 %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 50 ret <4 x i32> %tmp3 51 } 52 53 define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind { 54 ; CHECK-LABEL: vcombinefloat 55 56 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] 57 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] 58 59 ; CHECK-LE: vmov r0, r1, [[LD0]] 60 ; CHECK-LE: vmov r2, r3, [[LD1]] 61 62 ; CHECK-BE: vmov r1, r0, d16 63 ; CHECK-BE: vmov r3, r2, d17 64 %tmp1 = load <2 x float>, <2 x float>* %A 65 %tmp2 = load <2 x float>, <2 x float>* %B 66 %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 67 ret <4 x float> %tmp3 68 } 69 70 define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind { 71 ; CHECK-LABEL: vcombine64 72 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] 73 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] 74 75 ; CHECK-LE: vmov r0, r1, [[LD0]] 76 ; CHECK-LE: vmov r2, r3, [[LD1]] 77 78 ; CHECK-BE: vmov r1, r0, [[LD0]] 79 ; CHECK-BE: vmov r3, r2, [[LD1]] 80 %tmp1 = load <1 x i64>, <1 x i64>* %A 81 %tmp2 = load <1 x i64>, <1 x i64>* %B 82 %tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> <i32 0, i32 1> 83 ret <2 x i64> %tmp3 84 } 85 86 ; Check for vget_low and vget_high implemented with shufflevector. PR8411. 87 ; They should not require storing to the stack. 88 89 define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind { 90 ; CHECK: vget_low16 91 ; CHECK-NOT: vst 92 ; CHECK-LE: vmov r0, r1, d16 93 ; CHECK-BE: vmov r1, r0, d16 94 %tmp1 = load <8 x i16>, <8 x i16>* %A 95 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 96 ret <4 x i16> %tmp2 97 } 98 99 define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind { 100 ; CHECK: vget_high8 101 ; CHECK-NOT: vst 102 ; CHECK-LE-NOT: vld1.64 {d16, d17}, [r0] 103 ; CHECK-LE: vldr d16, [r0, #8] 104 ; CHECK-LE: vmov r0, r1, d16 105 ; CHECK-BE: vmov r1, r0, d16 106 %tmp1 = load <16 x i8>, <16 x i8>* %A 107 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 108 ret <8 x i8> %tmp2 109 } 110 111 ; vcombine(vld1_dup(p), vld1_dup(p2)) 112 define <8 x i16> @vcombine_vdup(<8 x i16> %src, i16* nocapture readonly %p) { 113 ; CHECK-LABEL: vcombine_vdup: 114 ; CHECK: vld1.16 {d16[]}, 115 ; CHECK: vld1.16 {d17[]}, 116 ; CHECK-LE: vmov r0, r1, d16 117 ; CHECK-LE: vmov r2, r3, d17 118 %a1 = load i16, i16* %p, align 2 119 %a2 = insertelement <4 x i16> undef, i16 %a1, i32 0 120 %a3 = shufflevector <4 x i16> %a2, <4 x i16> undef, <4 x i32> zeroinitializer 121 %p2 = getelementptr inbounds i16, i16* %p, i32 1 122 %b1 = load i16, i16* %p2, align 2 123 %b2 = insertelement <4 x i16> undef, i16 %b1, i32 0 124 %b3 = shufflevector <4 x i16> %b2, <4 x i16> undef, <4 x i32> zeroinitializer 125 %shuffle = shufflevector <4 x i16> %a3, <4 x i16> %b3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 126 ret <8 x i16> %shuffle 127 } 128