1 ; RUN: llc < %s -mattr=+neon | FileCheck %s 2 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" 3 target triple = "thumbv7-elf" 4 5 define i32 @vget_lanes8(<8 x i8>* %A) nounwind { 6 ;CHECK: vget_lanes8: 7 ;CHECK: vmov.s8 8 %tmp1 = load <8 x i8>* %A 9 %tmp2 = extractelement <8 x i8> %tmp1, i32 1 10 %tmp3 = sext i8 %tmp2 to i32 11 ret i32 %tmp3 12 } 13 14 define i32 @vget_lanes16(<4 x i16>* %A) nounwind { 15 ;CHECK: vget_lanes16: 16 ;CHECK: vmov.s16 17 %tmp1 = load <4 x i16>* %A 18 %tmp2 = extractelement <4 x i16> %tmp1, i32 1 19 %tmp3 = sext i16 %tmp2 to i32 20 ret i32 %tmp3 21 } 22 23 define i32 @vget_laneu8(<8 x i8>* %A) nounwind { 24 ;CHECK: vget_laneu8: 25 ;CHECK: vmov.u8 26 %tmp1 = load <8 x i8>* %A 27 %tmp2 = extractelement <8 x i8> %tmp1, i32 1 28 %tmp3 = zext i8 %tmp2 to i32 29 ret i32 %tmp3 30 } 31 32 define i32 @vget_laneu16(<4 x i16>* %A) nounwind { 33 ;CHECK: vget_laneu16: 34 ;CHECK: vmov.u16 35 %tmp1 = load <4 x i16>* %A 36 %tmp2 = extractelement <4 x i16> %tmp1, i32 1 37 %tmp3 = zext i16 %tmp2 to i32 38 ret i32 %tmp3 39 } 40 41 ; Do a vector add to keep the extraction from being done directly from memory. 42 define i32 @vget_lanei32(<2 x i32>* %A) nounwind { 43 ;CHECK: vget_lanei32: 44 ;CHECK: vmov.32 45 %tmp1 = load <2 x i32>* %A 46 %tmp2 = add <2 x i32> %tmp1, %tmp1 47 %tmp3 = extractelement <2 x i32> %tmp2, i32 1 48 ret i32 %tmp3 49 } 50 51 define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind { 52 ;CHECK: vgetQ_lanes8: 53 ;CHECK: vmov.s8 54 %tmp1 = load <16 x i8>* %A 55 %tmp2 = extractelement <16 x i8> %tmp1, i32 1 56 %tmp3 = sext i8 %tmp2 to i32 57 ret i32 %tmp3 58 } 59 60 define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind { 61 ;CHECK: vgetQ_lanes16: 62 ;CHECK: vmov.s16 63 %tmp1 = load <8 x i16>* %A 64 %tmp2 = extractelement <8 x i16> %tmp1, i32 1 65 %tmp3 = sext i16 %tmp2 to i32 66 ret i32 %tmp3 67 } 68 69 define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind { 70 ;CHECK: vgetQ_laneu8: 71 ;CHECK: vmov.u8 72 %tmp1 = load <16 x i8>* %A 73 %tmp2 = extractelement <16 x i8> %tmp1, i32 1 74 %tmp3 = zext i8 %tmp2 to i32 75 ret i32 %tmp3 76 } 77 78 define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind { 79 ;CHECK: vgetQ_laneu16: 80 ;CHECK: vmov.u16 81 %tmp1 = load <8 x i16>* %A 82 %tmp2 = extractelement <8 x i16> %tmp1, i32 1 83 %tmp3 = zext i16 %tmp2 to i32 84 ret i32 %tmp3 85 } 86 87 ; Do a vector add to keep the extraction from being done directly from memory. 88 define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind { 89 ;CHECK: vgetQ_lanei32: 90 ;CHECK: vmov.32 91 %tmp1 = load <4 x i32>* %A 92 %tmp2 = add <4 x i32> %tmp1, %tmp1 93 %tmp3 = extractelement <4 x i32> %tmp2, i32 1 94 ret i32 %tmp3 95 } 96 97 define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind { 98 entry: 99 ; CHECK: vmov.u16 r0, d{{.*}}[1] 100 %arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1] 101 %out_uint16_t = alloca i16 ; <i16*> [#uses=1] 102 %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] 103 %0 = load <4 x i16>* %arg0_uint16x4_t, align 8 ; <<4 x i16>> [#uses=1] 104 %1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1] 105 %2 = add i16 %1, %1 106 store i16 %2, i16* %out_uint16_t, align 2 107 br label %return 108 109 return: ; preds = %entry 110 ret void 111 } 112 113 define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind { 114 entry: 115 ; CHECK: vmov.u8 r0, d{{.*}}[1] 116 %arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1] 117 %out_uint8_t = alloca i8 ; <i8*> [#uses=1] 118 %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] 119 %0 = load <8 x i8>* %arg0_uint8x8_t, align 8 ; <<8 x i8>> [#uses=1] 120 %1 = extractelement <8 x i8> %0, i32 1 ; <i8> [#uses=1] 121 %2 = add i8 %1, %1 122 store i8 %2, i8* %out_uint8_t, align 1 123 br label %return 124 125 return: ; preds = %entry 126 ret void 127 } 128 129 define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind { 130 entry: 131 ; CHECK: vmov.u16 r0, d{{.*}}[1] 132 %arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1] 133 %out_uint16_t = alloca i16 ; <i16*> [#uses=1] 134 %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] 135 %0 = load <8 x i16>* %arg0_uint16x8_t, align 16 ; <<8 x i16>> [#uses=1] 136 %1 = extractelement <8 x i16> %0, i32 1 ; <i16> [#uses=1] 137 %2 = add i16 %1, %1 138 store i16 %2, i16* %out_uint16_t, align 2 139 br label %return 140 141 return: ; preds = %entry 142 ret void 143 } 144 145 define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind { 146 entry: 147 ; CHECK: vmov.u8 r0, d{{.*}}[1] 148 %arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1] 149 %out_uint8_t = alloca i8 ; <i8*> [#uses=1] 150 %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] 151 %0 = load <16 x i8>* %arg0_uint8x16_t, align 16 ; <<16 x i8>> [#uses=1] 152 %1 = extractelement <16 x i8> %0, i32 1 ; <i8> [#uses=1] 153 %2 = add i8 %1, %1 154 store i8 %2, i8* %out_uint8_t, align 1 155 br label %return 156 157 return: ; preds = %entry 158 ret void 159 } 160 161 define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind { 162 ;CHECK: vset_lane8: 163 ;CHECK: vmov.8 164 %tmp1 = load <8 x i8>* %A 165 %tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1 166 ret <8 x i8> %tmp2 167 } 168 169 define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind { 170 ;CHECK: vset_lane16: 171 ;CHECK: vmov.16 172 %tmp1 = load <4 x i16>* %A 173 %tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1 174 ret <4 x i16> %tmp2 175 } 176 177 define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind { 178 ;CHECK: vset_lane32: 179 ;CHECK: vmov.32 180 %tmp1 = load <2 x i32>* %A 181 %tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1 182 ret <2 x i32> %tmp2 183 } 184 185 define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind { 186 ;CHECK: vsetQ_lane8: 187 ;CHECK: vmov.8 188 %tmp1 = load <16 x i8>* %A 189 %tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1 190 ret <16 x i8> %tmp2 191 } 192 193 define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind { 194 ;CHECK: vsetQ_lane16: 195 ;CHECK: vmov.16 196 %tmp1 = load <8 x i16>* %A 197 %tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1 198 ret <8 x i16> %tmp2 199 } 200 201 define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind { 202 ;CHECK: vsetQ_lane32: 203 ;CHECK: vmov s 204 %tmp1 = load <4 x i32>* %A 205 %tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1 206 ret <4 x i32> %tmp2 207 } 208 209 define arm_aapcs_vfpcc <2 x float> @test_vset_lanef32(float %arg0_float32_t, <2 x float> %arg1_float32x2_t) nounwind { 210 ;CHECK: test_vset_lanef32: 211 ;CHECK: vmov.f32 s3, s0 212 ;CHECK: vmov.f64 d0, d1 213 entry: 214 %0 = insertelement <2 x float> %arg1_float32x2_t, float %arg0_float32_t, i32 1 ; <<2 x float>> [#uses=1] 215 ret <2 x float> %0 216 } 217 218 ; The llvm extractelement instruction does not require that the lane number 219 ; be an immediate constant. Make sure a variable lane number is handled. 220 221 define i32 @vget_variable_lanes8(<8 x i8>* %A, i32 %B) nounwind { 222 %tmp1 = load <8 x i8>* %A 223 %tmp2 = extractelement <8 x i8> %tmp1, i32 %B 224 %tmp3 = sext i8 %tmp2 to i32 225 ret i32 %tmp3 226 } 227 228 define i32 @vgetQ_variable_lanei32(<4 x i32>* %A, i32 %B) nounwind { 229 %tmp1 = load <4 x i32>* %A 230 %tmp2 = add <4 x i32> %tmp1, %tmp1 231 %tmp3 = extractelement <4 x i32> %tmp2, i32 %B 232 ret i32 %tmp3 233 } 234