1 ; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s 2 ; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s 3 ; RUN: llc -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s 4 5 ; IR-BOTH-LABEL: @simpleOneInstructionPromotion 6 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 7 ; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 undef, i32 1> 8 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1 9 ; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest 10 ; IR-BOTH-NEXT: ret 11 ; 12 ; Make sure we got rid of any expensive vmov.32 instructions. 13 ; ASM-LABEL: simpleOneInstructionPromotion: 14 ; ASM: vldr [[LOAD:d[0-9]+]], [r0] 15 ; ASM-NEXT: vorr.i32 [[LOAD]], #0x1 16 ; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1:32] 17 ; ASM-NEXT: bx 18 define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) { 19 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 20 %extract = extractelement <2 x i32> %in1, i32 1 21 %out = or i32 %extract, 1 22 store i32 %out, i32* %dest, align 4 23 ret void 24 } 25 26 ; IR-BOTH-LABEL: @unsupportedInstructionForPromotion 27 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 28 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0 29 ; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2 30 ; IR-BOTH-NEXT: store i1 [[CMP]], i1* %dest 31 ; IR-BOTH-NEXT: ret 32 ; 33 ; ASM-LABEL: unsupportedInstructionForPromotion: 34 ; ASM: vldr [[LOAD:d[0-9]+]], [r0] 35 ; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]] 36 ; ASM: bx 37 define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) { 38 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 39 %extract = extractelement <2 x i32> %in1, i32 0 40 %out = icmp eq i32 %extract, %in2 41 store i1 %out, i1* %dest, align 4 42 ret void 43 } 44 45 46 ; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs 47 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 48 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0 49 ; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end 50 ; BB2 51 ; IR-BOTH: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 52 ; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest, align 4 53 ; IR-BOTH: ret 54 ; 55 ; ASM-LABEL: unsupportedChainInDifferentBBs: 56 ; ASM: vldr [[LOAD:d[0-9]+]], [r0] 57 ; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]] 58 ; ASM: bx 59 define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) { 60 bb1: 61 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 62 %extract = extractelement <2 x i32> %in1, i32 0 63 br i1 %bool, label %bb2, label %end 64 bb2: 65 %out = or i32 %extract, 1 66 store i32 %out, i32* %dest, align 4 67 br label %end 68 end: 69 ret void 70 } 71 72 ; IR-LABEL: @chainOfInstructionsToPromote 73 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 74 ; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 undef> 75 ; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 undef> 76 ; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 undef> 77 ; IR-BOTH-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], <i32 1, i32 undef> 78 ; IR-BOTH-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], <i32 1, i32 undef> 79 ; IR-BOTH-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], <i32 1, i32 undef> 80 ; IR-BOTH-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], <i32 1, i32 undef> 81 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR7]], i32 0 82 ; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest 83 ; IR-BOTH-NEXT: ret 84 ; 85 ; ASM-LABEL: chainOfInstructionsToPromote: 86 ; ASM: vldr [[LOAD:d[0-9]+]], [r0] 87 ; ASM-NOT: vmov.32 {{r[0-9]+}}, [[LOAD]] 88 ; ASM: bx 89 define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) { 90 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 91 %extract = extractelement <2 x i32> %in1, i32 0 92 %out1 = or i32 %extract, 1 93 %out2 = or i32 %out1, 1 94 %out3 = or i32 %out2, 1 95 %out4 = or i32 %out3, 1 96 %out5 = or i32 %out4, 1 97 %out6 = or i32 %out5, 1 98 %out7 = or i32 %out6, 1 99 store i32 %out7, i32* %dest, align 4 100 ret void 101 } 102 103 ; IR-BOTH-LABEL: @unsupportedMultiUses 104 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 105 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 106 ; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 107 ; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest 108 ; IR-BOTH-NEXT: ret i32 [[OR]] 109 ; 110 ; ASM-LABEL: unsupportedMultiUses: 111 ; ASM: vldr [[LOAD:d[0-9]+]], [r0] 112 ; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]] 113 ; ASM: bx 114 define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) { 115 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 116 %extract = extractelement <2 x i32> %in1, i32 1 117 %out = or i32 %extract, 1 118 store i32 %out, i32* %dest, align 4 119 ret i32 %out 120 } 121 122 ; Check that we promote we a splat constant when this is a division. 123 ; The NORMAL mode does not promote anything as divisions are not legal. 124 ; IR-BOTH-LABEL: @udivCase 125 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 126 ; Scalar version: 127 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 128 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 [[EXTRACT]], 7 129 ; Vector version: 130 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <2 x i32> [[LOAD]], <i32 7, i32 7> 131 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 132 ; 133 ; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest 134 ; IR-BOTH-NEXT: ret 135 define void @udivCase(<2 x i32>* %addr1, i32* %dest) { 136 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 137 %extract = extractelement <2 x i32> %in1, i32 1 138 %out = udiv i32 %extract, 7 139 store i32 %out, i32* %dest, align 4 140 ret void 141 } 142 143 ; IR-BOTH-LABEL: @uremCase 144 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 145 ; Scalar version: 146 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 147 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i32 [[EXTRACT]], 7 148 ; Vector version: 149 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = urem <2 x i32> [[LOAD]], <i32 7, i32 7> 150 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 151 ; 152 ; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest 153 ; IR-BOTH-NEXT: ret 154 define void @uremCase(<2 x i32>* %addr1, i32* %dest) { 155 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 156 %extract = extractelement <2 x i32> %in1, i32 1 157 %out = urem i32 %extract, 7 158 store i32 %out, i32* %dest, align 4 159 ret void 160 } 161 162 ; IR-BOTH-LABEL: @sdivCase 163 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 164 ; Scalar version: 165 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 166 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sdiv i32 [[EXTRACT]], 7 167 ; Vector version: 168 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <2 x i32> [[LOAD]], <i32 7, i32 7> 169 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 170 ; 171 ; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest 172 ; IR-BOTH-NEXT: ret 173 define void @sdivCase(<2 x i32>* %addr1, i32* %dest) { 174 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 175 %extract = extractelement <2 x i32> %in1, i32 1 176 %out = sdiv i32 %extract, 7 177 store i32 %out, i32* %dest, align 4 178 ret void 179 } 180 181 ; IR-BOTH-LABEL: @sremCase 182 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 183 ; Scalar version: 184 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 185 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 [[EXTRACT]], 7 186 ; Vector version: 187 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = srem <2 x i32> [[LOAD]], <i32 7, i32 7> 188 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 189 ; 190 ; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest 191 ; IR-BOTH-NEXT: ret 192 define void @sremCase(<2 x i32>* %addr1, i32* %dest) { 193 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 194 %extract = extractelement <2 x i32> %in1, i32 1 195 %out = srem i32 %extract, 7 196 store i32 %out, i32* %dest, align 4 197 ret void 198 } 199 200 ; IR-BOTH-LABEL: @fdivCase 201 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1 202 ; Scalar version: 203 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 204 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0 205 ; Vector version: 206 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fdiv <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00> 207 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 208 ; 209 ; IR-BOTH-NEXT: store float [[RES]], float* %dest 210 ; IR-BOTH-NEXT: ret 211 define void @fdivCase(<2 x float>* %addr1, float* %dest) { 212 %in1 = load <2 x float>, <2 x float>* %addr1, align 8 213 %extract = extractelement <2 x float> %in1, i32 1 214 %out = fdiv float %extract, 7.0 215 store float %out, float* %dest, align 4 216 ret void 217 } 218 219 ; IR-BOTH-LABEL: @fremCase 220 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1 221 ; Scalar version: 222 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 223 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0 224 ; Vector version: 225 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00> 226 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 227 ; 228 ; IR-BOTH-NEXT: store float [[RES]], float* %dest 229 ; IR-BOTH-NEXT: ret 230 define void @fremCase(<2 x float>* %addr1, float* %dest) { 231 %in1 = load <2 x float>, <2 x float>* %addr1, align 8 232 %extract = extractelement <2 x float> %in1, i32 1 233 %out = frem float %extract, 7.0 234 store float %out, float* %dest, align 4 235 ret void 236 } 237 238 ; Check that we do not promote when we may introduce undefined behavior 239 ; like division by zero. 240 ; IR-BOTH-LABEL: @undefDivCase 241 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 242 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 243 ; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 7, [[EXTRACT]] 244 ; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest 245 ; IR-BOTH-NEXT: ret 246 define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) { 247 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 248 %extract = extractelement <2 x i32> %in1, i32 1 249 %out = udiv i32 7, %extract 250 store i32 %out, i32* %dest, align 4 251 ret void 252 } 253 254 255 ; Check that we do not promote when we may introduce undefined behavior 256 ; like division by zero. 257 ; IR-BOTH-LABEL: @undefRemCase 258 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 259 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 260 ; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 7, [[EXTRACT]] 261 ; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest 262 ; IR-BOTH-NEXT: ret 263 define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) { 264 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 265 %extract = extractelement <2 x i32> %in1, i32 1 266 %out = srem i32 7, %extract 267 store i32 %out, i32* %dest, align 4 268 ret void 269 } 270 271 ; Check that we use an undef mask for undefined behavior if the fast-math 272 ; flag is set. 273 ; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath 274 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1 275 ; Scalar version: 276 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 277 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0 278 ; Vector version: 279 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> [[LOAD]], <float undef, float 7.000000e+00> 280 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 281 ; 282 ; IR-BOTH-NEXT: store float [[RES]], float* %dest 283 ; IR-BOTH-NEXT: ret 284 define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) { 285 %in1 = load <2 x float>, <2 x float>* %addr1, align 8 286 %extract = extractelement <2 x float> %in1, i32 1 287 %out = frem nnan float %extract, 7.0 288 store float %out, float* %dest, align 4 289 ret void 290 } 291 292 ; Check that we use an undef mask for undefined behavior if the fast-math 293 ; flag is set. 294 ; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath 295 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1 296 ; Scalar version: 297 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 298 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]] 299 ; Vector version: 300 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> <float undef, float 7.000000e+00>, [[LOAD]] 301 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 302 ; 303 ; IR-BOTH-NEXT: store float [[RES]], float* %dest 304 ; IR-BOTH-NEXT: ret 305 define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) { 306 %in1 = load <2 x float>, <2 x float>* %addr1, align 8 307 %extract = extractelement <2 x float> %in1, i32 1 308 %out = frem nnan float 7.0, %extract 309 store float %out, float* %dest, align 4 310 ret void 311 } 312 313 ; Check that we are able to promote floating point value. 314 ; This requires the STRESS mode, as floating point value are 315 ; not promote on armv7. 316 ; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat 317 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1 318 ; Scalar version: 319 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 320 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0 321 ; Vector version: 322 ; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x float> [[LOAD]], <float undef, float 1.000000e+00> 323 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 324 ; 325 ; IR-BOTH-NEXT: store float [[RES]], float* %dest 326 ; IR-BOTH-NEXT: ret 327 define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) { 328 %in1 = load <2 x float>, <2 x float>* %addr1, align 8 329 %extract = extractelement <2 x float> %in1, i32 1 330 %out = fadd float %extract, 1.0 331 store float %out, float* %dest, align 4 332 ret void 333 } 334 335 ; Check that we correctly use a splat constant when we cannot 336 ; determine at compile time the index of the extract. 337 ; This requires the STRESS modes, as variable index are expensive 338 ; to lower. 339 ; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx 340 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1 341 ; Scalar version: 342 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx 343 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 344 ; Vector version: 345 ; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 1> 346 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx 347 ; 348 ; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest 349 ; IR-BOTH-NEXT: ret 350 define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) { 351 %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8 352 %extract = extractelement <2 x i32> %in1, i32 %idx 353 %out = or i32 %extract, 1 354 store i32 %out, i32* %dest, align 4 355 ret void 356 } 357 358 ; Check a vector with more than 2 elements. 359 ; This requires the STRESS mode because currently 'or v8i8' is not marked 360 ; as legal or custom, althought the actual assembly is better if we were 361 ; promoting it. 362 ; IR-BOTH-LABEL: @simpleOneInstructionPromotion8x8 363 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <8 x i8>, <8 x i8>* %addr1 364 ; Scalar version: 365 ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[LOAD]], i32 1 366 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i8 [[EXTRACT]], 1 367 ; Vector version: 368 ; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 undef, i8 1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef> 369 ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1 370 ; 371 ; IR-BOTH-NEXT: store i8 [[RES]], i8* %dest 372 ; IR-BOTH-NEXT: ret 373 define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) { 374 %in1 = load <8 x i8>, <8 x i8>* %addr1, align 8 375 %extract = extractelement <8 x i8> %in1, i32 1 376 %out = or i8 %extract, 1 377 store i8 %out, i8* %dest, align 4 378 ret void 379 } 380 381 ; Check that we optimized the sequence correctly when it can be 382 ; lowered on a Q register. 383 ; IR-BOTH-LABEL: @simpleOneInstructionPromotion 384 ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, <4 x i32>* %addr1 385 ; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], <i32 undef, i32 1, i32 undef, i32 undef> 386 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR]], i32 1 387 ; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest 388 ; IR-BOTH-NEXT: ret 389 ; 390 ; Make sure we got rid of any expensive vmov.32 instructions. 391 ; ASM-LABEL: simpleOneInstructionPromotion4x32: 392 ; ASM: vld1.64 {[[LOAD:d[0-9]+]], d{{[0-9]+}}}, [r0] 393 ; The Q register used here must be [[LOAD]] / 2, but we cannot express that. 394 ; ASM-NEXT: vorr.i32 q{{[[0-9]+}}, #0x1 395 ; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1] 396 ; ASM-NEXT: bx 397 define void @simpleOneInstructionPromotion4x32(<4 x i32>* %addr1, i32* %dest) { 398 %in1 = load <4 x i32>, <4 x i32>* %addr1, align 8 399 %extract = extractelement <4 x i32> %in1, i32 1 400 %out = or i32 %extract, 1 401 store i32 %out, i32* %dest, align 1 402 ret void 403 } 404