Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
      2 ; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR
      3 ;
      4 ; <rdar://problem/14477220>
      5 
      6 %class.Complex = type { float, float }
      7 
      8 
      9 ; Check that independent slices leads to independent loads then the slices leads to
     10 ; different register file.
     11 ;
     12 ; The layout is:
     13 ; LSB 0 1 2 3 | 4 5 6 7 MSB
     14 ;       Low      High
     15 ; The base address points to 0 and is 8-bytes aligned.
     16 ; Low slice starts at 0 (base) and is 8-bytes aligned.
     17 ; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
     18 ;
     19 ; STRESS-LABEL: t1:
     20 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
     21 ; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
     22 ; Add low slice: out[out_start].real, this is base + 0.
     23 ; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
     24 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
     25 ; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
     26 ; Add high slice: out[out_start].imm, this is base + 4.
     27 ; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
     28 ; Swap Imm and Real.
     29 ; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
     30 ; Put the results back into out[out_start].
     31 ; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
     32 ;
     33 ; Same for REGULAR, we eliminate register bank copy with each slices.
     34 ; REGULAR-LABEL: t1:
     35 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
     36 ; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
     37 ; Add low slice: out[out_start].real, this is base + 0.
     38 ; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
     39 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
     40 ; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
     41 ; Add high slice: out[out_start].imm, this is base + 4.
     42 ; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
     43 ; Swap Imm and Real.
     44 ; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
     45 ; Put the results back into out[out_start].
     46 ; REGULAR-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
     47 define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
     48 entry:
     49   %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
     50   %tmp = bitcast %class.Complex* %arrayidx to i64*
     51   %tmp1 = load i64, i64* %tmp, align 8
     52   %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
     53   %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
     54   %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
     55   %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
     56   %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
     57   %add = add i64 %out_start, 8
     58   %arrayidx2 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add
     59   %i.i = getelementptr inbounds %class.Complex, %class.Complex* %arrayidx2, i64 0, i32 0
     60   %tmp4 = load float, float* %i.i, align 4
     61   %add.i = fadd float %tmp4, %tmp2
     62   %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
     63   %r.i = getelementptr inbounds %class.Complex, %class.Complex* %arrayidx2, i64 0, i32 1
     64   %tmp5 = load float, float* %r.i, align 4
     65   %add5.i = fadd float %tmp5, %tmp3
     66   %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
     67   %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
     68   store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
     69   ret void
     70 }
     71 
     72 ; Function Attrs: nounwind
     73 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
     74 
     75 ; Function Attrs: nounwind
     76 declare void @llvm.lifetime.start(i64, i8* nocapture)
     77 
     78 ; Function Attrs: nounwind
     79 declare void @llvm.lifetime.end(i64, i8* nocapture)
     80 
     81 ; Check that we do not read outside of the chunk of bits of the original loads.
     82 ;
     83 ; The 64-bits should have been split in one 32-bits and one 16-bits slices.
     84 ; The 16-bits should be zero extended to match the final type.
     85 ;
     86 ; The memory layout is:
     87 ; LSB 0 1 2 3 | 4 5 | 6 7 MSB
     88 ;      Low            High
     89 ; The base address points to 0 and is 8-bytes aligned.
     90 ; Low slice starts at 0 (base) and is 8-bytes aligned.
     91 ; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
     92 ;
     93 ; STRESS-LABEL: t2:
     94 ; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
     95 ; STRESS-NEXT: addl ([[BASE]]), %eax
     96 ; STRESS-NEXT: ret
     97 ;
     98 ; For the REGULAR heuristic, this is not profitable to slice things that are not
     99 ; next to each other in memory. Here we have a hole with bytes #4-5.
    100 ; REGULAR-LABEL: t2:
    101 ; REGULAR: shrq $48
    102 define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
    103   %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
    104   %bitcast = bitcast %class.Complex* %arrayidx to i64*
    105   %chunk64 = load i64, i64* %bitcast, align 8
    106   %slice32_low = trunc i64 %chunk64 to i32
    107   %shift48 = lshr i64 %chunk64, 48
    108   %slice32_high = trunc i64 %shift48 to i32
    109   %res = add i32 %slice32_high, %slice32_low
    110   ret i32 %res
    111 }
    112 
    113 ; Check that we do not optimize overlapping slices.
    114 ;
    115 ; The 64-bits should NOT have been split in as slices are overlapping.
    116 ; First slice uses bytes numbered 0 to 3.
    117 ; Second slice uses bytes numbered 6 and 7.
    118 ; Third slice uses bytes numbered 4 to 7.
    119 ;
    120 ; STRESS-LABEL: t3:
    121 ; STRESS: shrq $48
    122 ; STRESS: shrq $32
    123 ;
    124 ; REGULAR-LABEL: t3:
    125 ; REGULAR: shrq $48
    126 ; REGULAR: shrq $32
    127 define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
    128   %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
    129   %bitcast = bitcast %class.Complex* %arrayidx to i64*
    130   %chunk64 = load i64, i64* %bitcast, align 8
    131   %slice32_low = trunc i64 %chunk64 to i32
    132   %shift48 = lshr i64 %chunk64, 48
    133   %slice32_high = trunc i64 %shift48 to i32
    134   %shift32 = lshr i64 %chunk64, 32
    135   %slice32_lowhigh = trunc i64 %shift32 to i32
    136   %tmpres = add i32 %slice32_high, %slice32_low
    137   %res = add i32 %slice32_lowhigh, %tmpres
    138   ret i32 %res
    139 }
    140