Home | History | Annotate | Download | only in AArch64
      1 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
      2 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s
      3 
      4 define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
      5 entry:
      6 ; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
      7 ; CHECK-LABEL: @test_cse
      8 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
      9   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
     10   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
     11   br label %for.cond
     12 
     13 for.cond:                                         ; preds = %for.body, %entry
     14   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
     15   %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
     16   %cmp = icmp slt i32 %i.0, %n
     17   br i1 %cmp, label %for.body, label %for.end
     18 
     19 for.body:                                         ; preds = %for.cond
     20   %0 = bitcast i32* %a to i8*
     21   %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
     22   %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
     23   %3 = bitcast <16 x i8> %1 to <4 x i32>
     24   %4 = bitcast <16 x i8> %2 to <4 x i32>
     25   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
     26   %5 = bitcast i32* %a to i8*
     27   %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
     28   %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
     29   %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
     30   %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
     31   %inc = add nsw i32 %i.0, 1
     32   br label %for.cond
     33 
     34 for.end:                                          ; preds = %for.cond
     35   ret <4 x i32> %res.0
     36 }
     37 
     38 define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
     39 entry:
     40 ; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
     41 ; CHECK-LABEL: @test_cse2
     42 ; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
     43 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
     44   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
     45   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
     46   br label %for.cond
     47 
     48 for.cond:                                         ; preds = %for.body, %entry
     49   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
     50   %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
     51   %cmp = icmp slt i32 %i.0, %n
     52   br i1 %cmp, label %for.body, label %for.end
     53 
     54 for.body:                                         ; preds = %for.cond
     55   %0 = bitcast i32* %a to i8*
     56   %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
     57   %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
     58   %3 = bitcast <16 x i8> %1 to <4 x i32>
     59   %4 = bitcast <16 x i8> %2 to <4 x i32>
     60   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
     61   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
     62   %5 = bitcast i32* %a to i8*
     63   %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
     64   %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
     65   %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
     66   %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
     67   %inc = add nsw i32 %i.0, 1
     68   br label %for.cond
     69 
     70 for.end:                                          ; preds = %for.cond
     71   ret <4 x i32> %res.0
     72 }
     73 
     74 define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
     75 entry:
     76 ; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
     77 ; CHECK-LABEL: @test_cse3
     78 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
     79 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
     80   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
     81   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
     82   br label %for.cond
     83 
     84 for.cond:                                         ; preds = %for.body, %entry
     85   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
     86   %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
     87   %cmp = icmp slt i32 %i.0, %n
     88   br i1 %cmp, label %for.body, label %for.end
     89 
     90 for.body:                                         ; preds = %for.cond
     91   %0 = bitcast i32* %a to i8*
     92   %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
     93   %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
     94   %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
     95   %1 = bitcast i32* %a to i8*
     96   %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
     97   %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
     98   %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
     99   %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
    100   %inc = add nsw i32 %i.0, 1
    101   br label %for.cond
    102 
    103 for.end:                                          ; preds = %for.cond
    104   ret <4 x i32> %res.0
    105 }
    106 
    107 
    108 define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
    109 entry:
    110 ; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
    111 ; away by Early CSE.
    112 ; CHECK-LABEL: @test_nocse
    113 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
    114   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
    115   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
    116   br label %for.cond
    117 
    118 for.cond:                                         ; preds = %for.body, %entry
    119   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    120   %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
    121   %cmp = icmp slt i32 %i.0, %n
    122   br i1 %cmp, label %for.body, label %for.end
    123 
    124 for.body:                                         ; preds = %for.cond
    125   %0 = bitcast i32* %a to i8*
    126   %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
    127   %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
    128   %3 = bitcast <16 x i8> %1 to <4 x i32>
    129   %4 = bitcast <16 x i8> %2 to <4 x i32>
    130   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
    131   store i32 0, i32* %b, align 4
    132   %5 = bitcast i32* %a to i8*
    133   %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
    134   %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
    135   %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
    136   %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
    137   %inc = add nsw i32 %i.0, 1
    138   br label %for.cond
    139 
    140 for.end:                                          ; preds = %for.cond
    141   ret <4 x i32> %res.0
    142 }
    143 
    144 define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
    145 entry:
    146 ; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
    147 ; to mismatch between st2 and ld3.
    148 ; CHECK-LABEL: @test_nocse2
    149 ; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
    150   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
    151   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
    152   br label %for.cond
    153 
    154 for.cond:                                         ; preds = %for.body, %entry
    155   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    156   %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
    157   %cmp = icmp slt i32 %i.0, %n
    158   br i1 %cmp, label %for.body, label %for.end
    159 
    160 for.body:                                         ; preds = %for.cond
    161   %0 = bitcast i32* %a to i8*
    162   %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
    163   %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
    164   %3 = bitcast <16 x i8> %1 to <4 x i32>
    165   %4 = bitcast <16 x i8> %2 to <4 x i32>
    166   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
    167   %5 = bitcast i32* %a to i8*
    168   %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
    169   %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
    170   %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
    171   %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
    172   %inc = add nsw i32 %i.0, 1
    173   br label %for.cond
    174 
    175 for.end:                                          ; preds = %for.cond
    176   ret <4 x i32> %res.0
    177 }
    178 
    179 define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
    180 entry:
    181 ; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
    182 ; mismatch between st2 and st3.
    183 ; CHECK-LABEL: @test_nocse3
    184 ; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
    185 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
    186   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
    187   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
    188   br label %for.cond
    189 
    190 for.cond:                                         ; preds = %for.body, %entry
    191   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    192   %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
    193   %cmp = icmp slt i32 %i.0, %n
    194   br i1 %cmp, label %for.body, label %for.end
    195 
    196 for.body:                                         ; preds = %for.cond
    197   %0 = bitcast i32* %a to i8*
    198   %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
    199   %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
    200   %3 = bitcast <16 x i8> %1 to <4 x i32>
    201   %4 = bitcast <16 x i8> %2 to <4 x i32>
    202   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
    203   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
    204   %5 = bitcast i32* %a to i8*
    205   %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
    206   %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
    207   %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
    208   %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
    209   %inc = add nsw i32 %i.0, 1
    210   br label %for.cond
    211 
    212 for.end:                                          ; preds = %for.cond
    213   ret <4 x i32> %res.0
    214 }
    215 
    216 ; Function Attrs: nounwind
    217 declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
    218 
    219 ; Function Attrs: nounwind
    220 declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
    221 
    222 ; Function Attrs: nounwind readonly
    223 declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
    224 
    225 ; Function Attrs: nounwind readonly
    226 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
    227 
    228 define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
    229 entry:
    230   %add = add <4 x i32> %__p0, %__p1
    231   ret <4 x i32> %add
    232 }
    233