Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
      4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
      5 
      6 ; 'signum' test cases (PR13248)
      7 
      8 ;
      9 ; generic implementation for 128-bit vectors
     10 ;
     11 
     12 define void @signum32a(<4 x float>*) {
     13 ; AVX-LABEL: signum32a:
     14 ; AVX:       # %bb.0: # %entry
     15 ; AVX-NEXT:    vmovaps (%rdi), %xmm0
     16 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
     17 ; AVX-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
     18 ; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
     19 ; AVX-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
     20 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
     21 ; AVX-NEXT:    vsubps %xmm0, %xmm2, %xmm0
     22 ; AVX-NEXT:    vmovaps %xmm0, (%rdi)
     23 ; AVX-NEXT:    retq
     24 entry:
     25   %1 = load <4 x float>, <4 x float>* %0
     26   %2 = fcmp olt <4 x float> %1, zeroinitializer
     27   %3 = sitofp <4 x i1> %2 to <4 x float>
     28   %4 = fcmp ogt <4 x float> %1, zeroinitializer
     29   %5 = sitofp <4 x i1> %4 to <4 x float>
     30   %6 = fsub <4 x float> %3, %5
     31   store <4 x float> %6, <4 x float>* %0
     32   ret void
     33 }
     34 
     35 define void @signum64a(<2 x double>*) {
     36 ; AVX-LABEL: signum64a:
     37 ; AVX:       # %bb.0: # %entry
     38 ; AVX-NEXT:    vmovapd (%rdi), %xmm0
     39 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
     40 ; AVX-NEXT:    vcmpltpd %xmm1, %xmm0, %xmm2
     41 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3]
     42 ; AVX-NEXT:    vcvtdq2pd %xmm2, %xmm2
     43 ; AVX-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
     44 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
     45 ; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
     46 ; AVX-NEXT:    vsubpd %xmm0, %xmm2, %xmm0
     47 ; AVX-NEXT:    vmovapd %xmm0, (%rdi)
     48 ; AVX-NEXT:    retq
     49 entry:
     50   %1 = load <2 x double>, <2 x double>* %0
     51   %2 = fcmp olt <2 x double> %1, zeroinitializer
     52   %3 = sitofp <2 x i1> %2 to <2 x double>
     53   %4 = fcmp ogt <2 x double> %1, zeroinitializer
     54   %5 = sitofp <2 x i1> %4 to <2 x double>
     55   %6 = fsub <2 x double> %3, %5
     56   store <2 x double> %6, <2 x double>* %0
     57   ret void
     58 }
     59 
     60 ;
     61 ; generic implementation for 256-bit vectors
     62 ;
     63 
     64 define void @signum32b(<8 x float>*) {
     65 ; AVX-LABEL: signum32b:
     66 ; AVX:       # %bb.0: # %entry
     67 ; AVX-NEXT:    vmovaps (%rdi), %ymm0
     68 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
     69 ; AVX-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
     70 ; AVX-NEXT:    vcvtdq2ps %ymm2, %ymm2
     71 ; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
     72 ; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
     73 ; AVX-NEXT:    vsubps %ymm0, %ymm2, %ymm0
     74 ; AVX-NEXT:    vmovaps %ymm0, (%rdi)
     75 ; AVX-NEXT:    vzeroupper
     76 ; AVX-NEXT:    retq
     77 entry:
     78   %1 = load <8 x float>, <8 x float>* %0
     79   %2 = fcmp olt <8 x float> %1, zeroinitializer
     80   %3 = sitofp <8 x i1> %2 to <8 x float>
     81   %4 = fcmp ogt <8 x float> %1, zeroinitializer
     82   %5 = sitofp <8 x i1> %4 to <8 x float>
     83   %6 = fsub <8 x float> %3, %5
     84   store <8 x float> %6, <8 x float>* %0
     85   ret void
     86 }
     87 
     88 define void @signum64b(<4 x double>*) {
     89 ; AVX1-LABEL: signum64b:
     90 ; AVX1:       # %bb.0: # %entry
     91 ; AVX1-NEXT:    vmovapd (%rdi), %ymm0
     92 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
     93 ; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
     94 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
     95 ; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
     96 ; AVX1-NEXT:    vcvtdq2pd %xmm2, %ymm2
     97 ; AVX1-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
     98 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     99 ; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
    100 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    101 ; AVX1-NEXT:    vsubpd %ymm0, %ymm2, %ymm0
    102 ; AVX1-NEXT:    vmovapd %ymm0, (%rdi)
    103 ; AVX1-NEXT:    vzeroupper
    104 ; AVX1-NEXT:    retq
    105 ;
    106 ; AVX2-LABEL: signum64b:
    107 ; AVX2:       # %bb.0: # %entry
    108 ; AVX2-NEXT:    vmovapd (%rdi), %ymm0
    109 ; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    110 ; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
    111 ; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
    112 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
    113 ; AVX2-NEXT:    vcvtdq2pd %xmm2, %ymm2
    114 ; AVX2-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
    115 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
    116 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
    117 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    118 ; AVX2-NEXT:    vsubpd %ymm0, %ymm2, %ymm0
    119 ; AVX2-NEXT:    vmovapd %ymm0, (%rdi)
    120 ; AVX2-NEXT:    vzeroupper
    121 ; AVX2-NEXT:    retq
    122 ;
    123 ; AVX512F-LABEL: signum64b:
    124 ; AVX512F:       # %bb.0: # %entry
    125 ; AVX512F-NEXT:    vmovapd (%rdi), %ymm0
    126 ; AVX512F-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    127 ; AVX512F-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
    128 ; AVX512F-NEXT:    vpmovqd %zmm2, %ymm2
    129 ; AVX512F-NEXT:    vcvtdq2pd %xmm2, %ymm2
    130 ; AVX512F-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
    131 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
    132 ; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
    133 ; AVX512F-NEXT:    vsubpd %ymm0, %ymm2, %ymm0
    134 ; AVX512F-NEXT:    vmovapd %ymm0, (%rdi)
    135 ; AVX512F-NEXT:    vzeroupper
    136 ; AVX512F-NEXT:    retq
    137 entry:
    138   %1 = load <4 x double>, <4 x double>* %0
    139   %2 = fcmp olt <4 x double> %1, zeroinitializer
    140   %3 = sitofp <4 x i1> %2 to <4 x double>
    141   %4 = fcmp ogt <4 x double> %1, zeroinitializer
    142   %5 = sitofp <4 x i1> %4 to <4 x double>
    143   %6 = fsub <4 x double> %3, %5
    144   store <4 x double> %6, <4 x double>* %0
    145   ret void
    146 }
    147 
    148 ;
    149 ; implementation using AVX intrinsics for 256-bit vectors
    150 ;
    151 
    152 define void @signum32c(<8 x float>*) {
    153 ; AVX-LABEL: signum32c:
    154 ; AVX:       # %bb.0: # %entry
    155 ; AVX-NEXT:    vmovaps (%rdi), %ymm0
    156 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    157 ; AVX-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
    158 ; AVX-NEXT:    vcvtdq2ps %ymm2, %ymm2
    159 ; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
    160 ; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
    161 ; AVX-NEXT:    vsubps %ymm0, %ymm2, %ymm0
    162 ; AVX-NEXT:    vmovaps %ymm0, (%rdi)
    163 ; AVX-NEXT:    vzeroupper
    164 ; AVX-NEXT:    retq
    165 entry:
    166   %1 = load <8 x float>, <8 x float>* %0
    167   %2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x float> zeroinitializer, i8 1)
    168   %3 = bitcast <8 x float> %2 to <8 x i32>
    169   %4 = sitofp <8 x i32> %3 to <8 x float>
    170   %5 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %1, i8 1)
    171   %6 = bitcast <8 x float> %5 to <8 x i32>
    172   %7 = sitofp <8 x i32> %6 to <8 x float>
    173   %8 = fsub <8 x float> %4, %7
    174   store <8 x float> %8, <8 x float>* %0
    175   ret void
    176 }
    177 
    178 define void @signum64c(<4 x double>*) {
    179 ; AVX1-LABEL: signum64c:
    180 ; AVX1:       # %bb.0: # %entry
    181 ; AVX1-NEXT:    vmovapd (%rdi), %ymm0
    182 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    183 ; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
    184 ; AVX1-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
    185 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    186 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
    187 ; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
    188 ; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
    189 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    190 ; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
    191 ; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
    192 ; AVX1-NEXT:    vzeroupper
    193 ; AVX1-NEXT:    retq
    194 ;
    195 ; AVX2-LABEL: signum64c:
    196 ; AVX2:       # %bb.0: # %entry
    197 ; AVX2-NEXT:    vmovapd (%rdi), %ymm0
    198 ; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    199 ; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
    200 ; AVX2-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
    201 ; AVX2-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
    202 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    203 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    204 ; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
    205 ; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
    206 ; AVX2-NEXT:    vzeroupper
    207 ; AVX2-NEXT:    retq
    208 ;
    209 ; AVX512F-LABEL: signum64c:
    210 ; AVX512F:       # %bb.0: # %entry
    211 ; AVX512F-NEXT:    vmovapd (%rdi), %ymm0
    212 ; AVX512F-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    213 ; AVX512F-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
    214 ; AVX512F-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
    215 ; AVX512F-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
    216 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    217 ; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    218 ; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
    219 ; AVX512F-NEXT:    vmovaps %ymm0, (%rdi)
    220 ; AVX512F-NEXT:    vzeroupper
    221 ; AVX512F-NEXT:    retq
    222 entry:
    223   %x = load <4 x double>, <4 x double>* %0
    224   %xgt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %x, <4 x double> zeroinitializer, i8 1)
    225   %igt = bitcast <4 x double> %xgt to <8 x i32>
    226   %xlt = tail call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %x, i8 1)
    227   %ilt = bitcast <4 x double> %xlt to <8 x i32>
    228   ; it is important to use %igt twice as source in order to make LLVM use a shuffle operation
    229   %isign = sub <8 x i32> %igt, %ilt
    230   %ssign = shufflevector <8 x i32> %isign, <8 x i32> %isign, <4 x i32> <i32 0, i32 2, i32 12, i32 14>
    231   %sign = tail call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %ssign)
    232   store <4 x double> %sign, <4 x double>* %0
    233   ret void
    234 }
    235 
    236 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    237 
    238 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
    239 
    240 declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
    241