Home | History | Annotate | Download | only in X86
      1 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
      2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
      3 
      4 declare float @fmaxf(float, float)
      5 declare double @fmax(double, double)
      6 declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)
      7 declare float @llvm.maxnum.f32(float, float)
      8 declare double @llvm.maxnum.f64(double, double)
      9 declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
     10 
     11 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
     12 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
     13 declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
     14 declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
     15 declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
     16 
     17 
     18 ; CHECK-LABEL: @test_fmaxf
     19 ; SSE:         movaps %xmm0, %xmm2
     20 ; SSE-NEXT:    cmpunordss %xmm2, %xmm2
     21 ; SSE-NEXT:    movaps %xmm2, %xmm3
     22 ; SSE-NEXT:    andps %xmm1, %xmm3
     23 ; SSE-NEXT:    maxss %xmm0, %xmm1
     24 ; SSE-NEXT:    andnps %xmm1, %xmm2
     25 ; SSE-NEXT:    orps %xmm3, %xmm2
     26 ; SSE-NEXT:    movaps %xmm2, %xmm0
     27 ; SSE-NEXT:    retq
     28 ;
     29 ; AVX:         vmaxss %xmm0, %xmm1, %xmm2
     30 ; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
     31 ; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
     32 ; AVX-NEXT:    retq
     33 define float @test_fmaxf(float %x, float %y) {
     34   %z = call float @fmaxf(float %x, float %y) readnone
     35   ret float %z
     36 }
     37 
     38 ; CHECK-LABEL: @test_fmaxf_minsize
     39 ; CHECK:       jmp fmaxf
     40 define float @test_fmaxf_minsize(float %x, float %y) minsize {
     41   %z = call float @fmaxf(float %x, float %y) readnone
     42   ret float %z
     43 }
     44 
     45 ; FIXME: Doubles should be inlined similarly to floats.
     46 
     47 ; CHECK-LABEL: @test_fmax
     48 ; CHECK: jmp fmax
     49 define double @test_fmax(double %x, double %y) {
     50   %z = call double @fmax(double %x, double %y) readnone
     51   ret double %z
     52 }
     53 
     54 ; CHECK-LABEL: @test_fmaxl
     55 ; CHECK: callq fmaxl
     56 define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
     57   %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone
     58   ret x86_fp80 %z
     59 }
     60 
     61 ; CHECK-LABEL: @test_intrinsic_fmaxf
     62 ; SSE:         movaps %xmm0, %xmm2
     63 ; SSE-NEXT:    cmpunordss %xmm2, %xmm2
     64 ; SSE-NEXT:    movaps %xmm2, %xmm3
     65 ; SSE-NEXT:    andps %xmm1, %xmm3
     66 ; SSE-NEXT:    maxss %xmm0, %xmm1
     67 ; SSE-NEXT:    andnps %xmm1, %xmm2
     68 ; SSE-NEXT:    orps %xmm3, %xmm2
     69 ; SSE-NEXT:    movaps %xmm2, %xmm0
     70 ; SSE-NEXT:    retq
     71 ;
     72 ; AVX:         vmaxss %xmm0, %xmm1, %xmm2
     73 ; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
     74 ; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
     75 ; AVX-NEXT:    retq
     76 define float @test_intrinsic_fmaxf(float %x, float %y) {
     77   %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
     78   ret float %z
     79 }
     80 
     81 ; FIXME: Doubles should be inlined similarly to floats.
     82 
     83 ; CHECK-LABEL: @test_intrinsic_fmax
     84 ; CHECK: jmp fmax
     85 define double @test_intrinsic_fmax(double %x, double %y) {
     86   %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
     87   ret double %z
     88 }
     89 
     90 ; CHECK-LABEL: @test_intrinsic_fmaxl
     91 ; CHECK: callq fmaxl
     92 define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
     93   %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
     94   ret x86_fp80 %z
     95 }
     96 
     97 ; FIXME: This should not be doing 4 scalar ops on a 2 element vector.
     98 ; FIXME: This should use vector ops (maxps / cmpps).
     99 
    100 ; CHECK-LABEL: @test_intrinsic_fmax_v2f32
    101 ; SSE:         movaps %xmm1, %xmm2
    102 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    103 ; SSE-NEXT:    movaps %xmm0, %xmm3
    104 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    105 ; SSE-NEXT:    movaps %xmm3, %xmm4
    106 ; SSE-NEXT:    cmpunordss %xmm4, %xmm4
    107 ; SSE-NEXT:    movaps %xmm4, %xmm5
    108 ; SSE-NEXT:    andps %xmm2, %xmm5
    109 ; SSE-NEXT:    maxss %xmm3, %xmm2
    110 ; SSE-NEXT:    andnps %xmm2, %xmm4
    111 ; SSE-NEXT:    orps %xmm5, %xmm4
    112 ; SSE-NEXT:    movaps %xmm1, %xmm2
    113 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
    114 ; SSE-NEXT:    movaps %xmm0, %xmm5
    115 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1,2,3]
    116 ; SSE-NEXT:    movaps %xmm5, %xmm3
    117 ; SSE-NEXT:    cmpunordss %xmm3, %xmm3
    118 ; SSE-NEXT:    movaps %xmm3, %xmm6
    119 ; SSE-NEXT:    andps %xmm2, %xmm6
    120 ; SSE-NEXT:    maxss %xmm5, %xmm2
    121 ; SSE-NEXT:    andnps %xmm2, %xmm3
    122 ; SSE-NEXT:    orps %xmm6, %xmm3
    123 ; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
    124 ; SSE-NEXT:    movaps %xmm0, %xmm2
    125 ; SSE-NEXT:    cmpunordss %xmm2, %xmm2
    126 ; SSE-NEXT:    movaps %xmm2, %xmm4
    127 ; SSE-NEXT:    andps %xmm1, %xmm4
    128 ; SSE-NEXT:    movaps %xmm1, %xmm5
    129 ; SSE-NEXT:    maxss %xmm0, %xmm5
    130 ; SSE-NEXT:    andnps %xmm5, %xmm2
    131 ; SSE-NEXT:    orps %xmm4, %xmm2
    132 ; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
    133 ; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
    134 ; SSE-NEXT:    movapd %xmm0, %xmm4
    135 ; SSE-NEXT:    cmpunordss %xmm4, %xmm4
    136 ; SSE-NEXT:    movaps %xmm4, %xmm5
    137 ; SSE-NEXT:    andps %xmm1, %xmm5
    138 ; SSE-NEXT:    maxss %xmm0, %xmm1
    139 ; SSE-NEXT:    andnps %xmm1, %xmm4
    140 ; SSE-NEXT:    orps %xmm5, %xmm4
    141 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
    142 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
    143 ; SSE-NEXT:    movaps %xmm2, %xmm0
    144 ; SSE-NEXT:    retq
    145 ;
    146 ; AVX:         vmaxss %xmm0, %xmm1, %xmm2
    147 ; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm3
    148 ; AVX-NEXT:    vblendvps %xmm3, %xmm1, %xmm2, %xmm2
    149 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    150 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
    151 ; AVX-NEXT:    vmaxss %xmm3, %xmm4, %xmm5
    152 ; AVX-NEXT:    vcmpunordss %xmm3, %xmm3, %xmm3
    153 ; AVX-NEXT:    vblendvps %xmm3, %xmm4, %xmm5, %xmm3
    154 ; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    155 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    156 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
    157 ; AVX-NEXT:    vmaxss %xmm3, %xmm4, %xmm5
    158 ; AVX-NEXT:    vcmpunordss %xmm3, %xmm3, %xmm3
    159 ; AVX-NEXT:    vblendvps %xmm3, %xmm4, %xmm5, %xmm3
    160 ; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
    161 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    162 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    163 ; AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm3
    164 ; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
    165 ; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm3, %xmm0
    166 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
    167 ; AVX-NEXT:    retq
    168 define <2 x float> @test_intrinsic_fmax_v2f32(<2 x float> %x, <2 x float> %y) {
    169   %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
    170   ret <2 x float> %z
    171 }
    172 
    173 ; FIXME: This should use vector ops (maxps / cmpps).
    174 
    175 ; CHECK-LABEL: @test_intrinsic_fmax_v4f32
    176 ; SSE:         movaps %xmm1, %xmm2
    177 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    178 ; SSE-NEXT:    movaps %xmm0, %xmm3
    179 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    180 ; SSE-NEXT:    movaps %xmm3, %xmm4
    181 ; SSE-NEXT:    cmpunordss %xmm4, %xmm4
    182 ; SSE-NEXT:    movaps %xmm4, %xmm5
    183 ; SSE-NEXT:    andps %xmm2, %xmm5
    184 ; SSE-NEXT:    maxss %xmm3, %xmm2
    185 ; SSE-NEXT:    andnps %xmm2, %xmm4
    186 ; SSE-NEXT:    orps %xmm5, %xmm4
    187 ; SSE-NEXT:    movaps %xmm1, %xmm2
    188 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
    189 ; SSE-NEXT:    movaps %xmm0, %xmm5
    190 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1,2,3]
    191 ; SSE-NEXT:    movaps %xmm5, %xmm3
    192 ; SSE-NEXT:    cmpunordss %xmm3, %xmm3
    193 ; SSE-NEXT:    movaps %xmm3, %xmm6
    194 ; SSE-NEXT:    andps %xmm2, %xmm6
    195 ; SSE-NEXT:    maxss %xmm5, %xmm2
    196 ; SSE-NEXT:    andnps %xmm2, %xmm3
    197 ; SSE-NEXT:    orps %xmm6, %xmm3
    198 ; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
    199 ; SSE-NEXT:    movaps %xmm0, %xmm2
    200 ; SSE-NEXT:    cmpunordss %xmm2, %xmm2
    201 ; SSE-NEXT:    movaps %xmm2, %xmm4
    202 ; SSE-NEXT:    andps %xmm1, %xmm4
    203 ; SSE-NEXT:    movaps %xmm1, %xmm5
    204 ; SSE-NEXT:    maxss %xmm0, %xmm5
    205 ; SSE-NEXT:    andnps %xmm5, %xmm2
    206 ; SSE-NEXT:    orps %xmm4, %xmm2
    207 ; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
    208 ; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
    209 ; SSE-NEXT:    movapd %xmm0, %xmm4
    210 ; SSE-NEXT:    cmpunordss %xmm4, %xmm4
    211 ; SSE-NEXT:    movaps %xmm4, %xmm5
    212 ; SSE-NEXT:    andps %xmm1, %xmm5
    213 ; SSE-NEXT:    maxss %xmm0, %xmm1
    214 ; SSE-NEXT:    andnps %xmm1, %xmm4
    215 ; SSE-NEXT:    orps %xmm5, %xmm4
    216 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
    217 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
    218 ; SSE-NEXT:    movaps %xmm2, %xmm0
    219 ; SSE-NEXT:    retq
    220 ;
    221 ; AVX:         vmaxss %xmm0, %xmm1, %xmm2
    222 ; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm3
    223 ; AVX-NEXT:    vblendvps %xmm3, %xmm1, %xmm2, %xmm2
    224 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    225 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
    226 ; AVX-NEXT:    vmaxss %xmm3, %xmm4, %xmm5
    227 ; AVX-NEXT:    vcmpunordss %xmm3, %xmm3, %xmm3
    228 ; AVX-NEXT:    vblendvps %xmm3, %xmm4, %xmm5, %xmm3
    229 ; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
    230 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    231 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
    232 ; AVX-NEXT:    vmaxss %xmm3, %xmm4, %xmm5
    233 ; AVX-NEXT:    vcmpunordss %xmm3, %xmm3, %xmm3
    234 ; AVX-NEXT:    vblendvps %xmm3, %xmm4, %xmm5, %xmm3
    235 ; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
    236 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    237 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    238 ; AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm3
    239 ; AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
    240 ; AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm3, %xmm0
    241 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
    242 ; AVX-NEXT:    retq
    243 define <4 x float> @test_intrinsic_fmax_v4f32(<4 x float> %x, <4 x float> %y) {
    244   %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
    245   ret <4 x float> %z
    246 }
    247 
    248 ; FIXME: Vector of doubles should be inlined similarly to vector of floats.
    249 
    250 ; CHECK-LABEL: @test_intrinsic_fmax_v2f64
    251 ; CHECK: callq fmax
    252 ; CHECK: callq fmax
    253 define <2 x double> @test_intrinsic_fmax_v2f64(<2 x double> %x, <2 x double> %y) {
    254   %z = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
    255   ret <2 x double> %z
    256 }
    257 
    258 ; FIXME: Vector of doubles should be inlined similarly to vector of floats.
    259 
    260 ; CHECK-LABEL: @test_intrinsic_fmax_v4f64
    261 ; CHECK: callq fmax
    262 ; CHECK: callq fmax
    263 ; CHECK: callq fmax
    264 ; CHECK: callq fmax
    265 define <4 x double> @test_intrinsic_fmax_v4f64(<4 x double> %x, <4 x double> %y) {
    266   %z = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
    267   ret <4 x double> %z
    268 }
    269 
    270 ; FIXME: Vector of doubles should be inlined similarly to vector of floats.
    271 
    272 ; CHECK-LABEL: @test_intrinsic_fmax_v8f64
    273 ; CHECK: callq fmax
    274 ; CHECK: callq fmax
    275 ; CHECK: callq fmax
    276 ; CHECK: callq fmax
    277 ; CHECK: callq fmax
    278 ; CHECK: callq fmax
    279 ; CHECK: callq fmax
    280 ; CHECK: callq fmax
    281 define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y) {
    282   %z = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
    283   ret <8 x double> %z
    284 }
    285 
    286