Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X32 -check-prefix=X32-KNL
      3 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X32 -check-prefix=X32-SKX
      4 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-KNL
      5 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-SKX
      6 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN64 -check-prefix=WIN64-KNL
      7 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN64 -check-prefix=WIN64-SKX
      8 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X64 -check-prefix=X64-KNL
      9 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X64 -check-prefix=X64-SKX
     10 
     11 declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
     12 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
     13 declare i32 @func_int(i32, i32)
     14 
     15 ;test calling conventions - input parameters
     16 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
     17 ; X32-LABEL: testf16_inp:
     18 ; X32:       ## %bb.0:
     19 ; X32-NEXT:    pushl %ebp
     20 ; X32-NEXT:    movl %esp, %ebp
     21 ; X32-NEXT:    andl $-64, %esp
     22 ; X32-NEXT:    subl $192, %esp
     23 ; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
     24 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
     25 ; X32-NEXT:    movl %eax, (%esp)
     26 ; X32-NEXT:    calll _func_float16_ptr
     27 ; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
     28 ; X32-NEXT:    movl %ebp, %esp
     29 ; X32-NEXT:    popl %ebp
     30 ; X32-NEXT:    retl
     31 ;
     32 ; WIN32-LABEL: testf16_inp:
     33 ; WIN32:       # %bb.0:
     34 ; WIN32-NEXT:    pushl %ebp
     35 ; WIN32-NEXT:    movl %esp, %ebp
     36 ; WIN32-NEXT:    andl $-64, %esp
     37 ; WIN32-NEXT:    subl $128, %esp
     38 ; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
     39 ; WIN32-NEXT:    movl %esp, %eax
     40 ; WIN32-NEXT:    pushl %eax
     41 ; WIN32-NEXT:    calll _func_float16_ptr
     42 ; WIN32-NEXT:    addl $4, %esp
     43 ; WIN32-NEXT:    vaddps (%esp), %zmm0, %zmm0
     44 ; WIN32-NEXT:    movl %ebp, %esp
     45 ; WIN32-NEXT:    popl %ebp
     46 ; WIN32-NEXT:    retl
     47 ;
     48 ; WIN64-LABEL: testf16_inp:
     49 ; WIN64:       # %bb.0:
     50 ; WIN64-NEXT:    pushq %rbp
     51 ; WIN64-NEXT:    subq $176, %rsp
     52 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
     53 ; WIN64-NEXT:    andq $-64, %rsp
     54 ; WIN64-NEXT:    vmovaps (%rcx), %zmm0
     55 ; WIN64-NEXT:    vaddps (%rdx), %zmm0, %zmm0
     56 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
     57 ; WIN64-NEXT:    callq func_float16_ptr
     58 ; WIN64-NEXT:    vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0
     59 ; WIN64-NEXT:    leaq 48(%rbp), %rsp
     60 ; WIN64-NEXT:    popq %rbp
     61 ; WIN64-NEXT:    retq
     62 ;
     63 ; X64-LABEL: testf16_inp:
     64 ; X64:       ## %bb.0:
     65 ; X64-NEXT:    pushq %rbp
     66 ; X64-NEXT:    movq %rsp, %rbp
     67 ; X64-NEXT:    pushq %r13
     68 ; X64-NEXT:    pushq %r12
     69 ; X64-NEXT:    andq $-64, %rsp
     70 ; X64-NEXT:    subq $128, %rsp
     71 ; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
     72 ; X64-NEXT:    movq %rsp, %rdi
     73 ; X64-NEXT:    callq _func_float16_ptr
     74 ; X64-NEXT:    vaddps (%rsp), %zmm0, %zmm0
     75 ; X64-NEXT:    leaq -16(%rbp), %rsp
     76 ; X64-NEXT:    popq %r12
     77 ; X64-NEXT:    popq %r13
     78 ; X64-NEXT:    popq %rbp
     79 ; X64-NEXT:    retq
     80   %y = alloca <16 x float>, align 16
     81   %x = fadd <16 x float> %a, %b
     82   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
     83   %2 = load <16 x float>, <16 x float>* %y, align 16
     84   %3 = fadd <16 x float> %2, %1
     85   ret <16 x float> %3
     86 }
     87 
     88 ;test calling conventions - preserved registers
     89 
     90 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
     91 ; X32-LABEL: testf16_regs:
     92 ; X32:       ## %bb.0:
     93 ; X32-NEXT:    pushl %ebp
     94 ; X32-NEXT:    movl %esp, %ebp
     95 ; X32-NEXT:    andl $-64, %esp
     96 ; X32-NEXT:    subl $256, %esp ## imm = 0x100
     97 ; X32-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill
     98 ; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
     99 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
    100 ; X32-NEXT:    movl %eax, (%esp)
    101 ; X32-NEXT:    calll _func_float16_ptr
    102 ; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ## 64-byte Folded Reload
    103 ; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
    104 ; X32-NEXT:    movl %ebp, %esp
    105 ; X32-NEXT:    popl %ebp
    106 ; X32-NEXT:    retl
    107 ;
    108 ; WIN32-LABEL: testf16_regs:
    109 ; WIN32:       # %bb.0:
    110 ; WIN32-NEXT:    pushl %ebp
    111 ; WIN32-NEXT:    movl %esp, %ebp
    112 ; WIN32-NEXT:    andl $-64, %esp
    113 ; WIN32-NEXT:    subl $192, %esp
    114 ; WIN32-NEXT:    vmovaps %zmm1, (%esp) # 64-byte Spill
    115 ; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
    116 ; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
    117 ; WIN32-NEXT:    pushl %eax
    118 ; WIN32-NEXT:    calll _func_float16_ptr
    119 ; WIN32-NEXT:    addl $4, %esp
    120 ; WIN32-NEXT:    vaddps (%esp), %zmm0, %zmm0 # 64-byte Folded Reload
    121 ; WIN32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
    122 ; WIN32-NEXT:    movl %ebp, %esp
    123 ; WIN32-NEXT:    popl %ebp
    124 ; WIN32-NEXT:    retl
    125 ;
    126 ; WIN64-LABEL: testf16_regs:
    127 ; WIN64:       # %bb.0:
    128 ; WIN64-NEXT:    pushq %rbp
    129 ; WIN64-NEXT:    subq $176, %rsp
    130 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
    131 ; WIN64-NEXT:    andq $-64, %rsp
    132 ; WIN64-NEXT:    vmovaps (%rdx), %zmm16
    133 ; WIN64-NEXT:    vaddps (%rcx), %zmm16, %zmm0
    134 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
    135 ; WIN64-NEXT:    callq func_float16_ptr
    136 ; WIN64-NEXT:    vaddps %zmm16, %zmm0, %zmm0
    137 ; WIN64-NEXT:    vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0
    138 ; WIN64-NEXT:    leaq 48(%rbp), %rsp
    139 ; WIN64-NEXT:    popq %rbp
    140 ; WIN64-NEXT:    retq
    141 ;
    142 ; X64-LABEL: testf16_regs:
    143 ; X64:       ## %bb.0:
    144 ; X64-NEXT:    pushq %rbp
    145 ; X64-NEXT:    movq %rsp, %rbp
    146 ; X64-NEXT:    pushq %r13
    147 ; X64-NEXT:    pushq %r12
    148 ; X64-NEXT:    andq $-64, %rsp
    149 ; X64-NEXT:    subq $128, %rsp
    150 ; X64-NEXT:    vmovaps %zmm1, %zmm16
    151 ; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
    152 ; X64-NEXT:    movq %rsp, %rdi
    153 ; X64-NEXT:    callq _func_float16_ptr
    154 ; X64-NEXT:    vaddps %zmm16, %zmm0, %zmm0
    155 ; X64-NEXT:    vaddps (%rsp), %zmm0, %zmm0
    156 ; X64-NEXT:    leaq -16(%rbp), %rsp
    157 ; X64-NEXT:    popq %r12
    158 ; X64-NEXT:    popq %r13
    159 ; X64-NEXT:    popq %rbp
    160 ; X64-NEXT:    retq
    161   %y = alloca <16 x float>, align 16
    162   %x = fadd <16 x float> %a, %b
    163   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
    164   %2 = load <16 x float>, <16 x float>* %y, align 16
    165   %3 = fadd <16 x float> %1, %b
    166   %4 = fadd <16 x float> %2, %3
    167   ret <16 x float> %4
    168 }
    169 
    170 ; test calling conventions - prolog and epilog
    171 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
    172 ; X32-LABEL: test_prolog_epilog:
    173 ; X32:       ## %bb.0:
    174 ; X32-NEXT:    subl $12, %esp
    175 ; X32-NEXT:    calll _func_float16
    176 ; X32-NEXT:    addl $12, %esp
    177 ; X32-NEXT:    retl
    178 ;
    179 ; WIN32-LABEL: test_prolog_epilog:
    180 ; WIN32:       # %bb.0:
    181 ; WIN32-NEXT:    calll _func_float16
    182 ; WIN32-NEXT:    retl
    183 ;
    184 ; WIN64-KNL-LABEL: test_prolog_epilog:
    185 ; WIN64-KNL:       # %bb.0:
    186 ; WIN64-KNL-NEXT:    pushq %rbp
    187 ; WIN64-KNL-NEXT:    subq $1328, %rsp # imm = 0x530
    188 ; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
    189 ; WIN64-KNL-NEXT:    kmovw %k7, 1198(%rbp) # 2-byte Spill
    190 ; WIN64-KNL-NEXT:    kmovw %k6, 1196(%rbp) # 2-byte Spill
    191 ; WIN64-KNL-NEXT:    kmovw %k5, 1194(%rbp) # 2-byte Spill
    192 ; WIN64-KNL-NEXT:    kmovw %k4, 1192(%rbp) # 2-byte Spill
    193 ; WIN64-KNL-NEXT:    vmovaps %zmm21, 1104(%rbp) # 64-byte Spill
    194 ; WIN64-KNL-NEXT:    vmovaps %zmm20, 992(%rbp) # 64-byte Spill
    195 ; WIN64-KNL-NEXT:    vmovaps %zmm19, 896(%rbp) # 64-byte Spill
    196 ; WIN64-KNL-NEXT:    vmovaps %zmm18, 832(%rbp) # 64-byte Spill
    197 ; WIN64-KNL-NEXT:    vmovaps %zmm17, 768(%rbp) # 64-byte Spill
    198 ; WIN64-KNL-NEXT:    vmovaps %zmm16, 704(%rbp) # 64-byte Spill
    199 ; WIN64-KNL-NEXT:    vmovaps %zmm15, 640(%rbp) # 64-byte Spill
    200 ; WIN64-KNL-NEXT:    vmovaps %zmm14, 576(%rbp) # 64-byte Spill
    201 ; WIN64-KNL-NEXT:    vmovaps %zmm13, 512(%rbp) # 64-byte Spill
    202 ; WIN64-KNL-NEXT:    vmovaps %zmm12, 448(%rbp) # 64-byte Spill
    203 ; WIN64-KNL-NEXT:    vmovaps %zmm11, 384(%rbp) # 64-byte Spill
    204 ; WIN64-KNL-NEXT:    vmovaps %zmm10, 320(%rbp) # 64-byte Spill
    205 ; WIN64-KNL-NEXT:    vmovaps %zmm9, 256(%rbp) # 64-byte Spill
    206 ; WIN64-KNL-NEXT:    vmovaps %zmm8, 192(%rbp) # 64-byte Spill
    207 ; WIN64-KNL-NEXT:    vmovaps %zmm7, 128(%rbp) # 64-byte Spill
    208 ; WIN64-KNL-NEXT:    vmovaps %zmm6, 64(%rbp) # 64-byte Spill
    209 ; WIN64-KNL-NEXT:    andq $-64, %rsp
    210 ; WIN64-KNL-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
    211 ; WIN64-KNL-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
    212 ; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
    213 ; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
    214 ; WIN64-KNL-NEXT:    callq func_float16
    215 ; WIN64-KNL-NEXT:    vmovaps 64(%rbp), %zmm6 # 64-byte Reload
    216 ; WIN64-KNL-NEXT:    vmovaps 128(%rbp), %zmm7 # 64-byte Reload
    217 ; WIN64-KNL-NEXT:    vmovaps 192(%rbp), %zmm8 # 64-byte Reload
    218 ; WIN64-KNL-NEXT:    vmovaps 256(%rbp), %zmm9 # 64-byte Reload
    219 ; WIN64-KNL-NEXT:    vmovaps 320(%rbp), %zmm10 # 64-byte Reload
    220 ; WIN64-KNL-NEXT:    vmovaps 384(%rbp), %zmm11 # 64-byte Reload
    221 ; WIN64-KNL-NEXT:    vmovaps 448(%rbp), %zmm12 # 64-byte Reload
    222 ; WIN64-KNL-NEXT:    vmovaps 512(%rbp), %zmm13 # 64-byte Reload
    223 ; WIN64-KNL-NEXT:    vmovaps 576(%rbp), %zmm14 # 64-byte Reload
    224 ; WIN64-KNL-NEXT:    vmovaps 640(%rbp), %zmm15 # 64-byte Reload
    225 ; WIN64-KNL-NEXT:    vmovaps 704(%rbp), %zmm16 # 64-byte Reload
    226 ; WIN64-KNL-NEXT:    vmovaps 768(%rbp), %zmm17 # 64-byte Reload
    227 ; WIN64-KNL-NEXT:    vmovaps 832(%rbp), %zmm18 # 64-byte Reload
    228 ; WIN64-KNL-NEXT:    vmovaps 896(%rbp), %zmm19 # 64-byte Reload
    229 ; WIN64-KNL-NEXT:    vmovaps 992(%rbp), %zmm20 # 64-byte Reload
    230 ; WIN64-KNL-NEXT:    vmovaps 1104(%rbp), %zmm21 # 64-byte Reload
    231 ; WIN64-KNL-NEXT:    kmovw 1192(%rbp), %k4 # 2-byte Reload
    232 ; WIN64-KNL-NEXT:    kmovw 1194(%rbp), %k5 # 2-byte Reload
    233 ; WIN64-KNL-NEXT:    kmovw 1196(%rbp), %k6 # 2-byte Reload
    234 ; WIN64-KNL-NEXT:    kmovw 1198(%rbp), %k7 # 2-byte Reload
    235 ; WIN64-KNL-NEXT:    leaq 1200(%rbp), %rsp
    236 ; WIN64-KNL-NEXT:    popq %rbp
    237 ; WIN64-KNL-NEXT:    retq
    238 ;
    239 ; WIN64-SKX-LABEL: test_prolog_epilog:
    240 ; WIN64-SKX:       # %bb.0:
    241 ; WIN64-SKX-NEXT:    pushq %rbp
    242 ; WIN64-SKX-NEXT:    subq $1328, %rsp # imm = 0x530
    243 ; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
    244 ; WIN64-SKX-NEXT:    kmovq %k7, 1192(%rbp) # 8-byte Spill
    245 ; WIN64-SKX-NEXT:    kmovq %k6, 1184(%rbp) # 8-byte Spill
    246 ; WIN64-SKX-NEXT:    kmovq %k5, 1176(%rbp) # 8-byte Spill
    247 ; WIN64-SKX-NEXT:    kmovq %k4, 1168(%rbp) # 8-byte Spill
    248 ; WIN64-SKX-NEXT:    vmovaps %zmm21, 1056(%rbp) # 64-byte Spill
    249 ; WIN64-SKX-NEXT:    vmovaps %zmm20, 960(%rbp) # 64-byte Spill
    250 ; WIN64-SKX-NEXT:    vmovaps %zmm19, 896(%rbp) # 64-byte Spill
    251 ; WIN64-SKX-NEXT:    vmovaps %zmm18, 832(%rbp) # 64-byte Spill
    252 ; WIN64-SKX-NEXT:    vmovaps %zmm17, 768(%rbp) # 64-byte Spill
    253 ; WIN64-SKX-NEXT:    vmovaps %zmm16, 704(%rbp) # 64-byte Spill
    254 ; WIN64-SKX-NEXT:    vmovaps %zmm15, 640(%rbp) # 64-byte Spill
    255 ; WIN64-SKX-NEXT:    vmovaps %zmm14, 576(%rbp) # 64-byte Spill
    256 ; WIN64-SKX-NEXT:    vmovaps %zmm13, 512(%rbp) # 64-byte Spill
    257 ; WIN64-SKX-NEXT:    vmovaps %zmm12, 448(%rbp) # 64-byte Spill
    258 ; WIN64-SKX-NEXT:    vmovaps %zmm11, 384(%rbp) # 64-byte Spill
    259 ; WIN64-SKX-NEXT:    vmovaps %zmm10, 320(%rbp) # 64-byte Spill
    260 ; WIN64-SKX-NEXT:    vmovaps %zmm9, 256(%rbp) # 64-byte Spill
    261 ; WIN64-SKX-NEXT:    vmovaps %zmm8, 192(%rbp) # 64-byte Spill
    262 ; WIN64-SKX-NEXT:    vmovaps %zmm7, 128(%rbp) # 64-byte Spill
    263 ; WIN64-SKX-NEXT:    vmovaps %zmm6, 64(%rbp) # 64-byte Spill
    264 ; WIN64-SKX-NEXT:    andq $-64, %rsp
    265 ; WIN64-SKX-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
    266 ; WIN64-SKX-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
    267 ; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
    268 ; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
    269 ; WIN64-SKX-NEXT:    callq func_float16
    270 ; WIN64-SKX-NEXT:    vmovaps 64(%rbp), %zmm6 # 64-byte Reload
    271 ; WIN64-SKX-NEXT:    vmovaps 128(%rbp), %zmm7 # 64-byte Reload
    272 ; WIN64-SKX-NEXT:    vmovaps 192(%rbp), %zmm8 # 64-byte Reload
    273 ; WIN64-SKX-NEXT:    vmovaps 256(%rbp), %zmm9 # 64-byte Reload
    274 ; WIN64-SKX-NEXT:    vmovaps 320(%rbp), %zmm10 # 64-byte Reload
    275 ; WIN64-SKX-NEXT:    vmovaps 384(%rbp), %zmm11 # 64-byte Reload
    276 ; WIN64-SKX-NEXT:    vmovaps 448(%rbp), %zmm12 # 64-byte Reload
    277 ; WIN64-SKX-NEXT:    vmovaps 512(%rbp), %zmm13 # 64-byte Reload
    278 ; WIN64-SKX-NEXT:    vmovaps 576(%rbp), %zmm14 # 64-byte Reload
    279 ; WIN64-SKX-NEXT:    vmovaps 640(%rbp), %zmm15 # 64-byte Reload
    280 ; WIN64-SKX-NEXT:    vmovaps 704(%rbp), %zmm16 # 64-byte Reload
    281 ; WIN64-SKX-NEXT:    vmovaps 768(%rbp), %zmm17 # 64-byte Reload
    282 ; WIN64-SKX-NEXT:    vmovaps 832(%rbp), %zmm18 # 64-byte Reload
    283 ; WIN64-SKX-NEXT:    vmovaps 896(%rbp), %zmm19 # 64-byte Reload
    284 ; WIN64-SKX-NEXT:    vmovaps 960(%rbp), %zmm20 # 64-byte Reload
    285 ; WIN64-SKX-NEXT:    vmovaps 1056(%rbp), %zmm21 # 64-byte Reload
    286 ; WIN64-SKX-NEXT:    kmovq 1168(%rbp), %k4 # 8-byte Reload
    287 ; WIN64-SKX-NEXT:    kmovq 1176(%rbp), %k5 # 8-byte Reload
    288 ; WIN64-SKX-NEXT:    kmovq 1184(%rbp), %k6 # 8-byte Reload
    289 ; WIN64-SKX-NEXT:    kmovq 1192(%rbp), %k7 # 8-byte Reload
    290 ; WIN64-SKX-NEXT:    leaq 1200(%rbp), %rsp
    291 ; WIN64-SKX-NEXT:    popq %rbp
    292 ; WIN64-SKX-NEXT:    retq
    293 ;
    294 ; X64-KNL-LABEL: test_prolog_epilog:
    295 ; X64-KNL:       ## %bb.0:
    296 ; X64-KNL-NEXT:    pushq %rsi
    297 ; X64-KNL-NEXT:    pushq %rdi
    298 ; X64-KNL-NEXT:    subq $1064, %rsp ## imm = 0x428
    299 ; X64-KNL-NEXT:    kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill
    300 ; X64-KNL-NEXT:    kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill
    301 ; X64-KNL-NEXT:    kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill
    302 ; X64-KNL-NEXT:    kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill
    303 ; X64-KNL-NEXT:    vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
    304 ; X64-KNL-NEXT:    vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
    305 ; X64-KNL-NEXT:    vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
    306 ; X64-KNL-NEXT:    vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
    307 ; X64-KNL-NEXT:    vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
    308 ; X64-KNL-NEXT:    vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
    309 ; X64-KNL-NEXT:    vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
    310 ; X64-KNL-NEXT:    vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
    311 ; X64-KNL-NEXT:    vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
    312 ; X64-KNL-NEXT:    vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
    313 ; X64-KNL-NEXT:    vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
    314 ; X64-KNL-NEXT:    vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
    315 ; X64-KNL-NEXT:    vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
    316 ; X64-KNL-NEXT:    vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
    317 ; X64-KNL-NEXT:    vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
    318 ; X64-KNL-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
    319 ; X64-KNL-NEXT:    callq _func_float16
    320 ; X64-KNL-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
    321 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
    322 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
    323 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
    324 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
    325 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
    326 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
    327 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
    328 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
    329 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
    330 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
    331 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
    332 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
    333 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
    334 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
    335 ; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
    336 ; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload
    337 ; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload
    338 ; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload
    339 ; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload
    340 ; X64-KNL-NEXT:    addq $1064, %rsp ## imm = 0x428
    341 ; X64-KNL-NEXT:    popq %rdi
    342 ; X64-KNL-NEXT:    popq %rsi
    343 ; X64-KNL-NEXT:    retq
    344 ;
    345 ; X64-SKX-LABEL: test_prolog_epilog:
    346 ; X64-SKX:       ## %bb.0:
    347 ; X64-SKX-NEXT:    pushq %rsi
    348 ; X64-SKX-NEXT:    pushq %rdi
    349 ; X64-SKX-NEXT:    subq $1192, %rsp ## imm = 0x4A8
    350 ; X64-SKX-NEXT:    kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill
    351 ; X64-SKX-NEXT:    kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill
    352 ; X64-SKX-NEXT:    kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill
    353 ; X64-SKX-NEXT:    kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill
    354 ; X64-SKX-NEXT:    vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
    355 ; X64-SKX-NEXT:    vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
    356 ; X64-SKX-NEXT:    vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
    357 ; X64-SKX-NEXT:    vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
    358 ; X64-SKX-NEXT:    vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
    359 ; X64-SKX-NEXT:    vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
    360 ; X64-SKX-NEXT:    vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
    361 ; X64-SKX-NEXT:    vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
    362 ; X64-SKX-NEXT:    vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
    363 ; X64-SKX-NEXT:    vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
    364 ; X64-SKX-NEXT:    vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
    365 ; X64-SKX-NEXT:    vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
    366 ; X64-SKX-NEXT:    vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
    367 ; X64-SKX-NEXT:    vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
    368 ; X64-SKX-NEXT:    vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
    369 ; X64-SKX-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
    370 ; X64-SKX-NEXT:    callq _func_float16
    371 ; X64-SKX-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
    372 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
    373 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
    374 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
    375 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
    376 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
    377 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
    378 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
    379 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
    380 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
    381 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
    382 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
    383 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
    384 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
    385 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
    386 ; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
    387 ; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload
    388 ; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload
    389 ; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload
    390 ; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload
    391 ; X64-SKX-NEXT:    addq $1192, %rsp ## imm = 0x4A8
    392 ; X64-SKX-NEXT:    popq %rdi
    393 ; X64-SKX-NEXT:    popq %rsi
    394 ; X64-SKX-NEXT:    retq
    395    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
    396    ret <16 x float> %c
    397 }
    398 
    399 
    400 declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
    401 
    402 define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask)  {
    403 ; X32-LABEL: testf16_inp_mask:
    404 ; X32:       ## %bb.0:
    405 ; X32-NEXT:    subl $12, %esp
    406 ; X32-NEXT:    .cfi_def_cfa_offset 16
    407 ; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    408 ; X32-NEXT:    calll _func_float16_mask
    409 ; X32-NEXT:    addl $12, %esp
    410 ; X32-NEXT:    retl
    411 ;
    412 ; WIN32-LABEL: testf16_inp_mask:
    413 ; WIN32:       # %bb.0:
    414 ; WIN32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
    415 ; WIN32-NEXT:    calll _func_float16_mask
    416 ; WIN32-NEXT:    retl
    417 ;
    418 ; WIN64-KNL-LABEL: testf16_inp_mask:
    419 ; WIN64-KNL:       # %bb.0:
    420 ; WIN64-KNL-NEXT:    subq $40, %rsp
    421 ; WIN64-KNL-NEXT:    .seh_stackalloc 40
    422 ; WIN64-KNL-NEXT:    .seh_endprologue
    423 ; WIN64-KNL-NEXT:    # kill: def $dx killed $dx def $edx
    424 ; WIN64-KNL-NEXT:    vmovaps (%rcx), %zmm0
    425 ; WIN64-KNL-NEXT:    kmovw %edx, %k1
    426 ; WIN64-KNL-NEXT:    callq func_float16_mask
    427 ; WIN64-KNL-NEXT:    nop
    428 ; WIN64-KNL-NEXT:    addq $40, %rsp
    429 ; WIN64-KNL-NEXT:    retq
    430 ; WIN64-KNL-NEXT:    .seh_handlerdata
    431 ; WIN64-KNL-NEXT:    .text
    432 ; WIN64-KNL-NEXT:    .seh_endproc
    433 ;
    434 ; WIN64-SKX-LABEL: testf16_inp_mask:
    435 ; WIN64-SKX:       # %bb.0:
    436 ; WIN64-SKX-NEXT:    subq $40, %rsp
    437 ; WIN64-SKX-NEXT:    .seh_stackalloc 40
    438 ; WIN64-SKX-NEXT:    .seh_endprologue
    439 ; WIN64-SKX-NEXT:    # kill: def $dx killed $dx def $edx
    440 ; WIN64-SKX-NEXT:    vmovaps (%rcx), %zmm0
    441 ; WIN64-SKX-NEXT:    kmovd %edx, %k1
    442 ; WIN64-SKX-NEXT:    callq func_float16_mask
    443 ; WIN64-SKX-NEXT:    nop
    444 ; WIN64-SKX-NEXT:    addq $40, %rsp
    445 ; WIN64-SKX-NEXT:    retq
    446 ; WIN64-SKX-NEXT:    .seh_handlerdata
    447 ; WIN64-SKX-NEXT:    .text
    448 ; WIN64-SKX-NEXT:    .seh_endproc
    449 ;
    450 ; X64-KNL-LABEL: testf16_inp_mask:
    451 ; X64-KNL:       ## %bb.0:
    452 ; X64-KNL-NEXT:    pushq %rbp
    453 ; X64-KNL-NEXT:    .cfi_def_cfa_offset 16
    454 ; X64-KNL-NEXT:    pushq %r13
    455 ; X64-KNL-NEXT:    .cfi_def_cfa_offset 24
    456 ; X64-KNL-NEXT:    pushq %r12
    457 ; X64-KNL-NEXT:    .cfi_def_cfa_offset 32
    458 ; X64-KNL-NEXT:    .cfi_offset %r12, -32
    459 ; X64-KNL-NEXT:    .cfi_offset %r13, -24
    460 ; X64-KNL-NEXT:    .cfi_offset %rbp, -16
    461 ; X64-KNL-NEXT:    kmovw %edi, %k1
    462 ; X64-KNL-NEXT:    callq _func_float16_mask
    463 ; X64-KNL-NEXT:    popq %r12
    464 ; X64-KNL-NEXT:    popq %r13
    465 ; X64-KNL-NEXT:    popq %rbp
    466 ; X64-KNL-NEXT:    retq
    467 ;
    468 ; X64-SKX-LABEL: testf16_inp_mask:
    469 ; X64-SKX:       ## %bb.0:
    470 ; X64-SKX-NEXT:    pushq %rbp
    471 ; X64-SKX-NEXT:    .cfi_def_cfa_offset 16
    472 ; X64-SKX-NEXT:    pushq %r13
    473 ; X64-SKX-NEXT:    .cfi_def_cfa_offset 24
    474 ; X64-SKX-NEXT:    pushq %r12
    475 ; X64-SKX-NEXT:    .cfi_def_cfa_offset 32
    476 ; X64-SKX-NEXT:    .cfi_offset %r12, -32
    477 ; X64-SKX-NEXT:    .cfi_offset %r13, -24
    478 ; X64-SKX-NEXT:    .cfi_offset %rbp, -16
    479 ; X64-SKX-NEXT:    kmovd %edi, %k1
    480 ; X64-SKX-NEXT:    callq _func_float16_mask
    481 ; X64-SKX-NEXT:    popq %r12
    482 ; X64-SKX-NEXT:    popq %r13
    483 ; X64-SKX-NEXT:    popq %rbp
    484 ; X64-SKX-NEXT:    retq
    485   %imask = bitcast i16 %mask to <16 x i1>
    486   %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
    487   ret <16 x float> %1
    488 }
    489 
    490 define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
    491 ; X32-LABEL: test_prolog_epilog_with_mask:
    492 ; X32:       ## %bb.0:
    493 ; X32-NEXT:    subl $12, %esp
    494 ; X32-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
    495 ; X32-NEXT:    kxorw %k1, %k0, %k1
    496 ; X32-NEXT:    calll _func_float16_mask
    497 ; X32-NEXT:    addl $12, %esp
    498 ; X32-NEXT:    retl
    499 ;
    500 ; WIN32-LABEL: test_prolog_epilog_with_mask:
    501 ; WIN32:       # %bb.0:
    502 ; WIN32-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
    503 ; WIN32-NEXT:    kxorw %k1, %k0, %k1
    504 ; WIN32-NEXT:    calll _func_float16_mask
    505 ; WIN32-NEXT:    retl
    506 ;
    507 ; WIN64-LABEL: test_prolog_epilog_with_mask:
    508 ; WIN64:       # %bb.0:
    509 ; WIN64-NEXT:    subq $40, %rsp
    510 ; WIN64-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
    511 ; WIN64-NEXT:    kxorw %k1, %k0, %k1
    512 ; WIN64-NEXT:    callq func_float16_mask
    513 ; WIN64-NEXT:    addq $40, %rsp
    514 ; WIN64-NEXT:    retq
    515 ;
    516 ; X64-LABEL: test_prolog_epilog_with_mask:
    517 ; X64:       ## %bb.0:
    518 ; X64-NEXT:    pushq %rax
    519 ; X64-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
    520 ; X64-NEXT:    kxorw %k1, %k0, %k1
    521 ; X64-NEXT:    callq _func_float16_mask
    522 ; X64-NEXT:    popq %rax
    523 ; X64-NEXT:    retq
    524    %cmp_res = icmp eq <16 x i32>%x1, %x2
    525    %mask1 = xor <16 x i1> %cmp_res, %mask
    526    %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
    527    ret <16 x float> %c
    528 }
    529