1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X32 -check-prefix=X32-KNL 3 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X32 -check-prefix=X32-SKX 4 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-KNL 5 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-SKX 6 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN64 -check-prefix=WIN64-KNL 7 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN64 -check-prefix=WIN64-SKX 8 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X64 -check-prefix=X64-KNL 9 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X64 -check-prefix=X64-SKX 10 11 declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) 12 declare <16 x float> @func_float16(<16 x float>, <16 x float>) 13 declare i32 @func_int(i32, i32) 14 15 ;test calling conventions - input parameters 16 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { 17 ; X32-LABEL: testf16_inp: 18 ; X32: ## %bb.0: 19 ; X32-NEXT: pushl %ebp 20 ; X32-NEXT: movl %esp, %ebp 21 ; X32-NEXT: andl $-64, %esp 22 ; X32-NEXT: subl $192, %esp 23 ; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0 24 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax 25 ; X32-NEXT: movl %eax, (%esp) 26 ; X32-NEXT: calll _func_float16_ptr 27 ; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 28 ; X32-NEXT: movl %ebp, %esp 29 ; X32-NEXT: popl %ebp 30 ; X32-NEXT: retl 31 ; 32 ; WIN32-LABEL: testf16_inp: 33 ; WIN32: # %bb.0: 34 ; WIN32-NEXT: pushl %ebp 35 ; WIN32-NEXT: movl %esp, %ebp 36 ; WIN32-NEXT: andl $-64, %esp 37 ; WIN32-NEXT: subl $128, %esp 38 ; WIN32-NEXT: vaddps %zmm1, %zmm0, %zmm0 39 ; WIN32-NEXT: movl %esp, %eax 40 ; WIN32-NEXT: pushl %eax 41 ; WIN32-NEXT: calll _func_float16_ptr 42 ; WIN32-NEXT: addl $4, %esp 43 ; WIN32-NEXT: vaddps (%esp), %zmm0, %zmm0 44 ; WIN32-NEXT: movl %ebp, %esp 45 ; WIN32-NEXT: popl %ebp 46 ; WIN32-NEXT: retl 47 ; 48 ; WIN64-LABEL: testf16_inp: 49 ; WIN64: # %bb.0: 50 ; WIN64-NEXT: pushq %rbp 51 ; WIN64-NEXT: subq $176, %rsp 52 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp 53 ; WIN64-NEXT: andq $-64, %rsp 54 ; WIN64-NEXT: vmovaps (%rcx), %zmm0 55 ; WIN64-NEXT: vaddps (%rdx), %zmm0, %zmm0 56 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 57 ; WIN64-NEXT: callq func_float16_ptr 58 ; WIN64-NEXT: vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0 59 ; WIN64-NEXT: leaq 48(%rbp), %rsp 60 ; WIN64-NEXT: popq %rbp 61 ; WIN64-NEXT: retq 62 ; 63 ; X64-LABEL: testf16_inp: 64 ; X64: ## %bb.0: 65 ; X64-NEXT: pushq %rbp 66 ; X64-NEXT: movq %rsp, %rbp 67 ; X64-NEXT: pushq %r13 68 ; X64-NEXT: pushq %r12 69 ; X64-NEXT: andq $-64, %rsp 70 ; X64-NEXT: subq $128, %rsp 71 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 72 ; X64-NEXT: movq %rsp, %rdi 73 ; X64-NEXT: callq _func_float16_ptr 74 ; X64-NEXT: vaddps (%rsp), %zmm0, %zmm0 75 ; X64-NEXT: leaq -16(%rbp), %rsp 76 ; X64-NEXT: popq %r12 77 ; X64-NEXT: popq %r13 78 ; X64-NEXT: popq %rbp 79 ; X64-NEXT: retq 80 %y = alloca <16 x float>, align 16 81 %x = fadd <16 x float> %a, %b 82 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 83 %2 = load <16 x float>, <16 x float>* %y, align 16 84 %3 = fadd <16 x float> %2, %1 85 ret <16 x float> %3 86 } 87 88 ;test calling conventions - preserved registers 89 90 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { 91 ; X32-LABEL: testf16_regs: 92 ; X32: ## %bb.0: 93 ; X32-NEXT: pushl %ebp 94 ; X32-NEXT: movl %esp, %ebp 95 ; X32-NEXT: andl $-64, %esp 96 ; X32-NEXT: subl $256, %esp ## imm = 0x100 97 ; X32-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill 98 ; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0 99 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax 100 ; X32-NEXT: movl %eax, (%esp) 101 ; X32-NEXT: calll _func_float16_ptr 102 ; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ## 64-byte Folded Reload 103 ; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 104 ; X32-NEXT: movl %ebp, %esp 105 ; X32-NEXT: popl %ebp 106 ; X32-NEXT: retl 107 ; 108 ; WIN32-LABEL: testf16_regs: 109 ; WIN32: # %bb.0: 110 ; WIN32-NEXT: pushl %ebp 111 ; WIN32-NEXT: movl %esp, %ebp 112 ; WIN32-NEXT: andl $-64, %esp 113 ; WIN32-NEXT: subl $192, %esp 114 ; WIN32-NEXT: vmovaps %zmm1, (%esp) # 64-byte Spill 115 ; WIN32-NEXT: vaddps %zmm1, %zmm0, %zmm0 116 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax 117 ; WIN32-NEXT: pushl %eax 118 ; WIN32-NEXT: calll _func_float16_ptr 119 ; WIN32-NEXT: addl $4, %esp 120 ; WIN32-NEXT: vaddps (%esp), %zmm0, %zmm0 # 64-byte Folded Reload 121 ; WIN32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 122 ; WIN32-NEXT: movl %ebp, %esp 123 ; WIN32-NEXT: popl %ebp 124 ; WIN32-NEXT: retl 125 ; 126 ; WIN64-LABEL: testf16_regs: 127 ; WIN64: # %bb.0: 128 ; WIN64-NEXT: pushq %rbp 129 ; WIN64-NEXT: subq $176, %rsp 130 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp 131 ; WIN64-NEXT: andq $-64, %rsp 132 ; WIN64-NEXT: vmovaps (%rdx), %zmm16 133 ; WIN64-NEXT: vaddps (%rcx), %zmm16, %zmm0 134 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 135 ; WIN64-NEXT: callq func_float16_ptr 136 ; WIN64-NEXT: vaddps %zmm16, %zmm0, %zmm0 137 ; WIN64-NEXT: vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0 138 ; WIN64-NEXT: leaq 48(%rbp), %rsp 139 ; WIN64-NEXT: popq %rbp 140 ; WIN64-NEXT: retq 141 ; 142 ; X64-LABEL: testf16_regs: 143 ; X64: ## %bb.0: 144 ; X64-NEXT: pushq %rbp 145 ; X64-NEXT: movq %rsp, %rbp 146 ; X64-NEXT: pushq %r13 147 ; X64-NEXT: pushq %r12 148 ; X64-NEXT: andq $-64, %rsp 149 ; X64-NEXT: subq $128, %rsp 150 ; X64-NEXT: vmovaps %zmm1, %zmm16 151 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 152 ; X64-NEXT: movq %rsp, %rdi 153 ; X64-NEXT: callq _func_float16_ptr 154 ; X64-NEXT: vaddps %zmm16, %zmm0, %zmm0 155 ; X64-NEXT: vaddps (%rsp), %zmm0, %zmm0 156 ; X64-NEXT: leaq -16(%rbp), %rsp 157 ; X64-NEXT: popq %r12 158 ; X64-NEXT: popq %r13 159 ; X64-NEXT: popq %rbp 160 ; X64-NEXT: retq 161 %y = alloca <16 x float>, align 16 162 %x = fadd <16 x float> %a, %b 163 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 164 %2 = load <16 x float>, <16 x float>* %y, align 16 165 %3 = fadd <16 x float> %1, %b 166 %4 = fadd <16 x float> %2, %3 167 ret <16 x float> %4 168 } 169 170 ; test calling conventions - prolog and epilog 171 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { 172 ; X32-LABEL: test_prolog_epilog: 173 ; X32: ## %bb.0: 174 ; X32-NEXT: subl $12, %esp 175 ; X32-NEXT: calll _func_float16 176 ; X32-NEXT: addl $12, %esp 177 ; X32-NEXT: retl 178 ; 179 ; WIN32-LABEL: test_prolog_epilog: 180 ; WIN32: # %bb.0: 181 ; WIN32-NEXT: calll _func_float16 182 ; WIN32-NEXT: retl 183 ; 184 ; WIN64-KNL-LABEL: test_prolog_epilog: 185 ; WIN64-KNL: # %bb.0: 186 ; WIN64-KNL-NEXT: pushq %rbp 187 ; WIN64-KNL-NEXT: subq $1328, %rsp # imm = 0x530 188 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rbp 189 ; WIN64-KNL-NEXT: kmovw %k7, 1198(%rbp) # 2-byte Spill 190 ; WIN64-KNL-NEXT: kmovw %k6, 1196(%rbp) # 2-byte Spill 191 ; WIN64-KNL-NEXT: kmovw %k5, 1194(%rbp) # 2-byte Spill 192 ; WIN64-KNL-NEXT: kmovw %k4, 1192(%rbp) # 2-byte Spill 193 ; WIN64-KNL-NEXT: vmovaps %zmm21, 1104(%rbp) # 64-byte Spill 194 ; WIN64-KNL-NEXT: vmovaps %zmm20, 992(%rbp) # 64-byte Spill 195 ; WIN64-KNL-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill 196 ; WIN64-KNL-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill 197 ; WIN64-KNL-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill 198 ; WIN64-KNL-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill 199 ; WIN64-KNL-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill 200 ; WIN64-KNL-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill 201 ; WIN64-KNL-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill 202 ; WIN64-KNL-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill 203 ; WIN64-KNL-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill 204 ; WIN64-KNL-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill 205 ; WIN64-KNL-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill 206 ; WIN64-KNL-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill 207 ; WIN64-KNL-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill 208 ; WIN64-KNL-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill 209 ; WIN64-KNL-NEXT: andq $-64, %rsp 210 ; WIN64-KNL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) 211 ; WIN64-KNL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 212 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 213 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 214 ; WIN64-KNL-NEXT: callq func_float16 215 ; WIN64-KNL-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload 216 ; WIN64-KNL-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload 217 ; WIN64-KNL-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload 218 ; WIN64-KNL-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload 219 ; WIN64-KNL-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload 220 ; WIN64-KNL-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload 221 ; WIN64-KNL-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload 222 ; WIN64-KNL-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload 223 ; WIN64-KNL-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload 224 ; WIN64-KNL-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload 225 ; WIN64-KNL-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload 226 ; WIN64-KNL-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload 227 ; WIN64-KNL-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload 228 ; WIN64-KNL-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload 229 ; WIN64-KNL-NEXT: vmovaps 992(%rbp), %zmm20 # 64-byte Reload 230 ; WIN64-KNL-NEXT: vmovaps 1104(%rbp), %zmm21 # 64-byte Reload 231 ; WIN64-KNL-NEXT: kmovw 1192(%rbp), %k4 # 2-byte Reload 232 ; WIN64-KNL-NEXT: kmovw 1194(%rbp), %k5 # 2-byte Reload 233 ; WIN64-KNL-NEXT: kmovw 1196(%rbp), %k6 # 2-byte Reload 234 ; WIN64-KNL-NEXT: kmovw 1198(%rbp), %k7 # 2-byte Reload 235 ; WIN64-KNL-NEXT: leaq 1200(%rbp), %rsp 236 ; WIN64-KNL-NEXT: popq %rbp 237 ; WIN64-KNL-NEXT: retq 238 ; 239 ; WIN64-SKX-LABEL: test_prolog_epilog: 240 ; WIN64-SKX: # %bb.0: 241 ; WIN64-SKX-NEXT: pushq %rbp 242 ; WIN64-SKX-NEXT: subq $1328, %rsp # imm = 0x530 243 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rbp 244 ; WIN64-SKX-NEXT: kmovq %k7, 1192(%rbp) # 8-byte Spill 245 ; WIN64-SKX-NEXT: kmovq %k6, 1184(%rbp) # 8-byte Spill 246 ; WIN64-SKX-NEXT: kmovq %k5, 1176(%rbp) # 8-byte Spill 247 ; WIN64-SKX-NEXT: kmovq %k4, 1168(%rbp) # 8-byte Spill 248 ; WIN64-SKX-NEXT: vmovaps %zmm21, 1056(%rbp) # 64-byte Spill 249 ; WIN64-SKX-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill 250 ; WIN64-SKX-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill 251 ; WIN64-SKX-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill 252 ; WIN64-SKX-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill 253 ; WIN64-SKX-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill 254 ; WIN64-SKX-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill 255 ; WIN64-SKX-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill 256 ; WIN64-SKX-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill 257 ; WIN64-SKX-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill 258 ; WIN64-SKX-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill 259 ; WIN64-SKX-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill 260 ; WIN64-SKX-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill 261 ; WIN64-SKX-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill 262 ; WIN64-SKX-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill 263 ; WIN64-SKX-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill 264 ; WIN64-SKX-NEXT: andq $-64, %rsp 265 ; WIN64-SKX-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) 266 ; WIN64-SKX-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) 267 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rcx 268 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rdx 269 ; WIN64-SKX-NEXT: callq func_float16 270 ; WIN64-SKX-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload 271 ; WIN64-SKX-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload 272 ; WIN64-SKX-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload 273 ; WIN64-SKX-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload 274 ; WIN64-SKX-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload 275 ; WIN64-SKX-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload 276 ; WIN64-SKX-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload 277 ; WIN64-SKX-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload 278 ; WIN64-SKX-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload 279 ; WIN64-SKX-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload 280 ; WIN64-SKX-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload 281 ; WIN64-SKX-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload 282 ; WIN64-SKX-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload 283 ; WIN64-SKX-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload 284 ; WIN64-SKX-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload 285 ; WIN64-SKX-NEXT: vmovaps 1056(%rbp), %zmm21 # 64-byte Reload 286 ; WIN64-SKX-NEXT: kmovq 1168(%rbp), %k4 # 8-byte Reload 287 ; WIN64-SKX-NEXT: kmovq 1176(%rbp), %k5 # 8-byte Reload 288 ; WIN64-SKX-NEXT: kmovq 1184(%rbp), %k6 # 8-byte Reload 289 ; WIN64-SKX-NEXT: kmovq 1192(%rbp), %k7 # 8-byte Reload 290 ; WIN64-SKX-NEXT: leaq 1200(%rbp), %rsp 291 ; WIN64-SKX-NEXT: popq %rbp 292 ; WIN64-SKX-NEXT: retq 293 ; 294 ; X64-KNL-LABEL: test_prolog_epilog: 295 ; X64-KNL: ## %bb.0: 296 ; X64-KNL-NEXT: pushq %rsi 297 ; X64-KNL-NEXT: pushq %rdi 298 ; X64-KNL-NEXT: subq $1064, %rsp ## imm = 0x428 299 ; X64-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill 300 ; X64-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill 301 ; X64-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill 302 ; X64-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill 303 ; X64-KNL-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill 304 ; X64-KNL-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill 305 ; X64-KNL-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill 306 ; X64-KNL-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill 307 ; X64-KNL-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill 308 ; X64-KNL-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill 309 ; X64-KNL-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill 310 ; X64-KNL-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill 311 ; X64-KNL-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill 312 ; X64-KNL-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill 313 ; X64-KNL-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill 314 ; X64-KNL-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill 315 ; X64-KNL-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill 316 ; X64-KNL-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill 317 ; X64-KNL-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill 318 ; X64-KNL-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill 319 ; X64-KNL-NEXT: callq _func_float16 320 ; X64-KNL-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload 321 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload 322 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload 323 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload 324 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload 325 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload 326 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload 327 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload 328 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload 329 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload 330 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload 331 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload 332 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload 333 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload 334 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload 335 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload 336 ; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload 337 ; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload 338 ; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload 339 ; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload 340 ; X64-KNL-NEXT: addq $1064, %rsp ## imm = 0x428 341 ; X64-KNL-NEXT: popq %rdi 342 ; X64-KNL-NEXT: popq %rsi 343 ; X64-KNL-NEXT: retq 344 ; 345 ; X64-SKX-LABEL: test_prolog_epilog: 346 ; X64-SKX: ## %bb.0: 347 ; X64-SKX-NEXT: pushq %rsi 348 ; X64-SKX-NEXT: pushq %rdi 349 ; X64-SKX-NEXT: subq $1192, %rsp ## imm = 0x4A8 350 ; X64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill 351 ; X64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill 352 ; X64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill 353 ; X64-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill 354 ; X64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill 355 ; X64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill 356 ; X64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill 357 ; X64-SKX-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill 358 ; X64-SKX-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill 359 ; X64-SKX-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill 360 ; X64-SKX-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill 361 ; X64-SKX-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill 362 ; X64-SKX-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill 363 ; X64-SKX-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill 364 ; X64-SKX-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill 365 ; X64-SKX-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill 366 ; X64-SKX-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill 367 ; X64-SKX-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill 368 ; X64-SKX-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill 369 ; X64-SKX-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill 370 ; X64-SKX-NEXT: callq _func_float16 371 ; X64-SKX-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload 372 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload 373 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload 374 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload 375 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload 376 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload 377 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload 378 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload 379 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload 380 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload 381 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload 382 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload 383 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload 384 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload 385 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload 386 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload 387 ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload 388 ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload 389 ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload 390 ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload 391 ; X64-SKX-NEXT: addq $1192, %rsp ## imm = 0x4A8 392 ; X64-SKX-NEXT: popq %rdi 393 ; X64-SKX-NEXT: popq %rsi 394 ; X64-SKX-NEXT: retq 395 %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) 396 ret <16 x float> %c 397 } 398 399 400 declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>) 401 402 define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask) { 403 ; X32-LABEL: testf16_inp_mask: 404 ; X32: ## %bb.0: 405 ; X32-NEXT: subl $12, %esp 406 ; X32-NEXT: .cfi_def_cfa_offset 16 407 ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 408 ; X32-NEXT: calll _func_float16_mask 409 ; X32-NEXT: addl $12, %esp 410 ; X32-NEXT: retl 411 ; 412 ; WIN32-LABEL: testf16_inp_mask: 413 ; WIN32: # %bb.0: 414 ; WIN32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 415 ; WIN32-NEXT: calll _func_float16_mask 416 ; WIN32-NEXT: retl 417 ; 418 ; WIN64-KNL-LABEL: testf16_inp_mask: 419 ; WIN64-KNL: # %bb.0: 420 ; WIN64-KNL-NEXT: subq $40, %rsp 421 ; WIN64-KNL-NEXT: .seh_stackalloc 40 422 ; WIN64-KNL-NEXT: .seh_endprologue 423 ; WIN64-KNL-NEXT: # kill: def $dx killed $dx def $edx 424 ; WIN64-KNL-NEXT: vmovaps (%rcx), %zmm0 425 ; WIN64-KNL-NEXT: kmovw %edx, %k1 426 ; WIN64-KNL-NEXT: callq func_float16_mask 427 ; WIN64-KNL-NEXT: nop 428 ; WIN64-KNL-NEXT: addq $40, %rsp 429 ; WIN64-KNL-NEXT: retq 430 ; WIN64-KNL-NEXT: .seh_handlerdata 431 ; WIN64-KNL-NEXT: .text 432 ; WIN64-KNL-NEXT: .seh_endproc 433 ; 434 ; WIN64-SKX-LABEL: testf16_inp_mask: 435 ; WIN64-SKX: # %bb.0: 436 ; WIN64-SKX-NEXT: subq $40, %rsp 437 ; WIN64-SKX-NEXT: .seh_stackalloc 40 438 ; WIN64-SKX-NEXT: .seh_endprologue 439 ; WIN64-SKX-NEXT: # kill: def $dx killed $dx def $edx 440 ; WIN64-SKX-NEXT: vmovaps (%rcx), %zmm0 441 ; WIN64-SKX-NEXT: kmovd %edx, %k1 442 ; WIN64-SKX-NEXT: callq func_float16_mask 443 ; WIN64-SKX-NEXT: nop 444 ; WIN64-SKX-NEXT: addq $40, %rsp 445 ; WIN64-SKX-NEXT: retq 446 ; WIN64-SKX-NEXT: .seh_handlerdata 447 ; WIN64-SKX-NEXT: .text 448 ; WIN64-SKX-NEXT: .seh_endproc 449 ; 450 ; X64-KNL-LABEL: testf16_inp_mask: 451 ; X64-KNL: ## %bb.0: 452 ; X64-KNL-NEXT: pushq %rbp 453 ; X64-KNL-NEXT: .cfi_def_cfa_offset 16 454 ; X64-KNL-NEXT: pushq %r13 455 ; X64-KNL-NEXT: .cfi_def_cfa_offset 24 456 ; X64-KNL-NEXT: pushq %r12 457 ; X64-KNL-NEXT: .cfi_def_cfa_offset 32 458 ; X64-KNL-NEXT: .cfi_offset %r12, -32 459 ; X64-KNL-NEXT: .cfi_offset %r13, -24 460 ; X64-KNL-NEXT: .cfi_offset %rbp, -16 461 ; X64-KNL-NEXT: kmovw %edi, %k1 462 ; X64-KNL-NEXT: callq _func_float16_mask 463 ; X64-KNL-NEXT: popq %r12 464 ; X64-KNL-NEXT: popq %r13 465 ; X64-KNL-NEXT: popq %rbp 466 ; X64-KNL-NEXT: retq 467 ; 468 ; X64-SKX-LABEL: testf16_inp_mask: 469 ; X64-SKX: ## %bb.0: 470 ; X64-SKX-NEXT: pushq %rbp 471 ; X64-SKX-NEXT: .cfi_def_cfa_offset 16 472 ; X64-SKX-NEXT: pushq %r13 473 ; X64-SKX-NEXT: .cfi_def_cfa_offset 24 474 ; X64-SKX-NEXT: pushq %r12 475 ; X64-SKX-NEXT: .cfi_def_cfa_offset 32 476 ; X64-SKX-NEXT: .cfi_offset %r12, -32 477 ; X64-SKX-NEXT: .cfi_offset %r13, -24 478 ; X64-SKX-NEXT: .cfi_offset %rbp, -16 479 ; X64-SKX-NEXT: kmovd %edi, %k1 480 ; X64-SKX-NEXT: callq _func_float16_mask 481 ; X64-SKX-NEXT: popq %r12 482 ; X64-SKX-NEXT: popq %r13 483 ; X64-SKX-NEXT: popq %rbp 484 ; X64-SKX-NEXT: retq 485 %imask = bitcast i16 %mask to <16 x i1> 486 %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask) 487 ret <16 x float> %1 488 } 489 490 define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind { 491 ; X32-LABEL: test_prolog_epilog_with_mask: 492 ; X32: ## %bb.0: 493 ; X32-NEXT: subl $12, %esp 494 ; X32-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 495 ; X32-NEXT: kxorw %k1, %k0, %k1 496 ; X32-NEXT: calll _func_float16_mask 497 ; X32-NEXT: addl $12, %esp 498 ; X32-NEXT: retl 499 ; 500 ; WIN32-LABEL: test_prolog_epilog_with_mask: 501 ; WIN32: # %bb.0: 502 ; WIN32-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 503 ; WIN32-NEXT: kxorw %k1, %k0, %k1 504 ; WIN32-NEXT: calll _func_float16_mask 505 ; WIN32-NEXT: retl 506 ; 507 ; WIN64-LABEL: test_prolog_epilog_with_mask: 508 ; WIN64: # %bb.0: 509 ; WIN64-NEXT: subq $40, %rsp 510 ; WIN64-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 511 ; WIN64-NEXT: kxorw %k1, %k0, %k1 512 ; WIN64-NEXT: callq func_float16_mask 513 ; WIN64-NEXT: addq $40, %rsp 514 ; WIN64-NEXT: retq 515 ; 516 ; X64-LABEL: test_prolog_epilog_with_mask: 517 ; X64: ## %bb.0: 518 ; X64-NEXT: pushq %rax 519 ; X64-NEXT: vpcmpeqd %zmm2, %zmm1, %k0 520 ; X64-NEXT: kxorw %k1, %k0, %k1 521 ; X64-NEXT: callq _func_float16_mask 522 ; X64-NEXT: popq %rax 523 ; X64-NEXT: retq 524 %cmp_res = icmp eq <16 x i32>%x1, %x2 525 %mask1 = xor <16 x i1> %cmp_res, %mask 526 %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1) 527 ret <16 x float> %c 528 } 529