1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW 3 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 4 5 declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64) 6 7 define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) { 8 ; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512: 9 ; AVX512BW: ## BB#0: 10 ; AVX512BW-NEXT: kmovq %rdx, %k1 11 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} 12 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rsi) 13 ; AVX512BW-NEXT: retq 14 ; 15 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512: 16 ; AVX512F-32: # BB#0: 17 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 18 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx 19 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 20 ; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1} 21 ; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%eax) 22 ; AVX512F-32-NEXT: retl 23 call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2) 24 call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1) 25 ret void 26 } 27 28 declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32) 29 30 define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) { 31 ; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512: 32 ; AVX512BW: ## BB#0: 33 ; AVX512BW-NEXT: kmovd %edx, %k1 34 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} 35 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rsi) 36 ; AVX512BW-NEXT: retq 37 ; 38 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512: 39 ; AVX512F-32: # BB#0: 40 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 41 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx 42 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 43 ; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1} 44 ; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%eax) 45 ; AVX512F-32-NEXT: retl 46 call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2) 47 call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1) 48 ret void 49 } 50 51 declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32) 52 53 define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) { 54 ; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512: 55 ; AVX512BW: ## BB#0: 56 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 57 ; AVX512BW-NEXT: kmovd %edx, %k1 58 ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1} 59 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z} 60 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 61 ; AVX512BW-NEXT: retq 62 ; 63 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512: 64 ; AVX512F-32: # BB#0: 65 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 66 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx 67 ; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0 68 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 69 ; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} 70 ; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z} 71 ; AVX512F-32-NEXT: vpaddw %zmm1, %zmm0, %zmm0 72 ; AVX512F-32-NEXT: retl 73 %res0 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1) 74 %res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res0, i32 %mask) 75 %res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask) 76 %res2 = add <32 x i16> %res, %res1 77 ret <32 x i16> %res2 78 } 79 80 declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64) 81 82 define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) { 83 ; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512: 84 ; AVX512BW: ## BB#0: 85 ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 86 ; AVX512BW-NEXT: kmovq %rdx, %k1 87 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1} 88 ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z} 89 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 90 ; AVX512BW-NEXT: retq 91 ; 92 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_b_512: 93 ; AVX512F-32: # BB#0: 94 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 95 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx 96 ; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0 97 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 98 ; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1} 99 ; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z} 100 ; AVX512F-32-NEXT: vpaddb %zmm1, %zmm0, %zmm0 101 ; AVX512F-32-NEXT: retl 102 %res0 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1) 103 %res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res0, i64 %mask) 104 %res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask) 105 %res2 = add <64 x i8> %res, %res1 106 ret <64 x i8> %res2 107 } 108 109 declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32) 110 111 define <8 x i64>@test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) { 112 ; AVX512BW-LABEL: test_int_x86_avx512_psll_dq_512: 113 ; AVX512BW: ## BB#0: 114 ; AVX512BW-NEXT: vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55] 115 ; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59] 116 ; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0 117 ; AVX512BW-NEXT: retq 118 ; 119 ; AVX512F-32-LABEL: test_int_x86_avx512_psll_dq_512: 120 ; AVX512F-32: # BB#0: 121 ; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55] 122 ; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59] 123 ; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0 124 ; AVX512F-32-NEXT: retl 125 %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8) 126 %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4) 127 %res2 = add <8 x i64> %res, %res1 128 ret <8 x i64> %res2 129 } 130 131 define <8 x i64>@test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) { 132 ; AVX512BW-LABEL: test_int_x86_avx512_psll_load_dq_512: 133 ; AVX512BW: ## BB#0: 134 ; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59] 135 ; AVX512BW-NEXT: retq 136 ; 137 ; AVX512F-32-LABEL: test_int_x86_avx512_psll_load_dq_512: 138 ; AVX512F-32: # BB#0: 139 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 140 ; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59] 141 ; AVX512F-32-NEXT: retl 142 %x0 = load <8 x i64>, <8 x i64> *%p0 143 %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4) 144 ret <8 x i64> %res 145 } 146 147 declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32) 148 149 define <8 x i64>@test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) { 150 ; AVX512BW-LABEL: test_int_x86_avx512_psrl_dq_512: 151 ; AVX512BW: ## BB#0: 152 ; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero 153 ; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero 154 ; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0 155 ; AVX512BW-NEXT: retq 156 ; 157 ; AVX512F-32-LABEL: test_int_x86_avx512_psrl_dq_512: 158 ; AVX512F-32: # BB#0: 159 ; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero 160 ; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero 161 ; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0 162 ; AVX512F-32-NEXT: retl 163 %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8) 164 %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4) 165 %res2 = add <8 x i64> %res, %res1 166 ret <8 x i64> %res2 167 } 168 169 define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) { 170 ; AVX512BW-LABEL: test_int_x86_avx512_psrl_load_dq_512: 171 ; AVX512BW: ## BB#0: 172 ; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero 173 ; AVX512BW-NEXT: retq 174 ; 175 ; AVX512F-32-LABEL: test_int_x86_avx512_psrl_load_dq_512: 176 ; AVX512F-32: # BB#0: 177 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax 178 ; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero 179 ; AVX512F-32-NEXT: retl 180 %x0 = load <8 x i64>, <8 x i64> *%p0 181 %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4) 182 ret <8 x i64> %res 183 } 184 185 declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64) 186 187 define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) { 188 ; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512: 189 ; AVX512BW: ## BB#0: 190 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] 191 ; AVX512BW-NEXT: kmovq %rdi, %k1 192 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] 193 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] 194 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 195 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 196 ; AVX512BW-NEXT: retq 197 ; 198 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512: 199 ; AVX512F-32: # BB#0: 200 ; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] 201 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 202 ; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] 203 ; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] 204 ; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0 205 ; AVX512F-32-NEXT: vpaddb %zmm3, %zmm0, %zmm0 206 ; AVX512F-32-NEXT: retl 207 %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4) 208 %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4) 209 %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1) 210 %res3 = add <64 x i8> %res, %res1 211 %res4 = add <64 x i8> %res3, %res2 212 ret <64 x i8> %res4 213 } 214 215 declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i32, <32 x i16>, i32) 216 217 define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) { 218 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufh_w_512: 219 ; AVX512BW: ## BB#0: 220 ; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] 221 ; AVX512BW-NEXT: kmovd %esi, %k1 222 ; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] 223 ; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] 224 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 225 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 226 ; AVX512BW-NEXT: retq 227 ; 228 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufh_w_512: 229 ; AVX512F-32: # BB#0: 230 ; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] 231 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 232 ; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] 233 ; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] 234 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 235 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 236 ; AVX512F-32-NEXT: retl 237 %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) 238 %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) 239 %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) 240 %res3 = add <32 x i16> %res, %res1 241 %res4 = add <32 x i16> %res3, %res2 242 ret <32 x i16> %res4 243 } 244 245 declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i32, <32 x i16>, i32) 246 247 define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) { 248 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufl_w_512: 249 ; AVX512BW: ## BB#0: 250 ; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] 251 ; AVX512BW-NEXT: kmovd %esi, %k1 252 ; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] 253 ; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] 254 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 255 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0 256 ; AVX512BW-NEXT: retq 257 ; 258 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufl_w_512: 259 ; AVX512F-32: # BB#0: 260 ; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] 261 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 262 ; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] 263 ; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] 264 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 265 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0 266 ; AVX512F-32-NEXT: retl 267 %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) 268 %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) 269 %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) 270 %res3 = add <32 x i16> %res, %res1 271 %res4 = add <32 x i16> %res3, %res2 272 ret <32 x i16> %res4 273 } 274 275 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) { 276 ; AVX512BW-LABEL: test_pcmpeq_b: 277 ; AVX512BW: ## BB#0: 278 ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 279 ; AVX512BW-NEXT: kmovq %k0, %rax 280 ; AVX512BW-NEXT: retq 281 ; 282 ; AVX512F-32-LABEL: test_pcmpeq_b: 283 ; AVX512F-32: # BB#0: 284 ; AVX512F-32-NEXT: subl $12, %esp 285 ; AVX512F-32-NEXT: .Ltmp0: 286 ; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 287 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 288 ; AVX512F-32-NEXT: kmovq %k0, (%esp) 289 ; AVX512F-32-NEXT: movl (%esp), %eax 290 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx 291 ; AVX512F-32-NEXT: addl $12, %esp 292 ; AVX512F-32-NEXT: retl 293 %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) 294 ret i64 %res 295 } 296 297 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { 298 ; AVX512BW-LABEL: test_mask_pcmpeq_b: 299 ; AVX512BW: ## BB#0: 300 ; AVX512BW-NEXT: kmovq %rdi, %k1 301 ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} 302 ; AVX512BW-NEXT: kmovq %k0, %rax 303 ; AVX512BW-NEXT: retq 304 ; 305 ; AVX512F-32-LABEL: test_mask_pcmpeq_b: 306 ; AVX512F-32: # BB#0: 307 ; AVX512F-32-NEXT: subl $12, %esp 308 ; AVX512F-32-NEXT: .Ltmp1: 309 ; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 310 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 311 ; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} 312 ; AVX512F-32-NEXT: kmovq %k0, (%esp) 313 ; AVX512F-32-NEXT: movl (%esp), %eax 314 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx 315 ; AVX512F-32-NEXT: addl $12, %esp 316 ; AVX512F-32-NEXT: retl 317 %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) 318 ret i64 %res 319 } 320 321 declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64) 322 323 define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) { 324 ; AVX512BW-LABEL: test_pcmpeq_w: 325 ; AVX512BW: ## BB#0: 326 ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 327 ; AVX512BW-NEXT: kmovd %k0, %eax 328 ; AVX512BW-NEXT: retq 329 ; 330 ; AVX512F-32-LABEL: test_pcmpeq_w: 331 ; AVX512F-32: # BB#0: 332 ; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 333 ; AVX512F-32-NEXT: kmovd %k0, %eax 334 ; AVX512F-32-NEXT: retl 335 %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) 336 ret i32 %res 337 } 338 339 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 340 ; AVX512BW-LABEL: test_mask_pcmpeq_w: 341 ; AVX512BW: ## BB#0: 342 ; AVX512BW-NEXT: kmovd %edi, %k1 343 ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} 344 ; AVX512BW-NEXT: kmovd %k0, %eax 345 ; AVX512BW-NEXT: retq 346 ; 347 ; AVX512F-32-LABEL: test_mask_pcmpeq_w: 348 ; AVX512F-32: # BB#0: 349 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 350 ; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} 351 ; AVX512F-32-NEXT: kmovd %k0, %eax 352 ; AVX512F-32-NEXT: retl 353 %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) 354 ret i32 %res 355 } 356 357 declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32) 358 359 define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) { 360 ; AVX512BW-LABEL: test_pcmpgt_b: 361 ; AVX512BW: ## BB#0: 362 ; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 363 ; AVX512BW-NEXT: kmovq %k0, %rax 364 ; AVX512BW-NEXT: retq 365 ; 366 ; AVX512F-32-LABEL: test_pcmpgt_b: 367 ; AVX512F-32: # BB#0: 368 ; AVX512F-32-NEXT: subl $12, %esp 369 ; AVX512F-32-NEXT: .Ltmp2: 370 ; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 371 ; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 372 ; AVX512F-32-NEXT: kmovq %k0, (%esp) 373 ; AVX512F-32-NEXT: movl (%esp), %eax 374 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx 375 ; AVX512F-32-NEXT: addl $12, %esp 376 ; AVX512F-32-NEXT: retl 377 %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) 378 ret i64 %res 379 } 380 381 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { 382 ; AVX512BW-LABEL: test_mask_pcmpgt_b: 383 ; AVX512BW: ## BB#0: 384 ; AVX512BW-NEXT: kmovq %rdi, %k1 385 ; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} 386 ; AVX512BW-NEXT: kmovq %k0, %rax 387 ; AVX512BW-NEXT: retq 388 ; 389 ; AVX512F-32-LABEL: test_mask_pcmpgt_b: 390 ; AVX512F-32: # BB#0: 391 ; AVX512F-32-NEXT: subl $12, %esp 392 ; AVX512F-32-NEXT: .Ltmp3: 393 ; AVX512F-32-NEXT: .cfi_def_cfa_offset 16 394 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 395 ; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} 396 ; AVX512F-32-NEXT: kmovq %k0, (%esp) 397 ; AVX512F-32-NEXT: movl (%esp), %eax 398 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx 399 ; AVX512F-32-NEXT: addl $12, %esp 400 ; AVX512F-32-NEXT: retl 401 %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) 402 ret i64 %res 403 } 404 405 declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64) 406 407 define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) { 408 ; AVX512BW-LABEL: test_pcmpgt_w: 409 ; AVX512BW: ## BB#0: 410 ; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 411 ; AVX512BW-NEXT: kmovd %k0, %eax 412 ; AVX512BW-NEXT: retq 413 ; 414 ; AVX512F-32-LABEL: test_pcmpgt_w: 415 ; AVX512F-32: # BB#0: 416 ; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 417 ; AVX512F-32-NEXT: kmovd %k0, %eax 418 ; AVX512F-32-NEXT: retl 419 %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) 420 ret i32 %res 421 } 422 423 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { 424 ; AVX512BW-LABEL: test_mask_pcmpgt_w: 425 ; AVX512BW: ## BB#0: 426 ; AVX512BW-NEXT: kmovd %edi, %k1 427 ; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} 428 ; AVX512BW-NEXT: kmovd %k0, %eax 429 ; AVX512BW-NEXT: retq 430 ; 431 ; AVX512F-32-LABEL: test_mask_pcmpgt_w: 432 ; AVX512F-32: # BB#0: 433 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 434 ; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} 435 ; AVX512F-32-NEXT: kmovd %k0, %eax 436 ; AVX512F-32-NEXT: retl 437 %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) 438 ret i32 %res 439 } 440 441 declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32) 442 443 declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) 444 445 define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { 446 ; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512: 447 ; AVX512BW: ## BB#0: 448 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 449 ; AVX512BW-NEXT: kmovq %rdi, %k1 450 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 451 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0 452 ; AVX512BW-NEXT: retq 453 ; 454 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512: 455 ; AVX512F-32: # BB#0: 456 ; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 457 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 458 ; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] 459 ; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0 460 ; AVX512F-32-NEXT: retl 461 %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) 462 %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) 463 %res2 = add <64 x i8> %res, %res1 464 ret <64 x i8> %res2 465 } 466 467 declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) 468 469 define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { 470 ; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512: 471 ; AVX512BW: ## BB#0: 472 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 473 ; AVX512BW-NEXT: kmovq %rdi, %k1 474 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 475 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0 476 ; AVX512BW-NEXT: retq 477 ; 478 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512: 479 ; AVX512F-32: # BB#0: 480 ; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 481 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 482 ; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] 483 ; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0 484 ; AVX512F-32-NEXT: retl 485 %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) 486 %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) 487 %res2 = add <64 x i8> %res, %res1 488 ret <64 x i8> %res2 489 } 490 491 declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) 492 493 define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { 494 ; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512: 495 ; AVX512BW: ## BB#0: 496 ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 497 ; AVX512BW-NEXT: kmovd %edi, %k1 498 ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 499 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0 500 ; AVX512BW-NEXT: retq 501 ; 502 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512: 503 ; AVX512F-32: # BB#0: 504 ; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 505 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 506 ; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] 507 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0 508 ; AVX512F-32-NEXT: retl 509 %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) 510 %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) 511 %res2 = add <32 x i16> %res, %res1 512 ret <32 x i16> %res2 513 } 514 515 declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) 516 517 define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { 518 ; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512: 519 ; AVX512BW: ## BB#0: 520 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 521 ; AVX512BW-NEXT: kmovd %edi, %k1 522 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 523 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0 524 ; AVX512BW-NEXT: retq 525 ; 526 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512: 527 ; AVX512F-32: # BB#0: 528 ; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 529 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 530 ; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] 531 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0 532 ; AVX512F-32-NEXT: retl 533 %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) 534 %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) 535 %res2 = add <32 x i16> %res, %res1 536 ret <32 x i16> %res2 537 } 538 539