1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX2 3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 6 7 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) 8 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) 9 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) 10 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) 11 12 define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) { 13 ; X32-LABEL: combine_pshufb_pslldq: 14 ; X32: # %bb.0: 15 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 16 ; X32-NEXT: retl 17 ; 18 ; X64-LABEL: combine_pshufb_pslldq: 19 ; X64: # %bb.0: 20 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 21 ; X64-NEXT: retq 22 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 23 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 24 ret <32 x i8> %2 25 } 26 27 define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) { 28 ; X32-LABEL: combine_pshufb_psrldq: 29 ; X32: # %bb.0: 30 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 31 ; X32-NEXT: retl 32 ; 33 ; X64-LABEL: combine_pshufb_psrldq: 34 ; X64: # %bb.0: 35 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 36 ; X64-NEXT: retq 37 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 38 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32> 39 ret <32 x i8> %2 40 } 41 42 define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) { 43 ; X32-LABEL: combine_pshufb_vpermd: 44 ; X32: # %bb.0: 45 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] 46 ; X32-NEXT: retl 47 ; 48 ; X64-LABEL: combine_pshufb_vpermd: 49 ; X64: # %bb.0: 50 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] 51 ; X64-NEXT: retq 52 %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>) 53 %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8> 54 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30> 55 ret <32 x i8> %tmp2 56 } 57 58 define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) { 59 ; X32-LABEL: combine_pshufb_vpermps: 60 ; X32: # %bb.0: 61 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] 62 ; X32-NEXT: retl 63 ; 64 ; X64-LABEL: combine_pshufb_vpermps: 65 ; X64: # %bb.0: 66 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] 67 ; X64-NEXT: retq 68 %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>) 69 %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8> 70 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30> 71 ret <32 x i8> %tmp2 72 } 73 74 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) { 75 ; X32-LABEL: combine_and_pshufb: 76 ; X32: # %bb.0: 77 ; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 78 ; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 79 ; X32-NEXT: retl 80 ; 81 ; X64-LABEL: combine_and_pshufb: 82 ; X64: # %bb.0: 83 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 84 ; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 85 ; X64-NEXT: retq 86 %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 87 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 88 ret <32 x i8> %2 89 } 90 91 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) { 92 ; X32-LABEL: combine_pshufb_and: 93 ; X32: # %bb.0: 94 ; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 95 ; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 96 ; X32-NEXT: retl 97 ; 98 ; X64-LABEL: combine_pshufb_and: 99 ; X64: # %bb.0: 100 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 101 ; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] 102 ; X64-NEXT: retq 103 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 104 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 105 ret <32 x i8> %2 106 } 107 108 define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) { 109 ; X32-LABEL: combine_permq_pshufb_as_vperm2i128: 110 ; X32: # %bb.0: 111 ; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 112 ; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 113 ; X32-NEXT: retl 114 ; 115 ; X64-LABEL: combine_permq_pshufb_as_vperm2i128: 116 ; X64: # %bb.0: 117 ; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero 118 ; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 119 ; X64-NEXT: retq 120 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 121 %2 = bitcast <4 x i64> %1 to <32 x i8> 122 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>) 123 %4 = bitcast <32 x i8> %3 to <4 x i64> 124 %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3> 125 ret <4 x i64> %5 126 } 127 128 define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) { 129 ; X32-LABEL: combine_as_vpermd: 130 ; X32: # %bb.0: 131 ; X32-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] 132 ; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 133 ; X32-NEXT: retl 134 ; 135 ; X64-LABEL: combine_as_vpermd: 136 ; X64: # %bb.0: 137 ; X64-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] 138 ; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 139 ; X64-NEXT: retq 140 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 141 %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>) 142 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3> 143 ret <8 x i32> %3 144 } 145 146 define <8 x float> @combine_as_vpermps(<8 x float> %a0) { 147 ; X32-LABEL: combine_as_vpermps: 148 ; X32: # %bb.0: 149 ; X32-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> 150 ; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 151 ; X32-NEXT: retl 152 ; 153 ; X64-LABEL: combine_as_vpermps: 154 ; X64: # %bb.0: 155 ; X64-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> 156 ; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 157 ; X64-NEXT: retq 158 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 159 %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>) 160 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3> 161 ret <8 x float> %3 162 } 163 164 define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) { 165 ; X32-LABEL: combine_permq_pshufb_as_vpblendd: 166 ; X32: # %bb.0: 167 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 168 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 169 ; X32-NEXT: retl 170 ; 171 ; X64-LABEL: combine_permq_pshufb_as_vpblendd: 172 ; X64: # %bb.0: 173 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 174 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 175 ; X64-NEXT: retq 176 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 177 %2 = bitcast <4 x i64> %1 to <32 x i8> 178 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>) 179 ret <32 x i8> %3 180 } 181 182 define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) { 183 ; X32-LABEL: combine_pshufb_as_vpbroadcastb128: 184 ; X32: # %bb.0: 185 ; X32-NEXT: vpbroadcastb %xmm0, %xmm0 186 ; X32-NEXT: retl 187 ; 188 ; X64-LABEL: combine_pshufb_as_vpbroadcastb128: 189 ; X64: # %bb.0: 190 ; X64-NEXT: vpbroadcastb %xmm0, %xmm0 191 ; X64-NEXT: retq 192 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer) 193 ret <16 x i8> %1 194 } 195 196 define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) { 197 ; X32-LABEL: combine_pshufb_as_vpbroadcastb256: 198 ; X32: # %bb.0: 199 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 200 ; X32-NEXT: vpbroadcastb %xmm0, %ymm0 201 ; X32-NEXT: retl 202 ; 203 ; X64-LABEL: combine_pshufb_as_vpbroadcastb256: 204 ; X64: # %bb.0: 205 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 206 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0 207 ; X64-NEXT: retq 208 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 209 %2 = bitcast <4 x i64> %1 to <32 x i8> 210 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer) 211 %4 = bitcast <32 x i8> %3 to <8 x i32> 212 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer) 213 %6 = bitcast <8 x i32> %5 to <32 x i8> 214 ret <32 x i8> %6 215 } 216 217 define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) { 218 ; X32-LABEL: combine_pshufb_as_vpbroadcastw128: 219 ; X32: # %bb.0: 220 ; X32-NEXT: vpbroadcastw %xmm0, %xmm0 221 ; X32-NEXT: retl 222 ; 223 ; X64-LABEL: combine_pshufb_as_vpbroadcastw128: 224 ; X64: # %bb.0: 225 ; X64-NEXT: vpbroadcastw %xmm0, %xmm0 226 ; X64-NEXT: retq 227 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>) 228 ret <16 x i8> %1 229 } 230 231 define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) { 232 ; X32-LABEL: combine_pshufb_as_vpbroadcastw256: 233 ; X32: # %bb.0: 234 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 235 ; X32-NEXT: vpbroadcastw %xmm0, %ymm0 236 ; X32-NEXT: retl 237 ; 238 ; X64-LABEL: combine_pshufb_as_vpbroadcastw256: 239 ; X64: # %bb.0: 240 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 241 ; X64-NEXT: vpbroadcastw %xmm0, %ymm0 242 ; X64-NEXT: retq 243 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 244 %2 = bitcast <4 x i64> %1 to <32 x i8> 245 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>) 246 %4 = bitcast <32 x i8> %3 to <8 x i32> 247 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer) 248 %6 = bitcast <8 x i32> %5 to <32 x i8> 249 ret <32 x i8> %6 250 } 251 252 define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) { 253 ; X32-LABEL: combine_pshufb_as_vpbroadcastd128: 254 ; X32: # %bb.0: 255 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 256 ; X32-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0 257 ; X32-NEXT: retl 258 ; 259 ; X64-LABEL: combine_pshufb_as_vpbroadcastd128: 260 ; X64: # %bb.0: 261 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 262 ; X64-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 263 ; X64-NEXT: retq 264 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>) 265 %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3> 266 ret <16 x i8> %2 267 } 268 269 define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) { 270 ; X32-LABEL: combine_permd_as_vpbroadcastd256: 271 ; X32: # %bb.0: 272 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 273 ; X32-NEXT: vpbroadcastd %xmm0, %ymm0 274 ; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 275 ; X32-NEXT: retl 276 ; 277 ; X64-LABEL: combine_permd_as_vpbroadcastd256: 278 ; X64: # %bb.0: 279 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 280 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 281 ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 282 ; X64-NEXT: retq 283 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 284 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer) 285 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 286 ret <8 x i32> %3 287 } 288 289 define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) { 290 ; X32-LABEL: combine_pshufb_as_vpbroadcastq128: 291 ; X32: # %bb.0: 292 ; X32-NEXT: vpbroadcastq %xmm0, %xmm0 293 ; X32-NEXT: retl 294 ; 295 ; X64-LABEL: combine_pshufb_as_vpbroadcastq128: 296 ; X64: # %bb.0: 297 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 298 ; X64-NEXT: retq 299 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 300 ret <16 x i8> %1 301 } 302 303 define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) { 304 ; X32-LABEL: combine_permd_as_vpbroadcastq256: 305 ; X32: # %bb.0: 306 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 307 ; X32-NEXT: vpbroadcastq %xmm0, %ymm0 308 ; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 309 ; X32-NEXT: retl 310 ; 311 ; X64-LABEL: combine_permd_as_vpbroadcastq256: 312 ; X64: # %bb.0: 313 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 314 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 315 ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 316 ; X64-NEXT: retq 317 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 318 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>) 319 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 320 ret <8 x i32> %3 321 } 322 323 define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) { 324 ; X32-LABEL: combine_pshufb_as_vpbroadcastss128: 325 ; X32: # %bb.0: 326 ; X32-NEXT: vbroadcastss %xmm0, %xmm0 327 ; X32-NEXT: retl 328 ; 329 ; X64-LABEL: combine_pshufb_as_vpbroadcastss128: 330 ; X64: # %bb.0: 331 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 332 ; X64-NEXT: retq 333 %1 = bitcast <4 x float> %a to <16 x i8> 334 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>) 335 %3 = bitcast <16 x i8> %2 to <4 x float> 336 ret <4 x float> %3 337 } 338 339 define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) { 340 ; X32-LABEL: combine_permps_as_vpbroadcastss256: 341 ; X32: # %bb.0: 342 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 343 ; X32-NEXT: vbroadcastss %xmm0, %ymm0 344 ; X32-NEXT: retl 345 ; 346 ; X64-LABEL: combine_permps_as_vpbroadcastss256: 347 ; X64: # %bb.0: 348 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 349 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 350 ; X64-NEXT: retq 351 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 352 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) 353 ret <8 x float> %2 354 } 355 356 define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) { 357 ; X32-LABEL: combine_permps_as_vpbroadcastsd256: 358 ; X32: # %bb.0: 359 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 360 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 361 ; X32-NEXT: retl 362 ; 363 ; X64-LABEL: combine_permps_as_vpbroadcastsd256: 364 ; X64: # %bb.0: 365 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 366 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 367 ; X64-NEXT: retq 368 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 369 %2 = bitcast <4 x double> %1 to <8 x float> 370 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>) 371 %4 = bitcast <8 x float> %3 to <4 x double> 372 ret <4 x double> %4 373 } 374 375 define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) { 376 ; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128: 377 ; X32: # %bb.0: 378 ; X32-NEXT: vpbroadcastb %xmm0, %xmm0 379 ; X32-NEXT: retl 380 ; 381 ; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128: 382 ; X64: # %bb.0: 383 ; X64-NEXT: vpbroadcastb %xmm0, %xmm0 384 ; X64-NEXT: retq 385 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer 386 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer) 387 ret <16 x i8> %2 388 } 389 390 define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) { 391 ; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256: 392 ; X32: # %bb.0: 393 ; X32-NEXT: vpbroadcastb %xmm0, %ymm0 394 ; X32-NEXT: retl 395 ; 396 ; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256: 397 ; X64: # %bb.0: 398 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0 399 ; X64-NEXT: retq 400 %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer 401 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer) 402 ret <32 x i8> %2 403 } 404 405 define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) { 406 ; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128: 407 ; X32: # %bb.0: 408 ; X32-NEXT: vbroadcastss %xmm0, %xmm0 409 ; X32-NEXT: retl 410 ; 411 ; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128: 412 ; X64: # %bb.0: 413 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 414 ; X64-NEXT: retq 415 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer 416 %2 = bitcast <4 x float> %1 to <16 x i8> 417 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>) 418 %4 = bitcast <16 x i8> %3 to <4 x float> 419 ret <4 x float> %4 420 } 421 422 define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) { 423 ; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256: 424 ; X32: # %bb.0: 425 ; X32-NEXT: vbroadcastss %xmm0, %ymm0 426 ; X32-NEXT: vbroadcastss %xmm0, %ymm0 427 ; X32-NEXT: retl 428 ; 429 ; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256: 430 ; X64: # %bb.0: 431 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 432 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 433 ; X64-NEXT: retq 434 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer 435 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) 436 ret <8 x float> %2 437 } 438 439 define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) { 440 ; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256: 441 ; X32: # %bb.0: 442 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 443 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 444 ; X32-NEXT: retl 445 ; 446 ; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256: 447 ; X64: # %bb.0: 448 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 449 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 450 ; X64-NEXT: retq 451 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer 452 %2 = bitcast <4 x double> %1 to <8 x float> 453 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>) 454 %4 = bitcast <8 x float> %3 to <4 x double> 455 ret <4 x double> %4 456 } 457 458 define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) { 459 ; X32-LABEL: combine_permd_as_permq: 460 ; X32: # %bb.0: 461 ; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] 462 ; X32-NEXT: retl 463 ; 464 ; X64-LABEL: combine_permd_as_permq: 465 ; X64: # %bb.0: 466 ; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] 467 ; X64-NEXT: retq 468 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>) 469 ret <8 x i32> %1 470 } 471 472 define <8 x float> @combine_permps_as_permpd(<8 x float> %a) { 473 ; X32-LABEL: combine_permps_as_permpd: 474 ; X32: # %bb.0: 475 ; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1] 476 ; X32-NEXT: retl 477 ; 478 ; X64-LABEL: combine_permps_as_permpd: 479 ; X64: # %bb.0: 480 ; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1] 481 ; X64-NEXT: retq 482 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>) 483 ret <8 x float> %1 484 } 485 486 define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) { 487 ; X32-LABEL: combine_pshufb_as_zext: 488 ; X32: # %bb.0: 489 ; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 490 ; X32-NEXT: retl 491 ; 492 ; X64-LABEL: combine_pshufb_as_zext: 493 ; X64: # %bb.0: 494 ; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 495 ; X64-NEXT: retq 496 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 497 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 498 %3 = bitcast <32 x i8> %2 to <4 x i64> 499 ret <4 x i64> %3 500 } 501 502 define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) { 503 ; X32-LABEL: combine_pshufb_as_zext128: 504 ; X32: # %bb.0: 505 ; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 506 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 507 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero 508 ; X32-NEXT: retl 509 ; 510 ; X64-LABEL: combine_pshufb_as_zext128: 511 ; X64: # %bb.0: 512 ; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 513 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 514 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero 515 ; X64-NEXT: retq 516 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 517 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 518 %3 = bitcast <32 x i8> %2 to <4 x i64> 519 ret <4 x i64> %3 520 } 521 522 define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) { 523 ; X32-LABEL: combine_pshufb_as_vzmovl_64: 524 ; X32: # %bb.0: 525 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 526 ; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 527 ; X32-NEXT: retl 528 ; 529 ; X64-LABEL: combine_pshufb_as_vzmovl_64: 530 ; X64: # %bb.0: 531 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 532 ; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 533 ; X64-NEXT: retq 534 %1 = bitcast <4 x double> %a0 to <32 x i8> 535 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 536 %3 = bitcast <32 x i8> %2 to <4 x double> 537 ret <4 x double> %3 538 } 539 540 define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) { 541 ; X32-LABEL: combine_pshufb_as_vzmovl_32: 542 ; X32: # %bb.0: 543 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 544 ; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 545 ; X32-NEXT: retl 546 ; 547 ; X64-LABEL: combine_pshufb_as_vzmovl_32: 548 ; X64: # %bb.0: 549 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 550 ; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 551 ; X64-NEXT: retq 552 %1 = bitcast <8 x float> %a0 to <32 x i8> 553 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 554 %3 = bitcast <32 x i8> %2 to <8 x float> 555 ret <8 x float> %3 556 } 557 558 define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) { 559 ; X32-LABEL: combine_pshufb_as_pslldq: 560 ; X32: # %bb.0: 561 ; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21] 562 ; X32-NEXT: retl 563 ; 564 ; X64-LABEL: combine_pshufb_as_pslldq: 565 ; X64: # %bb.0: 566 ; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21] 567 ; X64-NEXT: retq 568 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>) 569 ret <32 x i8> %res0 570 } 571 572 define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) { 573 ; X32-LABEL: combine_pshufb_as_psrldq: 574 ; X32: # %bb.0: 575 ; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 576 ; X32-NEXT: retl 577 ; 578 ; X64-LABEL: combine_pshufb_as_psrldq: 579 ; X64: # %bb.0: 580 ; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 581 ; X64-NEXT: retq 582 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 583 ret <32 x i8> %res0 584 } 585 586 define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) { 587 ; X32-LABEL: combine_pshufb_as_psrlw: 588 ; X32: # %bb.0: 589 ; X32-NEXT: vpsrlw $8, %ymm0, %ymm0 590 ; X32-NEXT: retl 591 ; 592 ; X64-LABEL: combine_pshufb_as_psrlw: 593 ; X64: # %bb.0: 594 ; X64-NEXT: vpsrlw $8, %ymm0, %ymm0 595 ; X64-NEXT: retq 596 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>) 597 ret <32 x i8> %res0 598 } 599 600 define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) { 601 ; X32-LABEL: combine_pshufb_as_pslld: 602 ; X32: # %bb.0: 603 ; X32-NEXT: vpslld $24, %ymm0, %ymm0 604 ; X32-NEXT: retl 605 ; 606 ; X64-LABEL: combine_pshufb_as_pslld: 607 ; X64: # %bb.0: 608 ; X64-NEXT: vpslld $24, %ymm0, %ymm0 609 ; X64-NEXT: retq 610 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>) 611 ret <32 x i8> %res0 612 } 613 614 define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) { 615 ; X32-LABEL: combine_pshufb_as_psrlq: 616 ; X32: # %bb.0: 617 ; X32-NEXT: vpsrlq $40, %ymm0, %ymm0 618 ; X32-NEXT: retl 619 ; 620 ; X64-LABEL: combine_pshufb_as_psrlq: 621 ; X64: # %bb.0: 622 ; X64-NEXT: vpsrlq $40, %ymm0, %ymm0 623 ; X64-NEXT: retq 624 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>) 625 ret <32 x i8> %res0 626 } 627 628 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) { 629 ; X32-LABEL: combine_pshufb_as_pshuflw: 630 ; X32: # %bb.0: 631 ; X32-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] 632 ; X32-NEXT: retl 633 ; 634 ; X64-LABEL: combine_pshufb_as_pshuflw: 635 ; X64: # %bb.0: 636 ; X64-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] 637 ; X64-NEXT: retq 638 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 639 ret <32 x i8> %res0 640 } 641 642 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) { 643 ; X32-LABEL: combine_pshufb_as_pshufhw: 644 ; X32: # %bb.0: 645 ; X32-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] 646 ; X32-NEXT: retl 647 ; 648 ; X64-LABEL: combine_pshufb_as_pshufhw: 649 ; X64: # %bb.0: 650 ; X64-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] 651 ; X64-NEXT: retq 652 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 653 ret <32 x i8> %res0 654 } 655 656 define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) { 657 ; X32-LABEL: combine_pshufb_not_as_pshufw: 658 ; X32: # %bb.0: 659 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29] 660 ; X32-NEXT: retl 661 ; 662 ; X64-LABEL: combine_pshufb_not_as_pshufw: 663 ; X64: # %bb.0: 664 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29] 665 ; X64-NEXT: retq 666 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 667 %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 668 ret <32 x i8> %res1 669 } 670 671 define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) { 672 ; X32-LABEL: combine_pshufb_as_unpacklo_undef: 673 ; X32: # %bb.0: 674 ; X32-NEXT: retl 675 ; 676 ; X64-LABEL: combine_pshufb_as_unpacklo_undef: 677 ; X64: # %bb.0: 678 ; X64-NEXT: retq 679 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>) 680 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30> 681 ret <32 x i8> %2 682 } 683 684 define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) { 685 ; X32-LABEL: combine_pshufb_as_unpacklo_zero: 686 ; X32: # %bb.0: 687 ; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 688 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 689 ; X32-NEXT: retl 690 ; 691 ; X64-LABEL: combine_pshufb_as_unpacklo_zero: 692 ; X64: # %bb.0: 693 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 694 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 695 ; X64-NEXT: retq 696 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>) 697 ret <32 x i8> %1 698 } 699 700 define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) { 701 ; X32-LABEL: combine_pshufb_as_unpackhi_zero: 702 ; X32: # %bb.0: 703 ; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 704 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] 705 ; X32-NEXT: retl 706 ; 707 ; X64-LABEL: combine_pshufb_as_unpackhi_zero: 708 ; X64: # %bb.0: 709 ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 710 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] 711 ; X64-NEXT: retq 712 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>) 713 ret <32 x i8> %1 714 } 715 716 define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) { 717 ; X32-LABEL: combine_psrlw_pshufb: 718 ; X32: # %bb.0: 719 ; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 720 ; X32-NEXT: retl 721 ; 722 ; X64-LABEL: combine_psrlw_pshufb: 723 ; X64: # %bb.0: 724 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 725 ; X64-NEXT: retq 726 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 727 %2 = bitcast <16 x i16> %1 to <32 x i8> 728 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>) 729 ret <32 x i8> %3 730 } 731 732 define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) { 733 ; X32-LABEL: combine_pslld_pshufb: 734 ; X32: # %bb.0: 735 ; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 736 ; X32-NEXT: retl 737 ; 738 ; X64-LABEL: combine_pslld_pshufb: 739 ; X64: # %bb.0: 740 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 741 ; X64-NEXT: retq 742 %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 743 %2 = bitcast <8 x i32> %1 to <32 x i8> 744 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>) 745 ret <32 x i8> %3 746 } 747 748 define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) { 749 ; X32-LABEL: combine_psrlq_pshufb: 750 ; X32: # %bb.0: 751 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero 752 ; X32-NEXT: retl 753 ; 754 ; X64-LABEL: combine_psrlq_pshufb: 755 ; X64: # %bb.0: 756 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero 757 ; X64-NEXT: retq 758 %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32> 759 %2 = bitcast <4 x i64> %1 to <32 x i8> 760 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>) 761 ret <32 x i8> %3 762 } 763 764 define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) { 765 ; X32-LABEL: combine_unpack_unpack_pshufb: 766 ; X32: # %bb.0: 767 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27] 768 ; X32-NEXT: retl 769 ; 770 ; X64-LABEL: combine_unpack_unpack_pshufb: 771 ; X64: # %bb.0: 772 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27] 773 ; X64-NEXT: retq 774 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 775 %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 776 %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 777 %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 778 %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 779 %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 780 ret <32 x i8> %6 781 } 782 783 define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) { 784 ; X32-LABEL: shuffle_combine_packssdw_pshufb: 785 ; X32: # %bb.0: 786 ; X32-NEXT: vpsrad $31, %ymm0, %ymm0 787 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17] 788 ; X32-NEXT: retl 789 ; 790 ; X64-LABEL: shuffle_combine_packssdw_pshufb: 791 ; X64: # %bb.0: 792 ; X64-NEXT: vpsrad $31, %ymm0, %ymm0 793 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17] 794 ; X64-NEXT: retq 795 %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 796 %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1) 797 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8> 798 ret <16 x i16> %3 799 } 800 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 801 802 define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) { 803 ; X32-LABEL: shuffle_combine_packsswb_pshufb: 804 ; X32: # %bb.0: 805 ; X32-NEXT: vpsraw $15, %ymm0, %ymm0 806 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16] 807 ; X32-NEXT: retl 808 ; 809 ; X64-LABEL: shuffle_combine_packsswb_pshufb: 810 ; X64: # %bb.0: 811 ; X64-NEXT: vpsraw $15, %ymm0, %ymm0 812 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16] 813 ; X64-NEXT: retq 814 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 815 %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 816 %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2) 817 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) 818 ret <32 x i8> %4 819 } 820 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 821 822 define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) { 823 ; X32-LABEL: shuffle_combine_packusdw_pshufb: 824 ; X32: # %bb.0: 825 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19] 826 ; X32-NEXT: retl 827 ; 828 ; X64-LABEL: shuffle_combine_packusdw_pshufb: 829 ; X64: # %bb.0: 830 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19] 831 ; X64-NEXT: retq 832 %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 833 %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1) 834 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8> 835 ret <16 x i16> %3 836 } 837 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 838 839 define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) { 840 ; X32-LABEL: shuffle_combine_packuswb_pshufb: 841 ; X32: # %bb.0: 842 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17] 843 ; X32-NEXT: retl 844 ; 845 ; X64-LABEL: shuffle_combine_packuswb_pshufb: 846 ; X64: # %bb.0: 847 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17] 848 ; X64-NEXT: retq 849 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 850 %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 851 %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2) 852 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) 853 ret <32 x i8> %4 854 } 855 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 856 857 define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) { 858 ; X32-LABEL: combine_pshufb_insertion_as_broadcast_v2i64: 859 ; X32: # %bb.0: 860 ; X32-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 861 ; X32-NEXT: retl 862 ; 863 ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64: 864 ; X64: # %bb.0: 865 ; X64-NEXT: vmovq %rdi, %xmm0 866 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 867 ; X64-NEXT: retq 868 %1 = insertelement <2 x i64> undef, i64 %a0, i32 0 869 %2 = bitcast <2 x i64> %1 to <16 x i8> 870 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 871 ret <16 x i8> %3 872 } 873 874 define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) { 875 ; X32-LABEL: combine_permd_insertion_as_broadcast_v4i64: 876 ; X32: # %bb.0: 877 ; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 878 ; X32-NEXT: retl 879 ; 880 ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64: 881 ; X64: # %bb.0: 882 ; X64-NEXT: vmovq %rdi, %xmm0 883 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 884 ; X64-NEXT: retq 885 %1 = insertelement <4 x i64> undef, i64 %a0, i32 0 886 %2 = bitcast <4 x i64> %1 to <8 x i32> 887 %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>) 888 ret <8 x i32> %3 889 } 890 891 define <8 x i32> @constant_fold_permd() { 892 ; X32-LABEL: constant_fold_permd: 893 ; X32: # %bb.0: 894 ; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] 895 ; X32-NEXT: retl 896 ; 897 ; X64-LABEL: constant_fold_permd: 898 ; X64: # %bb.0: 899 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] 900 ; X64-NEXT: retq 901 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>) 902 ret <8 x i32> %1 903 } 904 905 define <8 x float> @constant_fold_permps() { 906 ; X32-LABEL: constant_fold_permps: 907 ; X32: # %bb.0: 908 ; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00] 909 ; X32-NEXT: retl 910 ; 911 ; X64-LABEL: constant_fold_permps: 912 ; X64: # %bb.0: 913 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00] 914 ; X64-NEXT: retq 915 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>) 916 ret <8 x float> %1 917 } 918 919 define <32 x i8> @constant_fold_pshufb_256() { 920 ; X32-LABEL: constant_fold_pshufb_256: 921 ; X32: # %bb.0: 922 ; X32-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> 923 ; X32-NEXT: retl 924 ; 925 ; X64-LABEL: constant_fold_pshufb_256: 926 ; X64: # %bb.0: 927 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> 928 ; X64-NEXT: retq 929 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>) 930 ret <32 x i8> %1 931 } 932 933 define <32 x i8> @PR27320(<8 x i32> %a0) { 934 ; X32-LABEL: PR27320: 935 ; X32: # %bb.0: 936 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 937 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23] 938 ; X32-NEXT: retl 939 ; 940 ; X64-LABEL: PR27320: 941 ; X64: # %bb.0: 942 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] 943 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23] 944 ; X64-NEXT: retq 945 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef> 946 %2 = bitcast <8 x i32> %1 to <32 x i8> 947 %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27> 948 ret <32 x i8> %3 949 } 950 951 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) { 952 ; X32-LABEL: PR34577: 953 ; X32: # %bb.0: # %entry 954 ; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] 955 ; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2 956 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 957 ; X32-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2> 958 ; X32-NEXT: vpermps %ymm1, %ymm2, %ymm1 959 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 960 ; X32-NEXT: retl 961 ; 962 ; X64-LABEL: PR34577: 963 ; X64: # %bb.0: # %entry 964 ; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] 965 ; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 966 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 967 ; X64-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2> 968 ; X64-NEXT: vpermps %ymm1, %ymm2, %ymm1 969 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 970 ; X64-NEXT: retq 971 entry: 972 %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0> 973 %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer 974 %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3> 975 %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2> 976 ret <8 x float> %shuf2 977 } 978