1 //===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file describes the various vector pseudo instructions used by the 11 // compiler, as well as Pat patterns used during instruction selection. 12 // 13 //===----------------------------------------------------------------------===// 14 15 //===----------------------------------------------------------------------===// 16 // No op bitconverts 17 //===----------------------------------------------------------------------===// 18 19 // Bitcasts between 128-bit vector types. Return the original type since 20 // no instruction is needed for the conversion 21 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; 22 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; 23 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; 24 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; 25 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; 26 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; 27 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; 28 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; 29 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; 30 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; 31 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; 32 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; 33 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; 34 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; 35 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; 36 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; 37 def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; 38 def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; 39 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; 40 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; 41 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; 42 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; 43 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; 44 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; 45 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; 46 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; 47 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; 48 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; 49 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; 50 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; 51 52 // Bitcasts between 256-bit vector types. Return the original type since 53 // no instruction is needed for the conversion 54 def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; 55 def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; 56 def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; 57 def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; 58 def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; 59 def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; 60 def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; 61 def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; 62 def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; 63 def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; 64 def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; 65 def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; 66 def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; 67 def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; 68 def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; 69 def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; 70 def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; 71 def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; 72 def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; 73 def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; 74 def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; 75 def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; 76 def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; 77 def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; 78 def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; 79 def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; 80 def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; 81 def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; 82 def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; 83 def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; 84 85 // Bitcasts between 512-bit vector types. Return the original type since 86 // no instruction is needed for the conversion. 87 def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; 88 def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; 89 def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>; 90 def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>; 91 def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; 92 def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; 93 def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; 94 def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>; 95 def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>; 96 def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; 97 def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; 98 def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>; 99 def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>; 100 def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; 101 def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; 102 def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; 103 def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; 104 def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; 105 def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; 106 def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; 107 def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>; 108 def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>; 109 def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>; 110 def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>; 111 def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; 112 def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>; 113 def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>; 114 def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>; 115 def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>; 116 def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>; 117 118 119 //===----------------------------------------------------------------------===// 120 // Non-instruction patterns 121 //===----------------------------------------------------------------------===// 122 123 // A vector extract of the first f32/f64 position is a subregister copy 124 def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), 125 (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; 126 def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), 127 (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; 128 129 // Implicitly promote a 32-bit scalar to a vector. 130 def : Pat<(v4f32 (scalar_to_vector FR32:$src)), 131 (COPY_TO_REGCLASS FR32:$src, VR128)>; 132 // Implicitly promote a 64-bit scalar to a vector. 133 def : Pat<(v2f64 (scalar_to_vector FR64:$src)), 134 (COPY_TO_REGCLASS FR64:$src, VR128)>; 135 136 137 //===----------------------------------------------------------------------===// 138 // Subvector tricks 139 //===----------------------------------------------------------------------===// 140 141 // Patterns for insert_subvector/extract_subvector to/from index=0 142 multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT, 143 RegisterClass RC, ValueType VT, 144 SubRegIndex subIdx> { 145 def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))), 146 (subVT (EXTRACT_SUBREG RC:$src, subIdx))>; 147 148 def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), 149 (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>; 150 } 151 152 // A 128-bit subvector extract from the first 256-bit vector position is a 153 // subregister copy that needs no instruction. Likewise, a 128-bit subvector 154 // insert to the first 256-bit vector position is a subregister copy that needs 155 // no instruction. 156 defm : subvector_subreg_lowering<VR128, v4i32, VR256, v8i32, sub_xmm>; 157 defm : subvector_subreg_lowering<VR128, v4f32, VR256, v8f32, sub_xmm>; 158 defm : subvector_subreg_lowering<VR128, v2i64, VR256, v4i64, sub_xmm>; 159 defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64, sub_xmm>; 160 defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>; 161 defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8, sub_xmm>; 162 163 // A 128-bit subvector extract from the first 512-bit vector position is a 164 // subregister copy that needs no instruction. Likewise, a 128-bit subvector 165 // insert to the first 512-bit vector position is a subregister copy that needs 166 // no instruction. 167 defm : subvector_subreg_lowering<VR128, v4i32, VR512, v16i32, sub_xmm>; 168 defm : subvector_subreg_lowering<VR128, v4f32, VR512, v16f32, sub_xmm>; 169 defm : subvector_subreg_lowering<VR128, v2i64, VR512, v8i64, sub_xmm>; 170 defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64, sub_xmm>; 171 defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>; 172 defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8, sub_xmm>; 173 174 // A 128-bit subvector extract from the first 512-bit vector position is a 175 // subregister copy that needs no instruction. Likewise, a 128-bit subvector 176 // insert to the first 512-bit vector position is a subregister copy that needs 177 // no instruction. 178 defm : subvector_subreg_lowering<VR256, v8i32, VR512, v16i32, sub_ymm>; 179 defm : subvector_subreg_lowering<VR256, v8f32, VR512, v16f32, sub_ymm>; 180 defm : subvector_subreg_lowering<VR256, v4i64, VR512, v8i64, sub_ymm>; 181 defm : subvector_subreg_lowering<VR256, v4f64, VR512, v8f64, sub_ymm>; 182 defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>; 183 defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>; 184 185 186 multiclass subvector_store_lowering<string AlignedStr, string UnalignedStr, 187 RegisterClass RC, ValueType DstTy, 188 ValueType SrcTy, SubRegIndex SubIdx> { 189 def : Pat<(alignedstore (DstTy (extract_subvector 190 (SrcTy RC:$src), (iPTR 0))), addr:$dst), 191 (!cast<Instruction>("VMOV"#AlignedStr#"mr") addr:$dst, 192 (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; 193 194 def : Pat<(store (DstTy (extract_subvector 195 (SrcTy RC:$src), (iPTR 0))), addr:$dst), 196 (!cast<Instruction>("VMOV"#UnalignedStr#"mr") addr:$dst, 197 (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; 198 } 199 200 let Predicates = [HasAVX, NoVLX] in { 201 defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>; 202 defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>; 203 defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>; 204 defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>; 205 defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>; 206 defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>; 207 } 208 209 let Predicates = [HasVLX] in { 210 // Special patterns for storing subvector extracts of lower 128-bits 211 // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr 212 defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64, 213 sub_xmm>; 214 defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32, 215 sub_xmm>; 216 defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64, 217 v4i64, sub_xmm>; 218 defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32, 219 v8i32, sub_xmm>; 220 defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16, 221 v16i16, sub_xmm>; 222 defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8, 223 v32i8, sub_xmm>; 224 225 // Special patterns for storing subvector extracts of lower 128-bits of 512. 226 // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr 227 defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64, 228 sub_xmm>; 229 defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32, 230 sub_xmm>; 231 defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64, 232 v8i64, sub_xmm>; 233 defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32, 234 v16i32, sub_xmm>; 235 defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16, 236 v32i16, sub_xmm>; 237 defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8, 238 v64i8, sub_xmm>; 239 240 // Special patterns for storing subvector extracts of lower 256-bits of 512. 241 // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr 242 defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64, 243 sub_ymm>; 244 defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32, 245 sub_ymm>; 246 defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64, 247 v8i64, sub_ymm>; 248 defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32, 249 v16i32, sub_ymm>; 250 defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16, 251 v32i16, sub_ymm>; 252 defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8, 253 v64i8, sub_ymm>; 254 } 255 256 // If we're inserting into an all zeros vector, just use a plain move which 257 // will zero the upper bits. A post-isel hook will take care of removing 258 // any moves that we can prove are unnecessary. 259 multiclass subvec_zero_lowering<string MoveStr, 260 RegisterClass RC, ValueType DstTy, 261 ValueType SrcTy, ValueType ZeroTy, 262 SubRegIndex SubIdx> { 263 def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), 264 (SrcTy RC:$src), (iPTR 0))), 265 (SUBREG_TO_REG (i64 0), 266 (SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>; 267 } 268 269 let Predicates = [HasAVX, NoVLX] in { 270 defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, sub_xmm>; 271 defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, sub_xmm>; 272 defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, sub_xmm>; 273 defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, sub_xmm>; 274 defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, sub_xmm>; 275 defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, sub_xmm>; 276 } 277 278 let Predicates = [HasVLX] in { 279 defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, sub_xmm>; 280 defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, sub_xmm>; 281 defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, sub_xmm>; 282 defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, sub_xmm>; 283 defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, sub_xmm>; 284 defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, sub_xmm>; 285 286 defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, sub_xmm>; 287 defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, sub_xmm>; 288 defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, sub_xmm>; 289 defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, sub_xmm>; 290 defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, sub_xmm>; 291 defm : subvec_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, sub_xmm>; 292 293 defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, sub_ymm>; 294 defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, sub_ymm>; 295 defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, sub_ymm>; 296 defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, sub_ymm>; 297 defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, sub_ymm>; 298 defm : subvec_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, sub_ymm>; 299 } 300 301 let Predicates = [HasAVX512, NoVLX] in { 302 defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, sub_xmm>; 303 defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, sub_xmm>; 304 defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, sub_xmm>; 305 defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, sub_xmm>; 306 defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, sub_xmm>; 307 defm : subvec_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, sub_xmm>; 308 309 defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, sub_ymm>; 310 defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, sub_ymm>; 311 defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, sub_ymm>; 312 defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, sub_ymm>; 313 defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, sub_ymm>; 314 defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>; 315 } 316 317 class maskzeroupper<ValueType vt, RegisterClass RC> : 318 PatLeaf<(vt RC:$src), [{ 319 return isMaskZeroExtended(N); 320 }]>; 321 322 def maskzeroupperv1i1 : maskzeroupper<v1i1, VK1>; 323 def maskzeroupperv2i1 : maskzeroupper<v2i1, VK2>; 324 def maskzeroupperv4i1 : maskzeroupper<v4i1, VK4>; 325 def maskzeroupperv8i1 : maskzeroupper<v8i1, VK8>; 326 def maskzeroupperv16i1 : maskzeroupper<v16i1, VK16>; 327 def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>; 328 329 // The patterns determine if we can depend on the upper bits of a mask register 330 // being zeroed by the previous operation so that we can skip explicit 331 // zeroing. 332 let Predicates = [HasBWI] in { 333 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 334 maskzeroupperv1i1:$src, (iPTR 0))), 335 (COPY_TO_REGCLASS VK1:$src, VK32)>; 336 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 337 maskzeroupperv8i1:$src, (iPTR 0))), 338 (COPY_TO_REGCLASS VK8:$src, VK32)>; 339 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 340 maskzeroupperv16i1:$src, (iPTR 0))), 341 (COPY_TO_REGCLASS VK16:$src, VK32)>; 342 343 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 344 maskzeroupperv1i1:$src, (iPTR 0))), 345 (COPY_TO_REGCLASS VK1:$src, VK64)>; 346 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 347 maskzeroupperv8i1:$src, (iPTR 0))), 348 (COPY_TO_REGCLASS VK8:$src, VK64)>; 349 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 350 maskzeroupperv16i1:$src, (iPTR 0))), 351 (COPY_TO_REGCLASS VK16:$src, VK64)>; 352 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 353 maskzeroupperv32i1:$src, (iPTR 0))), 354 (COPY_TO_REGCLASS VK32:$src, VK64)>; 355 } 356 357 let Predicates = [HasAVX512] in { 358 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 359 maskzeroupperv1i1:$src, (iPTR 0))), 360 (COPY_TO_REGCLASS VK1:$src, VK16)>; 361 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 362 maskzeroupperv8i1:$src, (iPTR 0))), 363 (COPY_TO_REGCLASS VK8:$src, VK16)>; 364 } 365 366 let Predicates = [HasDQI] in { 367 def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), 368 maskzeroupperv1i1:$src, (iPTR 0))), 369 (COPY_TO_REGCLASS VK1:$src, VK8)>; 370 } 371 372 let Predicates = [HasVLX, HasDQI] in { 373 def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), 374 maskzeroupperv2i1:$src, (iPTR 0))), 375 (COPY_TO_REGCLASS VK2:$src, VK8)>; 376 def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), 377 maskzeroupperv4i1:$src, (iPTR 0))), 378 (COPY_TO_REGCLASS VK4:$src, VK8)>; 379 } 380 381 let Predicates = [HasVLX] in { 382 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 383 maskzeroupperv2i1:$src, (iPTR 0))), 384 (COPY_TO_REGCLASS VK2:$src, VK16)>; 385 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 386 maskzeroupperv4i1:$src, (iPTR 0))), 387 (COPY_TO_REGCLASS VK4:$src, VK16)>; 388 } 389 390 let Predicates = [HasBWI, HasVLX] in { 391 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 392 maskzeroupperv2i1:$src, (iPTR 0))), 393 (COPY_TO_REGCLASS VK2:$src, VK32)>; 394 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 395 maskzeroupperv4i1:$src, (iPTR 0))), 396 (COPY_TO_REGCLASS VK4:$src, VK32)>; 397 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 398 maskzeroupperv2i1:$src, (iPTR 0))), 399 (COPY_TO_REGCLASS VK2:$src, VK64)>; 400 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 401 maskzeroupperv4i1:$src, (iPTR 0))), 402 (COPY_TO_REGCLASS VK4:$src, VK64)>; 403 } 404 405 // If the bits are not zero we have to fall back to explicitly zeroing by 406 // using shifts. 407 let Predicates = [HasAVX512] in { 408 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 409 (v1i1 VK1:$mask), (iPTR 0))), 410 (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK1:$mask, VK16), 411 (i8 15)), (i8 15))>; 412 413 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 414 (v2i1 VK2:$mask), (iPTR 0))), 415 (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16), 416 (i8 14)), (i8 14))>; 417 418 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 419 (v4i1 VK4:$mask), (iPTR 0))), 420 (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16), 421 (i8 12)), (i8 12))>; 422 } 423 424 let Predicates = [HasAVX512, NoDQI] in { 425 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 426 (v8i1 VK8:$mask), (iPTR 0))), 427 (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK8:$mask, VK16), 428 (i8 8)), (i8 8))>; 429 } 430 431 let Predicates = [HasDQI] in { 432 def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), 433 (v8i1 VK8:$mask), (iPTR 0))), 434 (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK16)>; 435 436 def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), 437 (v1i1 VK1:$mask), (iPTR 0))), 438 (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK1:$mask, VK8), 439 (i8 7)), (i8 7))>; 440 def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), 441 (v2i1 VK2:$mask), (iPTR 0))), 442 (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK2:$mask, VK8), 443 (i8 6)), (i8 6))>; 444 def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), 445 (v4i1 VK4:$mask), (iPTR 0))), 446 (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK4:$mask, VK8), 447 (i8 4)), (i8 4))>; 448 } 449 450 let Predicates = [HasBWI] in { 451 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 452 (v16i1 VK16:$mask), (iPTR 0))), 453 (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK32)>; 454 455 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 456 (v16i1 VK16:$mask), (iPTR 0))), 457 (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK64)>; 458 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 459 (v32i1 VK32:$mask), (iPTR 0))), 460 (COPY_TO_REGCLASS (KMOVDkk VK32:$mask), VK64)>; 461 } 462 463 let Predicates = [HasBWI, NoDQI] in { 464 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 465 (v8i1 VK8:$mask), (iPTR 0))), 466 (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK8:$mask, VK32), 467 (i8 24)), (i8 24))>; 468 469 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 470 (v8i1 VK8:$mask), (iPTR 0))), 471 (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK8:$mask, VK64), 472 (i8 56)), (i8 56))>; 473 } 474 475 let Predicates = [HasBWI, HasDQI] in { 476 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 477 (v8i1 VK8:$mask), (iPTR 0))), 478 (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK32)>; 479 480 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 481 (v8i1 VK8:$mask), (iPTR 0))), 482 (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>; 483 } 484 485 let Predicates = [HasBWI, HasVLX] in { 486 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 487 (v1i1 VK1:$mask), (iPTR 0))), 488 (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32), 489 (i8 31)), (i8 31))>; 490 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 491 (v2i1 VK2:$mask), (iPTR 0))), 492 (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK2:$mask, VK32), 493 (i8 30)), (i8 30))>; 494 def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), 495 (v4i1 VK4:$mask), (iPTR 0))), 496 (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK4:$mask, VK32), 497 (i8 28)), (i8 28))>; 498 499 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 500 (v1i1 VK1:$mask), (iPTR 0))), 501 (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK1:$mask, VK64), 502 (i8 63)), (i8 63))>; 503 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 504 (v2i1 VK2:$mask), (iPTR 0))), 505 (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK2:$mask, VK64), 506 (i8 62)), (i8 62))>; 507 def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), 508 (v4i1 VK4:$mask), (iPTR 0))), 509 (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK4:$mask, VK64), 510 (i8 60)), (i8 60))>; 511 } 512