Home | History | Annotate | Download | only in X86

Lines Matching refs:PSHUFB

4361   case X86ISD::PSHUFB:
4400 case X86ISD::PSHUFB:
5906 case X86ISD::PSHUFB: {
8011 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8079 Opcode = X86ISD::PSHUFB;
8085 Opcode = X86ISD::PSHUFB;
8095 Opcode = X86ISD::PSHUFB;
8141 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8142 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8268 // TODO: Utilize pshufb and zero mask blending to support more efficient
9255 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9294 // PSHUFB can't cross lanes, ensure this doesn't happen.
9306 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10511 // pshufb when available. We can only use more than 2 unpack instructions
10512 // when zero extending i8 elements which also makes it easier to use pshufb.
10523 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12320 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
12353 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12357 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12491 // If we can't directly blend but can use PSHUFB, that will be better as it
12735 // with PSHUFB. It is important to do this before we attempt to generate any
12737 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12739 // a PSHUFB in the end. But once we start blending from multiple inputs,
12740 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12742 // PSHUFB approach because of its ability to zero lanes.
12751 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12756 // important as a single pshufb is significantly faster for that.
12769 // shuffles will both be pshufb, in which case we shouldn't bother with
12780 return PSHUFB;
14163 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14165 return PSHUFB;
14243 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14245 return PSHUFB;
14730 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14732 return PSHUFB;
14770 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14772 return PSHUFB;
15344 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
17279 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
17283 // The PSHUFB mask:
17307 // The PSHUFB mask:
22232 // Lower CTLZ using a PSHUFB lookup table implementation.
22241 // Per-nibble leading zero PSHUFB lookup table.
22253 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
22273 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
22274 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
22334 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
24510 // masked out higher ones) for each byte. PSHUFB is used separately with both
24542 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
24544 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
24730 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24731 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24756 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24757 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
25943 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
29621 /// for this operation, or into a PSHUFB instruction which is a fully general
29988 // mask, we can replace them with a single PSHUFB instruction profitably.
29989 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29990 // instructions, but in practice PSHUFB tends to be *very* fast so we're
30016 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
30021 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
30022 // slower than PSHUFB on targets that support both.
30154 /// PSHUFB instruction if available. We do this as the last combining step
30155 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
30156 /// a suitable short sequence of other instructions. The PSHUFB will either
30166 /// would simplify under the threshold for PSHUFB formation because of
36650 // SSSE3's pshufb results in less instructions in the cases below.
39130 // is only worth it with SSSE3 (PSHUFB).
39707 case X86ISD::PSHUFB: