Home | History | Annotate | Download | only in PowerPC
      1 ; RUN: opt < %s -instcombine | \
      2 ; RUN:   llc -march=ppc32 -mcpu=g5 | not grep vperm
      3 ; RUN: llc < %s -march=ppc32 -mcpu=g5 > %t
      4 ; RUN: grep vsldoi  %t | count 2
      5 ; RUN: grep vmrgh   %t | count 7
      6 ; RUN: grep vmrgl   %t | count 6
      7 ; RUN: grep vpkuhum %t | count 1
      8 ; RUN: grep vpkuwum %t | count 1
      9 
     10 define void @VSLDOI_xy(<8 x i16>* %A, <8 x i16>* %B) {
     11 entry:
     12 	%tmp = load <8 x i16>* %A		; <<8 x i16>> [#uses=1]
     13 	%tmp2 = load <8 x i16>* %B		; <<8 x i16>> [#uses=1]
     14 	%tmp.upgrd.1 = bitcast <8 x i16> %tmp to <16 x i8>		; <<16 x i8>> [#uses=11]
     15 	%tmp2.upgrd.2 = bitcast <8 x i16> %tmp2 to <16 x i8>		; <<16 x i8>> [#uses=5]
     16 	%tmp.upgrd.3 = extractelement <16 x i8> %tmp.upgrd.1, i32 5		; <i8> [#uses=1]
     17 	%tmp3 = extractelement <16 x i8> %tmp.upgrd.1, i32 6		; <i8> [#uses=1]
     18 	%tmp4 = extractelement <16 x i8> %tmp.upgrd.1, i32 7		; <i8> [#uses=1]
     19 	%tmp5 = extractelement <16 x i8> %tmp.upgrd.1, i32 8		; <i8> [#uses=1]
     20 	%tmp6 = extractelement <16 x i8> %tmp.upgrd.1, i32 9		; <i8> [#uses=1]
     21 	%tmp7 = extractelement <16 x i8> %tmp.upgrd.1, i32 10		; <i8> [#uses=1]
     22 	%tmp8 = extractelement <16 x i8> %tmp.upgrd.1, i32 11		; <i8> [#uses=1]
     23 	%tmp9 = extractelement <16 x i8> %tmp.upgrd.1, i32 12		; <i8> [#uses=1]
     24 	%tmp10 = extractelement <16 x i8> %tmp.upgrd.1, i32 13		; <i8> [#uses=1]
     25 	%tmp11 = extractelement <16 x i8> %tmp.upgrd.1, i32 14		; <i8> [#uses=1]
     26 	%tmp12 = extractelement <16 x i8> %tmp.upgrd.1, i32 15		; <i8> [#uses=1]
     27 	%tmp13 = extractelement <16 x i8> %tmp2.upgrd.2, i32 0		; <i8> [#uses=1]
     28 	%tmp14 = extractelement <16 x i8> %tmp2.upgrd.2, i32 1		; <i8> [#uses=1]
     29 	%tmp15 = extractelement <16 x i8> %tmp2.upgrd.2, i32 2		; <i8> [#uses=1]
     30 	%tmp16 = extractelement <16 x i8> %tmp2.upgrd.2, i32 3		; <i8> [#uses=1]
     31 	%tmp17 = extractelement <16 x i8> %tmp2.upgrd.2, i32 4		; <i8> [#uses=1]
     32 	%tmp18 = insertelement <16 x i8> undef, i8 %tmp.upgrd.3, i32 0		; <<16 x i8>> [#uses=1]
     33 	%tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp3, i32 1		; <<16 x i8>> [#uses=1]
     34 	%tmp20 = insertelement <16 x i8> %tmp19, i8 %tmp4, i32 2		; <<16 x i8>> [#uses=1]
     35 	%tmp21 = insertelement <16 x i8> %tmp20, i8 %tmp5, i32 3		; <<16 x i8>> [#uses=1]
     36 	%tmp22 = insertelement <16 x i8> %tmp21, i8 %tmp6, i32 4		; <<16 x i8>> [#uses=1]
     37 	%tmp23 = insertelement <16 x i8> %tmp22, i8 %tmp7, i32 5		; <<16 x i8>> [#uses=1]
     38 	%tmp24 = insertelement <16 x i8> %tmp23, i8 %tmp8, i32 6		; <<16 x i8>> [#uses=1]
     39 	%tmp25 = insertelement <16 x i8> %tmp24, i8 %tmp9, i32 7		; <<16 x i8>> [#uses=1]
     40 	%tmp26 = insertelement <16 x i8> %tmp25, i8 %tmp10, i32 8		; <<16 x i8>> [#uses=1]
     41 	%tmp27 = insertelement <16 x i8> %tmp26, i8 %tmp11, i32 9		; <<16 x i8>> [#uses=1]
     42 	%tmp28 = insertelement <16 x i8> %tmp27, i8 %tmp12, i32 10		; <<16 x i8>> [#uses=1]
     43 	%tmp29 = insertelement <16 x i8> %tmp28, i8 %tmp13, i32 11		; <<16 x i8>> [#uses=1]
     44 	%tmp30 = insertelement <16 x i8> %tmp29, i8 %tmp14, i32 12		; <<16 x i8>> [#uses=1]
     45 	%tmp31 = insertelement <16 x i8> %tmp30, i8 %tmp15, i32 13		; <<16 x i8>> [#uses=1]
     46 	%tmp32 = insertelement <16 x i8> %tmp31, i8 %tmp16, i32 14		; <<16 x i8>> [#uses=1]
     47 	%tmp33 = insertelement <16 x i8> %tmp32, i8 %tmp17, i32 15		; <<16 x i8>> [#uses=1]
     48 	%tmp33.upgrd.4 = bitcast <16 x i8> %tmp33 to <8 x i16>		; <<8 x i16>> [#uses=1]
     49 	store <8 x i16> %tmp33.upgrd.4, <8 x i16>* %A
     50 	ret void
     51 }
     52 
     53 define void @VSLDOI_xx(<8 x i16>* %A, <8 x i16>* %B) {
     54 	%tmp = load <8 x i16>* %A		; <<8 x i16>> [#uses=1]
     55 	%tmp2 = load <8 x i16>* %A		; <<8 x i16>> [#uses=1]
     56 	%tmp.upgrd.5 = bitcast <8 x i16> %tmp to <16 x i8>		; <<16 x i8>> [#uses=11]
     57 	%tmp2.upgrd.6 = bitcast <8 x i16> %tmp2 to <16 x i8>		; <<16 x i8>> [#uses=5]
     58 	%tmp.upgrd.7 = extractelement <16 x i8> %tmp.upgrd.5, i32 5		; <i8> [#uses=1]
     59 	%tmp3 = extractelement <16 x i8> %tmp.upgrd.5, i32 6		; <i8> [#uses=1]
     60 	%tmp4 = extractelement <16 x i8> %tmp.upgrd.5, i32 7		; <i8> [#uses=1]
     61 	%tmp5 = extractelement <16 x i8> %tmp.upgrd.5, i32 8		; <i8> [#uses=1]
     62 	%tmp6 = extractelement <16 x i8> %tmp.upgrd.5, i32 9		; <i8> [#uses=1]
     63 	%tmp7 = extractelement <16 x i8> %tmp.upgrd.5, i32 10		; <i8> [#uses=1]
     64 	%tmp8 = extractelement <16 x i8> %tmp.upgrd.5, i32 11		; <i8> [#uses=1]
     65 	%tmp9 = extractelement <16 x i8> %tmp.upgrd.5, i32 12		; <i8> [#uses=1]
     66 	%tmp10 = extractelement <16 x i8> %tmp.upgrd.5, i32 13		; <i8> [#uses=1]
     67 	%tmp11 = extractelement <16 x i8> %tmp.upgrd.5, i32 14		; <i8> [#uses=1]
     68 	%tmp12 = extractelement <16 x i8> %tmp.upgrd.5, i32 15		; <i8> [#uses=1]
     69 	%tmp13 = extractelement <16 x i8> %tmp2.upgrd.6, i32 0		; <i8> [#uses=1]
     70 	%tmp14 = extractelement <16 x i8> %tmp2.upgrd.6, i32 1		; <i8> [#uses=1]
     71 	%tmp15 = extractelement <16 x i8> %tmp2.upgrd.6, i32 2		; <i8> [#uses=1]
     72 	%tmp16 = extractelement <16 x i8> %tmp2.upgrd.6, i32 3		; <i8> [#uses=1]
     73 	%tmp17 = extractelement <16 x i8> %tmp2.upgrd.6, i32 4		; <i8> [#uses=1]
     74 	%tmp18 = insertelement <16 x i8> undef, i8 %tmp.upgrd.7, i32 0		; <<16 x i8>> [#uses=1]
     75 	%tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp3, i32 1		; <<16 x i8>> [#uses=1]
     76 	%tmp20 = insertelement <16 x i8> %tmp19, i8 %tmp4, i32 2		; <<16 x i8>> [#uses=1]
     77 	%tmp21 = insertelement <16 x i8> %tmp20, i8 %tmp5, i32 3		; <<16 x i8>> [#uses=1]
     78 	%tmp22 = insertelement <16 x i8> %tmp21, i8 %tmp6, i32 4		; <<16 x i8>> [#uses=1]
     79 	%tmp23 = insertelement <16 x i8> %tmp22, i8 %tmp7, i32 5		; <<16 x i8>> [#uses=1]
     80 	%tmp24 = insertelement <16 x i8> %tmp23, i8 %tmp8, i32 6		; <<16 x i8>> [#uses=1]
     81 	%tmp25 = insertelement <16 x i8> %tmp24, i8 %tmp9, i32 7		; <<16 x i8>> [#uses=1]
     82 	%tmp26 = insertelement <16 x i8> %tmp25, i8 %tmp10, i32 8		; <<16 x i8>> [#uses=1]
     83 	%tmp27 = insertelement <16 x i8> %tmp26, i8 %tmp11, i32 9		; <<16 x i8>> [#uses=1]
     84 	%tmp28 = insertelement <16 x i8> %tmp27, i8 %tmp12, i32 10		; <<16 x i8>> [#uses=1]
     85 	%tmp29 = insertelement <16 x i8> %tmp28, i8 %tmp13, i32 11		; <<16 x i8>> [#uses=1]
     86 	%tmp30 = insertelement <16 x i8> %tmp29, i8 %tmp14, i32 12		; <<16 x i8>> [#uses=1]
     87 	%tmp31 = insertelement <16 x i8> %tmp30, i8 %tmp15, i32 13		; <<16 x i8>> [#uses=1]
     88 	%tmp32 = insertelement <16 x i8> %tmp31, i8 %tmp16, i32 14		; <<16 x i8>> [#uses=1]
     89 	%tmp33 = insertelement <16 x i8> %tmp32, i8 %tmp17, i32 15		; <<16 x i8>> [#uses=1]
     90 	%tmp33.upgrd.8 = bitcast <16 x i8> %tmp33 to <8 x i16>		; <<8 x i16>> [#uses=1]
     91 	store <8 x i16> %tmp33.upgrd.8, <8 x i16>* %A
     92 	ret void
     93 }
     94 
     95 define void @VPERM_promote(<8 x i16>* %A, <8 x i16>* %B) {
     96 entry:
     97 	%tmp = load <8 x i16>* %A		; <<8 x i16>> [#uses=1]
     98 	%tmp.upgrd.9 = bitcast <8 x i16> %tmp to <4 x i32>		; <<4 x i32>> [#uses=1]
     99 	%tmp2 = load <8 x i16>* %B		; <<8 x i16>> [#uses=1]
    100 	%tmp2.upgrd.10 = bitcast <8 x i16> %tmp2 to <4 x i32>		; <<4 x i32>> [#uses=1]
    101 	%tmp3 = call <4 x i32> @llvm.ppc.altivec.vperm( <4 x i32> %tmp.upgrd.9, <4 x i32> %tmp2.upgrd.10, <16 x i8> < i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14, i8 14 > )		; <<4 x i32>> [#uses=1]
    102 	%tmp3.upgrd.11 = bitcast <4 x i32> %tmp3 to <8 x i16>		; <<8 x i16>> [#uses=1]
    103 	store <8 x i16> %tmp3.upgrd.11, <8 x i16>* %A
    104 	ret void
    105 }
    106 
    107 declare <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32>, <4 x i32>, <16 x i8>)
    108 
    109 define void @tb_l(<16 x i8>* %A, <16 x i8>* %B) {
    110 entry:
    111 	%tmp = load <16 x i8>* %A		; <<16 x i8>> [#uses=8]
    112 	%tmp2 = load <16 x i8>* %B		; <<16 x i8>> [#uses=8]
    113 	%tmp.upgrd.12 = extractelement <16 x i8> %tmp, i32 8		; <i8> [#uses=1]
    114 	%tmp3 = extractelement <16 x i8> %tmp2, i32 8		; <i8> [#uses=1]
    115 	%tmp4 = extractelement <16 x i8> %tmp, i32 9		; <i8> [#uses=1]
    116 	%tmp5 = extractelement <16 x i8> %tmp2, i32 9		; <i8> [#uses=1]
    117 	%tmp6 = extractelement <16 x i8> %tmp, i32 10		; <i8> [#uses=1]
    118 	%tmp7 = extractelement <16 x i8> %tmp2, i32 10		; <i8> [#uses=1]
    119 	%tmp8 = extractelement <16 x i8> %tmp, i32 11		; <i8> [#uses=1]
    120 	%tmp9 = extractelement <16 x i8> %tmp2, i32 11		; <i8> [#uses=1]
    121 	%tmp10 = extractelement <16 x i8> %tmp, i32 12		; <i8> [#uses=1]
    122 	%tmp11 = extractelement <16 x i8> %tmp2, i32 12		; <i8> [#uses=1]
    123 	%tmp12 = extractelement <16 x i8> %tmp, i32 13		; <i8> [#uses=1]
    124 	%tmp13 = extractelement <16 x i8> %tmp2, i32 13		; <i8> [#uses=1]
    125 	%tmp14 = extractelement <16 x i8> %tmp, i32 14		; <i8> [#uses=1]
    126 	%tmp15 = extractelement <16 x i8> %tmp2, i32 14		; <i8> [#uses=1]
    127 	%tmp16 = extractelement <16 x i8> %tmp, i32 15		; <i8> [#uses=1]
    128 	%tmp17 = extractelement <16 x i8> %tmp2, i32 15		; <i8> [#uses=1]
    129 	%tmp18 = insertelement <16 x i8> undef, i8 %tmp.upgrd.12, i32 0		; <<16 x i8>> [#uses=1]
    130 	%tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp3, i32 1		; <<16 x i8>> [#uses=1]
    131 	%tmp20 = insertelement <16 x i8> %tmp19, i8 %tmp4, i32 2		; <<16 x i8>> [#uses=1]
    132 	%tmp21 = insertelement <16 x i8> %tmp20, i8 %tmp5, i32 3		; <<16 x i8>> [#uses=1]
    133 	%tmp22 = insertelement <16 x i8> %tmp21, i8 %tmp6, i32 4		; <<16 x i8>> [#uses=1]
    134 	%tmp23 = insertelement <16 x i8> %tmp22, i8 %tmp7, i32 5		; <<16 x i8>> [#uses=1]
    135 	%tmp24 = insertelement <16 x i8> %tmp23, i8 %tmp8, i32 6		; <<16 x i8>> [#uses=1]
    136 	%tmp25 = insertelement <16 x i8> %tmp24, i8 %tmp9, i32 7		; <<16 x i8>> [#uses=1]
    137 	%tmp26 = insertelement <16 x i8> %tmp25, i8 %tmp10, i32 8		; <<16 x i8>> [#uses=1]
    138 	%tmp27 = insertelement <16 x i8> %tmp26, i8 %tmp11, i32 9		; <<16 x i8>> [#uses=1]
    139 	%tmp28 = insertelement <16 x i8> %tmp27, i8 %tmp12, i32 10		; <<16 x i8>> [#uses=1]
    140 	%tmp29 = insertelement <16 x i8> %tmp28, i8 %tmp13, i32 11		; <<16 x i8>> [#uses=1]
    141 	%tmp30 = insertelement <16 x i8> %tmp29, i8 %tmp14, i32 12		; <<16 x i8>> [#uses=1]
    142 	%tmp31 = insertelement <16 x i8> %tmp30, i8 %tmp15, i32 13		; <<16 x i8>> [#uses=1]
    143 	%tmp32 = insertelement <16 x i8> %tmp31, i8 %tmp16, i32 14		; <<16 x i8>> [#uses=1]
    144 	%tmp33 = insertelement <16 x i8> %tmp32, i8 %tmp17, i32 15		; <<16 x i8>> [#uses=1]
    145 	store <16 x i8> %tmp33, <16 x i8>* %A
    146 	ret void
    147 }
    148 
    149 define void @th_l(<8 x i16>* %A, <8 x i16>* %B) {
    150 entry:
    151 	%tmp = load <8 x i16>* %A		; <<8 x i16>> [#uses=4]
    152 	%tmp2 = load <8 x i16>* %B		; <<8 x i16>> [#uses=4]
    153 	%tmp.upgrd.13 = extractelement <8 x i16> %tmp, i32 4		; <i16> [#uses=1]
    154 	%tmp3 = extractelement <8 x i16> %tmp2, i32 4		; <i16> [#uses=1]
    155 	%tmp4 = extractelement <8 x i16> %tmp, i32 5		; <i16> [#uses=1]
    156 	%tmp5 = extractelement <8 x i16> %tmp2, i32 5		; <i16> [#uses=1]
    157 	%tmp6 = extractelement <8 x i16> %tmp, i32 6		; <i16> [#uses=1]
    158 	%tmp7 = extractelement <8 x i16> %tmp2, i32 6		; <i16> [#uses=1]
    159 	%tmp8 = extractelement <8 x i16> %tmp, i32 7		; <i16> [#uses=1]
    160 	%tmp9 = extractelement <8 x i16> %tmp2, i32 7		; <i16> [#uses=1]
    161 	%tmp10 = insertelement <8 x i16> undef, i16 %tmp.upgrd.13, i32 0		; <<8 x i16>> [#uses=1]
    162 	%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 1		; <<8 x i16>> [#uses=1]
    163 	%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 2		; <<8 x i16>> [#uses=1]
    164 	%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 3		; <<8 x i16>> [#uses=1]
    165 	%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 4		; <<8 x i16>> [#uses=1]
    166 	%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 5		; <<8 x i16>> [#uses=1]
    167 	%tmp16 = insertelement <8 x i16> %tmp15, i16 %tmp8, i32 6		; <<8 x i16>> [#uses=1]
    168 	%tmp17 = insertelement <8 x i16> %tmp16, i16 %tmp9, i32 7		; <<8 x i16>> [#uses=1]
    169 	store <8 x i16> %tmp17, <8 x i16>* %A
    170 	ret void
    171 }
    172 
    173 define void @tw_l(<4 x i32>* %A, <4 x i32>* %B) {
    174 entry:
    175 	%tmp = load <4 x i32>* %A		; <<4 x i32>> [#uses=2]
    176 	%tmp2 = load <4 x i32>* %B		; <<4 x i32>> [#uses=2]
    177 	%tmp.upgrd.14 = extractelement <4 x i32> %tmp, i32 2		; <i32> [#uses=1]
    178 	%tmp3 = extractelement <4 x i32> %tmp2, i32 2		; <i32> [#uses=1]
    179 	%tmp4 = extractelement <4 x i32> %tmp, i32 3		; <i32> [#uses=1]
    180 	%tmp5 = extractelement <4 x i32> %tmp2, i32 3		; <i32> [#uses=1]
    181 	%tmp6 = insertelement <4 x i32> undef, i32 %tmp.upgrd.14, i32 0		; <<4 x i32>> [#uses=1]
    182 	%tmp7 = insertelement <4 x i32> %tmp6, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
    183 	%tmp8 = insertelement <4 x i32> %tmp7, i32 %tmp4, i32 2		; <<4 x i32>> [#uses=1]
    184 	%tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp5, i32 3		; <<4 x i32>> [#uses=1]
    185 	store <4 x i32> %tmp9, <4 x i32>* %A
    186 	ret void
    187 }
    188 
    189 define void @tb_h(<16 x i8>* %A, <16 x i8>* %B) {
    190 entry:
    191 	%tmp = load <16 x i8>* %A		; <<16 x i8>> [#uses=8]
    192 	%tmp2 = load <16 x i8>* %B		; <<16 x i8>> [#uses=8]
    193 	%tmp.upgrd.15 = extractelement <16 x i8> %tmp, i32 0		; <i8> [#uses=1]
    194 	%tmp3 = extractelement <16 x i8> %tmp2, i32 0		; <i8> [#uses=1]
    195 	%tmp4 = extractelement <16 x i8> %tmp, i32 1		; <i8> [#uses=1]
    196 	%tmp5 = extractelement <16 x i8> %tmp2, i32 1		; <i8> [#uses=1]
    197 	%tmp6 = extractelement <16 x i8> %tmp, i32 2		; <i8> [#uses=1]
    198 	%tmp7 = extractelement <16 x i8> %tmp2, i32 2		; <i8> [#uses=1]
    199 	%tmp8 = extractelement <16 x i8> %tmp, i32 3		; <i8> [#uses=1]
    200 	%tmp9 = extractelement <16 x i8> %tmp2, i32 3		; <i8> [#uses=1]
    201 	%tmp10 = extractelement <16 x i8> %tmp, i32 4		; <i8> [#uses=1]
    202 	%tmp11 = extractelement <16 x i8> %tmp2, i32 4		; <i8> [#uses=1]
    203 	%tmp12 = extractelement <16 x i8> %tmp, i32 5		; <i8> [#uses=1]
    204 	%tmp13 = extractelement <16 x i8> %tmp2, i32 5		; <i8> [#uses=1]
    205 	%tmp14 = extractelement <16 x i8> %tmp, i32 6		; <i8> [#uses=1]
    206 	%tmp15 = extractelement <16 x i8> %tmp2, i32 6		; <i8> [#uses=1]
    207 	%tmp16 = extractelement <16 x i8> %tmp, i32 7		; <i8> [#uses=1]
    208 	%tmp17 = extractelement <16 x i8> %tmp2, i32 7		; <i8> [#uses=1]
    209 	%tmp18 = insertelement <16 x i8> undef, i8 %tmp.upgrd.15, i32 0		; <<16 x i8>> [#uses=1]
    210 	%tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp3, i32 1		; <<16 x i8>> [#uses=1]
    211 	%tmp20 = insertelement <16 x i8> %tmp19, i8 %tmp4, i32 2		; <<16 x i8>> [#uses=1]
    212 	%tmp21 = insertelement <16 x i8> %tmp20, i8 %tmp5, i32 3		; <<16 x i8>> [#uses=1]
    213 	%tmp22 = insertelement <16 x i8> %tmp21, i8 %tmp6, i32 4		; <<16 x i8>> [#uses=1]
    214 	%tmp23 = insertelement <16 x i8> %tmp22, i8 %tmp7, i32 5		; <<16 x i8>> [#uses=1]
    215 	%tmp24 = insertelement <16 x i8> %tmp23, i8 %tmp8, i32 6		; <<16 x i8>> [#uses=1]
    216 	%tmp25 = insertelement <16 x i8> %tmp24, i8 %tmp9, i32 7		; <<16 x i8>> [#uses=1]
    217 	%tmp26 = insertelement <16 x i8> %tmp25, i8 %tmp10, i32 8		; <<16 x i8>> [#uses=1]
    218 	%tmp27 = insertelement <16 x i8> %tmp26, i8 %tmp11, i32 9		; <<16 x i8>> [#uses=1]
    219 	%tmp28 = insertelement <16 x i8> %tmp27, i8 %tmp12, i32 10		; <<16 x i8>> [#uses=1]
    220 	%tmp29 = insertelement <16 x i8> %tmp28, i8 %tmp13, i32 11		; <<16 x i8>> [#uses=1]
    221 	%tmp30 = insertelement <16 x i8> %tmp29, i8 %tmp14, i32 12		; <<16 x i8>> [#uses=1]
    222 	%tmp31 = insertelement <16 x i8> %tmp30, i8 %tmp15, i32 13		; <<16 x i8>> [#uses=1]
    223 	%tmp32 = insertelement <16 x i8> %tmp31, i8 %tmp16, i32 14		; <<16 x i8>> [#uses=1]
    224 	%tmp33 = insertelement <16 x i8> %tmp32, i8 %tmp17, i32 15		; <<16 x i8>> [#uses=1]
    225 	store <16 x i8> %tmp33, <16 x i8>* %A
    226 	ret void
    227 }
    228 
    229 define void @th_h(<8 x i16>* %A, <8 x i16>* %B) {
    230 entry:
    231 	%tmp = load <8 x i16>* %A		; <<8 x i16>> [#uses=4]
    232 	%tmp2 = load <8 x i16>* %B		; <<8 x i16>> [#uses=4]
    233 	%tmp.upgrd.16 = extractelement <8 x i16> %tmp, i32 0		; <i16> [#uses=1]
    234 	%tmp3 = extractelement <8 x i16> %tmp2, i32 0		; <i16> [#uses=1]
    235 	%tmp4 = extractelement <8 x i16> %tmp, i32 1		; <i16> [#uses=1]
    236 	%tmp5 = extractelement <8 x i16> %tmp2, i32 1		; <i16> [#uses=1]
    237 	%tmp6 = extractelement <8 x i16> %tmp, i32 2		; <i16> [#uses=1]
    238 	%tmp7 = extractelement <8 x i16> %tmp2, i32 2		; <i16> [#uses=1]
    239 	%tmp8 = extractelement <8 x i16> %tmp, i32 3		; <i16> [#uses=1]
    240 	%tmp9 = extractelement <8 x i16> %tmp2, i32 3		; <i16> [#uses=1]
    241 	%tmp10 = insertelement <8 x i16> undef, i16 %tmp.upgrd.16, i32 0		; <<8 x i16>> [#uses=1]
    242 	%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 1		; <<8 x i16>> [#uses=1]
    243 	%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 2		; <<8 x i16>> [#uses=1]
    244 	%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 3		; <<8 x i16>> [#uses=1]
    245 	%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 4		; <<8 x i16>> [#uses=1]
    246 	%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 5		; <<8 x i16>> [#uses=1]
    247 	%tmp16 = insertelement <8 x i16> %tmp15, i16 %tmp8, i32 6		; <<8 x i16>> [#uses=1]
    248 	%tmp17 = insertelement <8 x i16> %tmp16, i16 %tmp9, i32 7		; <<8 x i16>> [#uses=1]
    249 	store <8 x i16> %tmp17, <8 x i16>* %A
    250 	ret void
    251 }
    252 
    253 define void @tw_h(<4 x i32>* %A, <4 x i32>* %B) {
    254 entry:
    255 	%tmp = load <4 x i32>* %A		; <<4 x i32>> [#uses=2]
    256 	%tmp2 = load <4 x i32>* %B		; <<4 x i32>> [#uses=2]
    257 	%tmp.upgrd.17 = extractelement <4 x i32> %tmp2, i32 0		; <i32> [#uses=1]
    258 	%tmp3 = extractelement <4 x i32> %tmp, i32 0		; <i32> [#uses=1]
    259 	%tmp4 = extractelement <4 x i32> %tmp2, i32 1		; <i32> [#uses=1]
    260 	%tmp5 = extractelement <4 x i32> %tmp, i32 1		; <i32> [#uses=1]
    261 	%tmp6 = insertelement <4 x i32> undef, i32 %tmp.upgrd.17, i32 0		; <<4 x i32>> [#uses=1]
    262 	%tmp7 = insertelement <4 x i32> %tmp6, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
    263 	%tmp8 = insertelement <4 x i32> %tmp7, i32 %tmp4, i32 2		; <<4 x i32>> [#uses=1]
    264 	%tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp5, i32 3		; <<4 x i32>> [#uses=1]
    265 	store <4 x i32> %tmp9, <4 x i32>* %A
    266 	ret void
    267 }
    268 
    269 define void @tw_h_flop(<4 x i32>* %A, <4 x i32>* %B) {
    270 	%tmp = load <4 x i32>* %A		; <<4 x i32>> [#uses=2]
    271 	%tmp2 = load <4 x i32>* %B		; <<4 x i32>> [#uses=2]
    272 	%tmp.upgrd.18 = extractelement <4 x i32> %tmp, i32 0		; <i32> [#uses=1]
    273 	%tmp3 = extractelement <4 x i32> %tmp2, i32 0		; <i32> [#uses=1]
    274 	%tmp4 = extractelement <4 x i32> %tmp, i32 1		; <i32> [#uses=1]
    275 	%tmp5 = extractelement <4 x i32> %tmp2, i32 1		; <i32> [#uses=1]
    276 	%tmp6 = insertelement <4 x i32> undef, i32 %tmp.upgrd.18, i32 0		; <<4 x i32>> [#uses=1]
    277 	%tmp7 = insertelement <4 x i32> %tmp6, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
    278 	%tmp8 = insertelement <4 x i32> %tmp7, i32 %tmp4, i32 2		; <<4 x i32>> [#uses=1]
    279 	%tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp5, i32 3		; <<4 x i32>> [#uses=1]
    280 	store <4 x i32> %tmp9, <4 x i32>* %A
    281 	ret void
    282 }
    283 
    284 define void @VMRG_UNARY_tb_l(<16 x i8>* %A, <16 x i8>* %B) {
    285 entry:
    286 	%tmp = load <16 x i8>* %A		; <<16 x i8>> [#uses=16]
    287 	%tmp.upgrd.19 = extractelement <16 x i8> %tmp, i32 8		; <i8> [#uses=1]
    288 	%tmp3 = extractelement <16 x i8> %tmp, i32 8		; <i8> [#uses=1]
    289 	%tmp4 = extractelement <16 x i8> %tmp, i32 9		; <i8> [#uses=1]
    290 	%tmp5 = extractelement <16 x i8> %tmp, i32 9		; <i8> [#uses=1]
    291 	%tmp6 = extractelement <16 x i8> %tmp, i32 10		; <i8> [#uses=1]
    292 	%tmp7 = extractelement <16 x i8> %tmp, i32 10		; <i8> [#uses=1]
    293 	%tmp8 = extractelement <16 x i8> %tmp, i32 11		; <i8> [#uses=1]
    294 	%tmp9 = extractelement <16 x i8> %tmp, i32 11		; <i8> [#uses=1]
    295 	%tmp10 = extractelement <16 x i8> %tmp, i32 12		; <i8> [#uses=1]
    296 	%tmp11 = extractelement <16 x i8> %tmp, i32 12		; <i8> [#uses=1]
    297 	%tmp12 = extractelement <16 x i8> %tmp, i32 13		; <i8> [#uses=1]
    298 	%tmp13 = extractelement <16 x i8> %tmp, i32 13		; <i8> [#uses=1]
    299 	%tmp14 = extractelement <16 x i8> %tmp, i32 14		; <i8> [#uses=1]
    300 	%tmp15 = extractelement <16 x i8> %tmp, i32 14		; <i8> [#uses=1]
    301 	%tmp16 = extractelement <16 x i8> %tmp, i32 15		; <i8> [#uses=1]
    302 	%tmp17 = extractelement <16 x i8> %tmp, i32 15		; <i8> [#uses=1]
    303 	%tmp18 = insertelement <16 x i8> undef, i8 %tmp.upgrd.19, i32 0		; <<16 x i8>> [#uses=1]
    304 	%tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp3, i32 1		; <<16 x i8>> [#uses=1]
    305 	%tmp20 = insertelement <16 x i8> %tmp19, i8 %tmp4, i32 2		; <<16 x i8>> [#uses=1]
    306 	%tmp21 = insertelement <16 x i8> %tmp20, i8 %tmp5, i32 3		; <<16 x i8>> [#uses=1]
    307 	%tmp22 = insertelement <16 x i8> %tmp21, i8 %tmp6, i32 4		; <<16 x i8>> [#uses=1]
    308 	%tmp23 = insertelement <16 x i8> %tmp22, i8 %tmp7, i32 5		; <<16 x i8>> [#uses=1]
    309 	%tmp24 = insertelement <16 x i8> %tmp23, i8 %tmp8, i32 6		; <<16 x i8>> [#uses=1]
    310 	%tmp25 = insertelement <16 x i8> %tmp24, i8 %tmp9, i32 7		; <<16 x i8>> [#uses=1]
    311 	%tmp26 = insertelement <16 x i8> %tmp25, i8 %tmp10, i32 8		; <<16 x i8>> [#uses=1]
    312 	%tmp27 = insertelement <16 x i8> %tmp26, i8 %tmp11, i32 9		; <<16 x i8>> [#uses=1]
    313 	%tmp28 = insertelement <16 x i8> %tmp27, i8 %tmp12, i32 10		; <<16 x i8>> [#uses=1]
    314 	%tmp29 = insertelement <16 x i8> %tmp28, i8 %tmp13, i32 11		; <<16 x i8>> [#uses=1]
    315 	%tmp30 = insertelement <16 x i8> %tmp29, i8 %tmp14, i32 12		; <<16 x i8>> [#uses=1]
    316 	%tmp31 = insertelement <16 x i8> %tmp30, i8 %tmp15, i32 13		; <<16 x i8>> [#uses=1]
    317 	%tmp32 = insertelement <16 x i8> %tmp31, i8 %tmp16, i32 14		; <<16 x i8>> [#uses=1]
    318 	%tmp33 = insertelement <16 x i8> %tmp32, i8 %tmp17, i32 15		; <<16 x i8>> [#uses=1]
    319 	store <16 x i8> %tmp33, <16 x i8>* %A
    320 	ret void
    321 }
    322 
    323 define void @VMRG_UNARY_th_l(<8 x i16>* %A, <8 x i16>* %B) {
    324 entry:
    325 	%tmp = load <8 x i16>* %A		; <<8 x i16>> [#uses=8]
    326 	%tmp.upgrd.20 = extractelement <8 x i16> %tmp, i32 4		; <i16> [#uses=1]
    327 	%tmp3 = extractelement <8 x i16> %tmp, i32 4		; <i16> [#uses=1]
    328 	%tmp4 = extractelement <8 x i16> %tmp, i32 5		; <i16> [#uses=1]
    329 	%tmp5 = extractelement <8 x i16> %tmp, i32 5		; <i16> [#uses=1]
    330 	%tmp6 = extractelement <8 x i16> %tmp, i32 6		; <i16> [#uses=1]
    331 	%tmp7 = extractelement <8 x i16> %tmp, i32 6		; <i16> [#uses=1]
    332 	%tmp8 = extractelement <8 x i16> %tmp, i32 7		; <i16> [#uses=1]
    333 	%tmp9 = extractelement <8 x i16> %tmp, i32 7		; <i16> [#uses=1]
    334 	%tmp10 = insertelement <8 x i16> undef, i16 %tmp.upgrd.20, i32 0		; <<8 x i16>> [#uses=1]
    335 	%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 1		; <<8 x i16>> [#uses=1]
    336 	%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 2		; <<8 x i16>> [#uses=1]
    337 	%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 3		; <<8 x i16>> [#uses=1]
    338 	%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 4		; <<8 x i16>> [#uses=1]
    339 	%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 5		; <<8 x i16>> [#uses=1]
    340 	%tmp16 = insertelement <8 x i16> %tmp15, i16 %tmp8, i32 6		; <<8 x i16>> [#uses=1]
    341 	%tmp17 = insertelement <8 x i16> %tmp16, i16 %tmp9, i32 7		; <<8 x i16>> [#uses=1]
    342 	store <8 x i16> %tmp17, <8 x i16>* %A
    343 	ret void
    344 }
    345 
    346 define void @VMRG_UNARY_tw_l(<4 x i32>* %A, <4 x i32>* %B) {
    347 entry:
    348 	%tmp = load <4 x i32>* %A		; <<4 x i32>> [#uses=4]
    349 	%tmp.upgrd.21 = extractelement <4 x i32> %tmp, i32 2		; <i32> [#uses=1]
    350 	%tmp3 = extractelement <4 x i32> %tmp, i32 2		; <i32> [#uses=1]
    351 	%tmp4 = extractelement <4 x i32> %tmp, i32 3		; <i32> [#uses=1]
    352 	%tmp5 = extractelement <4 x i32> %tmp, i32 3		; <i32> [#uses=1]
    353 	%tmp6 = insertelement <4 x i32> undef, i32 %tmp.upgrd.21, i32 0		; <<4 x i32>> [#uses=1]
    354 	%tmp7 = insertelement <4 x i32> %tmp6, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
    355 	%tmp8 = insertelement <4 x i32> %tmp7, i32 %tmp4, i32 2		; <<4 x i32>> [#uses=1]
    356 	%tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp5, i32 3		; <<4 x i32>> [#uses=1]
    357 	store <4 x i32> %tmp9, <4 x i32>* %A
    358 	ret void
    359 }
    360 
    361 define void @VMRG_UNARY_tb_h(<16 x i8>* %A, <16 x i8>* %B) {
    362 entry:
    363 	%tmp = load <16 x i8>* %A		; <<16 x i8>> [#uses=16]
    364 	%tmp.upgrd.22 = extractelement <16 x i8> %tmp, i32 0		; <i8> [#uses=1]
    365 	%tmp3 = extractelement <16 x i8> %tmp, i32 0		; <i8> [#uses=1]
    366 	%tmp4 = extractelement <16 x i8> %tmp, i32 1		; <i8> [#uses=1]
    367 	%tmp5 = extractelement <16 x i8> %tmp, i32 1		; <i8> [#uses=1]
    368 	%tmp6 = extractelement <16 x i8> %tmp, i32 2		; <i8> [#uses=1]
    369 	%tmp7 = extractelement <16 x i8> %tmp, i32 2		; <i8> [#uses=1]
    370 	%tmp8 = extractelement <16 x i8> %tmp, i32 3		; <i8> [#uses=1]
    371 	%tmp9 = extractelement <16 x i8> %tmp, i32 3		; <i8> [#uses=1]
    372 	%tmp10 = extractelement <16 x i8> %tmp, i32 4		; <i8> [#uses=1]
    373 	%tmp11 = extractelement <16 x i8> %tmp, i32 4		; <i8> [#uses=1]
    374 	%tmp12 = extractelement <16 x i8> %tmp, i32 5		; <i8> [#uses=1]
    375 	%tmp13 = extractelement <16 x i8> %tmp, i32 5		; <i8> [#uses=1]
    376 	%tmp14 = extractelement <16 x i8> %tmp, i32 6		; <i8> [#uses=1]
    377 	%tmp15 = extractelement <16 x i8> %tmp, i32 6		; <i8> [#uses=1]
    378 	%tmp16 = extractelement <16 x i8> %tmp, i32 7		; <i8> [#uses=1]
    379 	%tmp17 = extractelement <16 x i8> %tmp, i32 7		; <i8> [#uses=1]
    380 	%tmp18 = insertelement <16 x i8> undef, i8 %tmp.upgrd.22, i32 0		; <<16 x i8>> [#uses=1]
    381 	%tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp3, i32 1		; <<16 x i8>> [#uses=1]
    382 	%tmp20 = insertelement <16 x i8> %tmp19, i8 %tmp4, i32 2		; <<16 x i8>> [#uses=1]
    383 	%tmp21 = insertelement <16 x i8> %tmp20, i8 %tmp5, i32 3		; <<16 x i8>> [#uses=1]
    384 	%tmp22 = insertelement <16 x i8> %tmp21, i8 %tmp6, i32 4		; <<16 x i8>> [#uses=1]
    385 	%tmp23 = insertelement <16 x i8> %tmp22, i8 %tmp7, i32 5		; <<16 x i8>> [#uses=1]
    386 	%tmp24 = insertelement <16 x i8> %tmp23, i8 %tmp8, i32 6		; <<16 x i8>> [#uses=1]
    387 	%tmp25 = insertelement <16 x i8> %tmp24, i8 %tmp9, i32 7		; <<16 x i8>> [#uses=1]
    388 	%tmp26 = insertelement <16 x i8> %tmp25, i8 %tmp10, i32 8		; <<16 x i8>> [#uses=1]
    389 	%tmp27 = insertelement <16 x i8> %tmp26, i8 %tmp11, i32 9		; <<16 x i8>> [#uses=1]
    390 	%tmp28 = insertelement <16 x i8> %tmp27, i8 %tmp12, i32 10		; <<16 x i8>> [#uses=1]
    391 	%tmp29 = insertelement <16 x i8> %tmp28, i8 %tmp13, i32 11		; <<16 x i8>> [#uses=1]
    392 	%tmp30 = insertelement <16 x i8> %tmp29, i8 %tmp14, i32 12		; <<16 x i8>> [#uses=1]
    393 	%tmp31 = insertelement <16 x i8> %tmp30, i8 %tmp15, i32 13		; <<16 x i8>> [#uses=1]
    394 	%tmp32 = insertelement <16 x i8> %tmp31, i8 %tmp16, i32 14		; <<16 x i8>> [#uses=1]
    395 	%tmp33 = insertelement <16 x i8> %tmp32, i8 %tmp17, i32 15		; <<16 x i8>> [#uses=1]
    396 	store <16 x i8> %tmp33, <16 x i8>* %A
    397 	ret void
    398 }
    399 
    400 define void @VMRG_UNARY_th_h(<8 x i16>* %A, <8 x i16>* %B) {
    401 entry:
    402 	%tmp = load <8 x i16>* %A		; <<8 x i16>> [#uses=8]
    403 	%tmp.upgrd.23 = extractelement <8 x i16> %tmp, i32 0		; <i16> [#uses=1]
    404 	%tmp3 = extractelement <8 x i16> %tmp, i32 0		; <i16> [#uses=1]
    405 	%tmp4 = extractelement <8 x i16> %tmp, i32 1		; <i16> [#uses=1]
    406 	%tmp5 = extractelement <8 x i16> %tmp, i32 1		; <i16> [#uses=1]
    407 	%tmp6 = extractelement <8 x i16> %tmp, i32 2		; <i16> [#uses=1]
    408 	%tmp7 = extractelement <8 x i16> %tmp, i32 2		; <i16> [#uses=1]
    409 	%tmp8 = extractelement <8 x i16> %tmp, i32 3		; <i16> [#uses=1]
    410 	%tmp9 = extractelement <8 x i16> %tmp, i32 3		; <i16> [#uses=1]
    411 	%tmp10 = insertelement <8 x i16> undef, i16 %tmp.upgrd.23, i32 0		; <<8 x i16>> [#uses=1]
    412 	%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 1		; <<8 x i16>> [#uses=1]
    413 	%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 2		; <<8 x i16>> [#uses=1]
    414 	%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 3		; <<8 x i16>> [#uses=1]
    415 	%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 4		; <<8 x i16>> [#uses=1]
    416 	%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 5		; <<8 x i16>> [#uses=1]
    417 	%tmp16 = insertelement <8 x i16> %tmp15, i16 %tmp8, i32 6		; <<8 x i16>> [#uses=1]
    418 	%tmp17 = insertelement <8 x i16> %tmp16, i16 %tmp9, i32 7		; <<8 x i16>> [#uses=1]
    419 	store <8 x i16> %tmp17, <8 x i16>* %A
    420 	ret void
    421 }
    422 
    423 define void @VMRG_UNARY_tw_h(<4 x i32>* %A, <4 x i32>* %B) {
    424 entry:
    425 	%tmp = load <4 x i32>* %A		; <<4 x i32>> [#uses=4]
    426 	%tmp.upgrd.24 = extractelement <4 x i32> %tmp, i32 0		; <i32> [#uses=1]
    427 	%tmp3 = extractelement <4 x i32> %tmp, i32 0		; <i32> [#uses=1]
    428 	%tmp4 = extractelement <4 x i32> %tmp, i32 1		; <i32> [#uses=1]
    429 	%tmp5 = extractelement <4 x i32> %tmp, i32 1		; <i32> [#uses=1]
    430 	%tmp6 = insertelement <4 x i32> undef, i32 %tmp.upgrd.24, i32 0		; <<4 x i32>> [#uses=1]
    431 	%tmp7 = insertelement <4 x i32> %tmp6, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
    432 	%tmp8 = insertelement <4 x i32> %tmp7, i32 %tmp4, i32 2		; <<4 x i32>> [#uses=1]
    433 	%tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp5, i32 3		; <<4 x i32>> [#uses=1]
    434 	store <4 x i32> %tmp9, <4 x i32>* %A
    435 	ret void
    436 }
    437 
    438 define void @VPCKUHUM_unary(<8 x i16>* %A, <8 x i16>* %B) {
    439 entry:
    440 	%tmp = load <8 x i16>* %A		; <<8 x i16>> [#uses=2]
    441 	%tmp.upgrd.25 = bitcast <8 x i16> %tmp to <16 x i8>		; <<16 x i8>> [#uses=8]
    442 	%tmp3 = bitcast <8 x i16> %tmp to <16 x i8>		; <<16 x i8>> [#uses=8]
    443 	%tmp.upgrd.26 = extractelement <16 x i8> %tmp.upgrd.25, i32 1		; <i8> [#uses=1]
    444 	%tmp4 = extractelement <16 x i8> %tmp.upgrd.25, i32 3		; <i8> [#uses=1]
    445 	%tmp5 = extractelement <16 x i8> %tmp.upgrd.25, i32 5		; <i8> [#uses=1]
    446 	%tmp6 = extractelement <16 x i8> %tmp.upgrd.25, i32 7		; <i8> [#uses=1]
    447 	%tmp7 = extractelement <16 x i8> %tmp.upgrd.25, i32 9		; <i8> [#uses=1]
    448 	%tmp8 = extractelement <16 x i8> %tmp.upgrd.25, i32 11		; <i8> [#uses=1]
    449 	%tmp9 = extractelement <16 x i8> %tmp.upgrd.25, i32 13		; <i8> [#uses=1]
    450 	%tmp10 = extractelement <16 x i8> %tmp.upgrd.25, i32 15		; <i8> [#uses=1]
    451 	%tmp11 = extractelement <16 x i8> %tmp3, i32 1		; <i8> [#uses=1]
    452 	%tmp12 = extractelement <16 x i8> %tmp3, i32 3		; <i8> [#uses=1]
    453 	%tmp13 = extractelement <16 x i8> %tmp3, i32 5		; <i8> [#uses=1]
    454 	%tmp14 = extractelement <16 x i8> %tmp3, i32 7		; <i8> [#uses=1]
    455 	%tmp15 = extractelement <16 x i8> %tmp3, i32 9		; <i8> [#uses=1]
    456 	%tmp16 = extractelement <16 x i8> %tmp3, i32 11		; <i8> [#uses=1]
    457 	%tmp17 = extractelement <16 x i8> %tmp3, i32 13		; <i8> [#uses=1]
    458 	%tmp18 = extractelement <16 x i8> %tmp3, i32 15		; <i8> [#uses=1]
    459 	%tmp19 = insertelement <16 x i8> undef, i8 %tmp.upgrd.26, i32 0		; <<16 x i8>> [#uses=1]
    460 	%tmp20 = insertelement <16 x i8> %tmp19, i8 %tmp4, i32 1		; <<16 x i8>> [#uses=1]
    461 	%tmp21 = insertelement <16 x i8> %tmp20, i8 %tmp5, i32 2		; <<16 x i8>> [#uses=1]
    462 	%tmp22 = insertelement <16 x i8> %tmp21, i8 %tmp6, i32 3		; <<16 x i8>> [#uses=1]
    463 	%tmp23 = insertelement <16 x i8> %tmp22, i8 %tmp7, i32 4		; <<16 x i8>> [#uses=1]
    464 	%tmp24 = insertelement <16 x i8> %tmp23, i8 %tmp8, i32 5		; <<16 x i8>> [#uses=1]
    465 	%tmp25 = insertelement <16 x i8> %tmp24, i8 %tmp9, i32 6		; <<16 x i8>> [#uses=1]
    466 	%tmp26 = insertelement <16 x i8> %tmp25, i8 %tmp10, i32 7		; <<16 x i8>> [#uses=1]
    467 	%tmp27 = insertelement <16 x i8> %tmp26, i8 %tmp11, i32 8		; <<16 x i8>> [#uses=1]
    468 	%tmp28 = insertelement <16 x i8> %tmp27, i8 %tmp12, i32 9		; <<16 x i8>> [#uses=1]
    469 	%tmp29 = insertelement <16 x i8> %tmp28, i8 %tmp13, i32 10		; <<16 x i8>> [#uses=1]
    470 	%tmp30 = insertelement <16 x i8> %tmp29, i8 %tmp14, i32 11		; <<16 x i8>> [#uses=1]
    471 	%tmp31 = insertelement <16 x i8> %tmp30, i8 %tmp15, i32 12		; <<16 x i8>> [#uses=1]
    472 	%tmp32 = insertelement <16 x i8> %tmp31, i8 %tmp16, i32 13		; <<16 x i8>> [#uses=1]
    473 	%tmp33 = insertelement <16 x i8> %tmp32, i8 %tmp17, i32 14		; <<16 x i8>> [#uses=1]
    474 	%tmp34 = insertelement <16 x i8> %tmp33, i8 %tmp18, i32 15		; <<16 x i8>> [#uses=1]
    475 	%tmp34.upgrd.27 = bitcast <16 x i8> %tmp34 to <8 x i16>		; <<8 x i16>> [#uses=1]
    476 	store <8 x i16> %tmp34.upgrd.27, <8 x i16>* %A
    477 	ret void
    478 }
    479 
    480 define void @VPCKUWUM_unary(<4 x i32>* %A, <4 x i32>* %B) {
    481 entry:
    482 	%tmp = load <4 x i32>* %A		; <<4 x i32>> [#uses=2]
    483 	%tmp.upgrd.28 = bitcast <4 x i32> %tmp to <8 x i16>		; <<8 x i16>> [#uses=4]
    484 	%tmp3 = bitcast <4 x i32> %tmp to <8 x i16>		; <<8 x i16>> [#uses=4]
    485 	%tmp.upgrd.29 = extractelement <8 x i16> %tmp.upgrd.28, i32 1		; <i16> [#uses=1]
    486 	%tmp4 = extractelement <8 x i16> %tmp.upgrd.28, i32 3		; <i16> [#uses=1]
    487 	%tmp5 = extractelement <8 x i16> %tmp.upgrd.28, i32 5		; <i16> [#uses=1]
    488 	%tmp6 = extractelement <8 x i16> %tmp.upgrd.28, i32 7		; <i16> [#uses=1]
    489 	%tmp7 = extractelement <8 x i16> %tmp3, i32 1		; <i16> [#uses=1]
    490 	%tmp8 = extractelement <8 x i16> %tmp3, i32 3		; <i16> [#uses=1]
    491 	%tmp9 = extractelement <8 x i16> %tmp3, i32 5		; <i16> [#uses=1]
    492 	%tmp10 = extractelement <8 x i16> %tmp3, i32 7		; <i16> [#uses=1]
    493 	%tmp11 = insertelement <8 x i16> undef, i16 %tmp.upgrd.29, i32 0		; <<8 x i16>> [#uses=1]
    494 	%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 1		; <<8 x i16>> [#uses=1]
    495 	%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 2		; <<8 x i16>> [#uses=1]
    496 	%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 3		; <<8 x i16>> [#uses=1]
    497 	%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 4		; <<8 x i16>> [#uses=1]
    498 	%tmp16 = insertelement <8 x i16> %tmp15, i16 %tmp8, i32 5		; <<8 x i16>> [#uses=1]
    499 	%tmp17 = insertelement <8 x i16> %tmp16, i16 %tmp9, i32 6		; <<8 x i16>> [#uses=1]
    500 	%tmp18 = insertelement <8 x i16> %tmp17, i16 %tmp10, i32 7		; <<8 x i16>> [#uses=1]
    501 	%tmp18.upgrd.30 = bitcast <8 x i16> %tmp18 to <4 x i32>		; <<4 x i32>> [#uses=1]
    502 	store <4 x i32> %tmp18.upgrd.30, <4 x i32>* %A
    503 	ret void
    504 }
    505