Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck -check-prefix=X32 %s
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck -check-prefix=X64 %s
      4 
      5 ;; A basic sanity check to make sure that MMX arithmetic actually compiles.
      6 ;; First is a straight translation of the original with bitcasts as needed.
      7 
      8 define void @test0(x86_mmx* %A, x86_mmx* %B) {
      9 ; X32-LABEL: test0:
     10 ; X32:       # %bb.0: # %entry
     11 ; X32-NEXT:    pushl %ebp
     12 ; X32-NEXT:    .cfi_def_cfa_offset 8
     13 ; X32-NEXT:    .cfi_offset %ebp, -8
     14 ; X32-NEXT:    movl %esp, %ebp
     15 ; X32-NEXT:    .cfi_def_cfa_register %ebp
     16 ; X32-NEXT:    andl $-8, %esp
     17 ; X32-NEXT:    subl $32, %esp
     18 ; X32-NEXT:    movl 12(%ebp), %ecx
     19 ; X32-NEXT:    movl 8(%ebp), %eax
     20 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
     21 ; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
     22 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     23 ; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
     24 ; X32-NEXT:    paddw %xmm0, %xmm1
     25 ; X32-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
     26 ; X32-NEXT:    pand %xmm0, %xmm1
     27 ; X32-NEXT:    packuswb %xmm1, %xmm1
     28 ; X32-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
     29 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
     30 ; X32-NEXT:    movq %xmm1, (%eax)
     31 ; X32-NEXT:    paddsb (%ecx), %mm0
     32 ; X32-NEXT:    movq %mm0, (%eax)
     33 ; X32-NEXT:    paddusb (%ecx), %mm0
     34 ; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
     35 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     36 ; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
     37 ; X32-NEXT:    movq %mm0, (%eax)
     38 ; X32-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
     39 ; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
     40 ; X32-NEXT:    psubw %xmm2, %xmm1
     41 ; X32-NEXT:    pand %xmm0, %xmm1
     42 ; X32-NEXT:    packuswb %xmm1, %xmm1
     43 ; X32-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
     44 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
     45 ; X32-NEXT:    movq %xmm1, (%eax)
     46 ; X32-NEXT:    psubsb (%ecx), %mm0
     47 ; X32-NEXT:    movq %mm0, (%eax)
     48 ; X32-NEXT:    psubusb (%ecx), %mm0
     49 ; X32-NEXT:    movq %mm0, (%esp)
     50 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     51 ; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
     52 ; X32-NEXT:    movq %mm0, (%eax)
     53 ; X32-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
     54 ; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
     55 ; X32-NEXT:    pmullw %xmm1, %xmm2
     56 ; X32-NEXT:    movdqa %xmm2, %xmm1
     57 ; X32-NEXT:    pand %xmm0, %xmm1
     58 ; X32-NEXT:    packuswb %xmm1, %xmm1
     59 ; X32-NEXT:    movq %xmm1, (%eax)
     60 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     61 ; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
     62 ; X32-NEXT:    pand %xmm2, %xmm1
     63 ; X32-NEXT:    movdqa %xmm1, %xmm2
     64 ; X32-NEXT:    pand %xmm0, %xmm2
     65 ; X32-NEXT:    packuswb %xmm2, %xmm2
     66 ; X32-NEXT:    movq %xmm2, (%eax)
     67 ; X32-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
     68 ; X32-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
     69 ; X32-NEXT:    por %xmm1, %xmm2
     70 ; X32-NEXT:    movdqa %xmm2, %xmm1
     71 ; X32-NEXT:    pand %xmm0, %xmm1
     72 ; X32-NEXT:    packuswb %xmm1, %xmm1
     73 ; X32-NEXT:    movq %xmm1, (%eax)
     74 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     75 ; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
     76 ; X32-NEXT:    pxor %xmm2, %xmm1
     77 ; X32-NEXT:    pand %xmm0, %xmm1
     78 ; X32-NEXT:    packuswb %xmm1, %xmm1
     79 ; X32-NEXT:    movq %xmm1, (%eax)
     80 ; X32-NEXT:    emms
     81 ; X32-NEXT:    movl %ebp, %esp
     82 ; X32-NEXT:    popl %ebp
     83 ; X32-NEXT:    .cfi_def_cfa %esp, 4
     84 ; X32-NEXT:    retl
     85 ;
     86 ; X64-LABEL: test0:
     87 ; X64:       # %bb.0: # %entry
     88 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
     89 ; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
     90 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     91 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
     92 ; X64-NEXT:    paddw %xmm0, %xmm1
     93 ; X64-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
     94 ; X64-NEXT:    pand %xmm0, %xmm1
     95 ; X64-NEXT:    packuswb %xmm1, %xmm1
     96 ; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
     97 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
     98 ; X64-NEXT:    movq %xmm1, (%rdi)
     99 ; X64-NEXT:    paddsb (%rsi), %mm0
    100 ; X64-NEXT:    movq %mm0, (%rdi)
    101 ; X64-NEXT:    paddusb (%rsi), %mm0
    102 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
    103 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    104 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    105 ; X64-NEXT:    movq %mm0, (%rdi)
    106 ; X64-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
    107 ; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
    108 ; X64-NEXT:    psubw %xmm2, %xmm1
    109 ; X64-NEXT:    pand %xmm0, %xmm1
    110 ; X64-NEXT:    packuswb %xmm1, %xmm1
    111 ; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
    112 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
    113 ; X64-NEXT:    movq %xmm1, (%rdi)
    114 ; X64-NEXT:    psubsb (%rsi), %mm0
    115 ; X64-NEXT:    movq %mm0, (%rdi)
    116 ; X64-NEXT:    psubusb (%rsi), %mm0
    117 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
    118 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    119 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    120 ; X64-NEXT:    movq %mm0, (%rdi)
    121 ; X64-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
    122 ; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
    123 ; X64-NEXT:    pmullw %xmm1, %xmm2
    124 ; X64-NEXT:    movdqa %xmm2, %xmm1
    125 ; X64-NEXT:    pand %xmm0, %xmm1
    126 ; X64-NEXT:    packuswb %xmm1, %xmm1
    127 ; X64-NEXT:    movq %xmm1, (%rdi)
    128 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    129 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    130 ; X64-NEXT:    pand %xmm2, %xmm1
    131 ; X64-NEXT:    movdqa %xmm1, %xmm2
    132 ; X64-NEXT:    pand %xmm0, %xmm2
    133 ; X64-NEXT:    packuswb %xmm2, %xmm2
    134 ; X64-NEXT:    movq %xmm2, (%rdi)
    135 ; X64-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
    136 ; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
    137 ; X64-NEXT:    por %xmm1, %xmm2
    138 ; X64-NEXT:    movdqa %xmm2, %xmm1
    139 ; X64-NEXT:    pand %xmm0, %xmm1
    140 ; X64-NEXT:    packuswb %xmm1, %xmm1
    141 ; X64-NEXT:    movq %xmm1, (%rdi)
    142 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    143 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
    144 ; X64-NEXT:    pxor %xmm2, %xmm1
    145 ; X64-NEXT:    pand %xmm0, %xmm1
    146 ; X64-NEXT:    packuswb %xmm1, %xmm1
    147 ; X64-NEXT:    movq %xmm1, (%rdi)
    148 ; X64-NEXT:    emms
    149 ; X64-NEXT:    retq
    150 entry:
    151   %tmp1 = load x86_mmx, x86_mmx* %A
    152   %tmp3 = load x86_mmx, x86_mmx* %B
    153   %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8>
    154   %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8>
    155   %tmp4 = add <8 x i8> %tmp1a, %tmp3a
    156   %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx
    157   store x86_mmx %tmp4a, x86_mmx* %A
    158   %tmp7 = load x86_mmx, x86_mmx* %B
    159   %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %tmp4a, x86_mmx %tmp7)
    160   store x86_mmx %tmp12, x86_mmx* %A
    161   %tmp16 = load x86_mmx, x86_mmx* %B
    162   %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %tmp12, x86_mmx %tmp16)
    163   store x86_mmx %tmp21, x86_mmx* %A
    164   %tmp27 = load x86_mmx, x86_mmx* %B
    165   %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8>
    166   %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8>
    167   %tmp28 = sub <8 x i8> %tmp21a, %tmp27a
    168   %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx
    169   store x86_mmx %tmp28a, x86_mmx* %A
    170   %tmp31 = load x86_mmx, x86_mmx* %B
    171   %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %tmp28a, x86_mmx %tmp31)
    172   store x86_mmx %tmp36, x86_mmx* %A
    173   %tmp40 = load x86_mmx, x86_mmx* %B
    174   %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %tmp36, x86_mmx %tmp40)
    175   store x86_mmx %tmp45, x86_mmx* %A
    176   %tmp51 = load x86_mmx, x86_mmx* %B
    177   %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8>
    178   %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8>
    179   %tmp52 = mul <8 x i8> %tmp45a, %tmp51a
    180   %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx
    181   store x86_mmx %tmp52a, x86_mmx* %A
    182   %tmp57 = load x86_mmx, x86_mmx* %B
    183   %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8>
    184   %tmp58 = and <8 x i8> %tmp52, %tmp57a
    185   %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx
    186   store x86_mmx %tmp58a, x86_mmx* %A
    187   %tmp63 = load x86_mmx, x86_mmx* %B
    188   %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8>
    189   %tmp64 = or <8 x i8> %tmp58, %tmp63a
    190   %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx
    191   store x86_mmx %tmp64a, x86_mmx* %A
    192   %tmp69 = load x86_mmx, x86_mmx* %B
    193   %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8>
    194   %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8>
    195   %tmp70 = xor <8 x i8> %tmp64b, %tmp69a
    196   %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx
    197   store x86_mmx %tmp70a, x86_mmx* %A
    198   tail call void @llvm.x86.mmx.emms()
    199   ret void
    200 }
    201 
    202 define void @test1(x86_mmx* %A, x86_mmx* %B) {
    203 ; X32-LABEL: test1:
    204 ; X32:       # %bb.0: # %entry
    205 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    206 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    207 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    208 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
    209 ; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
    210 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
    211 ; X32-NEXT:    paddq %xmm0, %xmm1
    212 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    213 ; X32-NEXT:    movq %xmm0, (%eax)
    214 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    215 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
    216 ; X32-NEXT:    movdqa %xmm1, %xmm2
    217 ; X32-NEXT:    psrlq $32, %xmm2
    218 ; X32-NEXT:    pmuludq %xmm0, %xmm2
    219 ; X32-NEXT:    movdqa %xmm0, %xmm3
    220 ; X32-NEXT:    psrlq $32, %xmm3
    221 ; X32-NEXT:    pmuludq %xmm1, %xmm3
    222 ; X32-NEXT:    paddq %xmm2, %xmm3
    223 ; X32-NEXT:    psllq $32, %xmm3
    224 ; X32-NEXT:    pmuludq %xmm1, %xmm0
    225 ; X32-NEXT:    paddq %xmm3, %xmm0
    226 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
    227 ; X32-NEXT:    movq %xmm1, (%eax)
    228 ; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
    229 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
    230 ; X32-NEXT:    andps %xmm0, %xmm1
    231 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    232 ; X32-NEXT:    movq %xmm0, (%eax)
    233 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    234 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
    235 ; X32-NEXT:    orps %xmm1, %xmm0
    236 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
    237 ; X32-NEXT:    movq %xmm1, (%eax)
    238 ; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
    239 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
    240 ; X32-NEXT:    xorps %xmm0, %xmm1
    241 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    242 ; X32-NEXT:    movq %xmm0, (%eax)
    243 ; X32-NEXT:    emms
    244 ; X32-NEXT:    retl
    245 ;
    246 ; X64-LABEL: test1:
    247 ; X64:       # %bb.0: # %entry
    248 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    249 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    250 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    251 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
    252 ; X64-NEXT:    paddq %xmm0, %xmm1
    253 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    254 ; X64-NEXT:    movq %xmm0, (%rdi)
    255 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    256 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    257 ; X64-NEXT:    movdqa %xmm1, %xmm2
    258 ; X64-NEXT:    psrlq $32, %xmm2
    259 ; X64-NEXT:    pmuludq %xmm0, %xmm2
    260 ; X64-NEXT:    movdqa %xmm0, %xmm3
    261 ; X64-NEXT:    psrlq $32, %xmm3
    262 ; X64-NEXT:    pmuludq %xmm1, %xmm3
    263 ; X64-NEXT:    paddq %xmm2, %xmm3
    264 ; X64-NEXT:    psllq $32, %xmm3
    265 ; X64-NEXT:    pmuludq %xmm0, %xmm1
    266 ; X64-NEXT:    paddq %xmm3, %xmm1
    267 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    268 ; X64-NEXT:    movq %xmm0, (%rdi)
    269 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    270 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    271 ; X64-NEXT:    pand %xmm1, %xmm0
    272 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
    273 ; X64-NEXT:    movq %xmm1, (%rdi)
    274 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    275 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
    276 ; X64-NEXT:    por %xmm0, %xmm1
    277 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    278 ; X64-NEXT:    movq %xmm0, (%rdi)
    279 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    280 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    281 ; X64-NEXT:    pxor %xmm1, %xmm0
    282 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    283 ; X64-NEXT:    movq %xmm0, (%rdi)
    284 ; X64-NEXT:    emms
    285 ; X64-NEXT:    retq
    286 entry:
    287   %tmp1 = load x86_mmx, x86_mmx* %A
    288   %tmp3 = load x86_mmx, x86_mmx* %B
    289   %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32>
    290   %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32>
    291   %tmp4 = add <2 x i32> %tmp1a, %tmp3a
    292   %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx
    293   store x86_mmx %tmp4a, x86_mmx* %A
    294   %tmp9 = load x86_mmx, x86_mmx* %B
    295   %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32>
    296   %tmp10 = sub <2 x i32> %tmp4, %tmp9a
    297   %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx
    298   store x86_mmx %tmp10a, x86_mmx* %A
    299   %tmp15 = load x86_mmx, x86_mmx* %B
    300   %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32>
    301   %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
    302   %tmp16 = mul <2 x i32> %tmp10b, %tmp15a
    303   %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
    304   store x86_mmx %tmp16a, x86_mmx* %A
    305   %tmp21 = load x86_mmx, x86_mmx* %B
    306   %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32>
    307   %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32>
    308   %tmp22 = and <2 x i32> %tmp16b, %tmp21a
    309   %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx
    310   store x86_mmx %tmp22a, x86_mmx* %A
    311   %tmp27 = load x86_mmx, x86_mmx* %B
    312   %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32>
    313   %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32>
    314   %tmp28 = or <2 x i32> %tmp22b, %tmp27a
    315   %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx
    316   store x86_mmx %tmp28a, x86_mmx* %A
    317   %tmp33 = load x86_mmx, x86_mmx* %B
    318   %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32>
    319   %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32>
    320   %tmp34 = xor <2 x i32> %tmp28b, %tmp33a
    321   %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx
    322   store x86_mmx %tmp34a, x86_mmx* %A
    323   tail call void @llvm.x86.mmx.emms( )
    324   ret void
    325 }
    326 
    327 define void @test2(x86_mmx* %A, x86_mmx* %B) {
    328 ; X32-LABEL: test2:
    329 ; X32:       # %bb.0: # %entry
    330 ; X32-NEXT:    pushl %ebp
    331 ; X32-NEXT:    .cfi_def_cfa_offset 8
    332 ; X32-NEXT:    .cfi_offset %ebp, -8
    333 ; X32-NEXT:    movl %esp, %ebp
    334 ; X32-NEXT:    .cfi_def_cfa_register %ebp
    335 ; X32-NEXT:    andl $-8, %esp
    336 ; X32-NEXT:    subl $48, %esp
    337 ; X32-NEXT:    movl 12(%ebp), %ecx
    338 ; X32-NEXT:    movl 8(%ebp), %eax
    339 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    340 ; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    341 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    342 ; X32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    343 ; X32-NEXT:    paddd %xmm0, %xmm1
    344 ; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
    345 ; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    346 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    347 ; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
    348 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
    349 ; X32-NEXT:    movq %xmm0, (%eax)
    350 ; X32-NEXT:    paddsw (%ecx), %mm0
    351 ; X32-NEXT:    movq %mm0, (%eax)
    352 ; X32-NEXT:    paddusw (%ecx), %mm0
    353 ; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
    354 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    355 ; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    356 ; X32-NEXT:    movq %mm0, (%eax)
    357 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    358 ; X32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    359 ; X32-NEXT:    psubd %xmm1, %xmm0
    360 ; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    361 ; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    362 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    363 ; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
    364 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
    365 ; X32-NEXT:    movq %xmm0, (%eax)
    366 ; X32-NEXT:    psubsw (%ecx), %mm0
    367 ; X32-NEXT:    movq %mm0, (%eax)
    368 ; X32-NEXT:    psubusw (%ecx), %mm0
    369 ; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
    370 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    371 ; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    372 ; X32-NEXT:    movq %mm0, (%eax)
    373 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    374 ; X32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    375 ; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    376 ; X32-NEXT:    pmuludq %xmm1, %xmm0
    377 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    378 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    379 ; X32-NEXT:    pmuludq %xmm2, %xmm1
    380 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    381 ; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    382 ; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    383 ; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    384 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    385 ; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
    386 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
    387 ; X32-NEXT:    movq %xmm0, (%eax)
    388 ; X32-NEXT:    pmulhw (%ecx), %mm0
    389 ; X32-NEXT:    movq %mm0, (%eax)
    390 ; X32-NEXT:    pmaddwd (%ecx), %mm0
    391 ; X32-NEXT:    movq %mm0, (%esp)
    392 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    393 ; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    394 ; X32-NEXT:    movq %mm0, (%eax)
    395 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    396 ; X32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    397 ; X32-NEXT:    pand %xmm0, %xmm1
    398 ; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
    399 ; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    400 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    401 ; X32-NEXT:    movq %xmm0, (%eax)
    402 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    403 ; X32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    404 ; X32-NEXT:    por %xmm1, %xmm0
    405 ; X32-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
    406 ; X32-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
    407 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    408 ; X32-NEXT:    movq %xmm1, (%eax)
    409 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    410 ; X32-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    411 ; X32-NEXT:    pxor %xmm0, %xmm1
    412 ; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
    413 ; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    414 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    415 ; X32-NEXT:    movq %xmm0, (%eax)
    416 ; X32-NEXT:    emms
    417 ; X32-NEXT:    movl %ebp, %esp
    418 ; X32-NEXT:    popl %ebp
    419 ; X32-NEXT:    .cfi_def_cfa %esp, 4
    420 ; X32-NEXT:    retl
    421 ;
    422 ; X64-LABEL: test2:
    423 ; X64:       # %bb.0: # %entry
    424 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    425 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    426 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    427 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    428 ; X64-NEXT:    paddd %xmm0, %xmm1
    429 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
    430 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    431 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    432 ; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
    433 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
    434 ; X64-NEXT:    movq %xmm0, (%rdi)
    435 ; X64-NEXT:    paddsw (%rsi), %mm0
    436 ; X64-NEXT:    movq %mm0, (%rdi)
    437 ; X64-NEXT:    paddusw (%rsi), %mm0
    438 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
    439 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    440 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    441 ; X64-NEXT:    movq %mm0, (%rdi)
    442 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    443 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    444 ; X64-NEXT:    psubd %xmm1, %xmm0
    445 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    446 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    447 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    448 ; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
    449 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
    450 ; X64-NEXT:    movq %xmm0, (%rdi)
    451 ; X64-NEXT:    psubsw (%rsi), %mm0
    452 ; X64-NEXT:    movq %mm0, (%rdi)
    453 ; X64-NEXT:    psubusw (%rsi), %mm0
    454 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
    455 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    456 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    457 ; X64-NEXT:    movq %mm0, (%rdi)
    458 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    459 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    460 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    461 ; X64-NEXT:    pmuludq %xmm1, %xmm0
    462 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    463 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    464 ; X64-NEXT:    pmuludq %xmm2, %xmm1
    465 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    466 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    467 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    468 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    469 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    470 ; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
    471 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
    472 ; X64-NEXT:    movq %xmm0, (%rdi)
    473 ; X64-NEXT:    pmulhw (%rsi), %mm0
    474 ; X64-NEXT:    movq %mm0, (%rdi)
    475 ; X64-NEXT:    pmaddwd (%rsi), %mm0
    476 ; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
    477 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    478 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    479 ; X64-NEXT:    movq %mm0, (%rdi)
    480 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    481 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    482 ; X64-NEXT:    pand %xmm0, %xmm1
    483 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
    484 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    485 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    486 ; X64-NEXT:    movq %xmm0, (%rdi)
    487 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    488 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    489 ; X64-NEXT:    por %xmm1, %xmm0
    490 ; X64-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
    491 ; X64-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
    492 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    493 ; X64-NEXT:    movq %xmm1, (%rdi)
    494 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    495 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    496 ; X64-NEXT:    pxor %xmm0, %xmm1
    497 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
    498 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    499 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    500 ; X64-NEXT:    movq %xmm0, (%rdi)
    501 ; X64-NEXT:    emms
    502 ; X64-NEXT:    retq
    503 entry:
    504   %tmp1 = load x86_mmx, x86_mmx* %A
    505   %tmp3 = load x86_mmx, x86_mmx* %B
    506   %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16>
    507   %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16>
    508   %tmp4 = add <4 x i16> %tmp1a, %tmp3a
    509   %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx
    510   store x86_mmx %tmp4a, x86_mmx* %A
    511   %tmp7 = load x86_mmx, x86_mmx* %B
    512   %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %tmp4a, x86_mmx %tmp7)
    513   store x86_mmx %tmp12, x86_mmx* %A
    514   %tmp16 = load x86_mmx, x86_mmx* %B
    515   %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp12, x86_mmx %tmp16)
    516   store x86_mmx %tmp21, x86_mmx* %A
    517   %tmp27 = load x86_mmx, x86_mmx* %B
    518   %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16>
    519   %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16>
    520   %tmp28 = sub <4 x i16> %tmp21a, %tmp27a
    521   %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx
    522   store x86_mmx %tmp28a, x86_mmx* %A
    523   %tmp31 = load x86_mmx, x86_mmx* %B
    524   %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %tmp28a, x86_mmx %tmp31)
    525   store x86_mmx %tmp36, x86_mmx* %A
    526   %tmp40 = load x86_mmx, x86_mmx* %B
    527   %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %tmp36, x86_mmx %tmp40)
    528   store x86_mmx %tmp45, x86_mmx* %A
    529   %tmp51 = load x86_mmx, x86_mmx* %B
    530   %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16>
    531   %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16>
    532   %tmp52 = mul <4 x i16> %tmp45a, %tmp51a
    533   %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx
    534   store x86_mmx %tmp52a, x86_mmx* %A
    535   %tmp55 = load x86_mmx, x86_mmx* %B
    536   %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %tmp52a, x86_mmx %tmp55)
    537   store x86_mmx %tmp60, x86_mmx* %A
    538   %tmp64 = load x86_mmx, x86_mmx* %B
    539   %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %tmp60, x86_mmx %tmp64)
    540   %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx
    541   store x86_mmx %tmp70, x86_mmx* %A
    542   %tmp75 = load x86_mmx, x86_mmx* %B
    543   %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16>
    544   %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16>
    545   %tmp76 = and <4 x i16> %tmp70a, %tmp75a
    546   %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx
    547   store x86_mmx %tmp76a, x86_mmx* %A
    548   %tmp81 = load x86_mmx, x86_mmx* %B
    549   %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16>
    550   %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16>
    551   %tmp82 = or <4 x i16> %tmp76b, %tmp81a
    552   %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx
    553   store x86_mmx %tmp82a, x86_mmx* %A
    554   %tmp87 = load x86_mmx, x86_mmx* %B
    555   %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16>
    556   %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16>
    557   %tmp88 = xor <4 x i16> %tmp82b, %tmp87a
    558   %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx
    559   store x86_mmx %tmp88a, x86_mmx* %A
    560   tail call void @llvm.x86.mmx.emms( )
    561   ret void
    562 }
    563 
    564 define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
    565 ; X32-LABEL: test3:
    566 ; X32:       # %bb.0: # %entry
    567 ; X32-NEXT:    pushl %ebp
    568 ; X32-NEXT:    movl %esp, %ebp
    569 ; X32-NEXT:    pushl %ebx
    570 ; X32-NEXT:    pushl %edi
    571 ; X32-NEXT:    pushl %esi
    572 ; X32-NEXT:    andl $-8, %esp
    573 ; X32-NEXT:    subl $16, %esp
    574 ; X32-NEXT:    cmpl $0, 16(%ebp)
    575 ; X32-NEXT:    je .LBB3_1
    576 ; X32-NEXT:  # %bb.2: # %bb26.preheader
    577 ; X32-NEXT:    xorl %ebx, %ebx
    578 ; X32-NEXT:    xorl %eax, %eax
    579 ; X32-NEXT:    xorl %edx, %edx
    580 ; X32-NEXT:    .p2align 4, 0x90
    581 ; X32-NEXT:  .LBB3_3: # %bb26
    582 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
    583 ; X32-NEXT:    movl 8(%ebp), %ecx
    584 ; X32-NEXT:    movl %ecx, %esi
    585 ; X32-NEXT:    movl (%ecx,%ebx,8), %ecx
    586 ; X32-NEXT:    movl 4(%esi,%ebx,8), %esi
    587 ; X32-NEXT:    movl 12(%ebp), %edi
    588 ; X32-NEXT:    addl (%edi,%ebx,8), %ecx
    589 ; X32-NEXT:    adcl 4(%edi,%ebx,8), %esi
    590 ; X32-NEXT:    addl %eax, %ecx
    591 ; X32-NEXT:    movl %ecx, (%esp)
    592 ; X32-NEXT:    adcl %edx, %esi
    593 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
    594 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    595 ; X32-NEXT:    movd %xmm0, %eax
    596 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
    597 ; X32-NEXT:    movd %xmm0, %edx
    598 ; X32-NEXT:    incl %ebx
    599 ; X32-NEXT:    cmpl 16(%ebp), %ebx
    600 ; X32-NEXT:    jb .LBB3_3
    601 ; X32-NEXT:    jmp .LBB3_4
    602 ; X32-NEXT:  .LBB3_1:
    603 ; X32-NEXT:    xorl %eax, %eax
    604 ; X32-NEXT:    xorl %edx, %edx
    605 ; X32-NEXT:  .LBB3_4: # %bb31
    606 ; X32-NEXT:    leal -12(%ebp), %esp
    607 ; X32-NEXT:    popl %esi
    608 ; X32-NEXT:    popl %edi
    609 ; X32-NEXT:    popl %ebx
    610 ; X32-NEXT:    popl %ebp
    611 ; X32-NEXT:    retl
    612 ;
    613 ; X64-LABEL: test3:
    614 ; X64:       # %bb.0: # %entry
    615 ; X64-NEXT:    xorl %r8d, %r8d
    616 ; X64-NEXT:    xorl %eax, %eax
    617 ; X64-NEXT:    testl %edx, %edx
    618 ; X64-NEXT:    je .LBB3_2
    619 ; X64-NEXT:    .p2align 4, 0x90
    620 ; X64-NEXT:  .LBB3_1: # %bb26
    621 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
    622 ; X64-NEXT:    movslq %r8d, %r8
    623 ; X64-NEXT:    movq (%rdi,%r8,8), %rcx
    624 ; X64-NEXT:    addq (%rsi,%r8,8), %rcx
    625 ; X64-NEXT:    addq %rcx, %rax
    626 ; X64-NEXT:    incl %r8d
    627 ; X64-NEXT:    cmpl %edx, %r8d
    628 ; X64-NEXT:    jb .LBB3_1
    629 ; X64-NEXT:  .LBB3_2: # %bb31
    630 ; X64-NEXT:    retq
    631 entry:
    632   %tmp2942 = icmp eq i32 %count, 0
    633   br i1 %tmp2942, label %bb31, label %bb26
    634 
    635 bb26:
    636   %i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ]
    637   %sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]
    638   %tmp13 = getelementptr <1 x i64>, <1 x i64>* %b, i32 %i.037.0
    639   %tmp14 = load <1 x i64>, <1 x i64>* %tmp13
    640   %tmp18 = getelementptr <1 x i64>, <1 x i64>* %a, i32 %i.037.0
    641   %tmp19 = load <1 x i64>, <1 x i64>* %tmp18
    642   %tmp21 = add <1 x i64> %tmp19, %tmp14
    643   %tmp22 = add <1 x i64> %tmp21, %sum.035.0
    644   %tmp25 = add i32 %i.037.0, 1
    645   %tmp29 = icmp ult i32 %tmp25, %count
    646   br i1 %tmp29, label %bb26, label %bb31
    647 
    648 bb31:
    649   %sum.035.1 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]
    650   ret <1 x i64> %sum.035.1
    651 }
    652 
    653 ; There are no MMX operations here, so we use XMM or i64.
    654 define void @ti8(double %a, double %b) nounwind {
    655 ; X32-LABEL: ti8:
    656 ; X32:       # %bb.0: # %entry
    657 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    658 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    659 ; X32-NEXT:    paddb %xmm0, %xmm1
    660 ; X32-NEXT:    movq %xmm1, 0
    661 ; X32-NEXT:    retl
    662 ;
    663 ; X64-LABEL: ti8:
    664 ; X64:       # %bb.0: # %entry
    665 ; X64-NEXT:    paddb %xmm1, %xmm0
    666 ; X64-NEXT:    movq %xmm0, 0
    667 ; X64-NEXT:    retq
    668 entry:
    669   %tmp1 = bitcast double %a to <8 x i8>
    670   %tmp2 = bitcast double %b to <8 x i8>
    671   %tmp3 = add <8 x i8> %tmp1, %tmp2
    672   store <8 x i8> %tmp3, <8 x i8>* null
    673   ret void
    674 }
    675 
    676 define void @ti16(double %a, double %b) nounwind {
    677 ; X32-LABEL: ti16:
    678 ; X32:       # %bb.0: # %entry
    679 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    680 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    681 ; X32-NEXT:    paddw %xmm0, %xmm1
    682 ; X32-NEXT:    movq %xmm1, 0
    683 ; X32-NEXT:    retl
    684 ;
    685 ; X64-LABEL: ti16:
    686 ; X64:       # %bb.0: # %entry
    687 ; X64-NEXT:    paddw %xmm1, %xmm0
    688 ; X64-NEXT:    movq %xmm0, 0
    689 ; X64-NEXT:    retq
    690 entry:
    691   %tmp1 = bitcast double %a to <4 x i16>
    692   %tmp2 = bitcast double %b to <4 x i16>
    693   %tmp3 = add <4 x i16> %tmp1, %tmp2
    694   store <4 x i16> %tmp3, <4 x i16>* null
    695   ret void
    696 }
    697 
    698 define void @ti32(double %a, double %b) nounwind {
    699 ; X32-LABEL: ti32:
    700 ; X32:       # %bb.0: # %entry
    701 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    702 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    703 ; X32-NEXT:    paddd %xmm0, %xmm1
    704 ; X32-NEXT:    movq %xmm1, 0
    705 ; X32-NEXT:    retl
    706 ;
    707 ; X64-LABEL: ti32:
    708 ; X64:       # %bb.0: # %entry
    709 ; X64-NEXT:    paddd %xmm1, %xmm0
    710 ; X64-NEXT:    movq %xmm0, 0
    711 ; X64-NEXT:    retq
    712 entry:
    713   %tmp1 = bitcast double %a to <2 x i32>
    714   %tmp2 = bitcast double %b to <2 x i32>
    715   %tmp3 = add <2 x i32> %tmp1, %tmp2
    716   store <2 x i32> %tmp3, <2 x i32>* null
    717   ret void
    718 }
    719 
    720 define void @ti64(double %a, double %b) nounwind {
    721 ; X32-LABEL: ti64:
    722 ; X32:       # %bb.0: # %entry
    723 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    724 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    725 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
    726 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
    727 ; X32-NEXT:    movl %eax, 0
    728 ; X32-NEXT:    movl %ecx, 4
    729 ; X32-NEXT:    retl
    730 ;
    731 ; X64-LABEL: ti64:
    732 ; X64:       # %bb.0: # %entry
    733 ; X64-NEXT:    movq %xmm0, %rax
    734 ; X64-NEXT:    movq %xmm1, %rcx
    735 ; X64-NEXT:    addq %rax, %rcx
    736 ; X64-NEXT:    movq %rcx, 0
    737 ; X64-NEXT:    retq
    738 entry:
    739   %tmp1 = bitcast double %a to <1 x i64>
    740   %tmp2 = bitcast double %b to <1 x i64>
    741   %tmp3 = add <1 x i64> %tmp1, %tmp2
    742   store <1 x i64> %tmp3, <1 x i64>* null
    743   ret void
    744 }
    745 
    746 ; MMX intrinsics calls get us MMX instructions.
    747 define void @ti8a(double %a, double %b) nounwind {
    748 ; X32-LABEL: ti8a:
    749 ; X32:       # %bb.0: # %entry
    750 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
    751 ; X32-NEXT:    paddb {{[0-9]+}}(%esp), %mm0
    752 ; X32-NEXT:    movq %mm0, 0
    753 ; X32-NEXT:    retl
    754 ;
    755 ; X64-LABEL: ti8a:
    756 ; X64:       # %bb.0: # %entry
    757 ; X64-NEXT:    movdq2q %xmm0, %mm0
    758 ; X64-NEXT:    movdq2q %xmm1, %mm1
    759 ; X64-NEXT:    paddb %mm0, %mm1
    760 ; X64-NEXT:    movq %mm1, 0
    761 ; X64-NEXT:    retq
    762 entry:
    763   %tmp1 = bitcast double %a to x86_mmx
    764   %tmp2 = bitcast double %b to x86_mmx
    765   %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2)
    766   store x86_mmx %tmp3, x86_mmx* null
    767   ret void
    768 }
    769 
    770 define void @ti16a(double %a, double %b) nounwind {
    771 ; X32-LABEL: ti16a:
    772 ; X32:       # %bb.0: # %entry
    773 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
    774 ; X32-NEXT:    paddw {{[0-9]+}}(%esp), %mm0
    775 ; X32-NEXT:    movq %mm0, 0
    776 ; X32-NEXT:    retl
    777 ;
    778 ; X64-LABEL: ti16a:
    779 ; X64:       # %bb.0: # %entry
    780 ; X64-NEXT:    movdq2q %xmm0, %mm0
    781 ; X64-NEXT:    movdq2q %xmm1, %mm1
    782 ; X64-NEXT:    paddw %mm0, %mm1
    783 ; X64-NEXT:    movq %mm1, 0
    784 ; X64-NEXT:    retq
    785 entry:
    786   %tmp1 = bitcast double %a to x86_mmx
    787   %tmp2 = bitcast double %b to x86_mmx
    788   %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2)
    789   store x86_mmx %tmp3, x86_mmx* null
    790   ret void
    791 }
    792 
    793 define void @ti32a(double %a, double %b) nounwind {
    794 ; X32-LABEL: ti32a:
    795 ; X32:       # %bb.0: # %entry
    796 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
    797 ; X32-NEXT:    paddd {{[0-9]+}}(%esp), %mm0
    798 ; X32-NEXT:    movq %mm0, 0
    799 ; X32-NEXT:    retl
    800 ;
    801 ; X64-LABEL: ti32a:
    802 ; X64:       # %bb.0: # %entry
    803 ; X64-NEXT:    movdq2q %xmm0, %mm0
    804 ; X64-NEXT:    movdq2q %xmm1, %mm1
    805 ; X64-NEXT:    paddd %mm0, %mm1
    806 ; X64-NEXT:    movq %mm1, 0
    807 ; X64-NEXT:    retq
    808 entry:
    809   %tmp1 = bitcast double %a to x86_mmx
    810   %tmp2 = bitcast double %b to x86_mmx
    811   %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2)
    812   store x86_mmx %tmp3, x86_mmx* null
    813   ret void
    814 }
    815 
    816 define void @ti64a(double %a, double %b) nounwind {
    817 ; X32-LABEL: ti64a:
    818 ; X32:       # %bb.0: # %entry
    819 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm0
    820 ; X32-NEXT:    paddq {{[0-9]+}}(%esp), %mm0
    821 ; X32-NEXT:    movq %mm0, 0
    822 ; X32-NEXT:    retl
    823 ;
    824 ; X64-LABEL: ti64a:
    825 ; X64:       # %bb.0: # %entry
    826 ; X64-NEXT:    movdq2q %xmm0, %mm0
    827 ; X64-NEXT:    movdq2q %xmm1, %mm1
    828 ; X64-NEXT:    paddq %mm0, %mm1
    829 ; X64-NEXT:    movq %mm1, 0
    830 ; X64-NEXT:    retq
    831 entry:
    832   %tmp1 = bitcast double %a to x86_mmx
    833   %tmp2 = bitcast double %b to x86_mmx
    834   %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2)
    835   store x86_mmx %tmp3, x86_mmx* null
    836   ret void
    837 }
    838 
    839 declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
    840 declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
    841 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
    842 declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
    843 
    844 declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
    845 declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx)
    846 declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
    847 declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx)
    848 declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx)
    849 declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx)
    850 
    851 declare void @llvm.x86.mmx.emms()
    852 
    853 declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
    854 declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx)
    855 declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx)
    856 declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx)
    857 
    858