Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
      4 
      5 ; A single 16-bit load + a single 16-bit store
      6 define void @load_2_i8(<2 x i8>* %A)  {
      7 ; SSE2-LABEL: load_2_i8:
      8 ; SSE2:       # %bb.0:
      9 ; SSE2-NEXT:    movzwl (%rdi), %eax
     10 ; SSE2-NEXT:    movd %eax, %xmm0
     11 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
     12 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
     13 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
     14 ; SSE2-NEXT:    paddq {{.*}}(%rip), %xmm0
     15 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
     16 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
     17 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
     18 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
     19 ; SSE2-NEXT:    movd %xmm0, %eax
     20 ; SSE2-NEXT:    movw %ax, (%rdi)
     21 ; SSE2-NEXT:    retq
     22 ;
     23 ; SSE41-LABEL: load_2_i8:
     24 ; SSE41:       # %bb.0:
     25 ; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     26 ; SSE41-NEXT:    paddq {{.*}}(%rip), %xmm0
     27 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
     28 ; SSE41-NEXT:    pextrw $0, %xmm0, (%rdi)
     29 ; SSE41-NEXT:    retq
     30    %T = load <2 x i8>, <2 x i8>* %A
     31    %G = add <2 x i8> %T, <i8 9, i8 7>
     32    store <2 x i8> %G, <2 x i8>* %A
     33    ret void
     34 }
     35 
     36 ; Read 32-bits
     37 define void @load_2_i16(<2 x i16>* %A)  {
     38 ; SSE2-LABEL: load_2_i16:
     39 ; SSE2:       # %bb.0:
     40 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
     41 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
     42 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
     43 ; SSE2-NEXT:    paddq {{.*}}(%rip), %xmm0
     44 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     45 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
     46 ; SSE2-NEXT:    movd %xmm0, (%rdi)
     47 ; SSE2-NEXT:    retq
     48 ;
     49 ; SSE41-LABEL: load_2_i16:
     50 ; SSE41:       # %bb.0:
     51 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
     52 ; SSE41-NEXT:    paddq {{.*}}(%rip), %xmm0
     53 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     54 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
     55 ; SSE41-NEXT:    movd %xmm0, (%rdi)
     56 ; SSE41-NEXT:    retq
     57    %T = load <2 x i16>, <2 x i16>* %A
     58    %G = add <2 x i16> %T, <i16 9, i16 7>
     59    store <2 x i16> %G, <2 x i16>* %A
     60    ret void
     61 }
     62 
     63 define void @load_2_i32(<2 x i32>* %A)  {
     64 ; SSE2-LABEL: load_2_i32:
     65 ; SSE2:       # %bb.0:
     66 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
     67 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
     68 ; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
     69 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     70 ; SSE2-NEXT:    movq %xmm0, (%rdi)
     71 ; SSE2-NEXT:    retq
     72 ;
     73 ; SSE41-LABEL: load_2_i32:
     74 ; SSE41:       # %bb.0:
     75 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
     76 ; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
     77 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     78 ; SSE41-NEXT:    movq %xmm0, (%rdi)
     79 ; SSE41-NEXT:    retq
     80    %T = load <2 x i32>, <2 x i32>* %A
     81    %G = add <2 x i32> %T, <i32 9, i32 7>
     82    store <2 x i32> %G, <2 x i32>* %A
     83    ret void
     84 }
     85 
     86 define void @load_4_i8(<4 x i8>* %A)  {
     87 ; SSE2-LABEL: load_4_i8:
     88 ; SSE2:       # %bb.0:
     89 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
     90 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
     91 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
     92 ; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
     93 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
     94 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
     95 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
     96 ; SSE2-NEXT:    movd %xmm0, (%rdi)
     97 ; SSE2-NEXT:    retq
     98 ;
     99 ; SSE41-LABEL: load_4_i8:
    100 ; SSE41:       # %bb.0:
    101 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    102 ; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
    103 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    104 ; SSE41-NEXT:    movd %xmm0, (%rdi)
    105 ; SSE41-NEXT:    retq
    106    %T = load <4 x i8>, <4 x i8>* %A
    107    %G = add <4 x i8> %T, <i8 1, i8 4, i8 9, i8 7>
    108    store <4 x i8> %G, <4 x i8>* %A
    109    ret void
    110 }
    111 
    112 define void @load_4_i16(<4 x i16>* %A)  {
    113 ; SSE2-LABEL: load_4_i16:
    114 ; SSE2:       # %bb.0:
    115 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    116 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    117 ; SSE2-NEXT:    paddw {{.*}}(%rip), %xmm0
    118 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    119 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    120 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    121 ; SSE2-NEXT:    movq %xmm0, (%rdi)
    122 ; SSE2-NEXT:    retq
    123 ;
    124 ; SSE41-LABEL: load_4_i16:
    125 ; SSE41:       # %bb.0:
    126 ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    127 ; SSE41-NEXT:    paddw {{.*}}(%rip), %xmm0
    128 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    129 ; SSE41-NEXT:    movq %xmm0, (%rdi)
    130 ; SSE41-NEXT:    retq
    131    %T = load <4 x i16>, <4 x i16>* %A
    132    %G = add <4 x i16> %T, <i16 1, i16 4, i16 9, i16 7>
    133    store <4 x i16> %G, <4 x i16>* %A
    134    ret void
    135 }
    136 
    137 define void @load_8_i8(<8 x i8>* %A)  {
    138 ; SSE2-LABEL: load_8_i8:
    139 ; SSE2:       # %bb.0:
    140 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    141 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    142 ; SSE2-NEXT:    paddb %xmm0, %xmm0
    143 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    144 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    145 ; SSE2-NEXT:    movq %xmm0, (%rdi)
    146 ; SSE2-NEXT:    retq
    147 ;
    148 ; SSE41-LABEL: load_8_i8:
    149 ; SSE41:       # %bb.0:
    150 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    151 ; SSE41-NEXT:    paddb %xmm0, %xmm0
    152 ; SSE41-NEXT:    packuswb %xmm0, %xmm0
    153 ; SSE41-NEXT:    movq %xmm0, (%rdi)
    154 ; SSE41-NEXT:    retq
    155    %T = load <8 x i8>, <8 x i8>* %A
    156    %G = add <8 x i8> %T, %T
    157    store <8 x i8> %G, <8 x i8>* %A
    158    ret void
    159 }
    160