1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41 4 5 ; A single 16-bit load + a single 16-bit store 6 define void @load_2_i8(<2 x i8>* %A) { 7 ; SSE2-LABEL: load_2_i8: 8 ; SSE2: # %bb.0: 9 ; SSE2-NEXT: movzwl (%rdi), %eax 10 ; SSE2-NEXT: movd %eax, %xmm0 11 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 12 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 13 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 14 ; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0 15 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 16 ; SSE2-NEXT: packuswb %xmm0, %xmm0 17 ; SSE2-NEXT: packuswb %xmm0, %xmm0 18 ; SSE2-NEXT: packuswb %xmm0, %xmm0 19 ; SSE2-NEXT: movd %xmm0, %eax 20 ; SSE2-NEXT: movw %ax, (%rdi) 21 ; SSE2-NEXT: retq 22 ; 23 ; SSE41-LABEL: load_2_i8: 24 ; SSE41: # %bb.0: 25 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 26 ; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0 27 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 28 ; SSE41-NEXT: pextrw $0, %xmm0, (%rdi) 29 ; SSE41-NEXT: retq 30 %T = load <2 x i8>, <2 x i8>* %A 31 %G = add <2 x i8> %T, <i8 9, i8 7> 32 store <2 x i8> %G, <2 x i8>* %A 33 ret void 34 } 35 36 ; Read 32-bits 37 define void @load_2_i16(<2 x i16>* %A) { 38 ; SSE2-LABEL: load_2_i16: 39 ; SSE2: # %bb.0: 40 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 41 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 42 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] 43 ; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0 44 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 45 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 46 ; SSE2-NEXT: movd %xmm0, (%rdi) 47 ; SSE2-NEXT: retq 48 ; 49 ; SSE41-LABEL: load_2_i16: 50 ; SSE41: # %bb.0: 51 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero 52 ; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0 53 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 54 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 55 ; SSE41-NEXT: movd %xmm0, (%rdi) 56 ; SSE41-NEXT: retq 57 %T = load <2 x i16>, <2 x i16>* %A 58 %G = add <2 x i16> %T, <i16 9, i16 7> 59 store <2 x i16> %G, <2 x i16>* %A 60 ret void 61 } 62 63 define void @load_2_i32(<2 x i32>* %A) { 64 ; SSE2-LABEL: load_2_i32: 65 ; SSE2: # %bb.0: 66 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 67 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 68 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 69 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 70 ; SSE2-NEXT: movq %xmm0, (%rdi) 71 ; SSE2-NEXT: retq 72 ; 73 ; SSE41-LABEL: load_2_i32: 74 ; SSE41: # %bb.0: 75 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero 76 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 77 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 78 ; SSE41-NEXT: movq %xmm0, (%rdi) 79 ; SSE41-NEXT: retq 80 %T = load <2 x i32>, <2 x i32>* %A 81 %G = add <2 x i32> %T, <i32 9, i32 7> 82 store <2 x i32> %G, <2 x i32>* %A 83 ret void 84 } 85 86 define void @load_4_i8(<4 x i8>* %A) { 87 ; SSE2-LABEL: load_4_i8: 88 ; SSE2: # %bb.0: 89 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 90 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 91 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 92 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 93 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 94 ; SSE2-NEXT: packuswb %xmm0, %xmm0 95 ; SSE2-NEXT: packuswb %xmm0, %xmm0 96 ; SSE2-NEXT: movd %xmm0, (%rdi) 97 ; SSE2-NEXT: retq 98 ; 99 ; SSE41-LABEL: load_4_i8: 100 ; SSE41: # %bb.0: 101 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 102 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 103 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 104 ; SSE41-NEXT: movd %xmm0, (%rdi) 105 ; SSE41-NEXT: retq 106 %T = load <4 x i8>, <4 x i8>* %A 107 %G = add <4 x i8> %T, <i8 1, i8 4, i8 9, i8 7> 108 store <4 x i8> %G, <4 x i8>* %A 109 ret void 110 } 111 112 define void @load_4_i16(<4 x i16>* %A) { 113 ; SSE2-LABEL: load_4_i16: 114 ; SSE2: # %bb.0: 115 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 116 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 117 ; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0 118 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 119 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 120 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 121 ; SSE2-NEXT: movq %xmm0, (%rdi) 122 ; SSE2-NEXT: retq 123 ; 124 ; SSE41-LABEL: load_4_i16: 125 ; SSE41: # %bb.0: 126 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 127 ; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0 128 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 129 ; SSE41-NEXT: movq %xmm0, (%rdi) 130 ; SSE41-NEXT: retq 131 %T = load <4 x i16>, <4 x i16>* %A 132 %G = add <4 x i16> %T, <i16 1, i16 4, i16 9, i16 7> 133 store <4 x i16> %G, <4 x i16>* %A 134 ret void 135 } 136 137 define void @load_8_i8(<8 x i8>* %A) { 138 ; SSE2-LABEL: load_8_i8: 139 ; SSE2: # %bb.0: 140 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 141 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 142 ; SSE2-NEXT: paddb %xmm0, %xmm0 143 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 144 ; SSE2-NEXT: packuswb %xmm0, %xmm0 145 ; SSE2-NEXT: movq %xmm0, (%rdi) 146 ; SSE2-NEXT: retq 147 ; 148 ; SSE41-LABEL: load_8_i8: 149 ; SSE41: # %bb.0: 150 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 151 ; SSE41-NEXT: paddb %xmm0, %xmm0 152 ; SSE41-NEXT: packuswb %xmm0, %xmm0 153 ; SSE41-NEXT: movq %xmm0, (%rdi) 154 ; SSE41-NEXT: retq 155 %T = load <8 x i8>, <8 x i8>* %A 156 %G = add <8 x i8> %T, %T 157 store <8 x i8> %G, <8 x i8>* %A 158 ret void 159 } 160