1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE 3 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX 4 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX 5 6 ; Verify we fold loads into unary sse intrinsics only when optimizing for size 7 8 define float @rcpss(float* %a) { 9 ; SSE-LABEL: rcpss: 10 ; SSE: # %bb.0: 11 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 12 ; SSE-NEXT: rcpss %xmm0, %xmm0 13 ; SSE-NEXT: retq 14 ; 15 ; AVX-LABEL: rcpss: 16 ; AVX: # %bb.0: 17 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 18 ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 19 ; AVX-NEXT: retq 20 %ld = load float, float* %a 21 %ins = insertelement <4 x float> undef, float %ld, i32 0 22 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins) 23 %ext = extractelement <4 x float> %res, i32 0 24 ret float %ext 25 } 26 27 define float @rsqrtss(float* %a) { 28 ; SSE-LABEL: rsqrtss: 29 ; SSE: # %bb.0: 30 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 31 ; SSE-NEXT: rsqrtss %xmm0, %xmm0 32 ; SSE-NEXT: retq 33 ; 34 ; AVX-LABEL: rsqrtss: 35 ; AVX: # %bb.0: 36 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 37 ; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 38 ; AVX-NEXT: retq 39 %ld = load float, float* %a 40 %ins = insertelement <4 x float> undef, float %ld, i32 0 41 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins) 42 %ext = extractelement <4 x float> %res, i32 0 43 ret float %ext 44 } 45 46 define float @sqrtss(float* %a) { 47 ; SSE-LABEL: sqrtss: 48 ; SSE: # %bb.0: 49 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 50 ; SSE-NEXT: sqrtss %xmm0, %xmm0 51 ; SSE-NEXT: retq 52 ; 53 ; AVX-LABEL: sqrtss: 54 ; AVX: # %bb.0: 55 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 56 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 57 ; AVX-NEXT: retq 58 %ld = load float, float* %a 59 %ins = insertelement <4 x float> undef, float %ld, i32 0 60 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins) 61 %ext = extractelement <4 x float> %res, i32 0 62 ret float %ext 63 } 64 65 define double @sqrtsd(double* %a) { 66 ; SSE-LABEL: sqrtsd: 67 ; SSE: # %bb.0: 68 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 69 ; SSE-NEXT: sqrtsd %xmm0, %xmm0 70 ; SSE-NEXT: retq 71 ; 72 ; AVX-LABEL: sqrtsd: 73 ; AVX: # %bb.0: 74 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 75 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 76 ; AVX-NEXT: retq 77 %ld = load double, double* %a 78 %ins = insertelement <2 x double> undef, double %ld, i32 0 79 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins) 80 %ext = extractelement <2 x double> %res, i32 0 81 ret double %ext 82 } 83 84 define float @rcpss_size(float* %a) optsize { 85 ; SSE-LABEL: rcpss_size: 86 ; SSE: # %bb.0: 87 ; SSE-NEXT: rcpss (%rdi), %xmm0 88 ; SSE-NEXT: retq 89 ; 90 ; AVX-LABEL: rcpss_size: 91 ; AVX: # %bb.0: 92 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 93 ; AVX-NEXT: retq 94 %ld = load float, float* %a 95 %ins = insertelement <4 x float> undef, float %ld, i32 0 96 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins) 97 %ext = extractelement <4 x float> %res, i32 0 98 ret float %ext 99 } 100 101 define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize { 102 ; SSE-LABEL: rcpss_full_size: 103 ; SSE: # %bb.0: 104 ; SSE-NEXT: rcpss (%rdi), %xmm0 105 ; SSE-NEXT: retq 106 ; 107 ; AVX-LABEL: rcpss_full_size: 108 ; AVX: # %bb.0: 109 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 110 ; AVX-NEXT: retq 111 %ld = load <4 x float>, <4 x float>* %a 112 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld) 113 ret <4 x float> %res 114 } 115 116 define float @rsqrtss_size(float* %a) optsize { 117 ; SSE-LABEL: rsqrtss_size: 118 ; SSE: # %bb.0: 119 ; SSE-NEXT: rsqrtss (%rdi), %xmm0 120 ; SSE-NEXT: retq 121 ; 122 ; AVX-LABEL: rsqrtss_size: 123 ; AVX: # %bb.0: 124 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 125 ; AVX-NEXT: retq 126 %ld = load float, float* %a 127 %ins = insertelement <4 x float> undef, float %ld, i32 0 128 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins) 129 %ext = extractelement <4 x float> %res, i32 0 130 ret float %ext 131 } 132 133 define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize { 134 ; SSE-LABEL: rsqrtss_full_size: 135 ; SSE: # %bb.0: 136 ; SSE-NEXT: rsqrtss (%rdi), %xmm0 137 ; SSE-NEXT: retq 138 ; 139 ; AVX-LABEL: rsqrtss_full_size: 140 ; AVX: # %bb.0: 141 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 142 ; AVX-NEXT: retq 143 %ld = load <4 x float>, <4 x float>* %a 144 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld) 145 ret <4 x float> %res 146 } 147 148 define float @sqrtss_size(float* %a) optsize{ 149 ; SSE-LABEL: sqrtss_size: 150 ; SSE: # %bb.0: 151 ; SSE-NEXT: sqrtss (%rdi), %xmm0 152 ; SSE-NEXT: retq 153 ; 154 ; AVX-LABEL: sqrtss_size: 155 ; AVX: # %bb.0: 156 ; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 157 ; AVX-NEXT: retq 158 %ld = load float, float* %a 159 %ins = insertelement <4 x float> undef, float %ld, i32 0 160 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins) 161 %ext = extractelement <4 x float> %res, i32 0 162 ret float %ext 163 } 164 165 define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{ 166 ; SSE-LABEL: sqrtss_full_size: 167 ; SSE: # %bb.0: 168 ; SSE-NEXT: movaps (%rdi), %xmm0 169 ; SSE-NEXT: sqrtss %xmm0, %xmm0 170 ; SSE-NEXT: retq 171 ; 172 ; AVX-LABEL: sqrtss_full_size: 173 ; AVX: # %bb.0: 174 ; AVX-NEXT: vmovaps (%rdi), %xmm0 175 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 176 ; AVX-NEXT: retq 177 %ld = load <4 x float>, <4 x float>* %a 178 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) 179 ret <4 x float> %res 180 } 181 182 define double @sqrtsd_size(double* %a) optsize { 183 ; SSE-LABEL: sqrtsd_size: 184 ; SSE: # %bb.0: 185 ; SSE-NEXT: sqrtsd (%rdi), %xmm0 186 ; SSE-NEXT: retq 187 ; 188 ; AVX-LABEL: sqrtsd_size: 189 ; AVX: # %bb.0: 190 ; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 191 ; AVX-NEXT: retq 192 %ld = load double, double* %a 193 %ins = insertelement <2 x double> undef, double %ld, i32 0 194 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins) 195 %ext = extractelement <2 x double> %res, i32 0 196 ret double %ext 197 } 198 199 define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize { 200 ; SSE-LABEL: sqrtsd_full_size: 201 ; SSE: # %bb.0: 202 ; SSE-NEXT: movapd (%rdi), %xmm0 203 ; SSE-NEXT: sqrtsd %xmm0, %xmm0 204 ; SSE-NEXT: retq 205 ; 206 ; AVX-LABEL: sqrtsd_full_size: 207 ; AVX: # %bb.0: 208 ; AVX-NEXT: vmovapd (%rdi), %xmm0 209 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 210 ; AVX-NEXT: retq 211 %ld = load <2 x double>, <2 x double>* %a 212 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) 213 ret <2 x double> %res 214 } 215 216 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone 217 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone 218 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone 219 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone 220