1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE41 4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 6 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 7 8 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 9 10 @src64 = common global [8 x double] zeroinitializer, align 64 11 @dst64 = common global [8 x double] zeroinitializer, align 64 12 @src32 = common global [16 x float] zeroinitializer, align 64 13 @dst32 = common global [16 x float] zeroinitializer, align 64 14 15 declare double @llvm.ceil.f64(double %p) 16 declare double @llvm.floor.f64(double %p) 17 declare double @llvm.nearbyint.f64(double %p) 18 declare double @llvm.rint.f64(double %p) 19 declare double @llvm.trunc.f64(double %p) 20 21 declare float @llvm.ceil.f32(float %p) 22 declare float @llvm.floor.f32(float %p) 23 declare float @llvm.nearbyint.f32(float %p) 24 declare float @llvm.rint.f32(float %p) 25 declare float @llvm.trunc.f32(float %p) 26 27 define void @ceil_2f64() #0 { 28 ; SSE2-LABEL: @ceil_2f64( 29 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 30 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 31 ; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) 32 ; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) 33 ; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 34 ; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 35 ; SSE2-NEXT: ret void 36 ; 37 ; SSE41-LABEL: @ceil_2f64( 38 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 39 ; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) 40 ; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 41 ; SSE41-NEXT: ret void 42 ; 43 ; AVX-LABEL: @ceil_2f64( 44 ; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 45 ; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) 46 ; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 47 ; AVX-NEXT: ret void 48 ; 49 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 50 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 51 %ceil0 = call double @llvm.ceil.f64(double %ld0) 52 %ceil1 = call double @llvm.ceil.f64(double %ld1) 53 store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 54 store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 55 ret void 56 } 57 58 define void @ceil_4f64() #0 { 59 ; SSE2-LABEL: @ceil_4f64( 60 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 61 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 62 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 63 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 64 ; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) 65 ; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) 66 ; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) 67 ; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) 68 ; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 69 ; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 70 ; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 71 ; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 72 ; SSE2-NEXT: ret void 73 ; 74 ; SSE41-LABEL: @ceil_4f64( 75 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 76 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 77 ; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) 78 ; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) 79 ; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 80 ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 81 ; SSE41-NEXT: ret void 82 ; 83 ; AVX-LABEL: @ceil_4f64( 84 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 85 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) 86 ; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 87 ; AVX-NEXT: ret void 88 ; 89 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 90 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 91 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 92 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 93 %ceil0 = call double @llvm.ceil.f64(double %ld0) 94 %ceil1 = call double @llvm.ceil.f64(double %ld1) 95 %ceil2 = call double @llvm.ceil.f64(double %ld2) 96 %ceil3 = call double @llvm.ceil.f64(double %ld3) 97 store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 98 store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 99 store double %ceil2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 100 store double %ceil3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 101 ret void 102 } 103 104 define void @ceil_8f64() #0 { 105 ; SSE2-LABEL: @ceil_8f64( 106 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 107 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 108 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 109 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 110 ; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 111 ; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 112 ; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 113 ; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 114 ; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) 115 ; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) 116 ; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) 117 ; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) 118 ; SSE2-NEXT: [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[LD4]]) 119 ; SSE2-NEXT: [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]]) 120 ; SSE2-NEXT: [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]]) 121 ; SSE2-NEXT: [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]]) 122 ; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 123 ; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 124 ; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 125 ; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 126 ; SSE2-NEXT: store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 127 ; SSE2-NEXT: store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 128 ; SSE2-NEXT: store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 129 ; SSE2-NEXT: store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 130 ; SSE2-NEXT: ret void 131 ; 132 ; SSE41-LABEL: @ceil_8f64( 133 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 134 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 135 ; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 136 ; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 137 ; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) 138 ; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) 139 ; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]]) 140 ; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]]) 141 ; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 142 ; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 143 ; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 144 ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 145 ; SSE41-NEXT: ret void 146 ; 147 ; AVX1-LABEL: @ceil_8f64( 148 ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 149 ; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 150 ; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) 151 ; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]]) 152 ; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 153 ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 154 ; AVX1-NEXT: ret void 155 ; 156 ; AVX2-LABEL: @ceil_8f64( 157 ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 158 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 159 ; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) 160 ; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]]) 161 ; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 162 ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 163 ; AVX2-NEXT: ret void 164 ; 165 ; AVX512-LABEL: @ceil_8f64( 166 ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 167 ; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[TMP1]]) 168 ; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 169 ; AVX512-NEXT: ret void 170 ; 171 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 172 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 173 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 174 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 175 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 176 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 177 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 178 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 179 %ceil0 = call double @llvm.ceil.f64(double %ld0) 180 %ceil1 = call double @llvm.ceil.f64(double %ld1) 181 %ceil2 = call double @llvm.ceil.f64(double %ld2) 182 %ceil3 = call double @llvm.ceil.f64(double %ld3) 183 %ceil4 = call double @llvm.ceil.f64(double %ld4) 184 %ceil5 = call double @llvm.ceil.f64(double %ld5) 185 %ceil6 = call double @llvm.ceil.f64(double %ld6) 186 %ceil7 = call double @llvm.ceil.f64(double %ld7) 187 store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 188 store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 189 store double %ceil2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 190 store double %ceil3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 191 store double %ceil4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 192 store double %ceil5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 193 store double %ceil6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 194 store double %ceil7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 195 ret void 196 } 197 198 define void @floor_2f64() #0 { 199 ; SSE2-LABEL: @floor_2f64( 200 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 201 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 202 ; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) 203 ; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) 204 ; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 205 ; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 206 ; SSE2-NEXT: ret void 207 ; 208 ; SSE41-LABEL: @floor_2f64( 209 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 210 ; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) 211 ; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 212 ; SSE41-NEXT: ret void 213 ; 214 ; AVX-LABEL: @floor_2f64( 215 ; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 216 ; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) 217 ; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 218 ; AVX-NEXT: ret void 219 ; 220 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 221 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 222 %floor0 = call double @llvm.floor.f64(double %ld0) 223 %floor1 = call double @llvm.floor.f64(double %ld1) 224 store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 225 store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 226 ret void 227 } 228 229 define void @floor_4f64() #0 { 230 ; SSE2-LABEL: @floor_4f64( 231 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 232 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 233 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 234 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 235 ; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) 236 ; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) 237 ; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) 238 ; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) 239 ; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 240 ; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 241 ; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 242 ; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 243 ; SSE2-NEXT: ret void 244 ; 245 ; SSE41-LABEL: @floor_4f64( 246 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 247 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 248 ; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) 249 ; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) 250 ; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 251 ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 252 ; SSE41-NEXT: ret void 253 ; 254 ; AVX-LABEL: @floor_4f64( 255 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 256 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) 257 ; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 258 ; AVX-NEXT: ret void 259 ; 260 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 261 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 262 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 263 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 264 %floor0 = call double @llvm.floor.f64(double %ld0) 265 %floor1 = call double @llvm.floor.f64(double %ld1) 266 %floor2 = call double @llvm.floor.f64(double %ld2) 267 %floor3 = call double @llvm.floor.f64(double %ld3) 268 store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 269 store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 270 store double %floor2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 271 store double %floor3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 272 ret void 273 } 274 275 define void @floor_8f64() #0 { 276 ; SSE2-LABEL: @floor_8f64( 277 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 278 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 279 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 280 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 281 ; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 282 ; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 283 ; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 284 ; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 285 ; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) 286 ; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) 287 ; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) 288 ; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) 289 ; SSE2-NEXT: [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[LD4]]) 290 ; SSE2-NEXT: [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]]) 291 ; SSE2-NEXT: [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]]) 292 ; SSE2-NEXT: [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]]) 293 ; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 294 ; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 295 ; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 296 ; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 297 ; SSE2-NEXT: store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 298 ; SSE2-NEXT: store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 299 ; SSE2-NEXT: store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 300 ; SSE2-NEXT: store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 301 ; SSE2-NEXT: ret void 302 ; 303 ; SSE41-LABEL: @floor_8f64( 304 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 305 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 306 ; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 307 ; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 308 ; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) 309 ; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) 310 ; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]]) 311 ; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]]) 312 ; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 313 ; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 314 ; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 315 ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 316 ; SSE41-NEXT: ret void 317 ; 318 ; AVX1-LABEL: @floor_8f64( 319 ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 320 ; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 321 ; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) 322 ; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]]) 323 ; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 324 ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 325 ; AVX1-NEXT: ret void 326 ; 327 ; AVX2-LABEL: @floor_8f64( 328 ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 329 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 330 ; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) 331 ; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]]) 332 ; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 333 ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 334 ; AVX2-NEXT: ret void 335 ; 336 ; AVX512-LABEL: @floor_8f64( 337 ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 338 ; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[TMP1]]) 339 ; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 340 ; AVX512-NEXT: ret void 341 ; 342 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 343 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 344 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 345 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 346 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 347 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 348 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 349 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 350 %floor0 = call double @llvm.floor.f64(double %ld0) 351 %floor1 = call double @llvm.floor.f64(double %ld1) 352 %floor2 = call double @llvm.floor.f64(double %ld2) 353 %floor3 = call double @llvm.floor.f64(double %ld3) 354 %floor4 = call double @llvm.floor.f64(double %ld4) 355 %floor5 = call double @llvm.floor.f64(double %ld5) 356 %floor6 = call double @llvm.floor.f64(double %ld6) 357 %floor7 = call double @llvm.floor.f64(double %ld7) 358 store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 359 store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 360 store double %floor2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 361 store double %floor3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 362 store double %floor4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 363 store double %floor5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 364 store double %floor6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 365 store double %floor7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 366 ret void 367 } 368 369 define void @nearbyint_2f64() #0 { 370 ; SSE2-LABEL: @nearbyint_2f64( 371 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 372 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 373 ; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) 374 ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) 375 ; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 376 ; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 377 ; SSE2-NEXT: ret void 378 ; 379 ; SSE41-LABEL: @nearbyint_2f64( 380 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 381 ; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) 382 ; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 383 ; SSE41-NEXT: ret void 384 ; 385 ; AVX-LABEL: @nearbyint_2f64( 386 ; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 387 ; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) 388 ; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 389 ; AVX-NEXT: ret void 390 ; 391 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 392 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 393 %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0) 394 %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1) 395 store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 396 store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 397 ret void 398 } 399 400 define void @nearbyint_4f64() #0 { 401 ; SSE2-LABEL: @nearbyint_4f64( 402 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 403 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 404 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 405 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 406 ; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) 407 ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) 408 ; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) 409 ; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) 410 ; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 411 ; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 412 ; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 413 ; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 414 ; SSE2-NEXT: ret void 415 ; 416 ; SSE41-LABEL: @nearbyint_4f64( 417 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 418 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 419 ; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) 420 ; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) 421 ; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 422 ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 423 ; SSE41-NEXT: ret void 424 ; 425 ; AVX-LABEL: @nearbyint_4f64( 426 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 427 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) 428 ; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 429 ; AVX-NEXT: ret void 430 ; 431 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 432 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 433 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 434 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 435 %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0) 436 %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1) 437 %nearbyint2 = call double @llvm.nearbyint.f64(double %ld2) 438 %nearbyint3 = call double @llvm.nearbyint.f64(double %ld3) 439 store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 440 store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 441 store double %nearbyint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 442 store double %nearbyint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 443 ret void 444 } 445 446 define void @nearbyint_8f64() #0 { 447 ; SSE2-LABEL: @nearbyint_8f64( 448 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 449 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 450 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 451 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 452 ; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 453 ; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 454 ; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 455 ; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 456 ; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) 457 ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) 458 ; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) 459 ; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) 460 ; SSE2-NEXT: [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[LD4]]) 461 ; SSE2-NEXT: [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]]) 462 ; SSE2-NEXT: [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]]) 463 ; SSE2-NEXT: [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]]) 464 ; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 465 ; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 466 ; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 467 ; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 468 ; SSE2-NEXT: store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 469 ; SSE2-NEXT: store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 470 ; SSE2-NEXT: store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 471 ; SSE2-NEXT: store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 472 ; SSE2-NEXT: ret void 473 ; 474 ; SSE41-LABEL: @nearbyint_8f64( 475 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 476 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 477 ; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 478 ; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 479 ; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) 480 ; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) 481 ; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]]) 482 ; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]]) 483 ; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 484 ; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 485 ; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 486 ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 487 ; SSE41-NEXT: ret void 488 ; 489 ; AVX1-LABEL: @nearbyint_8f64( 490 ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 491 ; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 492 ; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) 493 ; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]]) 494 ; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 495 ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 496 ; AVX1-NEXT: ret void 497 ; 498 ; AVX2-LABEL: @nearbyint_8f64( 499 ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 500 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 501 ; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) 502 ; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]]) 503 ; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 504 ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 505 ; AVX2-NEXT: ret void 506 ; 507 ; AVX512-LABEL: @nearbyint_8f64( 508 ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 509 ; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> [[TMP1]]) 510 ; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 511 ; AVX512-NEXT: ret void 512 ; 513 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 514 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 515 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 516 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 517 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 518 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 519 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 520 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 521 %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0) 522 %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1) 523 %nearbyint2 = call double @llvm.nearbyint.f64(double %ld2) 524 %nearbyint3 = call double @llvm.nearbyint.f64(double %ld3) 525 %nearbyint4 = call double @llvm.nearbyint.f64(double %ld4) 526 %nearbyint5 = call double @llvm.nearbyint.f64(double %ld5) 527 %nearbyint6 = call double @llvm.nearbyint.f64(double %ld6) 528 %nearbyint7 = call double @llvm.nearbyint.f64(double %ld7) 529 store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 530 store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 531 store double %nearbyint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 532 store double %nearbyint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 533 store double %nearbyint4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 534 store double %nearbyint5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 535 store double %nearbyint6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 536 store double %nearbyint7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 537 ret void 538 } 539 540 define void @rint_2f64() #0 { 541 ; SSE2-LABEL: @rint_2f64( 542 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 543 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 544 ; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) 545 ; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) 546 ; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 547 ; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 548 ; SSE2-NEXT: ret void 549 ; 550 ; SSE41-LABEL: @rint_2f64( 551 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 552 ; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) 553 ; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 554 ; SSE41-NEXT: ret void 555 ; 556 ; AVX-LABEL: @rint_2f64( 557 ; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 558 ; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) 559 ; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 560 ; AVX-NEXT: ret void 561 ; 562 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 563 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 564 %rint0 = call double @llvm.rint.f64(double %ld0) 565 %rint1 = call double @llvm.rint.f64(double %ld1) 566 store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 567 store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 568 ret void 569 } 570 571 define void @rint_4f64() #0 { 572 ; SSE2-LABEL: @rint_4f64( 573 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 574 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 575 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 576 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 577 ; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) 578 ; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) 579 ; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) 580 ; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) 581 ; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 582 ; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 583 ; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 584 ; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 585 ; SSE2-NEXT: ret void 586 ; 587 ; SSE41-LABEL: @rint_4f64( 588 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 589 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 590 ; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) 591 ; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) 592 ; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 593 ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 594 ; SSE41-NEXT: ret void 595 ; 596 ; AVX-LABEL: @rint_4f64( 597 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 598 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) 599 ; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 600 ; AVX-NEXT: ret void 601 ; 602 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 603 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 604 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 605 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 606 %rint0 = call double @llvm.rint.f64(double %ld0) 607 %rint1 = call double @llvm.rint.f64(double %ld1) 608 %rint2 = call double @llvm.rint.f64(double %ld2) 609 %rint3 = call double @llvm.rint.f64(double %ld3) 610 store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 611 store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 612 store double %rint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 613 store double %rint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 614 ret void 615 } 616 617 define void @rint_8f64() #0 { 618 ; SSE2-LABEL: @rint_8f64( 619 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 620 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 621 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 622 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 623 ; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 624 ; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 625 ; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 626 ; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 627 ; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) 628 ; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) 629 ; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) 630 ; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) 631 ; SSE2-NEXT: [[RINT4:%.*]] = call double @llvm.rint.f64(double [[LD4]]) 632 ; SSE2-NEXT: [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]]) 633 ; SSE2-NEXT: [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]]) 634 ; SSE2-NEXT: [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]]) 635 ; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 636 ; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 637 ; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 638 ; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 639 ; SSE2-NEXT: store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 640 ; SSE2-NEXT: store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 641 ; SSE2-NEXT: store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 642 ; SSE2-NEXT: store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 643 ; SSE2-NEXT: ret void 644 ; 645 ; SSE41-LABEL: @rint_8f64( 646 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 647 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 648 ; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 649 ; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 650 ; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) 651 ; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) 652 ; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]]) 653 ; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]]) 654 ; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 655 ; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 656 ; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 657 ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 658 ; SSE41-NEXT: ret void 659 ; 660 ; AVX1-LABEL: @rint_8f64( 661 ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 662 ; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 663 ; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) 664 ; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]]) 665 ; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 666 ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 667 ; AVX1-NEXT: ret void 668 ; 669 ; AVX2-LABEL: @rint_8f64( 670 ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 671 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 672 ; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) 673 ; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]]) 674 ; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 675 ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 676 ; AVX2-NEXT: ret void 677 ; 678 ; AVX512-LABEL: @rint_8f64( 679 ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 680 ; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.rint.v8f64(<8 x double> [[TMP1]]) 681 ; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 682 ; AVX512-NEXT: ret void 683 ; 684 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 685 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 686 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 687 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 688 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 689 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 690 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 691 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 692 %rint0 = call double @llvm.rint.f64(double %ld0) 693 %rint1 = call double @llvm.rint.f64(double %ld1) 694 %rint2 = call double @llvm.rint.f64(double %ld2) 695 %rint3 = call double @llvm.rint.f64(double %ld3) 696 %rint4 = call double @llvm.rint.f64(double %ld4) 697 %rint5 = call double @llvm.rint.f64(double %ld5) 698 %rint6 = call double @llvm.rint.f64(double %ld6) 699 %rint7 = call double @llvm.rint.f64(double %ld7) 700 store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 701 store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 702 store double %rint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 703 store double %rint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 704 store double %rint4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 705 store double %rint5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 706 store double %rint6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 707 store double %rint7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 708 ret void 709 } 710 711 define void @trunc_2f64() #0 { 712 ; SSE2-LABEL: @trunc_2f64( 713 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 714 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 715 ; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) 716 ; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) 717 ; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 718 ; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 719 ; SSE2-NEXT: ret void 720 ; 721 ; SSE41-LABEL: @trunc_2f64( 722 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 723 ; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) 724 ; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 725 ; SSE41-NEXT: ret void 726 ; 727 ; AVX-LABEL: @trunc_2f64( 728 ; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 729 ; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) 730 ; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 731 ; AVX-NEXT: ret void 732 ; 733 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 734 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 735 %trunc0 = call double @llvm.trunc.f64(double %ld0) 736 %trunc1 = call double @llvm.trunc.f64(double %ld1) 737 store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 738 store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 739 ret void 740 } 741 742 define void @trunc_4f64() #0 { 743 ; SSE2-LABEL: @trunc_4f64( 744 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 745 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 746 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 747 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 748 ; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) 749 ; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) 750 ; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) 751 ; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) 752 ; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 753 ; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 754 ; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 755 ; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 756 ; SSE2-NEXT: ret void 757 ; 758 ; SSE41-LABEL: @trunc_4f64( 759 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 760 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 761 ; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) 762 ; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) 763 ; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 764 ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 765 ; SSE41-NEXT: ret void 766 ; 767 ; AVX-LABEL: @trunc_4f64( 768 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 769 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) 770 ; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 771 ; AVX-NEXT: ret void 772 ; 773 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 774 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 775 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 776 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 777 %trunc0 = call double @llvm.trunc.f64(double %ld0) 778 %trunc1 = call double @llvm.trunc.f64(double %ld1) 779 %trunc2 = call double @llvm.trunc.f64(double %ld2) 780 %trunc3 = call double @llvm.trunc.f64(double %ld3) 781 store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 782 store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 783 store double %trunc2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 784 store double %trunc3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 785 ret void 786 } 787 788 define void @trunc_8f64() #0 { 789 ; SSE2-LABEL: @trunc_8f64( 790 ; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 791 ; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 792 ; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 793 ; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 794 ; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 795 ; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 796 ; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 797 ; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 798 ; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) 799 ; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) 800 ; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) 801 ; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) 802 ; SSE2-NEXT: [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[LD4]]) 803 ; SSE2-NEXT: [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]]) 804 ; SSE2-NEXT: [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]]) 805 ; SSE2-NEXT: [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]]) 806 ; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 807 ; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 808 ; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 809 ; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 810 ; SSE2-NEXT: store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 811 ; SSE2-NEXT: store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 812 ; SSE2-NEXT: store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 813 ; SSE2-NEXT: store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 814 ; SSE2-NEXT: ret void 815 ; 816 ; SSE41-LABEL: @trunc_8f64( 817 ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 818 ; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 819 ; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 820 ; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 821 ; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) 822 ; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) 823 ; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]]) 824 ; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]]) 825 ; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 826 ; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 827 ; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 828 ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 829 ; SSE41-NEXT: ret void 830 ; 831 ; AVX1-LABEL: @trunc_8f64( 832 ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 833 ; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 834 ; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) 835 ; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]]) 836 ; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 837 ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 838 ; AVX1-NEXT: ret void 839 ; 840 ; AVX2-LABEL: @trunc_8f64( 841 ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 842 ; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 843 ; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) 844 ; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]]) 845 ; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 846 ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 847 ; AVX2-NEXT: ret void 848 ; 849 ; AVX512-LABEL: @trunc_8f64( 850 ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 851 ; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.trunc.v8f64(<8 x double> [[TMP1]]) 852 ; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 853 ; AVX512-NEXT: ret void 854 ; 855 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 856 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 857 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 858 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 859 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 860 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 861 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 862 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 863 %trunc0 = call double @llvm.trunc.f64(double %ld0) 864 %trunc1 = call double @llvm.trunc.f64(double %ld1) 865 %trunc2 = call double @llvm.trunc.f64(double %ld2) 866 %trunc3 = call double @llvm.trunc.f64(double %ld3) 867 %trunc4 = call double @llvm.trunc.f64(double %ld4) 868 %trunc5 = call double @llvm.trunc.f64(double %ld5) 869 %trunc6 = call double @llvm.trunc.f64(double %ld6) 870 %trunc7 = call double @llvm.trunc.f64(double %ld7) 871 store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 872 store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 873 store double %trunc2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 874 store double %trunc3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 875 store double %trunc4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 876 store double %trunc5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 877 store double %trunc6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 878 store double %trunc7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 879 ret void 880 } 881 882 define void @ceil_4f32() #0 { 883 ; SSE2-LABEL: @ceil_4f32( 884 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 885 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 886 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 887 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 888 ; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) 889 ; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) 890 ; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) 891 ; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) 892 ; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 893 ; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 894 ; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 895 ; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 896 ; SSE2-NEXT: ret void 897 ; 898 ; SSE41-LABEL: @ceil_4f32( 899 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 900 ; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) 901 ; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 902 ; SSE41-NEXT: ret void 903 ; 904 ; AVX-LABEL: @ceil_4f32( 905 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 906 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) 907 ; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 908 ; AVX-NEXT: ret void 909 ; 910 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 911 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 912 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 913 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 914 %ceil0 = call float @llvm.ceil.f32(float %ld0) 915 %ceil1 = call float @llvm.ceil.f32(float %ld1) 916 %ceil2 = call float @llvm.ceil.f32(float %ld2) 917 %ceil3 = call float @llvm.ceil.f32(float %ld3) 918 store float %ceil0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 919 store float %ceil1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 920 store float %ceil2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 921 store float %ceil3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 922 ret void 923 } 924 925 define void @ceil_8f32() #0 { 926 ; SSE2-LABEL: @ceil_8f32( 927 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 928 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 929 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 930 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 931 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 932 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 933 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 934 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 935 ; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) 936 ; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) 937 ; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) 938 ; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) 939 ; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]]) 940 ; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) 941 ; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) 942 ; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) 943 ; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 944 ; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 945 ; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 946 ; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 947 ; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 948 ; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 949 ; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 950 ; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 951 ; SSE2-NEXT: ret void 952 ; 953 ; SSE41-LABEL: @ceil_8f32( 954 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 955 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 956 ; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) 957 ; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) 958 ; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 959 ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 960 ; SSE41-NEXT: ret void 961 ; 962 ; AVX-LABEL: @ceil_8f32( 963 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 964 ; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) 965 ; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 966 ; AVX-NEXT: ret void 967 ; 968 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 969 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 970 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 971 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 972 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 973 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 974 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 975 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 976 %ceil0 = call float @llvm.ceil.f32(float %ld0) 977 %ceil1 = call float @llvm.ceil.f32(float %ld1) 978 %ceil2 = call float @llvm.ceil.f32(float %ld2) 979 %ceil3 = call float @llvm.ceil.f32(float %ld3) 980 %ceil4 = call float @llvm.ceil.f32(float %ld4) 981 %ceil5 = call float @llvm.ceil.f32(float %ld5) 982 %ceil6 = call float @llvm.ceil.f32(float %ld6) 983 %ceil7 = call float @llvm.ceil.f32(float %ld7) 984 store float %ceil0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 985 store float %ceil1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 986 store float %ceil2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 987 store float %ceil3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 988 store float %ceil4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 989 store float %ceil5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 990 store float %ceil6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 991 store float %ceil7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 992 ret void 993 } 994 995 define void @ceil_16f32() #0 { 996 ; SSE2-LABEL: @ceil_16f32( 997 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 998 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 999 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1000 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1001 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1002 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1003 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1004 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1005 ; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 1006 ; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 1007 ; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1008 ; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1009 ; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1010 ; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1011 ; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1012 ; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1013 ; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) 1014 ; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) 1015 ; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) 1016 ; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) 1017 ; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]]) 1018 ; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) 1019 ; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) 1020 ; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) 1021 ; SSE2-NEXT: [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[LD8]]) 1022 ; SSE2-NEXT: [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[LD9]]) 1023 ; SSE2-NEXT: [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[LD10]]) 1024 ; SSE2-NEXT: [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[LD11]]) 1025 ; SSE2-NEXT: [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[LD12]]) 1026 ; SSE2-NEXT: [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]]) 1027 ; SSE2-NEXT: [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]]) 1028 ; SSE2-NEXT: [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]]) 1029 ; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1030 ; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1031 ; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1032 ; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1033 ; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1034 ; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1035 ; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1036 ; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1037 ; SSE2-NEXT: store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 1038 ; SSE2-NEXT: store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 1039 ; SSE2-NEXT: store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1040 ; SSE2-NEXT: store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1041 ; SSE2-NEXT: store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1042 ; SSE2-NEXT: store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1043 ; SSE2-NEXT: store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1044 ; SSE2-NEXT: store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1045 ; SSE2-NEXT: ret void 1046 ; 1047 ; SSE41-LABEL: @ceil_16f32( 1048 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1049 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1050 ; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 1051 ; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 1052 ; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) 1053 ; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) 1054 ; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]]) 1055 ; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]]) 1056 ; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1057 ; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1058 ; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 1059 ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 1060 ; SSE41-NEXT: ret void 1061 ; 1062 ; AVX1-LABEL: @ceil_16f32( 1063 ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1064 ; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1065 ; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) 1066 ; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]]) 1067 ; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1068 ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1069 ; AVX1-NEXT: ret void 1070 ; 1071 ; AVX2-LABEL: @ceil_16f32( 1072 ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1073 ; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1074 ; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) 1075 ; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]]) 1076 ; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1077 ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1078 ; AVX2-NEXT: ret void 1079 ; 1080 ; AVX512-LABEL: @ceil_16f32( 1081 ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 1082 ; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[TMP1]]) 1083 ; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 1084 ; AVX512-NEXT: ret void 1085 ; 1086 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 1087 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 1088 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 1089 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 1090 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 1091 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 1092 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 1093 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 1094 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 1095 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 1096 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1097 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1098 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1099 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1100 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1101 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1102 %ceil0 = call float @llvm.ceil.f32(float %ld0 ) 1103 %ceil1 = call float @llvm.ceil.f32(float %ld1 ) 1104 %ceil2 = call float @llvm.ceil.f32(float %ld2 ) 1105 %ceil3 = call float @llvm.ceil.f32(float %ld3 ) 1106 %ceil4 = call float @llvm.ceil.f32(float %ld4 ) 1107 %ceil5 = call float @llvm.ceil.f32(float %ld5 ) 1108 %ceil6 = call float @llvm.ceil.f32(float %ld6 ) 1109 %ceil7 = call float @llvm.ceil.f32(float %ld7 ) 1110 %ceil8 = call float @llvm.ceil.f32(float %ld8 ) 1111 %ceil9 = call float @llvm.ceil.f32(float %ld9 ) 1112 %ceil10 = call float @llvm.ceil.f32(float %ld10) 1113 %ceil11 = call float @llvm.ceil.f32(float %ld11) 1114 %ceil12 = call float @llvm.ceil.f32(float %ld12) 1115 %ceil13 = call float @llvm.ceil.f32(float %ld13) 1116 %ceil14 = call float @llvm.ceil.f32(float %ld14) 1117 %ceil15 = call float @llvm.ceil.f32(float %ld15) 1118 store float %ceil0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 1119 store float %ceil1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 1120 store float %ceil2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 1121 store float %ceil3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 1122 store float %ceil4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 1123 store float %ceil5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 1124 store float %ceil6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 1125 store float %ceil7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 1126 store float %ceil8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 1127 store float %ceil9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 1128 store float %ceil10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1129 store float %ceil11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1130 store float %ceil12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1131 store float %ceil13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1132 store float %ceil14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1133 store float %ceil15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1134 ret void 1135 } 1136 1137 define void @floor_4f32() #0 { 1138 ; SSE2-LABEL: @floor_4f32( 1139 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1140 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1141 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1142 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1143 ; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) 1144 ; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) 1145 ; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) 1146 ; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) 1147 ; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1148 ; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1149 ; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1150 ; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1151 ; SSE2-NEXT: ret void 1152 ; 1153 ; SSE41-LABEL: @floor_4f32( 1154 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1155 ; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) 1156 ; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1157 ; SSE41-NEXT: ret void 1158 ; 1159 ; AVX-LABEL: @floor_4f32( 1160 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1161 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) 1162 ; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1163 ; AVX-NEXT: ret void 1164 ; 1165 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1166 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1167 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1168 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1169 %floor0 = call float @llvm.floor.f32(float %ld0) 1170 %floor1 = call float @llvm.floor.f32(float %ld1) 1171 %floor2 = call float @llvm.floor.f32(float %ld2) 1172 %floor3 = call float @llvm.floor.f32(float %ld3) 1173 store float %floor0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1174 store float %floor1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1175 store float %floor2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1176 store float %floor3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1177 ret void 1178 } 1179 1180 define void @floor_8f32() #0 { 1181 ; SSE2-LABEL: @floor_8f32( 1182 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1183 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1184 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1185 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1186 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1187 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1188 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1189 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1190 ; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) 1191 ; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) 1192 ; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) 1193 ; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) 1194 ; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]]) 1195 ; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) 1196 ; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) 1197 ; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) 1198 ; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1199 ; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1200 ; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1201 ; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1202 ; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1203 ; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1204 ; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1205 ; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1206 ; SSE2-NEXT: ret void 1207 ; 1208 ; SSE41-LABEL: @floor_8f32( 1209 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1210 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1211 ; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) 1212 ; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) 1213 ; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1214 ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1215 ; SSE41-NEXT: ret void 1216 ; 1217 ; AVX-LABEL: @floor_8f32( 1218 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1219 ; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) 1220 ; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1221 ; AVX-NEXT: ret void 1222 ; 1223 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1224 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1225 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1226 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1227 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1228 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1229 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1230 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1231 %floor0 = call float @llvm.floor.f32(float %ld0) 1232 %floor1 = call float @llvm.floor.f32(float %ld1) 1233 %floor2 = call float @llvm.floor.f32(float %ld2) 1234 %floor3 = call float @llvm.floor.f32(float %ld3) 1235 %floor4 = call float @llvm.floor.f32(float %ld4) 1236 %floor5 = call float @llvm.floor.f32(float %ld5) 1237 %floor6 = call float @llvm.floor.f32(float %ld6) 1238 %floor7 = call float @llvm.floor.f32(float %ld7) 1239 store float %floor0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1240 store float %floor1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1241 store float %floor2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1242 store float %floor3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1243 store float %floor4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1244 store float %floor5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1245 store float %floor6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1246 store float %floor7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1247 ret void 1248 } 1249 1250 define void @floor_16f32() #0 { 1251 ; SSE2-LABEL: @floor_16f32( 1252 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1253 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1254 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1255 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1256 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1257 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1258 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1259 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1260 ; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 1261 ; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 1262 ; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1263 ; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1264 ; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1265 ; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1266 ; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1267 ; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1268 ; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) 1269 ; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) 1270 ; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) 1271 ; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) 1272 ; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]]) 1273 ; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) 1274 ; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) 1275 ; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) 1276 ; SSE2-NEXT: [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[LD8]]) 1277 ; SSE2-NEXT: [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[LD9]]) 1278 ; SSE2-NEXT: [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[LD10]]) 1279 ; SSE2-NEXT: [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[LD11]]) 1280 ; SSE2-NEXT: [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[LD12]]) 1281 ; SSE2-NEXT: [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]]) 1282 ; SSE2-NEXT: [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]]) 1283 ; SSE2-NEXT: [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]]) 1284 ; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1285 ; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1286 ; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1287 ; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1288 ; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1289 ; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1290 ; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1291 ; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1292 ; SSE2-NEXT: store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 1293 ; SSE2-NEXT: store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 1294 ; SSE2-NEXT: store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1295 ; SSE2-NEXT: store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1296 ; SSE2-NEXT: store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1297 ; SSE2-NEXT: store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1298 ; SSE2-NEXT: store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1299 ; SSE2-NEXT: store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1300 ; SSE2-NEXT: ret void 1301 ; 1302 ; SSE41-LABEL: @floor_16f32( 1303 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1304 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1305 ; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 1306 ; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 1307 ; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) 1308 ; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) 1309 ; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]]) 1310 ; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]]) 1311 ; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1312 ; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1313 ; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 1314 ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 1315 ; SSE41-NEXT: ret void 1316 ; 1317 ; AVX1-LABEL: @floor_16f32( 1318 ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1319 ; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1320 ; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) 1321 ; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]]) 1322 ; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1323 ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1324 ; AVX1-NEXT: ret void 1325 ; 1326 ; AVX2-LABEL: @floor_16f32( 1327 ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1328 ; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1329 ; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) 1330 ; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]]) 1331 ; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1332 ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1333 ; AVX2-NEXT: ret void 1334 ; 1335 ; AVX512-LABEL: @floor_16f32( 1336 ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 1337 ; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[TMP1]]) 1338 ; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 1339 ; AVX512-NEXT: ret void 1340 ; 1341 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 1342 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 1343 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 1344 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 1345 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 1346 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 1347 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 1348 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 1349 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 1350 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 1351 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1352 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1353 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1354 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1355 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1356 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1357 %floor0 = call float @llvm.floor.f32(float %ld0 ) 1358 %floor1 = call float @llvm.floor.f32(float %ld1 ) 1359 %floor2 = call float @llvm.floor.f32(float %ld2 ) 1360 %floor3 = call float @llvm.floor.f32(float %ld3 ) 1361 %floor4 = call float @llvm.floor.f32(float %ld4 ) 1362 %floor5 = call float @llvm.floor.f32(float %ld5 ) 1363 %floor6 = call float @llvm.floor.f32(float %ld6 ) 1364 %floor7 = call float @llvm.floor.f32(float %ld7 ) 1365 %floor8 = call float @llvm.floor.f32(float %ld8 ) 1366 %floor9 = call float @llvm.floor.f32(float %ld9 ) 1367 %floor10 = call float @llvm.floor.f32(float %ld10) 1368 %floor11 = call float @llvm.floor.f32(float %ld11) 1369 %floor12 = call float @llvm.floor.f32(float %ld12) 1370 %floor13 = call float @llvm.floor.f32(float %ld13) 1371 %floor14 = call float @llvm.floor.f32(float %ld14) 1372 %floor15 = call float @llvm.floor.f32(float %ld15) 1373 store float %floor0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 1374 store float %floor1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 1375 store float %floor2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 1376 store float %floor3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 1377 store float %floor4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 1378 store float %floor5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 1379 store float %floor6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 1380 store float %floor7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 1381 store float %floor8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 1382 store float %floor9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 1383 store float %floor10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1384 store float %floor11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1385 store float %floor12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1386 store float %floor13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1387 store float %floor14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1388 store float %floor15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1389 ret void 1390 } 1391 1392 define void @nearbyint_4f32() #0 { 1393 ; SSE2-LABEL: @nearbyint_4f32( 1394 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1395 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1396 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1397 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1398 ; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) 1399 ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) 1400 ; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) 1401 ; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) 1402 ; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1403 ; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1404 ; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1405 ; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1406 ; SSE2-NEXT: ret void 1407 ; 1408 ; SSE41-LABEL: @nearbyint_4f32( 1409 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1410 ; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) 1411 ; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1412 ; SSE41-NEXT: ret void 1413 ; 1414 ; AVX-LABEL: @nearbyint_4f32( 1415 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1416 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) 1417 ; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1418 ; AVX-NEXT: ret void 1419 ; 1420 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1421 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1422 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1423 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1424 %nearbyint0 = call float @llvm.nearbyint.f32(float %ld0) 1425 %nearbyint1 = call float @llvm.nearbyint.f32(float %ld1) 1426 %nearbyint2 = call float @llvm.nearbyint.f32(float %ld2) 1427 %nearbyint3 = call float @llvm.nearbyint.f32(float %ld3) 1428 store float %nearbyint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1429 store float %nearbyint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1430 store float %nearbyint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1431 store float %nearbyint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1432 ret void 1433 } 1434 1435 define void @nearbyint_8f32() #0 { 1436 ; SSE2-LABEL: @nearbyint_8f32( 1437 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1438 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1439 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1440 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1441 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1442 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1443 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1444 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1445 ; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) 1446 ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) 1447 ; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) 1448 ; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) 1449 ; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]]) 1450 ; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) 1451 ; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) 1452 ; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) 1453 ; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1454 ; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1455 ; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1456 ; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1457 ; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1458 ; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1459 ; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1460 ; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1461 ; SSE2-NEXT: ret void 1462 ; 1463 ; SSE41-LABEL: @nearbyint_8f32( 1464 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1465 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1466 ; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) 1467 ; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) 1468 ; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1469 ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1470 ; SSE41-NEXT: ret void 1471 ; 1472 ; AVX-LABEL: @nearbyint_8f32( 1473 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1474 ; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) 1475 ; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1476 ; AVX-NEXT: ret void 1477 ; 1478 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1479 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1480 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1481 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1482 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1483 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1484 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1485 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1486 %nearbyint0 = call float @llvm.nearbyint.f32(float %ld0) 1487 %nearbyint1 = call float @llvm.nearbyint.f32(float %ld1) 1488 %nearbyint2 = call float @llvm.nearbyint.f32(float %ld2) 1489 %nearbyint3 = call float @llvm.nearbyint.f32(float %ld3) 1490 %nearbyint4 = call float @llvm.nearbyint.f32(float %ld4) 1491 %nearbyint5 = call float @llvm.nearbyint.f32(float %ld5) 1492 %nearbyint6 = call float @llvm.nearbyint.f32(float %ld6) 1493 %nearbyint7 = call float @llvm.nearbyint.f32(float %ld7) 1494 store float %nearbyint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1495 store float %nearbyint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1496 store float %nearbyint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1497 store float %nearbyint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1498 store float %nearbyint4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1499 store float %nearbyint5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1500 store float %nearbyint6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1501 store float %nearbyint7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1502 ret void 1503 } 1504 1505 define void @nearbyint_16f32() #0 { 1506 ; SSE2-LABEL: @nearbyint_16f32( 1507 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1508 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1509 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1510 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1511 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1512 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1513 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1514 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1515 ; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 1516 ; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 1517 ; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1518 ; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1519 ; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1520 ; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1521 ; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1522 ; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1523 ; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) 1524 ; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) 1525 ; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) 1526 ; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) 1527 ; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]]) 1528 ; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) 1529 ; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) 1530 ; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) 1531 ; SSE2-NEXT: [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[LD8]]) 1532 ; SSE2-NEXT: [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[LD9]]) 1533 ; SSE2-NEXT: [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[LD10]]) 1534 ; SSE2-NEXT: [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[LD11]]) 1535 ; SSE2-NEXT: [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[LD12]]) 1536 ; SSE2-NEXT: [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]]) 1537 ; SSE2-NEXT: [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]]) 1538 ; SSE2-NEXT: [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]]) 1539 ; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1540 ; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1541 ; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1542 ; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1543 ; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1544 ; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1545 ; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1546 ; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1547 ; SSE2-NEXT: store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 1548 ; SSE2-NEXT: store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 1549 ; SSE2-NEXT: store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1550 ; SSE2-NEXT: store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1551 ; SSE2-NEXT: store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1552 ; SSE2-NEXT: store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1553 ; SSE2-NEXT: store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1554 ; SSE2-NEXT: store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1555 ; SSE2-NEXT: ret void 1556 ; 1557 ; SSE41-LABEL: @nearbyint_16f32( 1558 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1559 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1560 ; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 1561 ; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 1562 ; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) 1563 ; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) 1564 ; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]]) 1565 ; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]]) 1566 ; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1567 ; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1568 ; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 1569 ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 1570 ; SSE41-NEXT: ret void 1571 ; 1572 ; AVX1-LABEL: @nearbyint_16f32( 1573 ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1574 ; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1575 ; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) 1576 ; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]]) 1577 ; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1578 ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1579 ; AVX1-NEXT: ret void 1580 ; 1581 ; AVX2-LABEL: @nearbyint_16f32( 1582 ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1583 ; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1584 ; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) 1585 ; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]]) 1586 ; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1587 ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1588 ; AVX2-NEXT: ret void 1589 ; 1590 ; AVX512-LABEL: @nearbyint_16f32( 1591 ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 1592 ; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> [[TMP1]]) 1593 ; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 1594 ; AVX512-NEXT: ret void 1595 ; 1596 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 1597 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 1598 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 1599 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 1600 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 1601 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 1602 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 1603 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 1604 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 1605 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 1606 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1607 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1608 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1609 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1610 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1611 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1612 %nearbyint0 = call float @llvm.nearbyint.f32(float %ld0 ) 1613 %nearbyint1 = call float @llvm.nearbyint.f32(float %ld1 ) 1614 %nearbyint2 = call float @llvm.nearbyint.f32(float %ld2 ) 1615 %nearbyint3 = call float @llvm.nearbyint.f32(float %ld3 ) 1616 %nearbyint4 = call float @llvm.nearbyint.f32(float %ld4 ) 1617 %nearbyint5 = call float @llvm.nearbyint.f32(float %ld5 ) 1618 %nearbyint6 = call float @llvm.nearbyint.f32(float %ld6 ) 1619 %nearbyint7 = call float @llvm.nearbyint.f32(float %ld7 ) 1620 %nearbyint8 = call float @llvm.nearbyint.f32(float %ld8 ) 1621 %nearbyint9 = call float @llvm.nearbyint.f32(float %ld9 ) 1622 %nearbyint10 = call float @llvm.nearbyint.f32(float %ld10) 1623 %nearbyint11 = call float @llvm.nearbyint.f32(float %ld11) 1624 %nearbyint12 = call float @llvm.nearbyint.f32(float %ld12) 1625 %nearbyint13 = call float @llvm.nearbyint.f32(float %ld13) 1626 %nearbyint14 = call float @llvm.nearbyint.f32(float %ld14) 1627 %nearbyint15 = call float @llvm.nearbyint.f32(float %ld15) 1628 store float %nearbyint0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 1629 store float %nearbyint1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 1630 store float %nearbyint2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 1631 store float %nearbyint3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 1632 store float %nearbyint4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 1633 store float %nearbyint5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 1634 store float %nearbyint6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 1635 store float %nearbyint7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 1636 store float %nearbyint8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 1637 store float %nearbyint9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 1638 store float %nearbyint10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1639 store float %nearbyint11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1640 store float %nearbyint12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1641 store float %nearbyint13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1642 store float %nearbyint14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1643 store float %nearbyint15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1644 ret void 1645 } 1646 1647 define void @rint_4f32() #0 { 1648 ; SSE2-LABEL: @rint_4f32( 1649 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1650 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1651 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1652 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1653 ; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) 1654 ; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) 1655 ; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) 1656 ; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) 1657 ; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1658 ; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1659 ; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1660 ; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1661 ; SSE2-NEXT: ret void 1662 ; 1663 ; SSE41-LABEL: @rint_4f32( 1664 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1665 ; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) 1666 ; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1667 ; SSE41-NEXT: ret void 1668 ; 1669 ; AVX-LABEL: @rint_4f32( 1670 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1671 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) 1672 ; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1673 ; AVX-NEXT: ret void 1674 ; 1675 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1676 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1677 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1678 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1679 %rint0 = call float @llvm.rint.f32(float %ld0) 1680 %rint1 = call float @llvm.rint.f32(float %ld1) 1681 %rint2 = call float @llvm.rint.f32(float %ld2) 1682 %rint3 = call float @llvm.rint.f32(float %ld3) 1683 store float %rint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1684 store float %rint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1685 store float %rint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1686 store float %rint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1687 ret void 1688 } 1689 1690 define void @rint_8f32() #0 { 1691 ; SSE2-LABEL: @rint_8f32( 1692 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1693 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1694 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1695 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1696 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1697 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1698 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1699 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1700 ; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) 1701 ; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) 1702 ; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) 1703 ; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) 1704 ; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]]) 1705 ; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) 1706 ; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) 1707 ; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) 1708 ; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1709 ; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1710 ; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1711 ; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1712 ; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1713 ; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1714 ; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1715 ; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1716 ; SSE2-NEXT: ret void 1717 ; 1718 ; SSE41-LABEL: @rint_8f32( 1719 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1720 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1721 ; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) 1722 ; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) 1723 ; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1724 ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1725 ; SSE41-NEXT: ret void 1726 ; 1727 ; AVX-LABEL: @rint_8f32( 1728 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1729 ; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) 1730 ; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1731 ; AVX-NEXT: ret void 1732 ; 1733 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1734 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1735 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1736 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1737 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1738 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1739 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1740 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1741 %rint0 = call float @llvm.rint.f32(float %ld0) 1742 %rint1 = call float @llvm.rint.f32(float %ld1) 1743 %rint2 = call float @llvm.rint.f32(float %ld2) 1744 %rint3 = call float @llvm.rint.f32(float %ld3) 1745 %rint4 = call float @llvm.rint.f32(float %ld4) 1746 %rint5 = call float @llvm.rint.f32(float %ld5) 1747 %rint6 = call float @llvm.rint.f32(float %ld6) 1748 %rint7 = call float @llvm.rint.f32(float %ld7) 1749 store float %rint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1750 store float %rint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1751 store float %rint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1752 store float %rint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1753 store float %rint4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1754 store float %rint5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1755 store float %rint6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1756 store float %rint7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1757 ret void 1758 } 1759 1760 define void @rint_16f32() #0 { 1761 ; SSE2-LABEL: @rint_16f32( 1762 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1763 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1764 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1765 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1766 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1767 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1768 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1769 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1770 ; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 1771 ; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 1772 ; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1773 ; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1774 ; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1775 ; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1776 ; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1777 ; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1778 ; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) 1779 ; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) 1780 ; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) 1781 ; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) 1782 ; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]]) 1783 ; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) 1784 ; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) 1785 ; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) 1786 ; SSE2-NEXT: [[RINT8:%.*]] = call float @llvm.rint.f32(float [[LD8]]) 1787 ; SSE2-NEXT: [[RINT9:%.*]] = call float @llvm.rint.f32(float [[LD9]]) 1788 ; SSE2-NEXT: [[RINT10:%.*]] = call float @llvm.rint.f32(float [[LD10]]) 1789 ; SSE2-NEXT: [[RINT11:%.*]] = call float @llvm.rint.f32(float [[LD11]]) 1790 ; SSE2-NEXT: [[RINT12:%.*]] = call float @llvm.rint.f32(float [[LD12]]) 1791 ; SSE2-NEXT: [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]]) 1792 ; SSE2-NEXT: [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]]) 1793 ; SSE2-NEXT: [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]]) 1794 ; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1795 ; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1796 ; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1797 ; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1798 ; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1799 ; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1800 ; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1801 ; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1802 ; SSE2-NEXT: store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 1803 ; SSE2-NEXT: store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 1804 ; SSE2-NEXT: store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1805 ; SSE2-NEXT: store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1806 ; SSE2-NEXT: store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1807 ; SSE2-NEXT: store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1808 ; SSE2-NEXT: store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1809 ; SSE2-NEXT: store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1810 ; SSE2-NEXT: ret void 1811 ; 1812 ; SSE41-LABEL: @rint_16f32( 1813 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1814 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1815 ; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 1816 ; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 1817 ; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) 1818 ; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) 1819 ; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]]) 1820 ; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]]) 1821 ; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1822 ; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1823 ; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 1824 ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 1825 ; SSE41-NEXT: ret void 1826 ; 1827 ; AVX1-LABEL: @rint_16f32( 1828 ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1829 ; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1830 ; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) 1831 ; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]]) 1832 ; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1833 ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1834 ; AVX1-NEXT: ret void 1835 ; 1836 ; AVX2-LABEL: @rint_16f32( 1837 ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1838 ; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1839 ; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) 1840 ; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]]) 1841 ; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1842 ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1843 ; AVX2-NEXT: ret void 1844 ; 1845 ; AVX512-LABEL: @rint_16f32( 1846 ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 1847 ; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.rint.v16f32(<16 x float> [[TMP1]]) 1848 ; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 1849 ; AVX512-NEXT: ret void 1850 ; 1851 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 1852 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 1853 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 1854 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 1855 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 1856 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 1857 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 1858 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 1859 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 1860 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 1861 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1862 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1863 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1864 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1865 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1866 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1867 %rint0 = call float @llvm.rint.f32(float %ld0 ) 1868 %rint1 = call float @llvm.rint.f32(float %ld1 ) 1869 %rint2 = call float @llvm.rint.f32(float %ld2 ) 1870 %rint3 = call float @llvm.rint.f32(float %ld3 ) 1871 %rint4 = call float @llvm.rint.f32(float %ld4 ) 1872 %rint5 = call float @llvm.rint.f32(float %ld5 ) 1873 %rint6 = call float @llvm.rint.f32(float %ld6 ) 1874 %rint7 = call float @llvm.rint.f32(float %ld7 ) 1875 %rint8 = call float @llvm.rint.f32(float %ld8 ) 1876 %rint9 = call float @llvm.rint.f32(float %ld9 ) 1877 %rint10 = call float @llvm.rint.f32(float %ld10) 1878 %rint11 = call float @llvm.rint.f32(float %ld11) 1879 %rint12 = call float @llvm.rint.f32(float %ld12) 1880 %rint13 = call float @llvm.rint.f32(float %ld13) 1881 %rint14 = call float @llvm.rint.f32(float %ld14) 1882 %rint15 = call float @llvm.rint.f32(float %ld15) 1883 store float %rint0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 1884 store float %rint1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 1885 store float %rint2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 1886 store float %rint3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 1887 store float %rint4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 1888 store float %rint5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 1889 store float %rint6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 1890 store float %rint7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 1891 store float %rint8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 1892 store float %rint9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 1893 store float %rint10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1894 store float %rint11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1895 store float %rint12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1896 store float %rint13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1897 store float %rint14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1898 store float %rint15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1899 ret void 1900 } 1901 1902 define void @trunc_4f32() #0 { 1903 ; SSE2-LABEL: @trunc_4f32( 1904 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1905 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1906 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1907 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1908 ; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) 1909 ; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) 1910 ; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) 1911 ; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) 1912 ; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1913 ; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1914 ; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1915 ; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1916 ; SSE2-NEXT: ret void 1917 ; 1918 ; SSE41-LABEL: @trunc_4f32( 1919 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1920 ; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) 1921 ; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1922 ; SSE41-NEXT: ret void 1923 ; 1924 ; AVX-LABEL: @trunc_4f32( 1925 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1926 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) 1927 ; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1928 ; AVX-NEXT: ret void 1929 ; 1930 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1931 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1932 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1933 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1934 %trunc0 = call float @llvm.trunc.f32(float %ld0) 1935 %trunc1 = call float @llvm.trunc.f32(float %ld1) 1936 %trunc2 = call float @llvm.trunc.f32(float %ld2) 1937 %trunc3 = call float @llvm.trunc.f32(float %ld3) 1938 store float %trunc0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1939 store float %trunc1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1940 store float %trunc2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1941 store float %trunc3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1942 ret void 1943 } 1944 1945 define void @trunc_8f32() #0 { 1946 ; SSE2-LABEL: @trunc_8f32( 1947 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1948 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1949 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1950 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1951 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1952 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1953 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1954 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1955 ; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) 1956 ; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) 1957 ; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) 1958 ; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) 1959 ; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]]) 1960 ; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) 1961 ; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) 1962 ; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) 1963 ; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1964 ; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1965 ; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1966 ; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1967 ; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1968 ; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1969 ; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1970 ; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1971 ; SSE2-NEXT: ret void 1972 ; 1973 ; SSE41-LABEL: @trunc_8f32( 1974 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1975 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1976 ; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) 1977 ; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) 1978 ; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1979 ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1980 ; SSE41-NEXT: ret void 1981 ; 1982 ; AVX-LABEL: @trunc_8f32( 1983 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1984 ; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) 1985 ; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1986 ; AVX-NEXT: ret void 1987 ; 1988 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1989 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1990 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1991 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1992 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1993 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1994 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1995 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1996 %trunc0 = call float @llvm.trunc.f32(float %ld0) 1997 %trunc1 = call float @llvm.trunc.f32(float %ld1) 1998 %trunc2 = call float @llvm.trunc.f32(float %ld2) 1999 %trunc3 = call float @llvm.trunc.f32(float %ld3) 2000 %trunc4 = call float @llvm.trunc.f32(float %ld4) 2001 %trunc5 = call float @llvm.trunc.f32(float %ld5) 2002 %trunc6 = call float @llvm.trunc.f32(float %ld6) 2003 %trunc7 = call float @llvm.trunc.f32(float %ld7) 2004 store float %trunc0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 2005 store float %trunc1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 2006 store float %trunc2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 2007 store float %trunc3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 2008 store float %trunc4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 2009 store float %trunc5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 2010 store float %trunc6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 2011 store float %trunc7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 2012 ret void 2013 } 2014 2015 define void @trunc_16f32() #0 { 2016 ; SSE2-LABEL: @trunc_16f32( 2017 ; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 2018 ; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 2019 ; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 2020 ; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 2021 ; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 2022 ; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 2023 ; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 2024 ; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 2025 ; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 2026 ; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 2027 ; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 2028 ; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 2029 ; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 2030 ; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 2031 ; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 2032 ; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 2033 ; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) 2034 ; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) 2035 ; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) 2036 ; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) 2037 ; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]]) 2038 ; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) 2039 ; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) 2040 ; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) 2041 ; SSE2-NEXT: [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[LD8]]) 2042 ; SSE2-NEXT: [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[LD9]]) 2043 ; SSE2-NEXT: [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[LD10]]) 2044 ; SSE2-NEXT: [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[LD11]]) 2045 ; SSE2-NEXT: [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[LD12]]) 2046 ; SSE2-NEXT: [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]]) 2047 ; SSE2-NEXT: [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]]) 2048 ; SSE2-NEXT: [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]]) 2049 ; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 2050 ; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 2051 ; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 2052 ; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 2053 ; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 2054 ; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 2055 ; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 2056 ; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 2057 ; SSE2-NEXT: store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 2058 ; SSE2-NEXT: store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 2059 ; SSE2-NEXT: store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 2060 ; SSE2-NEXT: store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 2061 ; SSE2-NEXT: store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 2062 ; SSE2-NEXT: store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 2063 ; SSE2-NEXT: store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 2064 ; SSE2-NEXT: store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 2065 ; SSE2-NEXT: ret void 2066 ; 2067 ; SSE41-LABEL: @trunc_16f32( 2068 ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 2069 ; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 2070 ; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 2071 ; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 2072 ; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) 2073 ; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) 2074 ; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]]) 2075 ; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]]) 2076 ; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 2077 ; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 2078 ; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 2079 ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 2080 ; SSE41-NEXT: ret void 2081 ; 2082 ; AVX1-LABEL: @trunc_16f32( 2083 ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 2084 ; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 2085 ; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) 2086 ; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]]) 2087 ; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 2088 ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 2089 ; AVX1-NEXT: ret void 2090 ; 2091 ; AVX2-LABEL: @trunc_16f32( 2092 ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 2093 ; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 2094 ; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) 2095 ; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]]) 2096 ; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 2097 ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 2098 ; AVX2-NEXT: ret void 2099 ; 2100 ; AVX512-LABEL: @trunc_16f32( 2101 ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 2102 ; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.trunc.v16f32(<16 x float> [[TMP1]]) 2103 ; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 2104 ; AVX512-NEXT: ret void 2105 ; 2106 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 2107 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 2108 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 2109 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 2110 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 2111 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 2112 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 2113 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 2114 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 2115 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 2116 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 2117 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 2118 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 2119 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 2120 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 2121 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 2122 %trunc0 = call float @llvm.trunc.f32(float %ld0 ) 2123 %trunc1 = call float @llvm.trunc.f32(float %ld1 ) 2124 %trunc2 = call float @llvm.trunc.f32(float %ld2 ) 2125 %trunc3 = call float @llvm.trunc.f32(float %ld3 ) 2126 %trunc4 = call float @llvm.trunc.f32(float %ld4 ) 2127 %trunc5 = call float @llvm.trunc.f32(float %ld5 ) 2128 %trunc6 = call float @llvm.trunc.f32(float %ld6 ) 2129 %trunc7 = call float @llvm.trunc.f32(float %ld7 ) 2130 %trunc8 = call float @llvm.trunc.f32(float %ld8 ) 2131 %trunc9 = call float @llvm.trunc.f32(float %ld9 ) 2132 %trunc10 = call float @llvm.trunc.f32(float %ld10) 2133 %trunc11 = call float @llvm.trunc.f32(float %ld11) 2134 %trunc12 = call float @llvm.trunc.f32(float %ld12) 2135 %trunc13 = call float @llvm.trunc.f32(float %ld13) 2136 %trunc14 = call float @llvm.trunc.f32(float %ld14) 2137 %trunc15 = call float @llvm.trunc.f32(float %ld15) 2138 store float %trunc0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 2139 store float %trunc1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 2140 store float %trunc2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 2141 store float %trunc3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 2142 store float %trunc4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 2143 store float %trunc5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 2144 store float %trunc6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 2145 store float %trunc7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 2146 store float %trunc8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 2147 store float %trunc9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 2148 store float %trunc10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 2149 store float %trunc11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 2150 store float %trunc12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 2151 store float %trunc13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 2152 store float %trunc14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 2153 store float %trunc15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 2154 ret void 2155 } 2156 2157 attributes #0 = { nounwind } 2158 2159