1 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 2 target triple = "aarch64-linux-android" 3 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5 ;;;;;;;;; INTRINSICS ;;;;;;;;;; 6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 7 8 declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone 9 declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone 10 declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 11 declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 12 declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 13 declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 14 declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 15 declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 16 17 declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone 18 declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone 19 declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 20 declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 21 declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 22 declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 23 declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 24 declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 25 26 declare <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 27 declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 28 declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 29 30 declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 31 declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 32 declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 33 34 declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone 35 declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone 36 37 declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone 38 declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone 39 40 declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone 41 declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone 42 43 declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone 44 declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone 45 46 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 47 ;;;;;;;;; HELPERS ;;;;;;;;;; 48 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 49 50 define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline { 51 %1 = insertelement <4 x float> undef, float %in, i32 0 52 %2 = insertelement <4 x float> %1, float %in, i32 1 53 %3 = insertelement <4 x float> %2, float %in, i32 2 54 %4 = insertelement <4 x float> %3, float %in, i32 3 55 ret <4 x float> %4 56 } 57 58 define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline { 59 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 60 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 61 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 62 %4 = insertelement <4 x i32> %3, i32 %in, i32 3 63 ret <4 x i32> %4 64 } 65 66 define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline { 67 %1 = insertelement <4 x i16> undef, i16 %in, i32 0 68 %2 = insertelement <4 x i16> %1, i16 %in, i32 1 69 %3 = insertelement <4 x i16> %2, i16 %in, i32 2 70 %4 = insertelement <4 x i16> %3, i16 %in, i32 3 71 ret <4 x i16> %4 72 } 73 74 75 76 define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline { 77 %1 = insertelement <2 x float> undef, float %in, i32 0 78 %2 = insertelement <2 x float> %1, float %in, i32 1 79 ret <2 x float> %2 80 } 81 82 define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline { 83 %1 = insertelement <2 x i32> undef, i32 %in, i32 0 84 %2 = insertelement <2 x i32> %1, i32 %in, i32 1 85 ret <2 x i32> %2 86 } 87 88 define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline { 89 %1 = insertelement <2 x i16> undef, i16 %in, i32 0 90 %2 = insertelement <2 x i16> %1, i16 %in, i32 1 91 ret <2 x i16> %2 92 } 93 94 95 define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline { 96 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 97 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 98 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 99 %4 = insertelement <4 x i32> %3, i32 %in, i32 3 100 ret <4 x i32> %4 101 } 102 103 104 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 105 ;;;;;;;;; CLAMP ;;;;;;;;;; 106 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 107 108 define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly { 109 %1 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone 110 %2 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone 111 ret <4 x float> %2 112 } 113 114 define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly { 115 %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone 116 %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone 117 %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly 118 ret <4 x float> %out 119 } 120 121 define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly { 122 %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 123 %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 124 %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 125 %a = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone 126 %b = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone 127 %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 128 ret <3 x float> %c 129 } 130 131 define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly { 132 %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 133 %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone 134 %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone 135 %a = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone 136 %b = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone 137 %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 138 ret <3 x float> %c 139 } 140 141 define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly { 142 %1 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone 143 %2 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone 144 ret <2 x float> %2 145 } 146 147 define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly { 148 %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone 149 %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone 150 %a = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone 151 %b = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone 152 ret <2 x float> %b 153 } 154 155 define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly { 156 %1 = fcmp olt float %value, %high 157 %2 = select i1 %1, float %value, float %high 158 %3 = fcmp ogt float %2, %low 159 %4 = select i1 %3, float %2, float %low 160 ret float %4 161 } 162 163 164 165 define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly { 166 %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone 167 %2 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone 168 ret <4 x i32> %2 169 } 170 171 define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly { 172 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 173 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 174 %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone 175 %2 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone 176 ret <4 x i32> %2 177 } 178 179 define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly { 180 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 181 %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 182 %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 183 %a = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 184 %b = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 185 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 186 ret <3 x i32> %c 187 } 188 189 define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly { 190 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 191 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 192 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 193 %a = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 194 %b = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 195 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 196 ret <3 x i32> %c 197 } 198 199 define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly { 200 %1 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone 201 %2 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone 202 ret <2 x i32> %2 203 } 204 205 define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly { 206 %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone 207 %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone 208 %a = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone 209 %b = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone 210 ret <2 x i32> %b 211 } 212 213 214 215 define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly { 216 %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone 217 %2 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone 218 ret <4 x i32> %2 219 } 220 221 define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly { 222 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 223 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 224 %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone 225 %2 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone 226 ret <4 x i32> %2 227 } 228 229 define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly { 230 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 231 %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 232 %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 233 %a = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 234 %b = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 235 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 236 ret <3 x i32> %c 237 } 238 239 define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly { 240 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 241 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 242 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 243 %a = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 244 %b = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 245 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 246 ret <3 x i32> %c 247 } 248 249 define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly { 250 %1 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone 251 %2 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone 252 ret <2 x i32> %2 253 } 254 255 define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly { 256 %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone 257 %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone 258 %a = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone 259 %b = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone 260 ret <2 x i32> %b 261 } 262 263 264 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 265 ;;;;;;;;; FMAX ;;;;;;;;;; 266 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 267 268 define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly { 269 %1 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone 270 ret <4 x float> %1 271 } 272 273 define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly { 274 %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 275 %2 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone 276 ret <4 x float> %2 277 } 278 279 define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly { 280 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 281 %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 282 %3 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 283 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 284 ret <3 x float> %4 285 } 286 287 define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly { 288 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 289 %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 290 %3 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 291 %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 292 ret <3 x float> %c 293 } 294 295 define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly { 296 %1 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone 297 ret <2 x float> %1 298 } 299 300 define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly { 301 %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone 302 %2 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone 303 ret <2 x float> %2 304 } 305 306 define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly { 307 %1 = fcmp ogt float %v1, %v2 308 %2 = select i1 %1, float %v1, float %v2 309 ret float %2 310 } 311 312 313 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 314 ;;;;;;;;; FMIN ;;;;;;;;;; 315 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 316 317 define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly { 318 %1 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone 319 ret <4 x float> %1 320 } 321 322 define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly { 323 %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 324 %2 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone 325 ret <4 x float> %2 326 } 327 328 define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly { 329 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 330 %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 331 %3 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 332 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 333 ret <3 x float> %4 334 } 335 336 define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly { 337 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 338 %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 339 %3 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 340 %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 341 ret <3 x float> %c 342 } 343 344 define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly { 345 %1 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone 346 ret <2 x float> %1 347 } 348 349 define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly { 350 %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone 351 %2 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone 352 ret <2 x float> %2 353 } 354 355 define float @_Z4fminff(float %v1, float %v2) nounwind readnone { 356 %1 = fcmp olt float %v1, %v2 357 %2 = select i1 %1, float %v1, float %v2 358 ret float %2 359 } 360 361 362 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 363 ;;;;;;;;; MAX ;;;;;;;;;; 364 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 365 366 define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone { 367 %1 = icmp sgt i8 %v1, %v2 368 %2 = select i1 %1, i8 %v1, i8 %v2 369 ret i8 %2 370 } 371 372 define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 373 %1 = sext <2 x i8> %v1 to <2 x i32> 374 %2 = sext <2 x i8> %v2 to <2 x i32> 375 %3 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 376 %4 = trunc <2 x i32> %3 to <2 x i8> 377 ret <2 x i8> %4 378 } 379 380 define <3 x i8> @_Z3maxDv3_cS_(i32 %v1, i32 %v2) nounwind readnone { 381 %1 = bitcast i32 %v1 to <4 x i8> 382 %2 = bitcast i32 %v2 to <4 x i8> 383 %3 = sext <4 x i8> %1 to <4 x i32> 384 %4 = sext <4 x i8> %2 to <4 x i32> 385 %5 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 386 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 387 %7 = trunc <3 x i32> %6 to <3 x i8> 388 ret <3 x i8> %7 389 } 390 391 define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 392 %1 = sext <4 x i8> %v1 to <4 x i32> 393 %2 = sext <4 x i8> %v2 to <4 x i32> 394 %3 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 395 %4 = trunc <4 x i32> %3 to <4 x i8> 396 ret <4 x i8> %4 397 } 398 399 define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone { 400 %1 = icmp sgt i16 %v1, %v2 401 %2 = select i1 %1, i16 %v1, i16 %v2 402 ret i16 %2 403 } 404 405 define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 406 %1 = sext <2 x i16> %v1 to <2 x i32> 407 %2 = sext <2 x i16> %v2 to <2 x i32> 408 %3 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 409 %4 = trunc <2 x i32> %3 to <2 x i16> 410 ret <2 x i16> %4 411 } 412 413 define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 414 %1 = sext <3 x i16> %v1 to <3 x i32> 415 %2 = sext <3 x i16> %v2 to <3 x i32> 416 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 417 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 418 %5 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 419 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 420 %7 = trunc <3 x i32> %6 to <3 x i16> 421 ret <3 x i16> %7 422 } 423 424 define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 425 %1 = sext <4 x i16> %v1 to <4 x i32> 426 %2 = sext <4 x i16> %v2 to <4 x i32> 427 %3 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 428 %4 = trunc <4 x i32> %3 to <4 x i16> 429 ret <4 x i16> %4 430 } 431 432 define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone { 433 %1 = icmp sgt i32 %v1, %v2 434 %2 = select i1 %1, i32 %v1, i32 %v2 435 ret i32 %2 436 } 437 438 define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 439 %1 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 440 ret <2 x i32> %1 441 } 442 443 define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 444 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 445 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 446 %3 = tail call <4 x i32 > @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 447 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 448 ret <3 x i32> %4 449 } 450 451 define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 452 %1 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 453 ret <4 x i32> %1 454 } 455 456 define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone { 457 %1 = icmp sgt i64 %v1, %v2 458 %2 = select i1 %1, i64 %v1, i64 %v2 459 ret i64 %2 460 } 461 462 ; TODO: long vector types 463 464 define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone { 465 %1 = icmp ugt i8 %v1, %v2 466 %2 = select i1 %1, i8 %v1, i8 %v2 467 ret i8 %2 468 } 469 470 define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 471 %1 = zext <2 x i8> %v1 to <2 x i32> 472 %2 = zext <2 x i8> %v2 to <2 x i32> 473 %3 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 474 %4 = trunc <2 x i32> %3 to <2 x i8> 475 ret <2 x i8> %4 476 } 477 478 define <3 x i8> @_Z3maxDv3_hS_(i32 %v1, i32 %v2) nounwind readnone { 479 %1 = bitcast i32 %v1 to <4 x i8> 480 %2 = bitcast i32 %v2 to <4 x i8> 481 %3 = zext <4 x i8> %1 to <4 x i32> 482 %4 = zext <4 x i8> %2 to <4 x i32> 483 %5 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 484 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 485 %7 = trunc <3 x i32> %6 to <3 x i8> 486 ret <3 x i8> %7 487 } 488 489 define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 490 %1 = zext <4 x i8> %v1 to <4 x i32> 491 %2 = zext <4 x i8> %v2 to <4 x i32> 492 %3 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 493 %4 = trunc <4 x i32> %3 to <4 x i8> 494 ret <4 x i8> %4 495 } 496 497 define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone { 498 %1 = icmp ugt i16 %v1, %v2 499 %2 = select i1 %1, i16 %v1, i16 %v2 500 ret i16 %2 501 } 502 503 define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 504 %1 = zext <2 x i16> %v1 to <2 x i32> 505 %2 = zext <2 x i16> %v2 to <2 x i32> 506 %3 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 507 %4 = trunc <2 x i32> %3 to <2 x i16> 508 ret <2 x i16> %4 509 } 510 511 define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 512 %1 = zext <3 x i16> %v1 to <3 x i32> 513 %2 = zext <3 x i16> %v2 to <3 x i32> 514 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 515 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 516 %5 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 517 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 518 %7 = trunc <3 x i32> %6 to <3 x i16> 519 ret <3 x i16> %7 520 } 521 522 define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 523 %1 = zext <4 x i16> %v1 to <4 x i32> 524 %2 = zext <4 x i16> %v2 to <4 x i32> 525 %3 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 526 %4 = trunc <4 x i32> %3 to <4 x i16> 527 ret <4 x i16> %4 528 } 529 530 define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone { 531 %1 = icmp ugt i32 %v1, %v2 532 %2 = select i1 %1, i32 %v1, i32 %v2 533 ret i32 %2 534 } 535 536 define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 537 %1 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 538 ret <2 x i32> %1 539 } 540 541 define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 542 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 543 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 544 %3 = tail call <4 x i32 > @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 545 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 546 ret <3 x i32> %4 547 } 548 549 define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 550 %1 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 551 ret <4 x i32> %1 552 } 553 554 555 ; TODO: long vector types 556 557 define float @_Z3maxff(float %v1, float %v2) nounwind readnone { 558 %1 = tail call float @_Z4fmaxff(float %v1, float %v2) 559 ret float %1 560 } 561 562 define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone { 563 %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) 564 ret <2 x float> %1 565 } 566 567 define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone { 568 %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) 569 ret <2 x float> %1 570 } 571 572 define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone { 573 %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) 574 ret <3 x float> %1 575 } 576 577 define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone { 578 %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) 579 ret <3 x float> %1 580 } 581 582 define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone { 583 %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) 584 ret <4 x float> %1 585 } 586 587 define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone { 588 %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) 589 ret <4 x float> %1 590 } 591 592 593 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 594 ;;;;;;;;; MIN ;;;;;;;;;; 595 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 596 597 define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone { 598 %1 = icmp slt i8 %v1, %v2 599 %2 = select i1 %1, i8 %v1, i8 %v2 600 ret i8 %2 601 } 602 603 define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 604 %1 = sext <2 x i8> %v1 to <2 x i32> 605 %2 = sext <2 x i8> %v2 to <2 x i32> 606 %3 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 607 %4 = trunc <2 x i32> %3 to <2 x i8> 608 ret <2 x i8> %4 609 } 610 611 define <3 x i8> @_Z3minDv3_cS_(i32 %v1, i32 %v2) nounwind readnone { 612 %1 = bitcast i32 %v1 to <4 x i8> 613 %2 = bitcast i32 %v2 to <4 x i8> 614 %3 = sext <4 x i8> %1 to <4 x i32> 615 %4 = sext <4 x i8> %2 to <4 x i32> 616 %5 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 617 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 618 %7 = trunc <3 x i32> %6 to <3 x i8> 619 ret <3 x i8> %7 620 } 621 622 define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 623 %1 = sext <4 x i8> %v1 to <4 x i32> 624 %2 = sext <4 x i8> %v2 to <4 x i32> 625 %3 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 626 %4 = trunc <4 x i32> %3 to <4 x i8> 627 ret <4 x i8> %4 628 } 629 630 define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone { 631 %1 = icmp slt i16 %v1, %v2 632 %2 = select i1 %1, i16 %v1, i16 %v2 633 ret i16 %2 634 } 635 636 define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 637 %1 = sext <2 x i16> %v1 to <2 x i32> 638 %2 = sext <2 x i16> %v2 to <2 x i32> 639 %3 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 640 %4 = trunc <2 x i32> %3 to <2 x i16> 641 ret <2 x i16> %4 642 } 643 644 define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 645 %1 = sext <3 x i16> %v1 to <3 x i32> 646 %2 = sext <3 x i16> %v2 to <3 x i32> 647 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 648 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 649 %5 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 650 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 651 %7 = trunc <3 x i32> %6 to <3 x i16> 652 ret <3 x i16> %7 653 } 654 655 define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 656 %1 = sext <4 x i16> %v1 to <4 x i32> 657 %2 = sext <4 x i16> %v2 to <4 x i32> 658 %3 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 659 %4 = trunc <4 x i32> %3 to <4 x i16> 660 ret <4 x i16> %4 661 } 662 663 define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone { 664 %1 = icmp slt i32 %v1, %v2 665 %2 = select i1 %1, i32 %v1, i32 %v2 666 ret i32 %2 667 } 668 669 define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 670 %1 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 671 ret <2 x i32> %1 672 } 673 674 define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 675 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 676 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 677 %3 = tail call <4 x i32 > @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 678 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 679 ret <3 x i32> %4 680 } 681 682 define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 683 %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 684 ret <4 x i32> %1 685 } 686 687 define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone { 688 %1 = icmp slt i64 %v1, %v2 689 %2 = select i1 %1, i64 %v1, i64 %v2 690 ret i64 %2 691 } 692 693 ; TODO: long vector types 694 695 define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone { 696 %1 = icmp ult i8 %v1, %v2 697 %2 = select i1 %1, i8 %v1, i8 %v2 698 ret i8 %2 699 } 700 701 define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 702 %1 = zext <2 x i8> %v1 to <2 x i32> 703 %2 = zext <2 x i8> %v2 to <2 x i32> 704 %3 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 705 %4 = trunc <2 x i32> %3 to <2 x i8> 706 ret <2 x i8> %4 707 } 708 709 define <3 x i8> @_Z3minDv3_hS_(i32 %v1, i32 %v2) nounwind readnone { 710 %1 = bitcast i32 %v1 to <4 x i8> 711 %2 = bitcast i32 %v2 to <4 x i8> 712 %3 = zext <4 x i8> %1 to <4 x i32> 713 %4 = zext <4 x i8> %2 to <4 x i32> 714 %5 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 715 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 716 %7 = trunc <3 x i32> %6 to <3 x i8> 717 ret <3 x i8> %7 718 } 719 720 define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 721 %1 = zext <4 x i8> %v1 to <4 x i32> 722 %2 = zext <4 x i8> %v2 to <4 x i32> 723 %3 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 724 %4 = trunc <4 x i32> %3 to <4 x i8> 725 ret <4 x i8> %4 726 } 727 728 define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone { 729 %1 = icmp ult i16 %v1, %v2 730 %2 = select i1 %1, i16 %v1, i16 %v2 731 ret i16 %2 732 } 733 734 define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 735 %1 = zext <2 x i16> %v1 to <2 x i32> 736 %2 = zext <2 x i16> %v2 to <2 x i32> 737 %3 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 738 %4 = trunc <2 x i32> %3 to <2 x i16> 739 ret <2 x i16> %4 740 } 741 742 define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 743 %1 = zext <3 x i16> %v1 to <3 x i32> 744 %2 = zext <3 x i16> %v2 to <3 x i32> 745 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 746 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 747 %5 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 748 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 749 %7 = trunc <3 x i32> %6 to <3 x i16> 750 ret <3 x i16> %7 751 } 752 753 define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 754 %1 = zext <4 x i16> %v1 to <4 x i32> 755 %2 = zext <4 x i16> %v2 to <4 x i32> 756 %3 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 757 %4 = trunc <4 x i32> %3 to <4 x i16> 758 ret <4 x i16> %4 759 } 760 761 define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone { 762 %1 = icmp ult i32 %v1, %v2 763 %2 = select i1 %1, i32 %v1, i32 %v2 764 ret i32 %2 765 } 766 767 define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 768 %1 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 769 ret <2 x i32> %1 770 } 771 772 define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 773 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 774 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 775 %3 = tail call <4 x i32 > @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 776 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 777 ret <3 x i32> %4 778 } 779 780 define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 781 %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 782 ret <4 x i32> %1 783 } 784 785 786 ; TODO: long vector types 787 788 define float @_Z3minff(float %v1, float %v2) nounwind readnone { 789 %1 = tail call float @_Z4fminff(float %v1, float %v2) 790 ret float %1 791 } 792 793 define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone { 794 %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) 795 ret <2 x float> %1 796 } 797 798 define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone { 799 %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) 800 ret <2 x float> %1 801 } 802 803 define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone { 804 %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) 805 ret <3 x float> %1 806 } 807 808 define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone { 809 %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) 810 ret <3 x float> %1 811 } 812 813 define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone { 814 %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) 815 ret <4 x float> %1 816 } 817 818 define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone { 819 %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) 820 ret <4 x float> %1 821 } 822 823 824 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 825 ;;;;;;;;; YUV ;;;;;;;;;; 826 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 827 828 @yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16 829 @yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16 830 @yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16 831 @yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16 832 833 834 define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline { 835 %_sy = zext i8 %pY to i32 836 %_su = zext i8 %pU to i32 837 %_sv = zext i8 %pV to i32 838 839 %_sy2 = add i32 -16, %_sy 840 %_sy3 = mul i32 298, %_sy2 841 %_su2 = add i32 -128, %_su 842 %_sv2 = add i32 -128, %_sv 843 %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone 844 %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone 845 %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone 846 847 %mu = load <4 x i32>* @yuv_U, align 8 848 %mv = load <4 x i32>* @yuv_V, align 8 849 %_u2 = mul <4 x i32> %_u, %mu 850 %_v2 = mul <4 x i32> %_v, %mv 851 %_y2 = add <4 x i32> %_y, %_u2 852 %_y3 = add <4 x i32> %_y2, %_v2 853 854 ; %r1 = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone 855 ; %r2 = trunc <4 x i16> %r1 to <4 x i8> 856 ; ret <4 x i8> %r2 857 858 %c0 = load <4 x i32>* @yuv_0, align 8 859 %c255 = load <4 x i32>* @yuv_255, align 8 860 %r1 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone 861 %r2 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone 862 %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8> 863 %r4 = trunc <4 x i32> %r3 to <4 x i8> 864 ret <4 x i8> %r4 865 } 866 867 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 868 ;;;;;;;;; half_RECIP ;;;;;;;;;; 869 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 870 871 define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone { 872 %1 = tail call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %v) nounwind readnone 873 %2 = tail call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %1, <2 x float> %v) nounwind readnone 874 %3 = fmul <2 x float> %1, %2 875 %4 = tail call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %3, <2 x float> %v) nounwind readnone 876 %5 = fmul <2 x float> %4, %3 877 ret <2 x float> %5 878 } 879 880 define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone { 881 %1 = tail call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %v) nounwind readnone 882 %2 = tail call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %1, <4 x float> %v) nounwind readnone 883 %3 = fmul <4 x float> %1, %2 884 %4 = tail call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %3, <4 x float> %v) nounwind readnone 885 %5 = fmul <4 x float> %4, %3 886 ret <4 x float> %5 887 } 888 889 define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone { 890 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 891 %2 = tail call <4 x float> @_Z10half_recipDv4_f(<4 x float> %1) nounwind readnone 892 %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 893 ret <3 x float> %3 894 } 895 896 897 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 898 ;;;;;;;;; half_RSQRT ;;;;;;;;;; 899 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 900 901 define float @_Z10half_rsqrtf(float %v) { 902 %1 = insertelement <2 x float> undef, float %v, i32 0 903 %2 = tail call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %1) nounwind readnone 904 %3 = fmul <2 x float> %2, %2 905 %4 = tail call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %1, <2 x float> %3) nounwind readnone 906 %5 = fmul <2 x float> %2, %4 907 %6 = extractelement <2 x float> %5, i32 0 908 ret float %6 909 } 910 911 define <2 x float> @_Z10half_rsqrtDv2_f(<2 x float> %v) nounwind readnone { 912 %1 = tail call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %v) nounwind readnone 913 %2 = fmul <2 x float> %1, %1 914 %3 = tail call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v, <2 x float> %2) nounwind readnone 915 %4 = fmul <2 x float> %1, %3 916 ret <2 x float> %4 917 } 918 919 define <3 x float> @_Z10half_rsqrtDv3_f(<3 x float> %v) nounwind readnone { 920 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 921 %2 = tail call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %1) nounwind readnone 922 %3 = fmul <4 x float> %2, %2 923 %4 = tail call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %1, <4 x float> %3) nounwind readnone 924 %5 = fmul <4 x float> %2, %4 925 %6 = shufflevector <4 x float> %5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 926 ret <3 x float> %6 927 } 928 929 define <4 x float> @_Z10half_rsqrtDv4_f(<4 x float> %v) nounwind readnone { 930 %1 = tail call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %v) nounwind readnone 931 %2 = fmul <4 x float> %1, %1 932 %3 = tail call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v, <4 x float> %2) nounwind readnone 933 %4 = fmul <4 x float> %1, %3 934 ret <4 x float> %4 935 } 936 937 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 938 ;;;;;;;;; matrix ;;;;;;;;;; 939 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 940 941 %struct.rs_matrix4x4 = type { [16 x float] } 942 %struct.rs_matrix3x3 = type { [9 x float] } 943 %struct.rs_matrix2x2 = type { [4 x float] } 944 945 define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline { 946 %1 = insertelement <4 x float> undef, float %in, i32 0 947 %2 = insertelement <4 x float> %1, float %in, i32 1 948 %3 = insertelement <4 x float> %2, float %in, i32 2 949 %4 = insertelement <4 x float> %3, float %in, i32 3 950 ret <4 x float> %4 951 } 952 953 954 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly { 955 %x0 = extractelement <3 x float> %in, i32 0 956 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 957 %y0 = extractelement <3 x float> %in, i32 1 958 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 959 %z0 = extractelement <3 x float> %in, i32 2 960 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 961 962 %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0 963 %px2 = bitcast float* %px to <4 x float>* 964 %xm = load <4 x float>* %px2, align 4 965 966 %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3 967 %py2 = bitcast float* %py to <4 x float>* 968 ; %ym = call <4 x float> @llvm.aarch64.neon.ld4.v4f32(i8* %py2, i32 4) nounwind 969 %ym = load <4 x float>* %py2, align 4 970 971 %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5 972 %pz2 = bitcast float* %pz to <4 x float>* 973 ; %zm2 = call <4 x float> @llvm.aarch64.neon.ld4.v4f32(i8* %pz2, i32 4) nounwind 974 %zm2 = load <4 x float>* %pz2, align 4 975 %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 976 977 %a1 = fmul <4 x float> %x, %xm 978 %a2 = fmul <4 x float> %y, %ym 979 %a3 = fadd <4 x float> %a1, %a2 980 %a4 = fmul <4 x float> %z, %zm 981 %a5 = fadd <4 x float> %a4, %a3 982 %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 983 ret <3 x float> %a6 984 } 985 986 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly { 987 %x0 = extractelement <2 x float> %in, i32 0 988 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 989 %y0 = extractelement <2 x float> %in, i32 1 990 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 991 992 %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0 993 %px2 = bitcast float* %px to <4 x float>* 994 %xm = load <4 x float>* %px2, align 4 995 %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3 996 %py2 = bitcast float* %py to <4 x float>* 997 %ym = load <4 x float>* %py2, align 4 998 999 %a1 = fmul <4 x float> %x, %xm 1000 %a2 = fmul <4 x float> %y, %ym 1001 %a3 = fadd <4 x float> %a1, %a2 1002 %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 1003 ret <3 x float> %a4 1004 } 1005 1006 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly { 1007 %x0 = extractelement <4 x float> %in, i32 0 1008 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1009 %y0 = extractelement <4 x float> %in, i32 1 1010 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1011 %z0 = extractelement <4 x float> %in, i32 2 1012 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 1013 %w0 = extractelement <4 x float> %in, i32 3 1014 %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone 1015 1016 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1017 %px2 = bitcast float* %px to <4 x float>* 1018 %xm = load <4 x float>* %px2, align 4 1019 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1020 %py2 = bitcast float* %py to <4 x float>* 1021 %ym = load <4 x float>* %py2, align 4 1022 %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8 1023 %pz2 = bitcast float* %pz to <4 x float>* 1024 %zm = load <4 x float>* %pz2, align 4 1025 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1026 %pw2 = bitcast float* %pw to <4 x float>* 1027 %wm = load <4 x float>* %pw2, align 4 1028 1029 %a1 = fmul <4 x float> %x, %xm 1030 %a2 = fmul <4 x float> %y, %ym 1031 %a3 = fadd <4 x float> %a1, %a2 1032 %a4 = fmul <4 x float> %z, %zm 1033 %a5 = fadd <4 x float> %a3, %a4 1034 %a6 = fmul <4 x float> %w, %wm 1035 %a7 = fadd <4 x float> %a5, %a6 1036 ret <4 x float> %a7 1037 } 1038 1039 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly { 1040 %x0 = extractelement <3 x float> %in, i32 0 1041 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1042 %y0 = extractelement <3 x float> %in, i32 1 1043 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1044 %z0 = extractelement <3 x float> %in, i32 2 1045 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 1046 1047 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1048 %px2 = bitcast float* %px to <4 x float>* 1049 %xm = load <4 x float>* %px2, align 4 1050 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1051 %py2 = bitcast float* %py to <4 x float>* 1052 %ym = load <4 x float>* %py2, align 4 1053 %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8 1054 %pz2 = bitcast float* %pz to <4 x float>* 1055 %zm = load <4 x float>* %pz2, align 4 1056 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1057 %pw2 = bitcast float* %pw to <4 x float>* 1058 %wm = load <4 x float>* %pw2, align 4 1059 1060 %a1 = fmul <4 x float> %x, %xm 1061 %a2 = fadd <4 x float> %wm, %a1 1062 %a3 = fmul <4 x float> %y, %ym 1063 %a4 = fadd <4 x float> %a2, %a3 1064 %a5 = fmul <4 x float> %z, %zm 1065 %a6 = fadd <4 x float> %a4, %a5 1066 ret <4 x float> %a6 1067 } 1068 1069 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly { 1070 %x0 = extractelement <2 x float> %in, i32 0 1071 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1072 %y0 = extractelement <2 x float> %in, i32 1 1073 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1074 1075 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1076 %px2 = bitcast float* %px to <4 x float>* 1077 %xm = load <4 x float>* %px2, align 4 1078 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1079 %py2 = bitcast float* %py to <4 x float>* 1080 %ym = load <4 x float>* %py2, align 4 1081 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1082 %pw2 = bitcast float* %pw to <4 x float>* 1083 %wm = load <4 x float>* %pw2, align 4 1084 1085 %a1 = fmul <4 x float> %x, %xm 1086 %a2 = fadd <4 x float> %wm, %a1 1087 %a3 = fmul <4 x float> %y, %ym 1088 %a4 = fadd <4 x float> %a2, %a3 1089 ret <4 x float> %a4 1090 } 1091 1092 1093 1094 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1095 ;;;;;;;;; pixel ops ;;;;;;;;;; 1096 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1097 1098 1099 @fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16 1100 @fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16 1101 @fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16 1102 1103 declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone 1104 declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone 1105 1106 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color) 1107 define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone { 1108 %f255 = load <4 x float>* @fc_255.0, align 16 1109 %f05 = load <4 x float>* @fc_0.5, align 16 1110 %f0 = load <4 x float>* @fc_0, align 16 1111 %v1 = fmul <4 x float> %f255, %color 1112 %v2 = fadd <4 x float> %f05, %v1 1113 %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone 1114 %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone 1115 ret <4 x i8> %v4 1116 } 1117 1118 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color) 1119 define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<4 x i32> %color) nounwind readnone { 1120 %1 = bitcast <4 x i32> %color to <4 x float> 1121 %2 = insertelement <4 x float> %1, float 1.0, i32 3 1122 %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone 1123 ret <4 x i8> %3 1124 } 1125 1126 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b) 1127 define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone { 1128 %1 = insertelement <4 x float> undef, float %r, i32 0 1129 %2 = insertelement <4 x float> %1, float %g, i32 1 1130 %3 = insertelement <4 x float> %2, float %b, i32 2 1131 %4 = insertelement <4 x float> %3, float 1.0, i32 3 1132 %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone 1133 ret <4 x i8> %5 1134 } 1135 1136 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a) 1137 define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone { 1138 %1 = insertelement <4 x float> undef, float %r, i32 0 1139 %2 = insertelement <4 x float> %1, float %g, i32 1 1140 %3 = insertelement <4 x float> %2, float %b, i32 2 1141 %4 = insertelement <4 x float> %3, float %a, i32 3 1142 %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone 1143 ret <4 x i8> %5 1144 } 1145 1146