1 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" 2 target triple = "armv7-none-linux-gnueabi" 3 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5 ;;;;;;;;; INTRINSICS ;;;;;;;;;; 6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 7 8 declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone 9 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone 10 declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 11 declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 12 declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 13 declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 14 15 declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone 16 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone 17 declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 18 declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 19 declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 20 declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 21 22 declare <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 23 declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 24 declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 25 26 declare <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 27 declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 28 declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 29 30 declare <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 31 declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 32 declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 33 34 declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone 35 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone 36 37 declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone 38 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone 39 40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 41 ;;;;;;;;; HELPERS ;;;;;;;;;; 42 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 43 44 define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline { 45 %1 = insertelement <4 x float> undef, float %in, i32 0 46 %2 = insertelement <4 x float> %1, float %in, i32 1 47 %3 = insertelement <4 x float> %2, float %in, i32 2 48 %4 = insertelement <4 x float> %3, float %in, i32 3 49 ret <4 x float> %4 50 } 51 52 define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline { 53 %1 = insertelement <2 x float> undef, float %in, i32 0 54 %2 = insertelement <2 x float> %1, float %in, i32 1 55 ret <2 x float> %2 56 } 57 58 define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline { 59 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 60 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 61 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 62 %4 = insertelement <4 x i32> %3, i32 %in, i32 3 63 ret <4 x i32> %4 64 } 65 66 67 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 68 ;;;;;;;;; CLAMP ;;;;;;;;;; 69 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 70 71 define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly { 72 %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone 73 %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone 74 ret <4 x float> %2 75 } 76 77 define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly { 78 %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone 79 %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone 80 %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly 81 ret <4 x float> %out 82 } 83 84 define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly { 85 %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 86 %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 87 %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 88 %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone 89 %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone 90 %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 91 ret <3 x float> %c 92 } 93 94 define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly { 95 %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 96 %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone 97 %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone 98 %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone 99 %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone 100 %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 101 ret <3 x float> %c 102 } 103 104 define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly { 105 %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone 106 %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone 107 ret <2 x float> %2 108 } 109 110 define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly { 111 %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone 112 %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone 113 %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone 114 %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone 115 ret <2 x float> %b 116 } 117 118 define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly { 119 %1 = fcmp olt float %value, %high 120 %2 = select i1 %1, float %value, float %high 121 %3 = fcmp ogt float %2, %low 122 %4 = select i1 %3, float %2, float %low 123 ret float %4 124 } 125 126 127 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 128 ;;;;;;;;; FMAX ;;;;;;;;;; 129 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 130 131 define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly { 132 %1 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone 133 ret <4 x float> %1 134 } 135 136 define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly { 137 %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 138 %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone 139 ret <4 x float> %2 140 } 141 142 define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly { 143 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 144 %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 145 %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 146 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 147 ret <3 x float> %4 148 } 149 150 define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly { 151 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 152 %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 153 %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 154 %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 155 ret <3 x float> %c 156 } 157 158 define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly { 159 %1 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone 160 ret <2 x float> %1 161 } 162 163 define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly { 164 %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone 165 %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone 166 ret <2 x float> %2 167 } 168 169 define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly { 170 %1 = fcmp ogt float %v1, %v2 171 %2 = select i1 %1, float %v1, float %v2 172 ret float %2 173 } 174 175 176 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 177 ;;;;;;;;; FMIN ;;;;;;;;;; 178 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 179 180 define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly { 181 %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone 182 ret <4 x float> %1 183 } 184 185 define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly { 186 %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 187 %2 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone 188 ret <4 x float> %2 189 } 190 191 define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly { 192 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 193 %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 194 %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 195 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 196 ret <3 x float> %4 197 } 198 199 define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly { 200 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 201 %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 202 %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 203 %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 204 ret <3 x float> %c 205 } 206 207 define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly { 208 %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone 209 ret <2 x float> %1 210 } 211 212 define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly { 213 %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone 214 %2 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone 215 ret <2 x float> %2 216 } 217 218 define float @_Z4fminff(float %v1, float %v2) nounwind readnone { 219 %1 = fcmp olt float %v1, %v2 220 %2 = select i1 %1, float %v1, float %v2 221 ret float %2 222 } 223 224 225 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 226 ;;;;;;;;; MAX ;;;;;;;;;; 227 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 228 229 define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone { 230 %1 = icmp sgt i8 %v1, %v2 231 %2 = select i1 %1, i8 %v1, i8 %v2 232 ret i8 %2 233 } 234 235 define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 236 %1 = sext <2 x i8> %v1 to <2 x i32> 237 %2 = sext <2 x i8> %v2 to <2 x i32> 238 %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 239 %4 = trunc <2 x i32> %3 to <2 x i8> 240 ret <2 x i8> %4 241 } 242 243 define <3 x i8> @_Z3maxDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone { 244 %1 = sext <3 x i8> %v1 to <3 x i32> 245 %2 = sext <3 x i8> %v2 to <3 x i32> 246 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 247 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 248 %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 249 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 250 %7 = trunc <3 x i32> %6 to <3 x i8> 251 ret <3 x i8> %7 252 } 253 254 define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 255 %1 = sext <4 x i8> %v1 to <4 x i32> 256 %2 = sext <4 x i8> %v2 to <4 x i32> 257 %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 258 %4 = trunc <4 x i32> %3 to <4 x i8> 259 ret <4 x i8> %4 260 } 261 262 define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone { 263 %1 = icmp sgt i16 %v1, %v2 264 %2 = select i1 %1, i16 %v1, i16 %v2 265 ret i16 %2 266 } 267 268 define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 269 %1 = sext <2 x i16> %v1 to <2 x i32> 270 %2 = sext <2 x i16> %v2 to <2 x i32> 271 %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 272 %4 = trunc <2 x i32> %3 to <2 x i16> 273 ret <2 x i16> %4 274 } 275 276 define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 277 %1 = sext <3 x i16> %v1 to <3 x i32> 278 %2 = sext <3 x i16> %v2 to <3 x i32> 279 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 280 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 281 %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 282 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 283 %7 = trunc <3 x i32> %6 to <3 x i16> 284 ret <3 x i16> %7 285 } 286 287 define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 288 %1 = sext <4 x i16> %v1 to <4 x i32> 289 %2 = sext <4 x i16> %v2 to <4 x i32> 290 %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 291 %4 = trunc <4 x i32> %3 to <4 x i16> 292 ret <4 x i16> %4 293 } 294 295 define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone { 296 %1 = icmp sgt i32 %v1, %v2 297 %2 = select i1 %1, i32 %v1, i32 %v2 298 ret i32 %2 299 } 300 301 define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 302 %1 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 303 ret <2 x i32> %1 304 } 305 306 define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 307 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 308 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 309 %3 = tail call <4 x i32 > @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 310 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 311 ret <3 x i32> %4 312 } 313 314 define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 315 %1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 316 ret <4 x i32> %1 317 } 318 319 define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone { 320 %1 = icmp sgt i64 %v1, %v2 321 %2 = select i1 %1, i64 %v1, i64 %v2 322 ret i64 %2 323 } 324 325 ; TODO: long vector types 326 327 define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone { 328 %1 = icmp ugt i8 %v1, %v2 329 %2 = select i1 %1, i8 %v1, i8 %v2 330 ret i8 %2 331 } 332 333 define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 334 %1 = zext <2 x i8> %v1 to <2 x i32> 335 %2 = zext <2 x i8> %v2 to <2 x i32> 336 %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 337 %4 = trunc <2 x i32> %3 to <2 x i8> 338 ret <2 x i8> %4 339 } 340 341 define <3 x i8> @_Z3maxDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone { 342 %1 = zext <3 x i8> %v1 to <3 x i32> 343 %2 = zext <3 x i8> %v2 to <3 x i32> 344 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 345 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 346 %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 347 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 348 %7 = trunc <3 x i32> %6 to <3 x i8> 349 ret <3 x i8> %7 350 } 351 352 define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 353 %1 = zext <4 x i8> %v1 to <4 x i32> 354 %2 = zext <4 x i8> %v2 to <4 x i32> 355 %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 356 %4 = trunc <4 x i32> %3 to <4 x i8> 357 ret <4 x i8> %4 358 } 359 360 define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone { 361 %1 = icmp ugt i16 %v1, %v2 362 %2 = select i1 %1, i16 %v1, i16 %v2 363 ret i16 %2 364 } 365 366 define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 367 %1 = zext <2 x i16> %v1 to <2 x i32> 368 %2 = zext <2 x i16> %v2 to <2 x i32> 369 %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 370 %4 = trunc <2 x i32> %3 to <2 x i16> 371 ret <2 x i16> %4 372 } 373 374 define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 375 %1 = zext <3 x i16> %v1 to <3 x i32> 376 %2 = zext <3 x i16> %v2 to <3 x i32> 377 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 378 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 379 %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 380 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 381 %7 = trunc <3 x i32> %6 to <3 x i16> 382 ret <3 x i16> %7 383 } 384 385 define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 386 %1 = zext <4 x i16> %v1 to <4 x i32> 387 %2 = zext <4 x i16> %v2 to <4 x i32> 388 %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 389 %4 = trunc <4 x i32> %3 to <4 x i16> 390 ret <4 x i16> %4 391 } 392 393 define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone { 394 %1 = icmp ugt i32 %v1, %v2 395 %2 = select i1 %1, i32 %v1, i32 %v2 396 ret i32 %2 397 } 398 399 define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 400 %1 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 401 ret <2 x i32> %1 402 } 403 404 define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 405 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 406 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 407 %3 = tail call <4 x i32 > @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 408 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 409 ret <3 x i32> %4 410 } 411 412 define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 413 %1 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 414 ret <4 x i32> %1 415 } 416 417 define i64 @_Z3maxyy(i64 %v1, i64 %v2) nounwind readnone { 418 %1 = icmp ugt i64 %v1, %v2 419 %2 = select i1 %1, i64 %v1, i64 %v2 420 ret i64 %2 421 } 422 423 ; TODO: long vector types 424 425 define float @_Z3maxff(float %v1, float %v2) nounwind readnone { 426 %1 = tail call float @_Z4fmaxff(float %v1, float %v2) 427 ret float %1 428 } 429 430 define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone { 431 %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) 432 ret <2 x float> %1 433 } 434 435 define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone { 436 %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) 437 ret <2 x float> %1 438 } 439 440 define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone { 441 %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) 442 ret <3 x float> %1 443 } 444 445 define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone { 446 %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) 447 ret <3 x float> %1 448 } 449 450 define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone { 451 %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) 452 ret <4 x float> %1 453 } 454 455 define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone { 456 %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) 457 ret <4 x float> %1 458 } 459 460 461 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 462 ;;;;;;;;; MIN ;;;;;;;;;; 463 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 464 465 define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone { 466 %1 = icmp slt i8 %v1, %v2 467 %2 = select i1 %1, i8 %v1, i8 %v2 468 ret i8 %2 469 } 470 471 define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 472 %1 = sext <2 x i8> %v1 to <2 x i32> 473 %2 = sext <2 x i8> %v2 to <2 x i32> 474 %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 475 %4 = trunc <2 x i32> %3 to <2 x i8> 476 ret <2 x i8> %4 477 } 478 479 define <3 x i8> @_Z3minDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone { 480 %1 = sext <3 x i8> %v1 to <3 x i32> 481 %2 = sext <3 x i8> %v2 to <3 x i32> 482 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 483 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 484 %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 485 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 486 %7 = trunc <3 x i32> %6 to <3 x i8> 487 ret <3 x i8> %7 488 } 489 490 define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 491 %1 = sext <4 x i8> %v1 to <4 x i32> 492 %2 = sext <4 x i8> %v2 to <4 x i32> 493 %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 494 %4 = trunc <4 x i32> %3 to <4 x i8> 495 ret <4 x i8> %4 496 } 497 498 define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone { 499 %1 = icmp slt i16 %v1, %v2 500 %2 = select i1 %1, i16 %v1, i16 %v2 501 ret i16 %2 502 } 503 504 define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 505 %1 = sext <2 x i16> %v1 to <2 x i32> 506 %2 = sext <2 x i16> %v2 to <2 x i32> 507 %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 508 %4 = trunc <2 x i32> %3 to <2 x i16> 509 ret <2 x i16> %4 510 } 511 512 define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 513 %1 = sext <3 x i16> %v1 to <3 x i32> 514 %2 = sext <3 x i16> %v2 to <3 x i32> 515 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 516 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 517 %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 518 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 519 %7 = trunc <3 x i32> %6 to <3 x i16> 520 ret <3 x i16> %7 521 } 522 523 define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 524 %1 = sext <4 x i16> %v1 to <4 x i32> 525 %2 = sext <4 x i16> %v2 to <4 x i32> 526 %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 527 %4 = trunc <4 x i32> %3 to <4 x i16> 528 ret <4 x i16> %4 529 } 530 531 define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone { 532 %1 = icmp slt i32 %v1, %v2 533 %2 = select i1 %1, i32 %v1, i32 %v2 534 ret i32 %2 535 } 536 537 define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 538 %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 539 ret <2 x i32> %1 540 } 541 542 define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 543 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 544 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 545 %3 = tail call <4 x i32 > @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 546 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 547 ret <3 x i32> %4 548 } 549 550 define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 551 %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 552 ret <4 x i32> %1 553 } 554 555 define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone { 556 %1 = icmp slt i64 %v1, %v2 557 %2 = select i1 %1, i64 %v1, i64 %v2 558 ret i64 %2 559 } 560 561 ; TODO: long vector types 562 563 define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone { 564 %1 = icmp ult i8 %v1, %v2 565 %2 = select i1 %1, i8 %v1, i8 %v2 566 ret i8 %2 567 } 568 569 define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 570 %1 = zext <2 x i8> %v1 to <2 x i32> 571 %2 = zext <2 x i8> %v2 to <2 x i32> 572 %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 573 %4 = trunc <2 x i32> %3 to <2 x i8> 574 ret <2 x i8> %4 575 } 576 577 define <3 x i8> @_Z3minDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone { 578 %1 = zext <3 x i8> %v1 to <3 x i32> 579 %2 = zext <3 x i8> %v2 to <3 x i32> 580 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 581 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 582 %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 583 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 584 %7 = trunc <3 x i32> %6 to <3 x i8> 585 ret <3 x i8> %7 586 } 587 588 define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 589 %1 = zext <4 x i8> %v1 to <4 x i32> 590 %2 = zext <4 x i8> %v2 to <4 x i32> 591 %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 592 %4 = trunc <4 x i32> %3 to <4 x i8> 593 ret <4 x i8> %4 594 } 595 596 define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone { 597 %1 = icmp ult i16 %v1, %v2 598 %2 = select i1 %1, i16 %v1, i16 %v2 599 ret i16 %2 600 } 601 602 define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 603 %1 = zext <2 x i16> %v1 to <2 x i32> 604 %2 = zext <2 x i16> %v2 to <2 x i32> 605 %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 606 %4 = trunc <2 x i32> %3 to <2 x i16> 607 ret <2 x i16> %4 608 } 609 610 define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 611 %1 = zext <3 x i16> %v1 to <3 x i32> 612 %2 = zext <3 x i16> %v2 to <3 x i32> 613 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 614 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 615 %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 616 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 617 %7 = trunc <3 x i32> %6 to <3 x i16> 618 ret <3 x i16> %7 619 } 620 621 define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 622 %1 = zext <4 x i16> %v1 to <4 x i32> 623 %2 = zext <4 x i16> %v2 to <4 x i32> 624 %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 625 %4 = trunc <4 x i32> %3 to <4 x i16> 626 ret <4 x i16> %4 627 } 628 629 define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone { 630 %1 = icmp ult i32 %v1, %v2 631 %2 = select i1 %1, i32 %v1, i32 %v2 632 ret i32 %2 633 } 634 635 define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 636 %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 637 ret <2 x i32> %1 638 } 639 640 define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 641 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 642 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 643 %3 = tail call <4 x i32 > @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 644 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 645 ret <3 x i32> %4 646 } 647 648 define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 649 %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 650 ret <4 x i32> %1 651 } 652 653 define i64 @_Z3minyy(i64 %v1, i64 %v2) nounwind readnone { 654 %1 = icmp ult i64 %v1, %v2 655 %2 = select i1 %1, i64 %v1, i64 %v2 656 ret i64 %2 657 } 658 659 ; TODO: long vector types 660 661 define float @_Z3minff(float %v1, float %v2) nounwind readnone { 662 %1 = tail call float @_Z4fminff(float %v1, float %v2) 663 ret float %1 664 } 665 666 define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone { 667 %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) 668 ret <2 x float> %1 669 } 670 671 define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone { 672 %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) 673 ret <2 x float> %1 674 } 675 676 define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone { 677 %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) 678 ret <3 x float> %1 679 } 680 681 define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone { 682 %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) 683 ret <3 x float> %1 684 } 685 686 define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone { 687 %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) 688 ret <4 x float> %1 689 } 690 691 define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone { 692 %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) 693 ret <4 x float> %1 694 } 695 696 697 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 698 ;;;;;;;;; YUV ;;;;;;;;;; 699 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 700 701 @yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16 702 @yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16 703 @yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16 704 @yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16 705 706 707 define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline { 708 %_sy = zext i8 %pY to i32 709 %_su = zext i8 %pU to i32 710 %_sv = zext i8 %pV to i32 711 712 %_sy2 = add i32 -16, %_sy 713 %_sy3 = mul i32 298, %_sy2 714 %_su2 = add i32 -128, %_su 715 %_sv2 = add i32 -128, %_sv 716 %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone 717 %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone 718 %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone 719 720 %mu = load <4 x i32>* @yuv_U, align 8 721 %mv = load <4 x i32>* @yuv_V, align 8 722 %_u2 = mul <4 x i32> %_u, %mu 723 %_v2 = mul <4 x i32> %_v, %mv 724 %_y2 = add <4 x i32> %_y, %_u2 725 %_y3 = add <4 x i32> %_y2, %_v2 726 727 ; %r1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone 728 ; %r2 = trunc <4 x i16> %r1 to <4 x i8> 729 ; ret <4 x i8> %r2 730 731 %c0 = load <4 x i32>* @yuv_0, align 8 732 %c255 = load <4 x i32>* @yuv_255, align 8 733 %r1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone 734 %r2 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone 735 %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8> 736 %r4 = trunc <4 x i32> %r3 to <4 x i8> 737 ret <4 x i8> %r4 738 } 739 740 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 741 ;;;;;;;;; half_RECIP ;;;;;;;;;; 742 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 743 744 define float @_Z10half_recipf(float %v) { 745 %1 = insertelement <2 x float> undef, float %v, i32 0 746 %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone 747 %3 = extractelement <2 x float> %2, i32 0 748 ret float %3 749 } 750 751 define <2 x float> @_Z10half_recip2Dv2_h(<2 x float> %v) nounwind readnone { 752 %1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone 753 ret <2 x float> %1 754 } 755 756 define <3 x float> @_Z10half_recip3Dv3_h(<3 x float> %v) nounwind readnone { 757 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 758 %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone 759 %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 760 ret <3 x float> %3 761 } 762 763 define <4 x float> @_Z10half_recip4Dv4_h(<4 x float> %v) nounwind readnone { 764 %1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone 765 ret <4 x float> %1 766 } 767 768 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 769 ;;;;;;;;; half_SQRT ;;;;;;;;;; 770 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 771 772 define float @_Z9half_sqrtf(float %v) { 773 %1 = insertelement <2 x float> undef, float %v, i32 0 774 %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone 775 %3 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %2) nounwind readnone 776 %4 = extractelement <2 x float> %3, i32 0 777 ret float %4 778 } 779 780 define <2 x float> @_Z9half_sqrt2Dv2_h(<2 x float> %v) nounwind readnone { 781 %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone 782 %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone 783 ret <2 x float> %2 784 } 785 786 define <3 x float> @_Z9half_sqrt3Dv3_h(<3 x float> %v) nounwind readnone { 787 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 788 %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone 789 %3 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %2) nounwind readnone 790 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 791 ret <3 x float> %4 792 } 793 794 define <4 x float> @_Z9half_sqrt4Dv4_h(<4 x float> %v) nounwind readnone { 795 %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone 796 %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone 797 ret <4 x float> %2 798 } 799 800 801 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 802 ;;;;;;;;; half_RSQRT ;;;;;;;;;; 803 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 804 805 define float @_Z10half_rsqrtf(float %v) { 806 %1 = insertelement <2 x float> undef, float %v, i32 0 807 %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone 808 %3 = extractelement <2 x float> %2, i32 0 809 ret float %3 810 } 811 812 define <2 x float> @_Z10half_rsqrt2Dv2_h(<2 x float> %v) nounwind readnone { 813 %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone 814 ret <2 x float> %1 815 } 816 817 define <3 x float> @_Z10half_rsqrt3Dv3_h(<3 x float> %v) nounwind readnone { 818 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 819 %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone 820 %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 821 ret <3 x float> %3 822 } 823 824 define <4 x float> @_Z10half_rsqrt4Dv4_h(<4 x float> %v) nounwind readnone { 825 %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone 826 ret <4 x float> %1 827 } 828 829 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 830 ;;;;;;;;; matrix ;;;;;;;;;; 831 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 832 833 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly 834 835 %struct.rs_matrix4x4 = type { [16 x float] } 836 %struct.rs_matrix3x3 = type { [9 x float] } 837 %struct.rs_matrix2x2 = type { [4 x float] } 838 839 define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline { 840 %1 = insertelement <4 x float> undef, float %in, i32 0 841 %2 = insertelement <4 x float> %1, float %in, i32 1 842 %3 = insertelement <4 x float> %2, float %in, i32 2 843 %4 = insertelement <4 x float> %3, float %in, i32 3 844 ret <4 x float> %4 845 } 846 847 848 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly { 849 %x0 = extractelement <3 x float> %in, i32 0 850 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 851 %y0 = extractelement <3 x float> %in, i32 1 852 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 853 %z0 = extractelement <3 x float> %in, i32 2 854 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 855 856 %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0 857 %px2 = bitcast float* %px to i8* 858 %xm = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %px2, i32 4) nounwind 859 860 %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3 861 %py2 = bitcast float* %py to i8* 862 %ym = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %py2, i32 4) nounwind 863 864 %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5 865 %pz2 = bitcast float* %pz to i8* 866 %zm2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %pz2, i32 4) nounwind 867 %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 868 869 %a1 = fmul <4 x float> %x, %xm 870 %a2 = fmul <4 x float> %y, %ym 871 %a3 = fadd <4 x float> %a1, %a2 872 %a4 = fmul <4 x float> %z, %zm 873 %a5 = fadd <4 x float> %a4, %a3 874 %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 875 ret <3 x float> %a6 876 } 877 878 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly { 879 %x0 = extractelement <2 x float> %in, i32 0 880 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 881 %y0 = extractelement <2 x float> %in, i32 1 882 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 883 884 %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0 885 %px2 = bitcast float* %px to <4 x float>* 886 %xm = load <4 x float>* %px2, align 4 887 %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3 888 %py2 = bitcast float* %py to <4 x float>* 889 %ym = load <4 x float>* %py2, align 4 890 891 %a1 = fmul <4 x float> %x, %xm 892 %a2 = fmul <4 x float> %y, %ym 893 %a3 = fadd <4 x float> %a1, %a2 894 %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 895 ret <3 x float> %a4 896 } 897 898 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly { 899 %x0 = extractelement <4 x float> %in, i32 0 900 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 901 %y0 = extractelement <4 x float> %in, i32 1 902 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 903 %z0 = extractelement <4 x float> %in, i32 2 904 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 905 %w0 = extractelement <4 x float> %in, i32 3 906 %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone 907 908 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 909 %px2 = bitcast float* %px to <4 x float>* 910 %xm = load <4 x float>* %px2, align 4 911 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 912 %py2 = bitcast float* %py to <4 x float>* 913 %ym = load <4 x float>* %py2, align 4 914 %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8 915 %pz2 = bitcast float* %pz to <4 x float>* 916 %zm = load <4 x float>* %pz2, align 4 917 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 918 %pw2 = bitcast float* %pw to <4 x float>* 919 %wm = load <4 x float>* %pw2, align 4 920 921 %a1 = fmul <4 x float> %x, %xm 922 %a2 = fmul <4 x float> %y, %ym 923 %a3 = fadd <4 x float> %a1, %a2 924 %a4 = fmul <4 x float> %z, %zm 925 %a5 = fadd <4 x float> %a3, %a4 926 %a6 = fmul <4 x float> %w, %wm 927 %a7 = fadd <4 x float> %a5, %a6 928 ret <4 x float> %a7 929 } 930 931 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly { 932 %x0 = extractelement <3 x float> %in, i32 0 933 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 934 %y0 = extractelement <3 x float> %in, i32 1 935 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 936 %z0 = extractelement <3 x float> %in, i32 2 937 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 938 939 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 940 %px2 = bitcast float* %px to <4 x float>* 941 %xm = load <4 x float>* %px2, align 4 942 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 943 %py2 = bitcast float* %py to <4 x float>* 944 %ym = load <4 x float>* %py2, align 4 945 %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8 946 %pz2 = bitcast float* %pz to <4 x float>* 947 %zm = load <4 x float>* %pz2, align 4 948 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 949 %pw2 = bitcast float* %pw to <4 x float>* 950 %wm = load <4 x float>* %pw2, align 4 951 952 %a1 = fmul <4 x float> %x, %xm 953 %a2 = fadd <4 x float> %wm, %a1 954 %a3 = fmul <4 x float> %y, %ym 955 %a4 = fadd <4 x float> %a2, %a3 956 %a5 = fmul <4 x float> %z, %zm 957 %a6 = fadd <4 x float> %a4, %a5 958 ret <4 x float> %a6 959 } 960 961 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly { 962 %x0 = extractelement <2 x float> %in, i32 0 963 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 964 %y0 = extractelement <2 x float> %in, i32 1 965 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 966 967 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 968 %px2 = bitcast float* %px to <4 x float>* 969 %xm = load <4 x float>* %px2, align 4 970 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 971 %py2 = bitcast float* %py to <4 x float>* 972 %ym = load <4 x float>* %py2, align 4 973 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 974 %pw2 = bitcast float* %pw to <4 x float>* 975 %wm = load <4 x float>* %pw2, align 4 976 977 %a1 = fmul <4 x float> %x, %xm 978 %a2 = fadd <4 x float> %wm, %a1 979 %a3 = fmul <4 x float> %y, %ym 980 %a4 = fadd <4 x float> %a2, %a3 981 ret <4 x float> %a4 982 } 983 984 985 986 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 987 ;;;;;;;;; pixel ops ;;;;;;;;;; 988 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 989 990 991 @fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16 992 @fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16 993 @fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16 994 995 declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone 996 declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone 997 998 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color) 999 define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone { 1000 %f255 = load <4 x float>* @fc_255.0, align 16 1001 %f05 = load <4 x float>* @fc_0.5, align 16 1002 %f0 = load <4 x float>* @fc_0, align 16 1003 %v1 = fmul <4 x float> %f255, %color 1004 %v2 = fadd <4 x float> %f05, %v1 1005 %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone 1006 %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone 1007 ret <4 x i8> %v4 1008 } 1009 1010 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color) 1011 define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone { 1012 %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1013 %2 = insertelement <4 x float> %1, float 1.0, i32 3 1014 %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone 1015 ret <4 x i8> %3 1016 } 1017 1018 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b) 1019 define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone { 1020 %1 = insertelement <4 x float> undef, float %r, i32 0 1021 %2 = insertelement <4 x float> %1, float %g, i32 1 1022 %3 = insertelement <4 x float> %2, float %b, i32 2 1023 %4 = insertelement <4 x float> %3, float 1.0, i32 3 1024 %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone 1025 ret <4 x i8> %5 1026 } 1027 1028 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a) 1029 define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone { 1030 %1 = insertelement <4 x float> undef, float %r, i32 0 1031 %2 = insertelement <4 x float> %1, float %g, i32 1 1032 %3 = insertelement <4 x float> %2, float %b, i32 2 1033 %4 = insertelement <4 x float> %3, float %a, i32 3 1034 %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone 1035 ret <4 x i8> %5 1036 } 1037 1038