1 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" 2 target triple = "armv7-none-linux-gnueabi" 3 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5 ;;;;;;;;; INTRINSICS ;;;;;;;;;; 6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 7 8 declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone 9 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone 10 declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 11 declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 12 declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 13 declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 14 declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 15 declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 16 17 declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone 18 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone 19 declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 20 declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 21 declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 22 declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 23 declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 24 declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 25 26 declare <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 27 declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 28 declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 29 30 declare <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 31 declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 32 declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 33 34 declare <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 35 declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 36 declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 37 38 declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone 39 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone 40 41 declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone 42 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone 43 44 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 45 ;;;;;;;;; HELPERS ;;;;;;;;;; 46 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 47 48 define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline { 49 %1 = insertelement <4 x float> undef, float %in, i32 0 50 %2 = insertelement <4 x float> %1, float %in, i32 1 51 %3 = insertelement <4 x float> %2, float %in, i32 2 52 %4 = insertelement <4 x float> %3, float %in, i32 3 53 ret <4 x float> %4 54 } 55 56 define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline { 57 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 58 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 59 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 60 %4 = insertelement <4 x i32> %3, i32 %in, i32 3 61 ret <4 x i32> %4 62 } 63 64 define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline { 65 %1 = insertelement <4 x i16> undef, i16 %in, i32 0 66 %2 = insertelement <4 x i16> %1, i16 %in, i32 1 67 %3 = insertelement <4 x i16> %2, i16 %in, i32 2 68 %4 = insertelement <4 x i16> %3, i16 %in, i32 3 69 ret <4 x i16> %4 70 } 71 72 73 74 define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline { 75 %1 = insertelement <2 x float> undef, float %in, i32 0 76 %2 = insertelement <2 x float> %1, float %in, i32 1 77 ret <2 x float> %2 78 } 79 80 define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline { 81 %1 = insertelement <2 x i32> undef, i32 %in, i32 0 82 %2 = insertelement <2 x i32> %1, i32 %in, i32 1 83 ret <2 x i32> %2 84 } 85 86 define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline { 87 %1 = insertelement <2 x i16> undef, i16 %in, i32 0 88 %2 = insertelement <2 x i16> %1, i16 %in, i32 1 89 ret <2 x i16> %2 90 } 91 92 93 define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline { 94 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 95 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 96 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 97 %4 = insertelement <4 x i32> %3, i32 %in, i32 3 98 ret <4 x i32> %4 99 } 100 101 102 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 103 ;;;;;;;;; CLAMP ;;;;;;;;;; 104 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 105 106 define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly { 107 %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone 108 %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone 109 ret <4 x float> %2 110 } 111 112 define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly { 113 %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone 114 %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone 115 %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly 116 ret <4 x float> %out 117 } 118 119 define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly { 120 %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 121 %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 122 %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 123 %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone 124 %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone 125 %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 126 ret <3 x float> %c 127 } 128 129 define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly { 130 %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 131 %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone 132 %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone 133 %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone 134 %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone 135 %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 136 ret <3 x float> %c 137 } 138 139 define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly { 140 %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone 141 %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone 142 ret <2 x float> %2 143 } 144 145 define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly { 146 %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone 147 %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone 148 %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone 149 %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone 150 ret <2 x float> %b 151 } 152 153 define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly { 154 %1 = fcmp olt float %value, %high 155 %2 = select i1 %1, float %value, float %high 156 %3 = fcmp ogt float %2, %low 157 %4 = select i1 %3, float %2, float %low 158 ret float %4 159 } 160 161 162 163 define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly { 164 %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone 165 %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone 166 ret <4 x i32> %2 167 } 168 169 define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly { 170 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 171 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 172 %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone 173 %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone 174 ret <4 x i32> %2 175 } 176 177 define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly { 178 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 179 %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 180 %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 181 %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 182 %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 183 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 184 ret <3 x i32> %c 185 } 186 187 define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly { 188 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 189 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 190 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 191 %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 192 %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 193 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 194 ret <3 x i32> %c 195 } 196 197 define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly { 198 %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone 199 %2 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone 200 ret <2 x i32> %2 201 } 202 203 define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly { 204 %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone 205 %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone 206 %a = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone 207 %b = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone 208 ret <2 x i32> %b 209 } 210 211 212 213 define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly { 214 %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone 215 %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone 216 ret <4 x i32> %2 217 } 218 219 define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly { 220 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 221 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 222 %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone 223 %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone 224 ret <4 x i32> %2 225 } 226 227 define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly { 228 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 229 %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 230 %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 231 %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 232 %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 233 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 234 ret <3 x i32> %c 235 } 236 237 define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly { 238 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 239 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 240 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 241 %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 242 %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 243 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 244 ret <3 x i32> %c 245 } 246 247 define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly { 248 %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone 249 %2 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone 250 ret <2 x i32> %2 251 } 252 253 define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly { 254 %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone 255 %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone 256 %a = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone 257 %b = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone 258 ret <2 x i32> %b 259 } 260 261 262 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 263 ;;;;;;;;; FMAX ;;;;;;;;;; 264 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 265 266 define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly { 267 %1 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone 268 ret <4 x float> %1 269 } 270 271 define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly { 272 %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 273 %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone 274 ret <4 x float> %2 275 } 276 277 define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly { 278 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 279 %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 280 %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 281 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 282 ret <3 x float> %4 283 } 284 285 define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly { 286 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 287 %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 288 %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 289 %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 290 ret <3 x float> %c 291 } 292 293 define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly { 294 %1 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone 295 ret <2 x float> %1 296 } 297 298 define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly { 299 %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone 300 %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone 301 ret <2 x float> %2 302 } 303 304 define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly { 305 %1 = fcmp ogt float %v1, %v2 306 %2 = select i1 %1, float %v1, float %v2 307 ret float %2 308 } 309 310 311 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 312 ;;;;;;;;; FMIN ;;;;;;;;;; 313 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 314 315 define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly { 316 %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone 317 ret <4 x float> %1 318 } 319 320 define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly { 321 %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 322 %2 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone 323 ret <4 x float> %2 324 } 325 326 define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly { 327 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 328 %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 329 %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 330 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 331 ret <3 x float> %4 332 } 333 334 define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly { 335 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 336 %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 337 %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 338 %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 339 ret <3 x float> %c 340 } 341 342 define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly { 343 %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone 344 ret <2 x float> %1 345 } 346 347 define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly { 348 %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone 349 %2 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone 350 ret <2 x float> %2 351 } 352 353 define float @_Z4fminff(float %v1, float %v2) nounwind readnone { 354 %1 = fcmp olt float %v1, %v2 355 %2 = select i1 %1, float %v1, float %v2 356 ret float %2 357 } 358 359 360 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 361 ;;;;;;;;; MAX ;;;;;;;;;; 362 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 363 364 define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone { 365 %1 = icmp sgt i8 %v1, %v2 366 %2 = select i1 %1, i8 %v1, i8 %v2 367 ret i8 %2 368 } 369 370 define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 371 %1 = sext <2 x i8> %v1 to <2 x i32> 372 %2 = sext <2 x i8> %v2 to <2 x i32> 373 %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 374 %4 = trunc <2 x i32> %3 to <2 x i8> 375 ret <2 x i8> %4 376 } 377 378 define <3 x i8> @_Z3maxDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone { 379 %1 = sext <3 x i8> %v1 to <3 x i32> 380 %2 = sext <3 x i8> %v2 to <3 x i32> 381 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 382 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 383 %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 384 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 385 %7 = trunc <3 x i32> %6 to <3 x i8> 386 ret <3 x i8> %7 387 } 388 389 define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 390 %1 = sext <4 x i8> %v1 to <4 x i32> 391 %2 = sext <4 x i8> %v2 to <4 x i32> 392 %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 393 %4 = trunc <4 x i32> %3 to <4 x i8> 394 ret <4 x i8> %4 395 } 396 397 define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone { 398 %1 = icmp sgt i16 %v1, %v2 399 %2 = select i1 %1, i16 %v1, i16 %v2 400 ret i16 %2 401 } 402 403 define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 404 %1 = sext <2 x i16> %v1 to <2 x i32> 405 %2 = sext <2 x i16> %v2 to <2 x i32> 406 %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 407 %4 = trunc <2 x i32> %3 to <2 x i16> 408 ret <2 x i16> %4 409 } 410 411 define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 412 %1 = sext <3 x i16> %v1 to <3 x i32> 413 %2 = sext <3 x i16> %v2 to <3 x i32> 414 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 415 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 416 %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 417 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 418 %7 = trunc <3 x i32> %6 to <3 x i16> 419 ret <3 x i16> %7 420 } 421 422 define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 423 %1 = sext <4 x i16> %v1 to <4 x i32> 424 %2 = sext <4 x i16> %v2 to <4 x i32> 425 %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 426 %4 = trunc <4 x i32> %3 to <4 x i16> 427 ret <4 x i16> %4 428 } 429 430 define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone { 431 %1 = icmp sgt i32 %v1, %v2 432 %2 = select i1 %1, i32 %v1, i32 %v2 433 ret i32 %2 434 } 435 436 define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 437 %1 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 438 ret <2 x i32> %1 439 } 440 441 define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 442 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 443 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 444 %3 = tail call <4 x i32 > @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 445 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 446 ret <3 x i32> %4 447 } 448 449 define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 450 %1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 451 ret <4 x i32> %1 452 } 453 454 define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone { 455 %1 = icmp sgt i64 %v1, %v2 456 %2 = select i1 %1, i64 %v1, i64 %v2 457 ret i64 %2 458 } 459 460 ; TODO: long vector types 461 462 define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone { 463 %1 = icmp ugt i8 %v1, %v2 464 %2 = select i1 %1, i8 %v1, i8 %v2 465 ret i8 %2 466 } 467 468 define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 469 %1 = zext <2 x i8> %v1 to <2 x i32> 470 %2 = zext <2 x i8> %v2 to <2 x i32> 471 %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 472 %4 = trunc <2 x i32> %3 to <2 x i8> 473 ret <2 x i8> %4 474 } 475 476 define <3 x i8> @_Z3maxDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone { 477 %1 = zext <3 x i8> %v1 to <3 x i32> 478 %2 = zext <3 x i8> %v2 to <3 x i32> 479 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 480 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 481 %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 482 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 483 %7 = trunc <3 x i32> %6 to <3 x i8> 484 ret <3 x i8> %7 485 } 486 487 define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 488 %1 = zext <4 x i8> %v1 to <4 x i32> 489 %2 = zext <4 x i8> %v2 to <4 x i32> 490 %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 491 %4 = trunc <4 x i32> %3 to <4 x i8> 492 ret <4 x i8> %4 493 } 494 495 define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone { 496 %1 = icmp ugt i16 %v1, %v2 497 %2 = select i1 %1, i16 %v1, i16 %v2 498 ret i16 %2 499 } 500 501 define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 502 %1 = zext <2 x i16> %v1 to <2 x i32> 503 %2 = zext <2 x i16> %v2 to <2 x i32> 504 %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 505 %4 = trunc <2 x i32> %3 to <2 x i16> 506 ret <2 x i16> %4 507 } 508 509 define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 510 %1 = zext <3 x i16> %v1 to <3 x i32> 511 %2 = zext <3 x i16> %v2 to <3 x i32> 512 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 513 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 514 %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 515 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 516 %7 = trunc <3 x i32> %6 to <3 x i16> 517 ret <3 x i16> %7 518 } 519 520 define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 521 %1 = zext <4 x i16> %v1 to <4 x i32> 522 %2 = zext <4 x i16> %v2 to <4 x i32> 523 %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 524 %4 = trunc <4 x i32> %3 to <4 x i16> 525 ret <4 x i16> %4 526 } 527 528 define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone { 529 %1 = icmp ugt i32 %v1, %v2 530 %2 = select i1 %1, i32 %v1, i32 %v2 531 ret i32 %2 532 } 533 534 define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 535 %1 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 536 ret <2 x i32> %1 537 } 538 539 define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 540 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 541 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 542 %3 = tail call <4 x i32 > @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 543 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 544 ret <3 x i32> %4 545 } 546 547 define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 548 %1 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 549 ret <4 x i32> %1 550 } 551 552 define i64 @_Z3maxyy(i64 %v1, i64 %v2) nounwind readnone { 553 %1 = icmp ugt i64 %v1, %v2 554 %2 = select i1 %1, i64 %v1, i64 %v2 555 ret i64 %2 556 } 557 558 ; TODO: long vector types 559 560 define float @_Z3maxff(float %v1, float %v2) nounwind readnone { 561 %1 = tail call float @_Z4fmaxff(float %v1, float %v2) 562 ret float %1 563 } 564 565 define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone { 566 %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) 567 ret <2 x float> %1 568 } 569 570 define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone { 571 %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) 572 ret <2 x float> %1 573 } 574 575 define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone { 576 %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) 577 ret <3 x float> %1 578 } 579 580 define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone { 581 %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) 582 ret <3 x float> %1 583 } 584 585 define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone { 586 %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) 587 ret <4 x float> %1 588 } 589 590 define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone { 591 %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) 592 ret <4 x float> %1 593 } 594 595 596 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 597 ;;;;;;;;; MIN ;;;;;;;;;; 598 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 599 600 define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone { 601 %1 = icmp slt i8 %v1, %v2 602 %2 = select i1 %1, i8 %v1, i8 %v2 603 ret i8 %2 604 } 605 606 define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 607 %1 = sext <2 x i8> %v1 to <2 x i32> 608 %2 = sext <2 x i8> %v2 to <2 x i32> 609 %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 610 %4 = trunc <2 x i32> %3 to <2 x i8> 611 ret <2 x i8> %4 612 } 613 614 define <3 x i8> @_Z3minDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone { 615 %1 = sext <3 x i8> %v1 to <3 x i32> 616 %2 = sext <3 x i8> %v2 to <3 x i32> 617 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 618 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 619 %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 620 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 621 %7 = trunc <3 x i32> %6 to <3 x i8> 622 ret <3 x i8> %7 623 } 624 625 define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 626 %1 = sext <4 x i8> %v1 to <4 x i32> 627 %2 = sext <4 x i8> %v2 to <4 x i32> 628 %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 629 %4 = trunc <4 x i32> %3 to <4 x i8> 630 ret <4 x i8> %4 631 } 632 633 define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone { 634 %1 = icmp slt i16 %v1, %v2 635 %2 = select i1 %1, i16 %v1, i16 %v2 636 ret i16 %2 637 } 638 639 define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 640 %1 = sext <2 x i16> %v1 to <2 x i32> 641 %2 = sext <2 x i16> %v2 to <2 x i32> 642 %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 643 %4 = trunc <2 x i32> %3 to <2 x i16> 644 ret <2 x i16> %4 645 } 646 647 define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 648 %1 = sext <3 x i16> %v1 to <3 x i32> 649 %2 = sext <3 x i16> %v2 to <3 x i32> 650 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 651 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 652 %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 653 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 654 %7 = trunc <3 x i32> %6 to <3 x i16> 655 ret <3 x i16> %7 656 } 657 658 define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 659 %1 = sext <4 x i16> %v1 to <4 x i32> 660 %2 = sext <4 x i16> %v2 to <4 x i32> 661 %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 662 %4 = trunc <4 x i32> %3 to <4 x i16> 663 ret <4 x i16> %4 664 } 665 666 define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone { 667 %1 = icmp slt i32 %v1, %v2 668 %2 = select i1 %1, i32 %v1, i32 %v2 669 ret i32 %2 670 } 671 672 define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 673 %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 674 ret <2 x i32> %1 675 } 676 677 define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 678 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 679 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 680 %3 = tail call <4 x i32 > @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 681 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 682 ret <3 x i32> %4 683 } 684 685 define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 686 %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 687 ret <4 x i32> %1 688 } 689 690 define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone { 691 %1 = icmp slt i64 %v1, %v2 692 %2 = select i1 %1, i64 %v1, i64 %v2 693 ret i64 %2 694 } 695 696 ; TODO: long vector types 697 698 define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone { 699 %1 = icmp ult i8 %v1, %v2 700 %2 = select i1 %1, i8 %v1, i8 %v2 701 ret i8 %2 702 } 703 704 define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 705 %1 = zext <2 x i8> %v1 to <2 x i32> 706 %2 = zext <2 x i8> %v2 to <2 x i32> 707 %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 708 %4 = trunc <2 x i32> %3 to <2 x i8> 709 ret <2 x i8> %4 710 } 711 712 define <3 x i8> @_Z3minDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone { 713 %1 = zext <3 x i8> %v1 to <3 x i32> 714 %2 = zext <3 x i8> %v2 to <3 x i32> 715 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 716 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 717 %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 718 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 719 %7 = trunc <3 x i32> %6 to <3 x i8> 720 ret <3 x i8> %7 721 } 722 723 define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 724 %1 = zext <4 x i8> %v1 to <4 x i32> 725 %2 = zext <4 x i8> %v2 to <4 x i32> 726 %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 727 %4 = trunc <4 x i32> %3 to <4 x i8> 728 ret <4 x i8> %4 729 } 730 731 define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone { 732 %1 = icmp ult i16 %v1, %v2 733 %2 = select i1 %1, i16 %v1, i16 %v2 734 ret i16 %2 735 } 736 737 define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 738 %1 = zext <2 x i16> %v1 to <2 x i32> 739 %2 = zext <2 x i16> %v2 to <2 x i32> 740 %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 741 %4 = trunc <2 x i32> %3 to <2 x i16> 742 ret <2 x i16> %4 743 } 744 745 define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 746 %1 = zext <3 x i16> %v1 to <3 x i32> 747 %2 = zext <3 x i16> %v2 to <3 x i32> 748 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 749 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 750 %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 751 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 752 %7 = trunc <3 x i32> %6 to <3 x i16> 753 ret <3 x i16> %7 754 } 755 756 define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 757 %1 = zext <4 x i16> %v1 to <4 x i32> 758 %2 = zext <4 x i16> %v2 to <4 x i32> 759 %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 760 %4 = trunc <4 x i32> %3 to <4 x i16> 761 ret <4 x i16> %4 762 } 763 764 define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone { 765 %1 = icmp ult i32 %v1, %v2 766 %2 = select i1 %1, i32 %v1, i32 %v2 767 ret i32 %2 768 } 769 770 define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 771 %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 772 ret <2 x i32> %1 773 } 774 775 define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 776 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 777 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 778 %3 = tail call <4 x i32 > @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 779 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 780 ret <3 x i32> %4 781 } 782 783 define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 784 %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 785 ret <4 x i32> %1 786 } 787 788 define i64 @_Z3minyy(i64 %v1, i64 %v2) nounwind readnone { 789 %1 = icmp ult i64 %v1, %v2 790 %2 = select i1 %1, i64 %v1, i64 %v2 791 ret i64 %2 792 } 793 794 ; TODO: long vector types 795 796 define float @_Z3minff(float %v1, float %v2) nounwind readnone { 797 %1 = tail call float @_Z4fminff(float %v1, float %v2) 798 ret float %1 799 } 800 801 define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone { 802 %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) 803 ret <2 x float> %1 804 } 805 806 define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone { 807 %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) 808 ret <2 x float> %1 809 } 810 811 define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone { 812 %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) 813 ret <3 x float> %1 814 } 815 816 define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone { 817 %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) 818 ret <3 x float> %1 819 } 820 821 define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone { 822 %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) 823 ret <4 x float> %1 824 } 825 826 define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone { 827 %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) 828 ret <4 x float> %1 829 } 830 831 832 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 833 ;;;;;;;;; YUV ;;;;;;;;;; 834 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 835 836 @yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16 837 @yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16 838 @yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16 839 @yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16 840 841 842 define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline { 843 %_sy = zext i8 %pY to i32 844 %_su = zext i8 %pU to i32 845 %_sv = zext i8 %pV to i32 846 847 %_sy2 = add i32 -16, %_sy 848 %_sy3 = mul i32 298, %_sy2 849 %_su2 = add i32 -128, %_su 850 %_sv2 = add i32 -128, %_sv 851 %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone 852 %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone 853 %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone 854 855 %mu = load <4 x i32>* @yuv_U, align 8 856 %mv = load <4 x i32>* @yuv_V, align 8 857 %_u2 = mul <4 x i32> %_u, %mu 858 %_v2 = mul <4 x i32> %_v, %mv 859 %_y2 = add <4 x i32> %_y, %_u2 860 %_y3 = add <4 x i32> %_y2, %_v2 861 862 ; %r1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone 863 ; %r2 = trunc <4 x i16> %r1 to <4 x i8> 864 ; ret <4 x i8> %r2 865 866 %c0 = load <4 x i32>* @yuv_0, align 8 867 %c255 = load <4 x i32>* @yuv_255, align 8 868 %r1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone 869 %r2 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone 870 %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8> 871 %r4 = trunc <4 x i32> %r3 to <4 x i8> 872 ret <4 x i8> %r4 873 } 874 875 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 876 ;;;;;;;;; half_RECIP ;;;;;;;;;; 877 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 878 879 define float @_Z10half_recipf(float %v) { 880 %1 = insertelement <2 x float> undef, float %v, i32 0 881 %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone 882 %3 = extractelement <2 x float> %2, i32 0 883 ret float %3 884 } 885 886 define <2 x float> @_Z10half_recip2Dv2_h(<2 x float> %v) nounwind readnone { 887 %1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone 888 ret <2 x float> %1 889 } 890 891 define <3 x float> @_Z10half_recip3Dv3_h(<3 x float> %v) nounwind readnone { 892 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 893 %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone 894 %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 895 ret <3 x float> %3 896 } 897 898 define <4 x float> @_Z10half_recip4Dv4_h(<4 x float> %v) nounwind readnone { 899 %1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone 900 ret <4 x float> %1 901 } 902 903 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 904 ;;;;;;;;; half_SQRT ;;;;;;;;;; 905 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 906 907 define float @_Z9half_sqrtf(float %v) { 908 %1 = insertelement <2 x float> undef, float %v, i32 0 909 %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone 910 %3 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %2) nounwind readnone 911 %4 = extractelement <2 x float> %3, i32 0 912 ret float %4 913 } 914 915 define <2 x float> @_Z9half_sqrt2Dv2_h(<2 x float> %v) nounwind readnone { 916 %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone 917 %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone 918 ret <2 x float> %2 919 } 920 921 define <3 x float> @_Z9half_sqrt3Dv3_h(<3 x float> %v) nounwind readnone { 922 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 923 %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone 924 %3 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %2) nounwind readnone 925 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 926 ret <3 x float> %4 927 } 928 929 define <4 x float> @_Z9half_sqrt4Dv4_h(<4 x float> %v) nounwind readnone { 930 %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone 931 %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone 932 ret <4 x float> %2 933 } 934 935 936 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 937 ;;;;;;;;; half_RSQRT ;;;;;;;;;; 938 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 939 940 define float @_Z10half_rsqrtf(float %v) { 941 %1 = insertelement <2 x float> undef, float %v, i32 0 942 %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone 943 %3 = extractelement <2 x float> %2, i32 0 944 ret float %3 945 } 946 947 define <2 x float> @_Z10half_rsqrt2Dv2_h(<2 x float> %v) nounwind readnone { 948 %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone 949 ret <2 x float> %1 950 } 951 952 define <3 x float> @_Z10half_rsqrt3Dv3_h(<3 x float> %v) nounwind readnone { 953 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 954 %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone 955 %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 956 ret <3 x float> %3 957 } 958 959 define <4 x float> @_Z10half_rsqrt4Dv4_h(<4 x float> %v) nounwind readnone { 960 %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone 961 ret <4 x float> %1 962 } 963 964 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 965 ;;;;;;;;; matrix ;;;;;;;;;; 966 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 967 968 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly 969 970 %struct.rs_matrix4x4 = type { [16 x float] } 971 %struct.rs_matrix3x3 = type { [9 x float] } 972 %struct.rs_matrix2x2 = type { [4 x float] } 973 974 define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline { 975 %1 = insertelement <4 x float> undef, float %in, i32 0 976 %2 = insertelement <4 x float> %1, float %in, i32 1 977 %3 = insertelement <4 x float> %2, float %in, i32 2 978 %4 = insertelement <4 x float> %3, float %in, i32 3 979 ret <4 x float> %4 980 } 981 982 983 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly { 984 %x0 = extractelement <3 x float> %in, i32 0 985 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 986 %y0 = extractelement <3 x float> %in, i32 1 987 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 988 %z0 = extractelement <3 x float> %in, i32 2 989 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 990 991 %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0 992 %px2 = bitcast float* %px to i8* 993 %xm = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %px2, i32 4) nounwind 994 995 %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3 996 %py2 = bitcast float* %py to i8* 997 %ym = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %py2, i32 4) nounwind 998 999 %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5 1000 %pz2 = bitcast float* %pz to i8* 1001 %zm2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %pz2, i32 4) nounwind 1002 %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 1003 1004 %a1 = fmul <4 x float> %x, %xm 1005 %a2 = fmul <4 x float> %y, %ym 1006 %a3 = fadd <4 x float> %a1, %a2 1007 %a4 = fmul <4 x float> %z, %zm 1008 %a5 = fadd <4 x float> %a4, %a3 1009 %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 1010 ret <3 x float> %a6 1011 } 1012 1013 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly { 1014 %x0 = extractelement <2 x float> %in, i32 0 1015 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1016 %y0 = extractelement <2 x float> %in, i32 1 1017 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1018 1019 %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0 1020 %px2 = bitcast float* %px to <4 x float>* 1021 %xm = load <4 x float>* %px2, align 4 1022 %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3 1023 %py2 = bitcast float* %py to <4 x float>* 1024 %ym = load <4 x float>* %py2, align 4 1025 1026 %a1 = fmul <4 x float> %x, %xm 1027 %a2 = fmul <4 x float> %y, %ym 1028 %a3 = fadd <4 x float> %a1, %a2 1029 %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 1030 ret <3 x float> %a4 1031 } 1032 1033 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly { 1034 %x0 = extractelement <4 x float> %in, i32 0 1035 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1036 %y0 = extractelement <4 x float> %in, i32 1 1037 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1038 %z0 = extractelement <4 x float> %in, i32 2 1039 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 1040 %w0 = extractelement <4 x float> %in, i32 3 1041 %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone 1042 1043 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1044 %px2 = bitcast float* %px to <4 x float>* 1045 %xm = load <4 x float>* %px2, align 4 1046 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1047 %py2 = bitcast float* %py to <4 x float>* 1048 %ym = load <4 x float>* %py2, align 4 1049 %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8 1050 %pz2 = bitcast float* %pz to <4 x float>* 1051 %zm = load <4 x float>* %pz2, align 4 1052 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1053 %pw2 = bitcast float* %pw to <4 x float>* 1054 %wm = load <4 x float>* %pw2, align 4 1055 1056 %a1 = fmul <4 x float> %x, %xm 1057 %a2 = fmul <4 x float> %y, %ym 1058 %a3 = fadd <4 x float> %a1, %a2 1059 %a4 = fmul <4 x float> %z, %zm 1060 %a5 = fadd <4 x float> %a3, %a4 1061 %a6 = fmul <4 x float> %w, %wm 1062 %a7 = fadd <4 x float> %a5, %a6 1063 ret <4 x float> %a7 1064 } 1065 1066 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly { 1067 %x0 = extractelement <3 x float> %in, i32 0 1068 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1069 %y0 = extractelement <3 x float> %in, i32 1 1070 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1071 %z0 = extractelement <3 x float> %in, i32 2 1072 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 1073 1074 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1075 %px2 = bitcast float* %px to <4 x float>* 1076 %xm = load <4 x float>* %px2, align 4 1077 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1078 %py2 = bitcast float* %py to <4 x float>* 1079 %ym = load <4 x float>* %py2, align 4 1080 %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8 1081 %pz2 = bitcast float* %pz to <4 x float>* 1082 %zm = load <4 x float>* %pz2, align 4 1083 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1084 %pw2 = bitcast float* %pw to <4 x float>* 1085 %wm = load <4 x float>* %pw2, align 4 1086 1087 %a1 = fmul <4 x float> %x, %xm 1088 %a2 = fadd <4 x float> %wm, %a1 1089 %a3 = fmul <4 x float> %y, %ym 1090 %a4 = fadd <4 x float> %a2, %a3 1091 %a5 = fmul <4 x float> %z, %zm 1092 %a6 = fadd <4 x float> %a4, %a5 1093 ret <4 x float> %a6 1094 } 1095 1096 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly { 1097 %x0 = extractelement <2 x float> %in, i32 0 1098 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1099 %y0 = extractelement <2 x float> %in, i32 1 1100 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1101 1102 %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1103 %px2 = bitcast float* %px to <4 x float>* 1104 %xm = load <4 x float>* %px2, align 4 1105 %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1106 %py2 = bitcast float* %py to <4 x float>* 1107 %ym = load <4 x float>* %py2, align 4 1108 %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1109 %pw2 = bitcast float* %pw to <4 x float>* 1110 %wm = load <4 x float>* %pw2, align 4 1111 1112 %a1 = fmul <4 x float> %x, %xm 1113 %a2 = fadd <4 x float> %wm, %a1 1114 %a3 = fmul <4 x float> %y, %ym 1115 %a4 = fadd <4 x float> %a2, %a3 1116 ret <4 x float> %a4 1117 } 1118 1119 1120 1121 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1122 ;;;;;;;;; pixel ops ;;;;;;;;;; 1123 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1124 1125 1126 @fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16 1127 @fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16 1128 @fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16 1129 1130 declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone 1131 declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone 1132 1133 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color) 1134 define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone { 1135 %f255 = load <4 x float>* @fc_255.0, align 16 1136 %f05 = load <4 x float>* @fc_0.5, align 16 1137 %f0 = load <4 x float>* @fc_0, align 16 1138 %v1 = fmul <4 x float> %f255, %color 1139 %v2 = fadd <4 x float> %f05, %v1 1140 %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone 1141 %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone 1142 ret <4 x i8> %v4 1143 } 1144 1145 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color) 1146 define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone { 1147 %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1148 %2 = insertelement <4 x float> %1, float 1.0, i32 3 1149 %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone 1150 ret <4 x i8> %3 1151 } 1152 1153 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b) 1154 define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone { 1155 %1 = insertelement <4 x float> undef, float %r, i32 0 1156 %2 = insertelement <4 x float> %1, float %g, i32 1 1157 %3 = insertelement <4 x float> %2, float %b, i32 2 1158 %4 = insertelement <4 x float> %3, float 1.0, i32 3 1159 %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone 1160 ret <4 x i8> %5 1161 } 1162 1163 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a) 1164 define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone { 1165 %1 = insertelement <4 x float> undef, float %r, i32 0 1166 %2 = insertelement <4 x float> %1, float %g, i32 1 1167 %3 = insertelement <4 x float> %2, float %b, i32 2 1168 %4 = insertelement <4 x float> %3, float %a, i32 3 1169 %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone 1170 ret <4 x i8> %5 1171 } 1172 1173