Home | History | Annotate | Download | only in arch
      1 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
      2 target triple = "armv7-none-linux-gnueabi"
      3 
      4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
      5 ;;;;;;;;;               INTRINSICS               ;;;;;;;;;;
      6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
      7 
      8 declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
      9 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
     10 declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     11 declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     12 declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     13 declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     14 declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     15 declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     16 
     17 declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
     18 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
     19 declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     20 declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     21 declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     22 declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     23 declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     24 declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     25 
     26 declare <8 x i8>  @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     27 declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     28 declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     29 
     30 declare <8 x i8>  @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     31 declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     32 declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     33 
     34 declare <8 x i8>  @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     35 declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     36 declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     37 
     38 declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
     39 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
     40 
     41 declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
     42 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
     43 
     44 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     45 ;;;;;;;;;                HELPERS                 ;;;;;;;;;;
     46 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     47 
     48 define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
     49   %1 = insertelement <4 x float> undef, float %in, i32 0
     50   %2 = insertelement <4 x float> %1, float %in, i32 1
     51   %3 = insertelement <4 x float> %2, float %in, i32 2
     52   %4 = insertelement <4 x float> %3, float %in, i32 3
     53   ret <4 x float> %4
     54 }
     55 
     56 define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline {
     57   %1 = insertelement <4 x i32> undef, i32 %in, i32 0
     58   %2 = insertelement <4 x i32> %1, i32 %in, i32 1
     59   %3 = insertelement <4 x i32> %2, i32 %in, i32 2
     60   %4 = insertelement <4 x i32> %3, i32 %in, i32 3
     61   ret <4 x i32> %4
     62 }
     63 
     64 define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline {
     65   %1 = insertelement <4 x i16> undef, i16 %in, i32 0
     66   %2 = insertelement <4 x i16> %1, i16 %in, i32 1
     67   %3 = insertelement <4 x i16> %2, i16 %in, i32 2
     68   %4 = insertelement <4 x i16> %3, i16 %in, i32 3
     69   ret <4 x i16> %4
     70 }
     71 
     72 
     73 
     74 define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
     75   %1 = insertelement <2 x float> undef, float %in, i32 0
     76   %2 = insertelement <2 x float> %1, float %in, i32 1
     77   ret <2 x float> %2
     78 }
     79 
     80 define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline {
     81   %1 = insertelement <2 x i32> undef, i32 %in, i32 0
     82   %2 = insertelement <2 x i32> %1, i32 %in, i32 1
     83   ret <2 x i32> %2
     84 }
     85 
     86 define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline {
     87   %1 = insertelement <2 x i16> undef, i16 %in, i32 0
     88   %2 = insertelement <2 x i16> %1, i16 %in, i32 1
     89   ret <2 x i16> %2
     90 }
     91 
     92 
     93 define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline {
     94   %1 = insertelement <4 x i32> undef, i32 %in, i32 0
     95   %2 = insertelement <4 x i32> %1, i32 %in, i32 1
     96   %3 = insertelement <4 x i32> %2, i32 %in, i32 2
     97   %4 = insertelement <4 x i32> %3, i32 %in, i32 3
     98   ret <4 x i32> %4
     99 }
    100 
    101 
    102 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    103 ;;;;;;;;;                 CLAMP                  ;;;;;;;;;;
    104 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    105 
    106 define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
    107   %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
    108   %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
    109   ret <4 x float> %2
    110 }
    111 
    112 define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
    113   %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
    114   %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
    115   %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
    116   ret <4 x float> %out
    117 }
    118 
    119 define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
    120   %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    121   %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    122   %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    123   %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
    124   %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
    125   %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    126   ret <3 x float> %c
    127 }
    128 
    129 define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
    130   %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    131   %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
    132   %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
    133   %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
    134   %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
    135   %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    136   ret <3 x float> %c
    137 }
    138 
    139 define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
    140   %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
    141   %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
    142   ret <2 x float> %2
    143 }
    144 
    145 define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
    146   %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
    147   %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
    148   %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
    149   %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
    150   ret <2 x float> %b
    151 }
    152 
    153 define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
    154   %1 = fcmp olt float %value, %high
    155   %2 = select i1 %1, float %value, float %high
    156   %3 = fcmp ogt float %2, %low
    157   %4 = select i1 %3, float %2, float %low
    158   ret float %4
    159 }
    160 
    161 
    162 
    163 define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
    164   %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
    165   %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
    166   ret <4 x i32> %2
    167 }
    168 
    169 define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    170   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    171   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    172   %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
    173   %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
    174   ret <4 x i32> %2
    175 }
    176 
    177 define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
    178   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    179   %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    180   %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    181   %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    182   %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    183   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    184   ret <3 x i32> %c
    185 }
    186 
    187 define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    188   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    189   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    190   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    191   %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    192   %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    193   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    194   ret <3 x i32> %c
    195 }
    196 
    197 define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
    198   %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
    199   %2 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
    200   ret <2 x i32> %2
    201 }
    202 
    203 define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    204   %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
    205   %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
    206   %a = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
    207   %b = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
    208   ret <2 x i32> %b
    209 }
    210 
    211 
    212 
    213 define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
    214   %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
    215   %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
    216   ret <4 x i32> %2
    217 }
    218 
    219 define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    220   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    221   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    222   %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
    223   %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
    224   ret <4 x i32> %2
    225 }
    226 
    227 define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
    228   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    229   %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    230   %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    231   %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    232   %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    233   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    234   ret <3 x i32> %c
    235 }
    236 
    237 define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    238   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    239   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    240   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    241   %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    242   %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    243   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    244   ret <3 x i32> %c
    245 }
    246 
    247 define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
    248   %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
    249   %2 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
    250   ret <2 x i32> %2
    251 }
    252 
    253 define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    254   %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
    255   %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
    256   %a = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
    257   %b = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
    258   ret <2 x i32> %b
    259 }
    260 
    261 
    262 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    263 ;;;;;;;;;                  FMAX                  ;;;;;;;;;;
    264 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    265 
    266 define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
    267   %1 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
    268   ret <4 x float> %1
    269 }
    270 
    271 define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
    272   %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    273   %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
    274   ret <4 x float> %2
    275 }
    276 
    277 define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
    278   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    279   %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    280   %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    281   %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    282   ret <3 x float> %4
    283 }
    284 
    285 define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
    286   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    287   %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    288   %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    289   %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    290   ret <3 x float> %c
    291 }
    292 
    293 define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
    294   %1 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
    295   ret <2 x float> %1
    296 }
    297 
    298 define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
    299   %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
    300   %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
    301   ret <2 x float> %2
    302 }
    303 
    304 define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly {
    305   %1 = fcmp ogt float %v1, %v2
    306   %2 = select i1 %1, float %v1, float %v2
    307   ret float %2
    308 }
    309 
    310 
    311 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    312 ;;;;;;;;;                  FMIN                  ;;;;;;;;;;
    313 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    314 
    315 define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
    316   %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
    317   ret <4 x float> %1
    318 }
    319 
    320 define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
    321   %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    322   %2 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
    323   ret <4 x float> %2
    324 }
    325 
    326 define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
    327   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    328   %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    329   %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    330   %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    331   ret <3 x float> %4
    332 }
    333 
    334 define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
    335   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    336   %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    337   %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    338   %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    339   ret <3 x float> %c
    340 }
    341 
    342 define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
    343   %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
    344   ret <2 x float> %1
    345 }
    346 
    347 define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
    348   %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
    349   %2 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
    350   ret <2 x float> %2
    351 }
    352 
    353 define float @_Z4fminff(float %v1, float %v2) nounwind readnone {
    354   %1 = fcmp olt float %v1, %v2
    355   %2 = select i1 %1, float %v1, float %v2
    356   ret float %2
    357 }
    358 
    359 
    360 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    361 ;;;;;;;;;                  MAX                   ;;;;;;;;;;
    362 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    363 
    364 define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone {
    365   %1 = icmp sgt i8 %v1, %v2
    366   %2 = select i1 %1, i8 %v1, i8 %v2
    367   ret i8 %2
    368 }
    369 
    370 define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    371   %1 = sext <2 x i8> %v1 to <2 x i32>
    372   %2 = sext <2 x i8> %v2 to <2 x i32>
    373   %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    374   %4 = trunc <2 x i32> %3 to <2 x i8>
    375   ret <2 x i8> %4
    376 }
    377 
    378 define <3 x i8> @_Z3maxDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
    379   %1 = sext <3 x i8> %v1 to <3 x i32>
    380   %2 = sext <3 x i8> %v2 to <3 x i32>
    381   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    382   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    383   %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    384   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    385   %7 = trunc <3 x i32> %6 to <3 x i8>
    386   ret <3 x i8> %7
    387 }
    388 
    389 define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    390   %1 = sext <4 x i8> %v1 to <4 x i32>
    391   %2 = sext <4 x i8> %v2 to <4 x i32>
    392   %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    393   %4 = trunc <4 x i32> %3 to <4 x i8>
    394   ret <4 x i8> %4
    395 }
    396 
    397 define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone {
    398   %1 = icmp sgt i16 %v1, %v2
    399   %2 = select i1 %1, i16 %v1, i16 %v2
    400   ret i16 %2
    401 }
    402 
    403 define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    404   %1 = sext <2 x i16> %v1 to <2 x i32>
    405   %2 = sext <2 x i16> %v2 to <2 x i32>
    406   %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    407   %4 = trunc <2 x i32> %3 to <2 x i16>
    408   ret <2 x i16> %4
    409 }
    410 
    411 define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    412   %1 = sext <3 x i16> %v1 to <3 x i32>
    413   %2 = sext <3 x i16> %v2 to <3 x i32>
    414   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    415   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    416   %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    417   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    418   %7 = trunc <3 x i32> %6 to <3 x i16>
    419   ret <3 x i16> %7
    420 }
    421 
    422 define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    423   %1 = sext <4 x i16> %v1 to <4 x i32>
    424   %2 = sext <4 x i16> %v2 to <4 x i32>
    425   %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    426   %4 = trunc <4 x i32> %3 to <4 x i16>
    427   ret <4 x i16> %4
    428 }
    429 
    430 define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone {
    431   %1 = icmp sgt i32 %v1, %v2
    432   %2 = select i1 %1, i32 %v1, i32 %v2
    433   ret i32 %2
    434 }
    435 
    436 define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    437   %1 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    438   ret <2 x i32> %1
    439 }
    440 
    441 define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    442   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    443   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    444   %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    445   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    446   ret <3 x i32> %4
    447 }
    448 
    449 define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    450   %1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    451   ret <4 x i32> %1
    452 }
    453 
    454 define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone {
    455   %1 = icmp sgt i64 %v1, %v2
    456   %2 = select i1 %1, i64 %v1, i64 %v2
    457   ret i64 %2
    458 }
    459 
    460 ; TODO:  long vector types
    461 
    462 define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
    463   %1 = icmp ugt i8 %v1, %v2
    464   %2 = select i1 %1, i8 %v1, i8 %v2
    465   ret i8 %2
    466 }
    467 
    468 define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    469   %1 = zext <2 x i8> %v1 to <2 x i32>
    470   %2 = zext <2 x i8> %v2 to <2 x i32>
    471   %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    472   %4 = trunc <2 x i32> %3 to <2 x i8>
    473   ret <2 x i8> %4
    474 }
    475 
    476 define <3 x i8> @_Z3maxDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
    477   %1 = zext <3 x i8> %v1 to <3 x i32>
    478   %2 = zext <3 x i8> %v2 to <3 x i32>
    479   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    480   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    481   %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    482   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    483   %7 = trunc <3 x i32> %6 to <3 x i8>
    484   ret <3 x i8> %7
    485 }
    486 
    487 define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    488   %1 = zext <4 x i8> %v1 to <4 x i32>
    489   %2 = zext <4 x i8> %v2 to <4 x i32>
    490   %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    491   %4 = trunc <4 x i32> %3 to <4 x i8>
    492   ret <4 x i8> %4
    493 }
    494 
    495 define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
    496   %1 = icmp ugt i16 %v1, %v2
    497   %2 = select i1 %1, i16 %v1, i16 %v2
    498   ret i16 %2
    499 }
    500 
    501 define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    502   %1 = zext <2 x i16> %v1 to <2 x i32>
    503   %2 = zext <2 x i16> %v2 to <2 x i32>
    504   %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    505   %4 = trunc <2 x i32> %3 to <2 x i16>
    506   ret <2 x i16> %4
    507 }
    508 
    509 define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    510   %1 = zext <3 x i16> %v1 to <3 x i32>
    511   %2 = zext <3 x i16> %v2 to <3 x i32>
    512   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    513   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    514   %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    515   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    516   %7 = trunc <3 x i32> %6 to <3 x i16>
    517   ret <3 x i16> %7
    518 }
    519 
    520 define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    521   %1 = zext <4 x i16> %v1 to <4 x i32>
    522   %2 = zext <4 x i16> %v2 to <4 x i32>
    523   %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    524   %4 = trunc <4 x i32> %3 to <4 x i16>
    525   ret <4 x i16> %4
    526 }
    527 
    528 define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone {
    529   %1 = icmp ugt i32 %v1, %v2
    530   %2 = select i1 %1, i32 %v1, i32 %v2
    531   ret i32 %2
    532 }
    533 
    534 define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    535   %1 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    536   ret <2 x i32> %1
    537 }
    538 
    539 define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    540   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    541   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    542   %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    543   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    544   ret <3 x i32> %4
    545 }
    546 
    547 define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    548   %1 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    549   ret <4 x i32> %1
    550 }
    551 
    552 define i64 @_Z3maxyy(i64 %v1, i64 %v2) nounwind readnone {
    553   %1 = icmp ugt i64 %v1, %v2
    554   %2 = select i1 %1, i64 %v1, i64 %v2
    555   ret i64 %2
    556 }
    557 
    558 ; TODO:  long vector types
    559 
    560 define float @_Z3maxff(float %v1, float %v2) nounwind readnone {
    561   %1 = tail call float @_Z4fmaxff(float %v1, float %v2)
    562   ret float %1
    563 }
    564 
    565 define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
    566   %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2)
    567   ret <2 x float> %1
    568 }
    569 
    570 define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
    571   %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2)
    572   ret <2 x float> %1
    573 }
    574 
    575 define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
    576   %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2)
    577   ret <3 x float> %1
    578 }
    579 
    580 define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
    581   %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2)
    582   ret <3 x float> %1
    583 }
    584 
    585 define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
    586   %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2)
    587   ret <4 x float> %1
    588 }
    589 
    590 define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
    591   %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2)
    592   ret <4 x float> %1
    593 }
    594 
    595 
    596 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    597 ;;;;;;;;;                  MIN                   ;;;;;;;;;;
    598 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    599 
    600 define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone {
    601   %1 = icmp slt i8 %v1, %v2
    602   %2 = select i1 %1, i8 %v1, i8 %v2
    603   ret i8 %2
    604 }
    605 
    606 define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    607   %1 = sext <2 x i8> %v1 to <2 x i32>
    608   %2 = sext <2 x i8> %v2 to <2 x i32>
    609   %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    610   %4 = trunc <2 x i32> %3 to <2 x i8>
    611   ret <2 x i8> %4
    612 }
    613 
    614 define <3 x i8> @_Z3minDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
    615   %1 = sext <3 x i8> %v1 to <3 x i32>
    616   %2 = sext <3 x i8> %v2 to <3 x i32>
    617   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    618   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    619   %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    620   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    621   %7 = trunc <3 x i32> %6 to <3 x i8>
    622   ret <3 x i8> %7
    623 }
    624 
    625 define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    626   %1 = sext <4 x i8> %v1 to <4 x i32>
    627   %2 = sext <4 x i8> %v2 to <4 x i32>
    628   %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    629   %4 = trunc <4 x i32> %3 to <4 x i8>
    630   ret <4 x i8> %4
    631 }
    632 
    633 define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone {
    634   %1 = icmp slt i16 %v1, %v2
    635   %2 = select i1 %1, i16 %v1, i16 %v2
    636   ret i16 %2
    637 }
    638 
    639 define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    640   %1 = sext <2 x i16> %v1 to <2 x i32>
    641   %2 = sext <2 x i16> %v2 to <2 x i32>
    642   %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    643   %4 = trunc <2 x i32> %3 to <2 x i16>
    644   ret <2 x i16> %4
    645 }
    646 
    647 define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    648   %1 = sext <3 x i16> %v1 to <3 x i32>
    649   %2 = sext <3 x i16> %v2 to <3 x i32>
    650   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    651   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    652   %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    653   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    654   %7 = trunc <3 x i32> %6 to <3 x i16>
    655   ret <3 x i16> %7
    656 }
    657 
    658 define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    659   %1 = sext <4 x i16> %v1 to <4 x i32>
    660   %2 = sext <4 x i16> %v2 to <4 x i32>
    661   %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    662   %4 = trunc <4 x i32> %3 to <4 x i16>
    663   ret <4 x i16> %4
    664 }
    665 
    666 define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone {
    667   %1 = icmp slt i32 %v1, %v2
    668   %2 = select i1 %1, i32 %v1, i32 %v2
    669   ret i32 %2
    670 }
    671 
    672 define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    673   %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    674   ret <2 x i32> %1
    675 }
    676 
    677 define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    678   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    679   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    680   %3 = tail call <4 x i32   > @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    681   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    682   ret <3 x i32> %4
    683 }
    684 
    685 define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    686   %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    687   ret <4 x i32> %1
    688 }
    689 
    690 define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone {
    691   %1 = icmp slt i64 %v1, %v2
    692   %2 = select i1 %1, i64 %v1, i64 %v2
    693   ret i64 %2
    694 }
    695 
    696 ; TODO:  long vector types
    697 
    698 define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
    699   %1 = icmp ult i8 %v1, %v2
    700   %2 = select i1 %1, i8 %v1, i8 %v2
    701   ret i8 %2
    702 }
    703 
    704 define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    705   %1 = zext <2 x i8> %v1 to <2 x i32>
    706   %2 = zext <2 x i8> %v2 to <2 x i32>
    707   %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    708   %4 = trunc <2 x i32> %3 to <2 x i8>
    709   ret <2 x i8> %4
    710 }
    711 
    712 define <3 x i8> @_Z3minDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
    713   %1 = zext <3 x i8> %v1 to <3 x i32>
    714   %2 = zext <3 x i8> %v2 to <3 x i32>
    715   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    716   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    717   %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    718   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    719   %7 = trunc <3 x i32> %6 to <3 x i8>
    720   ret <3 x i8> %7
    721 }
    722 
    723 define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    724   %1 = zext <4 x i8> %v1 to <4 x i32>
    725   %2 = zext <4 x i8> %v2 to <4 x i32>
    726   %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    727   %4 = trunc <4 x i32> %3 to <4 x i8>
    728   ret <4 x i8> %4
    729 }
    730 
    731 define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
    732   %1 = icmp ult i16 %v1, %v2
    733   %2 = select i1 %1, i16 %v1, i16 %v2
    734   ret i16 %2
    735 }
    736 
    737 define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    738   %1 = zext <2 x i16> %v1 to <2 x i32>
    739   %2 = zext <2 x i16> %v2 to <2 x i32>
    740   %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    741   %4 = trunc <2 x i32> %3 to <2 x i16>
    742   ret <2 x i16> %4
    743 }
    744 
    745 define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    746   %1 = zext <3 x i16> %v1 to <3 x i32>
    747   %2 = zext <3 x i16> %v2 to <3 x i32>
    748   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    749   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    750   %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    751   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    752   %7 = trunc <3 x i32> %6 to <3 x i16>
    753   ret <3 x i16> %7
    754 }
    755 
    756 define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    757   %1 = zext <4 x i16> %v1 to <4 x i32>
    758   %2 = zext <4 x i16> %v2 to <4 x i32>
    759   %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    760   %4 = trunc <4 x i32> %3 to <4 x i16>
    761   ret <4 x i16> %4
    762 }
    763 
    764 define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone {
    765   %1 = icmp ult i32 %v1, %v2
    766   %2 = select i1 %1, i32 %v1, i32 %v2
    767   ret i32 %2
    768 }
    769 
    770 define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    771   %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    772   ret <2 x i32> %1
    773 }
    774 
    775 define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    776   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    777   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    778   %3 = tail call <4 x i32   > @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    779   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    780   ret <3 x i32> %4
    781 }
    782 
    783 define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    784   %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    785   ret <4 x i32> %1
    786 }
    787 
    788 define i64 @_Z3minyy(i64 %v1, i64 %v2) nounwind readnone {
    789   %1 = icmp ult i64 %v1, %v2
    790   %2 = select i1 %1, i64 %v1, i64 %v2
    791   ret i64 %2
    792 }
    793 
    794 ; TODO:  long vector types
    795 
    796 define float @_Z3minff(float %v1, float %v2) nounwind readnone {
    797   %1 = tail call float @_Z4fminff(float %v1, float %v2)
    798   ret float %1
    799 }
    800 
    801 define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
    802   %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2)
    803   ret <2 x float> %1
    804 }
    805 
    806 define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
    807   %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2)
    808   ret <2 x float> %1
    809 }
    810 
    811 define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
    812   %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2)
    813   ret <3 x float> %1
    814 }
    815 
    816 define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
    817   %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2)
    818   ret <3 x float> %1
    819 }
    820 
    821 define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
    822   %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2)
    823   ret <4 x float> %1
    824 }
    825 
    826 define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
    827   %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2)
    828   ret <4 x float> %1
    829 }
    830 
    831 
    832 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    833 ;;;;;;;;;                  YUV                   ;;;;;;;;;;
    834 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    835 
    836 @yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16
    837 @yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16
    838 @yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
    839 @yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16
    840 
    841 
    842 define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline {
    843   %_sy = zext i8 %pY to i32
    844   %_su = zext i8 %pU to i32
    845   %_sv = zext i8 %pV to i32
    846 
    847   %_sy2 = add i32 -16, %_sy
    848   %_sy3 = mul i32 298, %_sy2
    849   %_su2 = add i32 -128, %_su
    850   %_sv2 = add i32 -128, %_sv
    851   %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone
    852   %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone
    853   %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone
    854 
    855   %mu = load <4 x i32>* @yuv_U, align 8
    856   %mv = load <4 x i32>* @yuv_V, align 8
    857   %_u2 = mul <4 x i32> %_u, %mu
    858   %_v2 = mul <4 x i32> %_v, %mv
    859   %_y2 = add <4 x i32> %_y, %_u2
    860   %_y3 = add <4 x i32> %_y2, %_v2
    861 
    862  ; %r1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone
    863 ;  %r2 = trunc <4 x i16> %r1 to <4 x i8>
    864 ;  ret <4 x i8> %r2
    865 
    866   %c0 = load <4 x i32>* @yuv_0, align 8
    867   %c255 = load <4 x i32>* @yuv_255, align 8
    868   %r1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone
    869   %r2 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone
    870   %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8>
    871   %r4 = trunc <4 x i32> %r3 to <4 x i8>
    872   ret <4 x i8> %r4
    873 }
    874 
    875 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    876 ;;;;;;;;;              half_RECIP              ;;;;;;;;;;
    877 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    878 
    879 define float @_Z10half_recipf(float %v) {
    880   %1 = insertelement <2 x float> undef, float %v, i32 0
    881   %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
    882   %3 = extractelement <2 x float> %2, i32 0
    883   ret float %3
    884 }
    885 
    886 define <2 x float> @_Z10half_recip2Dv2_h(<2 x float> %v) nounwind readnone {
    887   %1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone
    888   ret <2 x float> %1
    889 }
    890 
    891 define <3 x float> @_Z10half_recip3Dv3_h(<3 x float> %v) nounwind readnone {
    892   %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    893   %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
    894   %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    895   ret <3 x float> %3
    896 }
    897 
    898 define <4 x float> @_Z10half_recip4Dv4_h(<4 x float> %v) nounwind readnone {
    899   %1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone
    900   ret <4 x float> %1
    901 }
    902 
    903 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    904 ;;;;;;;;;              half_SQRT               ;;;;;;;;;;
    905 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    906 
    907 define float @_Z9half_sqrtf(float %v) {
    908   %1 = insertelement <2 x float> undef, float %v, i32 0
    909   %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
    910   %3 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %2) nounwind readnone
    911   %4 = extractelement <2 x float> %3, i32 0
    912   ret float %4
    913 }
    914 
    915 define <2 x float> @_Z9half_sqrt2Dv2_h(<2 x float> %v) nounwind readnone {
    916   %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
    917   %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
    918   ret <2 x float> %2
    919 }
    920 
    921 define <3 x float> @_Z9half_sqrt3Dv3_h(<3 x float> %v) nounwind readnone {
    922   %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    923   %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
    924   %3 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %2) nounwind readnone
    925   %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    926   ret <3 x float> %4
    927 }
    928 
    929 define <4 x float> @_Z9half_sqrt4Dv4_h(<4 x float> %v) nounwind readnone {
    930   %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
    931   %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
    932   ret <4 x float> %2
    933 }
    934 
    935 
    936 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    937 ;;;;;;;;;              half_RSQRT              ;;;;;;;;;;
    938 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    939 
    940 define float @_Z10half_rsqrtf(float %v) {
    941   %1 = insertelement <2 x float> undef, float %v, i32 0
    942   %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
    943   %3 = extractelement <2 x float> %2, i32 0
    944   ret float %3
    945 }
    946 
    947 define <2 x float> @_Z10half_rsqrt2Dv2_h(<2 x float> %v) nounwind readnone {
    948   %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
    949   ret <2 x float> %1
    950 }
    951 
    952 define <3 x float> @_Z10half_rsqrt3Dv3_h(<3 x float> %v) nounwind readnone {
    953   %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    954   %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
    955   %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    956   ret <3 x float> %3
    957 }
    958 
    959 define <4 x float> @_Z10half_rsqrt4Dv4_h(<4 x float> %v) nounwind readnone {
    960   %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
    961   ret <4 x float> %1
    962 }
    963 
    964 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    965 ;;;;;;;;;              matrix                    ;;;;;;;;;;
    966 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    967 
    968 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
    969 
    970 %struct.rs_matrix4x4 = type { [16 x float] }
    971 %struct.rs_matrix3x3 = type { [9 x float] }
    972 %struct.rs_matrix2x2 = type { [4 x float] }
    973 
    974 define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline {
    975   %1 = insertelement <4 x float> undef, float %in, i32 0
    976   %2 = insertelement <4 x float> %1, float %in, i32 1
    977   %3 = insertelement <4 x float> %2, float %in, i32 2
    978   %4 = insertelement <4 x float> %3, float %in, i32 3
    979   ret <4 x float> %4
    980 }
    981 
    982 
    983 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
    984   %x0 = extractelement <3 x float> %in, i32 0
    985   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
    986   %y0 = extractelement <3 x float> %in, i32 1
    987   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
    988   %z0 = extractelement <3 x float> %in, i32 2
    989   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
    990 
    991   %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
    992   %px2 = bitcast float* %px to i8*
    993   %xm = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %px2, i32 4) nounwind
    994 
    995   %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
    996   %py2 = bitcast float* %py to i8*
    997   %ym = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %py2, i32 4) nounwind
    998 
    999   %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5
   1000   %pz2 = bitcast float* %pz to i8*
   1001   %zm2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %pz2, i32 4) nounwind
   1002   %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   1003 
   1004   %a1 = fmul <4 x float> %x, %xm
   1005   %a2 = fmul <4 x float> %y, %ym
   1006   %a3 = fadd <4 x float> %a1, %a2
   1007   %a4 = fmul <4 x float> %z, %zm
   1008   %a5 = fadd <4 x float> %a4, %a3
   1009   %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
   1010   ret <3 x float> %a6
   1011 }
   1012 
   1013 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
   1014   %x0 = extractelement <2 x float> %in, i32 0
   1015   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1016   %y0 = extractelement <2 x float> %in, i32 1
   1017   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1018 
   1019   %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
   1020   %px2 = bitcast float* %px to <4 x float>*
   1021   %xm = load <4 x float>* %px2, align 4
   1022   %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
   1023   %py2 = bitcast float* %py to <4 x float>*
   1024   %ym = load <4 x float>* %py2, align 4
   1025 
   1026   %a1 = fmul <4 x float> %x, %xm
   1027   %a2 = fmul <4 x float> %y, %ym
   1028   %a3 = fadd <4 x float> %a1, %a2
   1029   %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
   1030   ret <3 x float> %a4
   1031 }
   1032 
   1033 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
   1034   %x0 = extractelement <4 x float> %in, i32 0
   1035   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1036   %y0 = extractelement <4 x float> %in, i32 1
   1037   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1038   %z0 = extractelement <4 x float> %in, i32 2
   1039   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
   1040   %w0 = extractelement <4 x float> %in, i32 3
   1041   %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone
   1042 
   1043   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1044   %px2 = bitcast float* %px to <4 x float>*
   1045   %xm = load <4 x float>* %px2, align 4
   1046   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1047   %py2 = bitcast float* %py to <4 x float>*
   1048   %ym = load <4 x float>* %py2, align 4
   1049   %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   1050   %pz2 = bitcast float* %pz to <4 x float>*
   1051   %zm = load <4 x float>* %pz2, align 4
   1052   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1053   %pw2 = bitcast float* %pw to <4 x float>*
   1054   %wm = load <4 x float>* %pw2, align 4
   1055 
   1056   %a1 = fmul <4 x float> %x, %xm
   1057   %a2 = fmul <4 x float> %y, %ym
   1058   %a3 = fadd <4 x float> %a1, %a2
   1059   %a4 = fmul <4 x float> %z, %zm
   1060   %a5 = fadd <4 x float> %a3, %a4
   1061   %a6 = fmul <4 x float> %w, %wm
   1062   %a7 = fadd <4 x float> %a5, %a6
   1063   ret <4 x float> %a7
   1064 }
   1065 
   1066 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
   1067   %x0 = extractelement <3 x float> %in, i32 0
   1068   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1069   %y0 = extractelement <3 x float> %in, i32 1
   1070   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1071   %z0 = extractelement <3 x float> %in, i32 2
   1072   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
   1073 
   1074   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1075   %px2 = bitcast float* %px to <4 x float>*
   1076   %xm = load <4 x float>* %px2, align 4
   1077   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1078   %py2 = bitcast float* %py to <4 x float>*
   1079   %ym = load <4 x float>* %py2, align 4
   1080   %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   1081   %pz2 = bitcast float* %pz to <4 x float>*
   1082   %zm = load <4 x float>* %pz2, align 4
   1083   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1084   %pw2 = bitcast float* %pw to <4 x float>*
   1085   %wm = load <4 x float>* %pw2, align 4
   1086 
   1087   %a1 = fmul <4 x float> %x, %xm
   1088   %a2 = fadd <4 x float> %wm, %a1
   1089   %a3 = fmul <4 x float> %y, %ym
   1090   %a4 = fadd <4 x float> %a2, %a3
   1091   %a5 = fmul <4 x float> %z, %zm
   1092   %a6 = fadd <4 x float> %a4, %a5
   1093   ret <4 x float> %a6
   1094 }
   1095 
   1096 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
   1097   %x0 = extractelement <2 x float> %in, i32 0
   1098   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1099   %y0 = extractelement <2 x float> %in, i32 1
   1100   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1101 
   1102   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1103   %px2 = bitcast float* %px to <4 x float>*
   1104   %xm = load <4 x float>* %px2, align 4
   1105   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1106   %py2 = bitcast float* %py to <4 x float>*
   1107   %ym = load <4 x float>* %py2, align 4
   1108   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1109   %pw2 = bitcast float* %pw to <4 x float>*
   1110   %wm = load <4 x float>* %pw2, align 4
   1111 
   1112   %a1 = fmul <4 x float> %x, %xm
   1113   %a2 = fadd <4 x float> %wm, %a1
   1114   %a3 = fmul <4 x float> %y, %ym
   1115   %a4 = fadd <4 x float> %a2, %a3
   1116   ret <4 x float> %a4
   1117 }
   1118 
   1119 
   1120 
   1121 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   1122 ;;;;;;;;;              pixel ops                 ;;;;;;;;;;
   1123 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   1124 
   1125 
   1126 @fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
   1127 @fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
   1128 @fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16
   1129 
   1130 declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
   1131 declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
   1132 
   1133 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
   1134 define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
   1135     %f255 = load <4 x float>* @fc_255.0, align 16
   1136     %f05 = load <4 x float>* @fc_0.5, align 16
   1137     %f0 = load <4 x float>* @fc_0, align 16
   1138     %v1 = fmul <4 x float> %f255, %color
   1139     %v2 = fadd <4 x float> %f05, %v1
   1140     %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone
   1141     %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone
   1142     ret <4 x i8> %v4
   1143 }
   1144 
   1145 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
   1146 define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone {
   1147     %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1148     %2 = insertelement <4 x float> %1, float 1.0, i32 3
   1149     %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
   1150     ret <4 x i8> %3
   1151 }
   1152 
   1153 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
   1154 define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
   1155     %1 = insertelement <4 x float> undef, float %r, i32 0
   1156     %2 = insertelement <4 x float> %1, float %g, i32 1
   1157     %3 = insertelement <4 x float> %2, float %b, i32 2
   1158     %4 = insertelement <4 x float> %3, float 1.0, i32 3
   1159     %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
   1160     ret <4 x i8> %5
   1161 }
   1162 
   1163 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
   1164 define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
   1165     %1 = insertelement <4 x float> undef, float %r, i32 0
   1166     %2 = insertelement <4 x float> %1, float %g, i32 1
   1167     %3 = insertelement <4 x float> %2, float %b, i32 2
   1168     %4 = insertelement <4 x float> %3, float %a, i32 3
   1169     %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
   1170     ret <4 x i8> %5
   1171 }
   1172 
   1173