Home | History | Annotate | Download | only in arch
      1 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
      2 target triple = "armv7-none-linux-gnueabi"
      3 
      4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
      5 ;;;;;;;;;               INTRINSICS               ;;;;;;;;;;
      6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
      7 
      8 declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
      9 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
     10 declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     11 declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     12 declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     13 declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     14 declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     15 declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     16 
     17 declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
     18 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
     19 declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     20 declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     21 declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     22 declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     23 declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     24 declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     25 
     26 declare <8 x i8>  @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     27 declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     28 declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     29 
     30 declare <8 x i8>  @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     31 declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     32 declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     33 
     34 declare <8 x i8>  @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     35 declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     36 declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     37 
     38 declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
     39 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
     40 
     41 declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
     42 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
     43 
     44 declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
     45 declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
     46 
     47 declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
     48 declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
     49 
     50 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     51 ;;;;;;;;;                HELPERS                 ;;;;;;;;;;
     52 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     53 
     54 define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
     55   %1 = insertelement <4 x float> undef, float %in, i32 0
     56   %2 = insertelement <4 x float> %1, float %in, i32 1
     57   %3 = insertelement <4 x float> %2, float %in, i32 2
     58   %4 = insertelement <4 x float> %3, float %in, i32 3
     59   ret <4 x float> %4
     60 }
     61 
     62 define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline {
     63   %1 = insertelement <4 x i32> undef, i32 %in, i32 0
     64   %2 = insertelement <4 x i32> %1, i32 %in, i32 1
     65   %3 = insertelement <4 x i32> %2, i32 %in, i32 2
     66   %4 = insertelement <4 x i32> %3, i32 %in, i32 3
     67   ret <4 x i32> %4
     68 }
     69 
     70 define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline {
     71   %1 = insertelement <4 x i16> undef, i16 %in, i32 0
     72   %2 = insertelement <4 x i16> %1, i16 %in, i32 1
     73   %3 = insertelement <4 x i16> %2, i16 %in, i32 2
     74   %4 = insertelement <4 x i16> %3, i16 %in, i32 3
     75   ret <4 x i16> %4
     76 }
     77 
     78 
     79 
     80 define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
     81   %1 = insertelement <2 x float> undef, float %in, i32 0
     82   %2 = insertelement <2 x float> %1, float %in, i32 1
     83   ret <2 x float> %2
     84 }
     85 
     86 define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline {
     87   %1 = insertelement <2 x i32> undef, i32 %in, i32 0
     88   %2 = insertelement <2 x i32> %1, i32 %in, i32 1
     89   ret <2 x i32> %2
     90 }
     91 
     92 define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline {
     93   %1 = insertelement <2 x i16> undef, i16 %in, i32 0
     94   %2 = insertelement <2 x i16> %1, i16 %in, i32 1
     95   ret <2 x i16> %2
     96 }
     97 
     98 
     99 define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline {
    100   %1 = insertelement <4 x i32> undef, i32 %in, i32 0
    101   %2 = insertelement <4 x i32> %1, i32 %in, i32 1
    102   %3 = insertelement <4 x i32> %2, i32 %in, i32 2
    103   %4 = insertelement <4 x i32> %3, i32 %in, i32 3
    104   ret <4 x i32> %4
    105 }
    106 
    107 
    108 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    109 ;;;;;;;;;                 CLAMP                  ;;;;;;;;;;
    110 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    111 
    112 define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
    113   %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
    114   %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
    115   ret <4 x float> %2
    116 }
    117 
    118 define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
    119   %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
    120   %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
    121   %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
    122   ret <4 x float> %out
    123 }
    124 
    125 define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
    126   %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    127   %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    128   %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    129   %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
    130   %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
    131   %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    132   ret <3 x float> %c
    133 }
    134 
    135 define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
    136   %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    137   %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
    138   %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
    139   %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
    140   %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
    141   %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    142   ret <3 x float> %c
    143 }
    144 
    145 define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
    146   %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
    147   %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
    148   ret <2 x float> %2
    149 }
    150 
    151 define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
    152   %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
    153   %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
    154   %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
    155   %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
    156   ret <2 x float> %b
    157 }
    158 
    159 define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
    160   %1 = fcmp olt float %value, %high
    161   %2 = select i1 %1, float %value, float %high
    162   %3 = fcmp ogt float %2, %low
    163   %4 = select i1 %3, float %2, float %low
    164   ret float %4
    165 }
    166 
    167 
    168 
    169 define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
    170   %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
    171   %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
    172   ret <4 x i32> %2
    173 }
    174 
    175 define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    176   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    177   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    178   %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
    179   %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
    180   ret <4 x i32> %2
    181 }
    182 
    183 define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
    184   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    185   %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    186   %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    187   %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    188   %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    189   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    190   ret <3 x i32> %c
    191 }
    192 
    193 define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    194   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    195   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    196   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    197   %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    198   %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    199   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    200   ret <3 x i32> %c
    201 }
    202 
    203 define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
    204   %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
    205   %2 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
    206   ret <2 x i32> %2
    207 }
    208 
    209 define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    210   %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
    211   %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
    212   %a = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
    213   %b = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
    214   ret <2 x i32> %b
    215 }
    216 
    217 
    218 
    219 define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
    220   %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
    221   %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
    222   ret <4 x i32> %2
    223 }
    224 
    225 define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    226   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    227   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    228   %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
    229   %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
    230   ret <4 x i32> %2
    231 }
    232 
    233 define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
    234   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    235   %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    236   %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    237   %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    238   %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    239   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    240   ret <3 x i32> %c
    241 }
    242 
    243 define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    244   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    245   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    246   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    247   %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    248   %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    249   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    250   ret <3 x i32> %c
    251 }
    252 
    253 define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
    254   %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
    255   %2 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
    256   ret <2 x i32> %2
    257 }
    258 
    259 define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    260   %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
    261   %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
    262   %a = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
    263   %b = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
    264   ret <2 x i32> %b
    265 }
    266 
    267 
    268 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    269 ;;;;;;;;;                  FMAX                  ;;;;;;;;;;
    270 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    271 
    272 define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
    273   %1 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
    274   ret <4 x float> %1
    275 }
    276 
    277 define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
    278   %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    279   %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
    280   ret <4 x float> %2
    281 }
    282 
    283 define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
    284   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    285   %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    286   %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    287   %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    288   ret <3 x float> %4
    289 }
    290 
    291 define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
    292   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    293   %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    294   %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    295   %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    296   ret <3 x float> %c
    297 }
    298 
    299 define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
    300   %1 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
    301   ret <2 x float> %1
    302 }
    303 
    304 define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
    305   %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
    306   %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
    307   ret <2 x float> %2
    308 }
    309 
    310 define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly {
    311   %1 = fcmp ogt float %v1, %v2
    312   %2 = select i1 %1, float %v1, float %v2
    313   ret float %2
    314 }
    315 
    316 
    317 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    318 ;;;;;;;;;                  FMIN                  ;;;;;;;;;;
    319 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    320 
    321 define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
    322   %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
    323   ret <4 x float> %1
    324 }
    325 
    326 define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
    327   %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    328   %2 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
    329   ret <4 x float> %2
    330 }
    331 
    332 define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
    333   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    334   %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    335   %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    336   %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    337   ret <3 x float> %4
    338 }
    339 
    340 define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
    341   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    342   %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    343   %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    344   %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    345   ret <3 x float> %c
    346 }
    347 
    348 define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
    349   %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
    350   ret <2 x float> %1
    351 }
    352 
    353 define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
    354   %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
    355   %2 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
    356   ret <2 x float> %2
    357 }
    358 
    359 define float @_Z4fminff(float %v1, float %v2) nounwind readnone {
    360   %1 = fcmp olt float %v1, %v2
    361   %2 = select i1 %1, float %v1, float %v2
    362   ret float %2
    363 }
    364 
    365 
    366 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    367 ;;;;;;;;;                  MAX                   ;;;;;;;;;;
    368 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    369 
    370 define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone {
    371   %1 = icmp sgt i8 %v1, %v2
    372   %2 = select i1 %1, i8 %v1, i8 %v2
    373   ret i8 %2
    374 }
    375 
    376 define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    377   %1 = sext <2 x i8> %v1 to <2 x i32>
    378   %2 = sext <2 x i8> %v2 to <2 x i32>
    379   %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    380   %4 = trunc <2 x i32> %3 to <2 x i8>
    381   ret <2 x i8> %4
    382 }
    383 
    384 define <3 x i8> @_Z3maxDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
    385   %1 = sext <3 x i8> %v1 to <3 x i32>
    386   %2 = sext <3 x i8> %v2 to <3 x i32>
    387   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    388   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    389   %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    390   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    391   %7 = trunc <3 x i32> %6 to <3 x i8>
    392   ret <3 x i8> %7
    393 }
    394 
    395 define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    396   %1 = sext <4 x i8> %v1 to <4 x i32>
    397   %2 = sext <4 x i8> %v2 to <4 x i32>
    398   %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    399   %4 = trunc <4 x i32> %3 to <4 x i8>
    400   ret <4 x i8> %4
    401 }
    402 
    403 define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone {
    404   %1 = icmp sgt i16 %v1, %v2
    405   %2 = select i1 %1, i16 %v1, i16 %v2
    406   ret i16 %2
    407 }
    408 
    409 define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    410   %1 = sext <2 x i16> %v1 to <2 x i32>
    411   %2 = sext <2 x i16> %v2 to <2 x i32>
    412   %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    413   %4 = trunc <2 x i32> %3 to <2 x i16>
    414   ret <2 x i16> %4
    415 }
    416 
    417 define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    418   %1 = sext <3 x i16> %v1 to <3 x i32>
    419   %2 = sext <3 x i16> %v2 to <3 x i32>
    420   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    421   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    422   %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    423   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    424   %7 = trunc <3 x i32> %6 to <3 x i16>
    425   ret <3 x i16> %7
    426 }
    427 
    428 define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    429   %1 = sext <4 x i16> %v1 to <4 x i32>
    430   %2 = sext <4 x i16> %v2 to <4 x i32>
    431   %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    432   %4 = trunc <4 x i32> %3 to <4 x i16>
    433   ret <4 x i16> %4
    434 }
    435 
    436 define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone {
    437   %1 = icmp sgt i32 %v1, %v2
    438   %2 = select i1 %1, i32 %v1, i32 %v2
    439   ret i32 %2
    440 }
    441 
    442 define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    443   %1 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    444   ret <2 x i32> %1
    445 }
    446 
    447 define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    448   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    449   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    450   %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    451   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    452   ret <3 x i32> %4
    453 }
    454 
    455 define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    456   %1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    457   ret <4 x i32> %1
    458 }
    459 
    460 define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone {
    461   %1 = icmp sgt i64 %v1, %v2
    462   %2 = select i1 %1, i64 %v1, i64 %v2
    463   ret i64 %2
    464 }
    465 
    466 ; TODO:  long vector types
    467 
    468 define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
    469   %1 = icmp ugt i8 %v1, %v2
    470   %2 = select i1 %1, i8 %v1, i8 %v2
    471   ret i8 %2
    472 }
    473 
    474 define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    475   %1 = zext <2 x i8> %v1 to <2 x i32>
    476   %2 = zext <2 x i8> %v2 to <2 x i32>
    477   %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    478   %4 = trunc <2 x i32> %3 to <2 x i8>
    479   ret <2 x i8> %4
    480 }
    481 
    482 define <3 x i8> @_Z3maxDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
    483   %1 = zext <3 x i8> %v1 to <3 x i32>
    484   %2 = zext <3 x i8> %v2 to <3 x i32>
    485   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    486   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    487   %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    488   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    489   %7 = trunc <3 x i32> %6 to <3 x i8>
    490   ret <3 x i8> %7
    491 }
    492 
    493 define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    494   %1 = zext <4 x i8> %v1 to <4 x i32>
    495   %2 = zext <4 x i8> %v2 to <4 x i32>
    496   %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    497   %4 = trunc <4 x i32> %3 to <4 x i8>
    498   ret <4 x i8> %4
    499 }
    500 
    501 define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
    502   %1 = icmp ugt i16 %v1, %v2
    503   %2 = select i1 %1, i16 %v1, i16 %v2
    504   ret i16 %2
    505 }
    506 
    507 define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    508   %1 = zext <2 x i16> %v1 to <2 x i32>
    509   %2 = zext <2 x i16> %v2 to <2 x i32>
    510   %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    511   %4 = trunc <2 x i32> %3 to <2 x i16>
    512   ret <2 x i16> %4
    513 }
    514 
    515 define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    516   %1 = zext <3 x i16> %v1 to <3 x i32>
    517   %2 = zext <3 x i16> %v2 to <3 x i32>
    518   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    519   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    520   %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    521   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    522   %7 = trunc <3 x i32> %6 to <3 x i16>
    523   ret <3 x i16> %7
    524 }
    525 
    526 define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    527   %1 = zext <4 x i16> %v1 to <4 x i32>
    528   %2 = zext <4 x i16> %v2 to <4 x i32>
    529   %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    530   %4 = trunc <4 x i32> %3 to <4 x i16>
    531   ret <4 x i16> %4
    532 }
    533 
    534 define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone {
    535   %1 = icmp ugt i32 %v1, %v2
    536   %2 = select i1 %1, i32 %v1, i32 %v2
    537   ret i32 %2
    538 }
    539 
    540 define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    541   %1 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    542   ret <2 x i32> %1
    543 }
    544 
    545 define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    546   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    547   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    548   %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    549   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    550   ret <3 x i32> %4
    551 }
    552 
    553 define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    554   %1 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    555   ret <4 x i32> %1
    556 }
    557 
    558 
    559 ; TODO:  long vector types
    560 
    561 define float @_Z3maxff(float %v1, float %v2) nounwind readnone {
    562   %1 = tail call float @_Z4fmaxff(float %v1, float %v2)
    563   ret float %1
    564 }
    565 
    566 define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
    567   %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2)
    568   ret <2 x float> %1
    569 }
    570 
    571 define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
    572   %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2)
    573   ret <2 x float> %1
    574 }
    575 
    576 define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
    577   %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2)
    578   ret <3 x float> %1
    579 }
    580 
    581 define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
    582   %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2)
    583   ret <3 x float> %1
    584 }
    585 
    586 define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
    587   %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2)
    588   ret <4 x float> %1
    589 }
    590 
    591 define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
    592   %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2)
    593   ret <4 x float> %1
    594 }
    595 
    596 
    597 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    598 ;;;;;;;;;                  MIN                   ;;;;;;;;;;
    599 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    600 
    601 define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone {
    602   %1 = icmp slt i8 %v1, %v2
    603   %2 = select i1 %1, i8 %v1, i8 %v2
    604   ret i8 %2
    605 }
    606 
    607 define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    608   %1 = sext <2 x i8> %v1 to <2 x i32>
    609   %2 = sext <2 x i8> %v2 to <2 x i32>
    610   %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    611   %4 = trunc <2 x i32> %3 to <2 x i8>
    612   ret <2 x i8> %4
    613 }
    614 
    615 define <3 x i8> @_Z3minDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
    616   %1 = sext <3 x i8> %v1 to <3 x i32>
    617   %2 = sext <3 x i8> %v2 to <3 x i32>
    618   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    619   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    620   %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    621   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    622   %7 = trunc <3 x i32> %6 to <3 x i8>
    623   ret <3 x i8> %7
    624 }
    625 
    626 define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    627   %1 = sext <4 x i8> %v1 to <4 x i32>
    628   %2 = sext <4 x i8> %v2 to <4 x i32>
    629   %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    630   %4 = trunc <4 x i32> %3 to <4 x i8>
    631   ret <4 x i8> %4
    632 }
    633 
    634 define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone {
    635   %1 = icmp slt i16 %v1, %v2
    636   %2 = select i1 %1, i16 %v1, i16 %v2
    637   ret i16 %2
    638 }
    639 
    640 define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    641   %1 = sext <2 x i16> %v1 to <2 x i32>
    642   %2 = sext <2 x i16> %v2 to <2 x i32>
    643   %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    644   %4 = trunc <2 x i32> %3 to <2 x i16>
    645   ret <2 x i16> %4
    646 }
    647 
    648 define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    649   %1 = sext <3 x i16> %v1 to <3 x i32>
    650   %2 = sext <3 x i16> %v2 to <3 x i32>
    651   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    652   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    653   %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    654   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    655   %7 = trunc <3 x i32> %6 to <3 x i16>
    656   ret <3 x i16> %7
    657 }
    658 
    659 define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    660   %1 = sext <4 x i16> %v1 to <4 x i32>
    661   %2 = sext <4 x i16> %v2 to <4 x i32>
    662   %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    663   %4 = trunc <4 x i32> %3 to <4 x i16>
    664   ret <4 x i16> %4
    665 }
    666 
    667 define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone {
    668   %1 = icmp slt i32 %v1, %v2
    669   %2 = select i1 %1, i32 %v1, i32 %v2
    670   ret i32 %2
    671 }
    672 
    673 define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    674   %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    675   ret <2 x i32> %1
    676 }
    677 
    678 define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    679   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    680   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    681   %3 = tail call <4 x i32   > @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    682   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    683   ret <3 x i32> %4
    684 }
    685 
    686 define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    687   %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    688   ret <4 x i32> %1
    689 }
    690 
    691 define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone {
    692   %1 = icmp slt i64 %v1, %v2
    693   %2 = select i1 %1, i64 %v1, i64 %v2
    694   ret i64 %2
    695 }
    696 
    697 ; TODO:  long vector types
    698 
    699 define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
    700   %1 = icmp ult i8 %v1, %v2
    701   %2 = select i1 %1, i8 %v1, i8 %v2
    702   ret i8 %2
    703 }
    704 
    705 define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    706   %1 = zext <2 x i8> %v1 to <2 x i32>
    707   %2 = zext <2 x i8> %v2 to <2 x i32>
    708   %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    709   %4 = trunc <2 x i32> %3 to <2 x i8>
    710   ret <2 x i8> %4
    711 }
    712 
    713 define <3 x i8> @_Z3minDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
    714   %1 = zext <3 x i8> %v1 to <3 x i32>
    715   %2 = zext <3 x i8> %v2 to <3 x i32>
    716   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    717   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    718   %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    719   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    720   %7 = trunc <3 x i32> %6 to <3 x i8>
    721   ret <3 x i8> %7
    722 }
    723 
    724 define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    725   %1 = zext <4 x i8> %v1 to <4 x i32>
    726   %2 = zext <4 x i8> %v2 to <4 x i32>
    727   %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    728   %4 = trunc <4 x i32> %3 to <4 x i8>
    729   ret <4 x i8> %4
    730 }
    731 
    732 define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
    733   %1 = icmp ult i16 %v1, %v2
    734   %2 = select i1 %1, i16 %v1, i16 %v2
    735   ret i16 %2
    736 }
    737 
    738 define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    739   %1 = zext <2 x i16> %v1 to <2 x i32>
    740   %2 = zext <2 x i16> %v2 to <2 x i32>
    741   %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    742   %4 = trunc <2 x i32> %3 to <2 x i16>
    743   ret <2 x i16> %4
    744 }
    745 
    746 define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    747   %1 = zext <3 x i16> %v1 to <3 x i32>
    748   %2 = zext <3 x i16> %v2 to <3 x i32>
    749   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    750   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    751   %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    752   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    753   %7 = trunc <3 x i32> %6 to <3 x i16>
    754   ret <3 x i16> %7
    755 }
    756 
    757 define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    758   %1 = zext <4 x i16> %v1 to <4 x i32>
    759   %2 = zext <4 x i16> %v2 to <4 x i32>
    760   %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    761   %4 = trunc <4 x i32> %3 to <4 x i16>
    762   ret <4 x i16> %4
    763 }
    764 
    765 define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone {
    766   %1 = icmp ult i32 %v1, %v2
    767   %2 = select i1 %1, i32 %v1, i32 %v2
    768   ret i32 %2
    769 }
    770 
    771 define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    772   %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    773   ret <2 x i32> %1
    774 }
    775 
    776 define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    777   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    778   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    779   %3 = tail call <4 x i32   > @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    780   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    781   ret <3 x i32> %4
    782 }
    783 
    784 define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    785   %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    786   ret <4 x i32> %1
    787 }
    788 
    789 
    790 ; TODO:  long vector types
    791 
    792 define float @_Z3minff(float %v1, float %v2) nounwind readnone {
    793   %1 = tail call float @_Z4fminff(float %v1, float %v2)
    794   ret float %1
    795 }
    796 
    797 define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
    798   %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2)
    799   ret <2 x float> %1
    800 }
    801 
    802 define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
    803   %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2)
    804   ret <2 x float> %1
    805 }
    806 
    807 define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
    808   %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2)
    809   ret <3 x float> %1
    810 }
    811 
    812 define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
    813   %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2)
    814   ret <3 x float> %1
    815 }
    816 
    817 define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
    818   %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2)
    819   ret <4 x float> %1
    820 }
    821 
    822 define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
    823   %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2)
    824   ret <4 x float> %1
    825 }
    826 
    827 
    828 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    829 ;;;;;;;;;                  YUV                   ;;;;;;;;;;
    830 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    831 
    832 @yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16
    833 @yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16
    834 @yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
    835 @yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16
    836 
    837 
    838 define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline {
    839   %_sy = zext i8 %pY to i32
    840   %_su = zext i8 %pU to i32
    841   %_sv = zext i8 %pV to i32
    842 
    843   %_sy2 = add i32 -16, %_sy
    844   %_sy3 = mul i32 298, %_sy2
    845   %_su2 = add i32 -128, %_su
    846   %_sv2 = add i32 -128, %_sv
    847   %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone
    848   %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone
    849   %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone
    850 
    851   %mu = load <4 x i32>, <4 x i32>* @yuv_U, align 8
    852   %mv = load <4 x i32>, <4 x i32>* @yuv_V, align 8
    853   %_u2 = mul <4 x i32> %_u, %mu
    854   %_v2 = mul <4 x i32> %_v, %mv
    855   %_y2 = add <4 x i32> %_y, %_u2
    856   %_y3 = add <4 x i32> %_y2, %_v2
    857 
    858  ; %r1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone
    859 ;  %r2 = trunc <4 x i16> %r1 to <4 x i8>
    860 ;  ret <4 x i8> %r2
    861 
    862   %c0 = load <4 x i32>, <4 x i32>* @yuv_0, align 8
    863   %c255 = load <4 x i32>, <4 x i32>* @yuv_255, align 8
    864   %r1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone
    865   %r2 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone
    866   %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8>
    867   %r4 = trunc <4 x i32> %r3 to <4 x i8>
    868   ret <4 x i8> %r4
    869 }
    870 
    871 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    872 ;;;;;;;;;              half_RECIP              ;;;;;;;;;;
    873 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    874 
    875 define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone {
    876   %1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone
    877   %2 = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %1, <2 x float> %v) nounwind readnone
    878   %3 = fmul <2 x float> %1, %2
    879   %4 = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %3, <2 x float> %v) nounwind readnone
    880   %5 = fmul <2 x float> %4, %3
    881   ret <2 x float> %5
    882 }
    883 
    884 define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone {
    885   %1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone
    886   %2 = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %1, <4 x float> %v) nounwind readnone
    887   %3 = fmul <4 x float> %1, %2
    888   %4 = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %3, <4 x float> %v) nounwind readnone
    889   %5 = fmul <4 x float> %4, %3
    890   ret <4 x float> %5
    891 }
    892 
    893 define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
    894   %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    895   %2 = tail call <4 x float> @_Z10half_recipDv4_f(<4 x float> %1) nounwind readnone
    896   %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    897   ret <3 x float> %3
    898 }
    899 
    900 
    901 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    902 ;;;;;;;;;              half_RSQRT              ;;;;;;;;;;
    903 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    904 
    905 define float @_Z10half_rsqrtf(float %v) {
    906   %1 = insertelement <2 x float> undef, float %v, i32 0
    907   %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
    908   %3 = extractelement <2 x float> %2, i32 0
    909   ret float %3
    910 }
    911 
    912 define <2 x float> @_Z10half_rsqrtDv2_f(<2 x float> %v) nounwind readnone {
    913   %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
    914   ret <2 x float> %1
    915 }
    916 
    917 define <3 x float> @_Z10half_rsqrtDv3_f(<3 x float> %v) nounwind readnone {
    918   %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    919   %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
    920   %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    921   ret <3 x float> %3
    922 }
    923 
    924 define <4 x float> @_Z10half_rsqrtDv4_f(<4 x float> %v) nounwind readnone {
    925   %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
    926   ret <4 x float> %1
    927 }
    928 
    929 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    930 ;;;;;;;;;              matrix                    ;;;;;;;;;;
    931 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    932 
    933 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
    934 
    935 %struct.rs_matrix4x4 = type { [16 x float] }
    936 %struct.rs_matrix3x3 = type { [9 x float] }
    937 %struct.rs_matrix2x2 = type { [4 x float] }
    938 
    939 define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline {
    940   %1 = insertelement <4 x float> undef, float %in, i32 0
    941   %2 = insertelement <4 x float> %1, float %in, i32 1
    942   %3 = insertelement <4 x float> %2, float %in, i32 2
    943   %4 = insertelement <4 x float> %3, float %in, i32 3
    944   ret <4 x float> %4
    945 }
    946 
    947 
    948 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
    949   %x0 = extractelement <3 x float> %in, i32 0
    950   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
    951   %y0 = extractelement <3 x float> %in, i32 1
    952   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
    953   %z0 = extractelement <3 x float> %in, i32 2
    954   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
    955 
    956   %px = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
    957   %px2 = bitcast float* %px to i8*
    958   %xm = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %px2, i32 4) nounwind
    959 
    960   %py = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
    961   %py2 = bitcast float* %py to i8*
    962   %ym = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %py2, i32 4) nounwind
    963 
    964   %pz = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5
    965   %pz2 = bitcast float* %pz to i8*
    966   %zm2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %pz2, i32 4) nounwind
    967   %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
    968 
    969   %a1 = fmul <4 x float> %x, %xm
    970   %a2 = fmul <4 x float> %y, %ym
    971   %a3 = fadd <4 x float> %a1, %a2
    972   %a4 = fmul <4 x float> %z, %zm
    973   %a5 = fadd <4 x float> %a4, %a3
    974   %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    975   ret <3 x float> %a6
    976 }
    977 
    978 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
    979   %x0 = extractelement <2 x float> %in, i32 0
    980   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
    981   %y0 = extractelement <2 x float> %in, i32 1
    982   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
    983 
    984   %px = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
    985   %px2 = bitcast float* %px to <4 x float>*
    986   %xm = load <4 x float>, <4 x float>* %px2, align 4
    987   %py = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
    988   %py2 = bitcast float* %py to <4 x float>*
    989   %ym = load <4 x float>, <4 x float>* %py2, align 4
    990 
    991   %a1 = fmul <4 x float> %x, %xm
    992   %a2 = fmul <4 x float> %y, %ym
    993   %a3 = fadd <4 x float> %a1, %a2
    994   %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    995   ret <3 x float> %a4
    996 }
    997 
    998 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
    999   %x0 = extractelement <4 x float> %in, i32 0
   1000   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1001   %y0 = extractelement <4 x float> %in, i32 1
   1002   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1003   %z0 = extractelement <4 x float> %in, i32 2
   1004   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
   1005   %w0 = extractelement <4 x float> %in, i32 3
   1006   %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone
   1007 
   1008   %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1009   %px2 = bitcast float* %px to <4 x float>*
   1010   %xm = load <4 x float>, <4 x float>* %px2, align 4
   1011   %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1012   %py2 = bitcast float* %py to <4 x float>*
   1013   %ym = load <4 x float>, <4 x float>* %py2, align 4
   1014   %pz = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   1015   %pz2 = bitcast float* %pz to <4 x float>*
   1016   %zm = load <4 x float>, <4 x float>* %pz2, align 4
   1017   %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1018   %pw2 = bitcast float* %pw to <4 x float>*
   1019   %wm = load <4 x float>, <4 x float>* %pw2, align 4
   1020 
   1021   %a1 = fmul <4 x float> %x, %xm
   1022   %a2 = fmul <4 x float> %y, %ym
   1023   %a3 = fadd <4 x float> %a1, %a2
   1024   %a4 = fmul <4 x float> %z, %zm
   1025   %a5 = fadd <4 x float> %a3, %a4
   1026   %a6 = fmul <4 x float> %w, %wm
   1027   %a7 = fadd <4 x float> %a5, %a6
   1028   ret <4 x float> %a7
   1029 }
   1030 
   1031 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
   1032   %x0 = extractelement <3 x float> %in, i32 0
   1033   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1034   %y0 = extractelement <3 x float> %in, i32 1
   1035   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1036   %z0 = extractelement <3 x float> %in, i32 2
   1037   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
   1038 
   1039   %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1040   %px2 = bitcast float* %px to <4 x float>*
   1041   %xm = load <4 x float>, <4 x float>* %px2, align 4
   1042   %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1043   %py2 = bitcast float* %py to <4 x float>*
   1044   %ym = load <4 x float>, <4 x float>* %py2, align 4
   1045   %pz = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   1046   %pz2 = bitcast float* %pz to <4 x float>*
   1047   %zm = load <4 x float>, <4 x float>* %pz2, align 4
   1048   %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1049   %pw2 = bitcast float* %pw to <4 x float>*
   1050   %wm = load <4 x float>, <4 x float>* %pw2, align 4
   1051 
   1052   %a1 = fmul <4 x float> %x, %xm
   1053   %a2 = fadd <4 x float> %wm, %a1
   1054   %a3 = fmul <4 x float> %y, %ym
   1055   %a4 = fadd <4 x float> %a2, %a3
   1056   %a5 = fmul <4 x float> %z, %zm
   1057   %a6 = fadd <4 x float> %a4, %a5
   1058   ret <4 x float> %a6
   1059 }
   1060 
   1061 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
   1062   %x0 = extractelement <2 x float> %in, i32 0
   1063   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1064   %y0 = extractelement <2 x float> %in, i32 1
   1065   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1066 
   1067   %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1068   %px2 = bitcast float* %px to <4 x float>*
   1069   %xm = load <4 x float>, <4 x float>* %px2, align 4
   1070   %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1071   %py2 = bitcast float* %py to <4 x float>*
   1072   %ym = load <4 x float>, <4 x float>* %py2, align 4
   1073   %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1074   %pw2 = bitcast float* %pw to <4 x float>*
   1075   %wm = load <4 x float>, <4 x float>* %pw2, align 4
   1076 
   1077   %a1 = fmul <4 x float> %x, %xm
   1078   %a2 = fadd <4 x float> %wm, %a1
   1079   %a3 = fmul <4 x float> %y, %ym
   1080   %a4 = fadd <4 x float> %a2, %a3
   1081   ret <4 x float> %a4
   1082 }
   1083 
   1084 
   1085 
   1086 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   1087 ;;;;;;;;;              pixel ops                 ;;;;;;;;;;
   1088 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   1089 
   1090 
   1091 @fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
   1092 @fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
   1093 @fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16
   1094 
   1095 declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
   1096 declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
   1097 
   1098 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
   1099 define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
   1100     %f255 = load <4 x float>, <4 x float>* @fc_255.0, align 16
   1101     %f05 = load <4 x float>, <4 x float>* @fc_0.5, align 16
   1102     %f0 = load <4 x float>, <4 x float>* @fc_0, align 16
   1103     %v1 = fmul <4 x float> %f255, %color
   1104     %v2 = fadd <4 x float> %f05, %v1
   1105     %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone
   1106     %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone
   1107     ret <4 x i8> %v4
   1108 }
   1109 
   1110 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
   1111 define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone {
   1112     %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1113     %2 = insertelement <4 x float> %1, float 1.0, i32 3
   1114     %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
   1115     ret <4 x i8> %3
   1116 }
   1117 
   1118 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
   1119 define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
   1120     %1 = insertelement <4 x float> undef, float %r, i32 0
   1121     %2 = insertelement <4 x float> %1, float %g, i32 1
   1122     %3 = insertelement <4 x float> %2, float %b, i32 2
   1123     %4 = insertelement <4 x float> %3, float 1.0, i32 3
   1124     %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
   1125     ret <4 x i8> %5
   1126 }
   1127 
   1128 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
   1129 define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
   1130     %1 = insertelement <4 x float> undef, float %r, i32 0
   1131     %2 = insertelement <4 x float> %1, float %g, i32 1
   1132     %3 = insertelement <4 x float> %2, float %b, i32 2
   1133     %4 = insertelement <4 x float> %3, float %a, i32 3
   1134     %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
   1135     ret <4 x i8> %5
   1136 }
   1137 
   1138