Home | History | Annotate | Download | only in arch
      1 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
      2 target triple = "aarch64-linux-android"
      3 
      4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
      5 ;;;;;;;;;               INTRINSICS               ;;;;;;;;;;
      6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
      7 
      8 declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
      9 declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
     10 declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     11 declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     12 declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     13 declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     14 declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     15 declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     16 
     17 declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
     18 declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
     19 declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     20 declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     21 declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     22 declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     23 declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     24 declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     25 
     26 declare <8 x i8>  @llvm.aarch64.neon.sqshl.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     27 declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     28 declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     29 
     30 declare <8 x i8>  @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     31 declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     32 declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     33 
     34 declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
     35 declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
     36 
     37 declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
     38 declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
     39 
     40 declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
     41 declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
     42 
     43 declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
     44 declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
     45 
     46 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     47 ;;;;;;;;;                HELPERS                 ;;;;;;;;;;
     48 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     49 
     50 define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
     51   %1 = insertelement <4 x float> undef, float %in, i32 0
     52   %2 = insertelement <4 x float> %1, float %in, i32 1
     53   %3 = insertelement <4 x float> %2, float %in, i32 2
     54   %4 = insertelement <4 x float> %3, float %in, i32 3
     55   ret <4 x float> %4
     56 }
     57 
     58 define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline {
     59   %1 = insertelement <4 x i32> undef, i32 %in, i32 0
     60   %2 = insertelement <4 x i32> %1, i32 %in, i32 1
     61   %3 = insertelement <4 x i32> %2, i32 %in, i32 2
     62   %4 = insertelement <4 x i32> %3, i32 %in, i32 3
     63   ret <4 x i32> %4
     64 }
     65 
     66 define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline {
     67   %1 = insertelement <4 x i16> undef, i16 %in, i32 0
     68   %2 = insertelement <4 x i16> %1, i16 %in, i32 1
     69   %3 = insertelement <4 x i16> %2, i16 %in, i32 2
     70   %4 = insertelement <4 x i16> %3, i16 %in, i32 3
     71   ret <4 x i16> %4
     72 }
     73 
     74 
     75 
     76 define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
     77   %1 = insertelement <2 x float> undef, float %in, i32 0
     78   %2 = insertelement <2 x float> %1, float %in, i32 1
     79   ret <2 x float> %2
     80 }
     81 
     82 define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline {
     83   %1 = insertelement <2 x i32> undef, i32 %in, i32 0
     84   %2 = insertelement <2 x i32> %1, i32 %in, i32 1
     85   ret <2 x i32> %2
     86 }
     87 
     88 define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline {
     89   %1 = insertelement <2 x i16> undef, i16 %in, i32 0
     90   %2 = insertelement <2 x i16> %1, i16 %in, i32 1
     91   ret <2 x i16> %2
     92 }
     93 
     94 
     95 define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline {
     96   %1 = insertelement <4 x i32> undef, i32 %in, i32 0
     97   %2 = insertelement <4 x i32> %1, i32 %in, i32 1
     98   %3 = insertelement <4 x i32> %2, i32 %in, i32 2
     99   %4 = insertelement <4 x i32> %3, i32 %in, i32 3
    100   ret <4 x i32> %4
    101 }
    102 
    103 
    104 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    105 ;;;;;;;;;                 CLAMP                  ;;;;;;;;;;
    106 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    107 
    108 define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
    109   %1 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
    110   %2 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
    111   ret <4 x float> %2
    112 }
    113 
    114 define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
    115   %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
    116   %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
    117   %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
    118   ret <4 x float> %out
    119 }
    120 
    121 define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
    122   %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    123   %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    124   %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    125   %a = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
    126   %b = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
    127   %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    128   ret <3 x float> %c
    129 }
    130 
    131 define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
    132   %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    133   %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
    134   %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
    135   %a = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
    136   %b = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
    137   %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    138   ret <3 x float> %c
    139 }
    140 
    141 define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
    142   %1 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
    143   %2 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
    144   ret <2 x float> %2
    145 }
    146 
    147 define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
    148   %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
    149   %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
    150   %a = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
    151   %b = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
    152   ret <2 x float> %b
    153 }
    154 
    155 define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
    156   %1 = fcmp olt float %value, %high
    157   %2 = select i1 %1, float %value, float %high
    158   %3 = fcmp ogt float %2, %low
    159   %4 = select i1 %3, float %2, float %low
    160   ret float %4
    161 }
    162 
    163 
    164 
    165 define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
    166   %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
    167   %2 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
    168   ret <4 x i32> %2
    169 }
    170 
    171 define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    172   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    173   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    174   %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
    175   %2 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
    176   ret <4 x i32> %2
    177 }
    178 
    179 define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
    180   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    181   %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    182   %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    183   %a = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    184   %b = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    185   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    186   ret <3 x i32> %c
    187 }
    188 
    189 define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    190   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    191   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    192   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    193   %a = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    194   %b = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    195   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    196   ret <3 x i32> %c
    197 }
    198 
    199 define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
    200   %1 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
    201   %2 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
    202   ret <2 x i32> %2
    203 }
    204 
    205 define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    206   %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
    207   %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
    208   %a = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
    209   %b = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
    210   ret <2 x i32> %b
    211 }
    212 
    213 
    214 
    215 define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
    216   %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
    217   %2 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
    218   ret <4 x i32> %2
    219 }
    220 
    221 define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    222   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    223   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    224   %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
    225   %2 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
    226   ret <4 x i32> %2
    227 }
    228 
    229 define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
    230   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    231   %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    232   %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    233   %a = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    234   %b = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    235   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    236   ret <3 x i32> %c
    237 }
    238 
    239 define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    240   %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    241   %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
    242   %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
    243   %a = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
    244   %b = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
    245   %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    246   ret <3 x i32> %c
    247 }
    248 
    249 define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
    250   %1 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
    251   %2 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
    252   ret <2 x i32> %2
    253 }
    254 
    255 define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
    256   %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
    257   %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
    258   %a = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
    259   %b = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
    260   ret <2 x i32> %b
    261 }
    262 
    263 
    264 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    265 ;;;;;;;;;                  FMAX                  ;;;;;;;;;;
    266 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    267 
    268 define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
    269   %1 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
    270   ret <4 x float> %1
    271 }
    272 
    273 define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
    274   %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    275   %2 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
    276   ret <4 x float> %2
    277 }
    278 
    279 define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
    280   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    281   %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    282   %3 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    283   %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    284   ret <3 x float> %4
    285 }
    286 
    287 define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
    288   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    289   %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    290   %3 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    291   %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    292   ret <3 x float> %c
    293 }
    294 
    295 define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
    296   %1 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
    297   ret <2 x float> %1
    298 }
    299 
    300 define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
    301   %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
    302   %2 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
    303   ret <2 x float> %2
    304 }
    305 
    306 define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly {
    307   %1 = fcmp ogt float %v1, %v2
    308   %2 = select i1 %1, float %v1, float %v2
    309   ret float %2
    310 }
    311 
    312 
    313 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    314 ;;;;;;;;;                  FMIN                  ;;;;;;;;;;
    315 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    316 
    317 define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
    318   %1 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
    319   ret <4 x float> %1
    320 }
    321 
    322 define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
    323   %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    324   %2 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
    325   ret <4 x float> %2
    326 }
    327 
    328 define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
    329   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    330   %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    331   %3 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    332   %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    333   ret <3 x float> %4
    334 }
    335 
    336 define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
    337   %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    338   %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
    339   %3 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
    340   %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    341   ret <3 x float> %c
    342 }
    343 
    344 define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
    345   %1 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
    346   ret <2 x float> %1
    347 }
    348 
    349 define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
    350   %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
    351   %2 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
    352   ret <2 x float> %2
    353 }
    354 
    355 define float @_Z4fminff(float %v1, float %v2) nounwind readnone {
    356   %1 = fcmp olt float %v1, %v2
    357   %2 = select i1 %1, float %v1, float %v2
    358   ret float %2
    359 }
    360 
    361 
    362 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    363 ;;;;;;;;;                  MAX                   ;;;;;;;;;;
    364 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    365 
    366 define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone {
    367   %1 = icmp sgt i8 %v1, %v2
    368   %2 = select i1 %1, i8 %v1, i8 %v2
    369   ret i8 %2
    370 }
    371 
    372 define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    373   %1 = sext <2 x i8> %v1 to <2 x i32>
    374   %2 = sext <2 x i8> %v2 to <2 x i32>
    375   %3 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    376   %4 = trunc <2 x i32> %3 to <2 x i8>
    377   ret <2 x i8> %4
    378 }
    379 
    380 define <3 x i8> @_Z3maxDv3_cS_(i32 %v1, i32 %v2) nounwind readnone {
    381   %1 = bitcast i32 %v1 to <4 x i8>
    382   %2 = bitcast i32 %v2 to <4 x i8>
    383   %3 = sext <4 x i8> %1 to <4 x i32>
    384   %4 = sext <4 x i8> %2 to <4 x i32>
    385   %5 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    386   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    387   %7 = trunc <3 x i32> %6 to <3 x i8>
    388   ret <3 x i8> %7
    389 }
    390 
    391 define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    392   %1 = sext <4 x i8> %v1 to <4 x i32>
    393   %2 = sext <4 x i8> %v2 to <4 x i32>
    394   %3 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    395   %4 = trunc <4 x i32> %3 to <4 x i8>
    396   ret <4 x i8> %4
    397 }
    398 
    399 define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone {
    400   %1 = icmp sgt i16 %v1, %v2
    401   %2 = select i1 %1, i16 %v1, i16 %v2
    402   ret i16 %2
    403 }
    404 
    405 define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    406   %1 = sext <2 x i16> %v1 to <2 x i32>
    407   %2 = sext <2 x i16> %v2 to <2 x i32>
    408   %3 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    409   %4 = trunc <2 x i32> %3 to <2 x i16>
    410   ret <2 x i16> %4
    411 }
    412 
    413 define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    414   %1 = sext <3 x i16> %v1 to <3 x i32>
    415   %2 = sext <3 x i16> %v2 to <3 x i32>
    416   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    417   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    418   %5 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    419   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    420   %7 = trunc <3 x i32> %6 to <3 x i16>
    421   ret <3 x i16> %7
    422 }
    423 
    424 define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    425   %1 = sext <4 x i16> %v1 to <4 x i32>
    426   %2 = sext <4 x i16> %v2 to <4 x i32>
    427   %3 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    428   %4 = trunc <4 x i32> %3 to <4 x i16>
    429   ret <4 x i16> %4
    430 }
    431 
    432 define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone {
    433   %1 = icmp sgt i32 %v1, %v2
    434   %2 = select i1 %1, i32 %v1, i32 %v2
    435   ret i32 %2
    436 }
    437 
    438 define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    439   %1 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    440   ret <2 x i32> %1
    441 }
    442 
    443 define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    444   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    445   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    446   %3 = tail call <4 x i32   > @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    447   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    448   ret <3 x i32> %4
    449 }
    450 
    451 define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    452   %1 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    453   ret <4 x i32> %1
    454 }
    455 
    456 define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone {
    457   %1 = icmp sgt i64 %v1, %v2
    458   %2 = select i1 %1, i64 %v1, i64 %v2
    459   ret i64 %2
    460 }
    461 
    462 ; TODO:  long vector types
    463 
    464 define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
    465   %1 = icmp ugt i8 %v1, %v2
    466   %2 = select i1 %1, i8 %v1, i8 %v2
    467   ret i8 %2
    468 }
    469 
    470 define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    471   %1 = zext <2 x i8> %v1 to <2 x i32>
    472   %2 = zext <2 x i8> %v2 to <2 x i32>
    473   %3 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    474   %4 = trunc <2 x i32> %3 to <2 x i8>
    475   ret <2 x i8> %4
    476 }
    477 
    478 define <3 x i8> @_Z3maxDv3_hS_(i32 %v1, i32 %v2) nounwind readnone {
    479   %1 = bitcast i32 %v1 to <4 x i8>
    480   %2 = bitcast i32 %v2 to <4 x i8>
    481   %3 = zext <4 x i8> %1 to <4 x i32>
    482   %4 = zext <4 x i8> %2 to <4 x i32>
    483   %5 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    484   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    485   %7 = trunc <3 x i32> %6 to <3 x i8>
    486   ret <3 x i8> %7
    487 }
    488 
    489 define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    490   %1 = zext <4 x i8> %v1 to <4 x i32>
    491   %2 = zext <4 x i8> %v2 to <4 x i32>
    492   %3 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    493   %4 = trunc <4 x i32> %3 to <4 x i8>
    494   ret <4 x i8> %4
    495 }
    496 
    497 define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
    498   %1 = icmp ugt i16 %v1, %v2
    499   %2 = select i1 %1, i16 %v1, i16 %v2
    500   ret i16 %2
    501 }
    502 
    503 define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    504   %1 = zext <2 x i16> %v1 to <2 x i32>
    505   %2 = zext <2 x i16> %v2 to <2 x i32>
    506   %3 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    507   %4 = trunc <2 x i32> %3 to <2 x i16>
    508   ret <2 x i16> %4
    509 }
    510 
    511 define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    512   %1 = zext <3 x i16> %v1 to <3 x i32>
    513   %2 = zext <3 x i16> %v2 to <3 x i32>
    514   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    515   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    516   %5 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    517   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    518   %7 = trunc <3 x i32> %6 to <3 x i16>
    519   ret <3 x i16> %7
    520 }
    521 
    522 define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    523   %1 = zext <4 x i16> %v1 to <4 x i32>
    524   %2 = zext <4 x i16> %v2 to <4 x i32>
    525   %3 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    526   %4 = trunc <4 x i32> %3 to <4 x i16>
    527   ret <4 x i16> %4
    528 }
    529 
    530 define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone {
    531   %1 = icmp ugt i32 %v1, %v2
    532   %2 = select i1 %1, i32 %v1, i32 %v2
    533   ret i32 %2
    534 }
    535 
    536 define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    537   %1 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    538   ret <2 x i32> %1
    539 }
    540 
    541 define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    542   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    543   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    544   %3 = tail call <4 x i32   > @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    545   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    546   ret <3 x i32> %4
    547 }
    548 
    549 define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    550   %1 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    551   ret <4 x i32> %1
    552 }
    553 
    554 
    555 ; TODO:  long vector types
    556 
    557 define float @_Z3maxff(float %v1, float %v2) nounwind readnone {
    558   %1 = tail call float @_Z4fmaxff(float %v1, float %v2)
    559   ret float %1
    560 }
    561 
    562 define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
    563   %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2)
    564   ret <2 x float> %1
    565 }
    566 
    567 define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
    568   %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2)
    569   ret <2 x float> %1
    570 }
    571 
    572 define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
    573   %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2)
    574   ret <3 x float> %1
    575 }
    576 
    577 define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
    578   %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2)
    579   ret <3 x float> %1
    580 }
    581 
    582 define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
    583   %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2)
    584   ret <4 x float> %1
    585 }
    586 
    587 define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
    588   %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2)
    589   ret <4 x float> %1
    590 }
    591 
    592 
    593 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    594 ;;;;;;;;;                  MIN                   ;;;;;;;;;;
    595 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    596 
    597 define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone {
    598   %1 = icmp slt i8 %v1, %v2
    599   %2 = select i1 %1, i8 %v1, i8 %v2
    600   ret i8 %2
    601 }
    602 
    603 define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    604   %1 = sext <2 x i8> %v1 to <2 x i32>
    605   %2 = sext <2 x i8> %v2 to <2 x i32>
    606   %3 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    607   %4 = trunc <2 x i32> %3 to <2 x i8>
    608   ret <2 x i8> %4
    609 }
    610 
    611 define <3 x i8> @_Z3minDv3_cS_(i32 %v1, i32 %v2) nounwind readnone {
    612   %1 = bitcast i32 %v1 to <4 x i8>
    613   %2 = bitcast i32 %v2 to <4 x i8>
    614   %3 = sext <4 x i8> %1 to <4 x i32>
    615   %4 = sext <4 x i8> %2 to <4 x i32>
    616   %5 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    617   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    618   %7 = trunc <3 x i32> %6 to <3 x i8>
    619   ret <3 x i8> %7
    620 }
    621 
    622 define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    623   %1 = sext <4 x i8> %v1 to <4 x i32>
    624   %2 = sext <4 x i8> %v2 to <4 x i32>
    625   %3 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    626   %4 = trunc <4 x i32> %3 to <4 x i8>
    627   ret <4 x i8> %4
    628 }
    629 
    630 define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone {
    631   %1 = icmp slt i16 %v1, %v2
    632   %2 = select i1 %1, i16 %v1, i16 %v2
    633   ret i16 %2
    634 }
    635 
    636 define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    637   %1 = sext <2 x i16> %v1 to <2 x i32>
    638   %2 = sext <2 x i16> %v2 to <2 x i32>
    639   %3 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    640   %4 = trunc <2 x i32> %3 to <2 x i16>
    641   ret <2 x i16> %4
    642 }
    643 
    644 define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    645   %1 = sext <3 x i16> %v1 to <3 x i32>
    646   %2 = sext <3 x i16> %v2 to <3 x i32>
    647   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    648   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    649   %5 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    650   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    651   %7 = trunc <3 x i32> %6 to <3 x i16>
    652   ret <3 x i16> %7
    653 }
    654 
    655 define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    656   %1 = sext <4 x i16> %v1 to <4 x i32>
    657   %2 = sext <4 x i16> %v2 to <4 x i32>
    658   %3 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    659   %4 = trunc <4 x i32> %3 to <4 x i16>
    660   ret <4 x i16> %4
    661 }
    662 
    663 define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone {
    664   %1 = icmp slt i32 %v1, %v2
    665   %2 = select i1 %1, i32 %v1, i32 %v2
    666   ret i32 %2
    667 }
    668 
    669 define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    670   %1 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    671   ret <2 x i32> %1
    672 }
    673 
    674 define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    675   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    676   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    677   %3 = tail call <4 x i32   > @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    678   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    679   ret <3 x i32> %4
    680 }
    681 
    682 define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    683   %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    684   ret <4 x i32> %1
    685 }
    686 
    687 define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone {
    688   %1 = icmp slt i64 %v1, %v2
    689   %2 = select i1 %1, i64 %v1, i64 %v2
    690   ret i64 %2
    691 }
    692 
    693 ; TODO:  long vector types
    694 
    695 define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
    696   %1 = icmp ult i8 %v1, %v2
    697   %2 = select i1 %1, i8 %v1, i8 %v2
    698   ret i8 %2
    699 }
    700 
    701 define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
    702   %1 = zext <2 x i8> %v1 to <2 x i32>
    703   %2 = zext <2 x i8> %v2 to <2 x i32>
    704   %3 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    705   %4 = trunc <2 x i32> %3 to <2 x i8>
    706   ret <2 x i8> %4
    707 }
    708 
    709 define <3 x i8> @_Z3minDv3_hS_(i32 %v1, i32 %v2) nounwind readnone {
    710   %1 = bitcast i32 %v1 to <4 x i8>
    711   %2 = bitcast i32 %v2 to <4 x i8>
    712   %3 = zext <4 x i8> %1 to <4 x i32>
    713   %4 = zext <4 x i8> %2 to <4 x i32>
    714   %5 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    715   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    716   %7 = trunc <3 x i32> %6 to <3 x i8>
    717   ret <3 x i8> %7
    718 }
    719 
    720 define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
    721   %1 = zext <4 x i8> %v1 to <4 x i32>
    722   %2 = zext <4 x i8> %v2 to <4 x i32>
    723   %3 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    724   %4 = trunc <4 x i32> %3 to <4 x i8>
    725   ret <4 x i8> %4
    726 }
    727 
    728 define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
    729   %1 = icmp ult i16 %v1, %v2
    730   %2 = select i1 %1, i16 %v1, i16 %v2
    731   ret i16 %2
    732 }
    733 
    734 define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
    735   %1 = zext <2 x i16> %v1 to <2 x i32>
    736   %2 = zext <2 x i16> %v2 to <2 x i32>
    737   %3 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
    738   %4 = trunc <2 x i32> %3 to <2 x i16>
    739   ret <2 x i16> %4
    740 }
    741 
    742 define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
    743   %1 = zext <3 x i16> %v1 to <3 x i32>
    744   %2 = zext <3 x i16> %v2 to <3 x i32>
    745   %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    746   %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    747   %5 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
    748   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    749   %7 = trunc <3 x i32> %6 to <3 x i16>
    750   ret <3 x i16> %7
    751 }
    752 
    753 define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
    754   %1 = zext <4 x i16> %v1 to <4 x i32>
    755   %2 = zext <4 x i16> %v2 to <4 x i32>
    756   %3 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    757   %4 = trunc <4 x i32> %3 to <4 x i16>
    758   ret <4 x i16> %4
    759 }
    760 
    761 define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone {
    762   %1 = icmp ult i32 %v1, %v2
    763   %2 = select i1 %1, i32 %v1, i32 %v2
    764   ret i32 %2
    765 }
    766 
    767 define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
    768   %1 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
    769   ret <2 x i32> %1
    770 }
    771 
    772 define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
    773   %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    774   %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    775   %3 = tail call <4 x i32   > @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
    776   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
    777   ret <3 x i32> %4
    778 }
    779 
    780 define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
    781   %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
    782   ret <4 x i32> %1
    783 }
    784 
    785 
    786 ; TODO:  long vector types
    787 
    788 define float @_Z3minff(float %v1, float %v2) nounwind readnone {
    789   %1 = tail call float @_Z4fminff(float %v1, float %v2)
    790   ret float %1
    791 }
    792 
    793 define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
    794   %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2)
    795   ret <2 x float> %1
    796 }
    797 
    798 define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
    799   %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2)
    800   ret <2 x float> %1
    801 }
    802 
    803 define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
    804   %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2)
    805   ret <3 x float> %1
    806 }
    807 
    808 define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
    809   %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2)
    810   ret <3 x float> %1
    811 }
    812 
    813 define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
    814   %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2)
    815   ret <4 x float> %1
    816 }
    817 
    818 define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
    819   %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2)
    820   ret <4 x float> %1
    821 }
    822 
    823 
    824 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    825 ;;;;;;;;;                  YUV                   ;;;;;;;;;;
    826 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    827 
    828 @yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16
    829 @yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16
    830 @yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
    831 @yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16
    832 
    833 
    834 define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline {
    835   %_sy = zext i8 %pY to i32
    836   %_su = zext i8 %pU to i32
    837   %_sv = zext i8 %pV to i32
    838 
    839   %_sy2 = add i32 -16, %_sy
    840   %_sy3 = mul i32 298, %_sy2
    841   %_su2 = add i32 -128, %_su
    842   %_sv2 = add i32 -128, %_sv
    843   %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone
    844   %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone
    845   %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone
    846 
    847   %mu = load <4 x i32>, <4 x i32>* @yuv_U, align 8
    848   %mv = load <4 x i32>, <4 x i32>* @yuv_V, align 8
    849   %_u2 = mul <4 x i32> %_u, %mu
    850   %_v2 = mul <4 x i32> %_v, %mv
    851   %_y2 = add <4 x i32> %_y, %_u2
    852   %_y3 = add <4 x i32> %_y2, %_v2
    853 
    854  ; %r1 = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone
    855 ;  %r2 = trunc <4 x i16> %r1 to <4 x i8>
    856 ;  ret <4 x i8> %r2
    857 
    858   %c0 = load <4 x i32>, <4 x i32>* @yuv_0, align 8
    859   %c255 = load <4 x i32>, <4 x i32>* @yuv_255, align 8
    860   %r1 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone
    861   %r2 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone
    862   %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8>
    863   %r4 = trunc <4 x i32> %r3 to <4 x i8>
    864   ret <4 x i8> %r4
    865 }
    866 
    867 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    868 ;;;;;;;;;              half_RECIP              ;;;;;;;;;;
    869 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    870 
    871 define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone {
    872   %1 = tail call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %v) nounwind readnone
    873   %2 = tail call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %1, <2 x float> %v) nounwind readnone
    874   %3 = fmul <2 x float> %1, %2
    875   %4 = tail call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %3, <2 x float> %v) nounwind readnone
    876   %5 = fmul <2 x float> %4, %3
    877   ret <2 x float> %5
    878 }
    879 
    880 define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone {
    881   %1 = tail call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %v) nounwind readnone
    882   %2 = tail call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %1, <4 x float> %v) nounwind readnone
    883   %3 = fmul <4 x float> %1, %2
    884   %4 = tail call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %3, <4 x float> %v) nounwind readnone
    885   %5 = fmul <4 x float> %4, %3
    886   ret <4 x float> %5
    887 }
    888 
    889 define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
    890   %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    891   %2 = tail call <4 x float> @_Z10half_recipDv4_f(<4 x float> %1) nounwind readnone
    892   %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    893   ret <3 x float> %3
    894 }
    895 
    896 
    897 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    898 ;;;;;;;;;              half_RSQRT              ;;;;;;;;;;
    899 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    900 
    901 define float @_Z10half_rsqrtf(float %v) {
    902   %1 = insertelement <2 x float> undef, float %v, i32 0
    903   %2 = tail call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %1) nounwind readnone
    904   %3 = fmul <2 x float> %2, %2
    905   %4 = tail call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %1, <2 x float> %3) nounwind readnone
    906   %5 = fmul <2 x float> %2, %4
    907   %6 = extractelement <2 x float> %5, i32 0
    908   ret float %6
    909 }
    910 
    911 define <2 x float> @_Z10half_rsqrtDv2_f(<2 x float> %v) nounwind readnone {
    912   %1 = tail call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %v) nounwind readnone
    913   %2 = fmul <2 x float> %1, %1
    914   %3 = tail call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v, <2 x float> %2) nounwind readnone
    915   %4 = fmul <2 x float> %1, %3
    916   ret <2 x float> %4
    917 }
    918 
    919 define <3 x float> @_Z10half_rsqrtDv3_f(<3 x float> %v) nounwind readnone {
    920   %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    921   %2 = tail call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %1) nounwind readnone
    922   %3 = fmul <4 x float> %2, %2
    923   %4 = tail call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %1, <4 x float> %3) nounwind readnone
    924   %5 = fmul <4 x float> %2, %4
    925   %6 = shufflevector <4 x float> %5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    926   ret <3 x float> %6
    927 }
    928 
    929 define <4 x float> @_Z10half_rsqrtDv4_f(<4 x float> %v) nounwind readnone {
    930   %1 = tail call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %v) nounwind readnone
    931   %2 = fmul <4 x float> %1, %1
    932   %3 = tail call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v, <4 x float> %2) nounwind readnone
    933   %4 = fmul <4 x float> %1, %3
    934   ret <4 x float> %4
    935 }
    936 
    937 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    938 ;;;;;;;;;              matrix                    ;;;;;;;;;;
    939 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    940 
    941 %struct.rs_matrix4x4 = type { [16 x float] }
    942 %struct.rs_matrix3x3 = type { [9 x float] }
    943 %struct.rs_matrix2x2 = type { [4 x float] }
    944 
    945 define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline {
    946   %1 = insertelement <4 x float> undef, float %in, i32 0
    947   %2 = insertelement <4 x float> %1, float %in, i32 1
    948   %3 = insertelement <4 x float> %2, float %in, i32 2
    949   %4 = insertelement <4 x float> %3, float %in, i32 3
    950   ret <4 x float> %4
    951 }
    952 
    953 
    954 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
    955   %x0 = extractelement <3 x float> %in, i32 0
    956   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
    957   %y0 = extractelement <3 x float> %in, i32 1
    958   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
    959   %z0 = extractelement <3 x float> %in, i32 2
    960   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
    961 
    962   %px = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
    963   %px2 = bitcast float* %px to <4 x float>*
    964   %xm = load <4 x float>, <4 x float>* %px2, align 4
    965 
    966   %py = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
    967   %py2 = bitcast float* %py to <4 x float>*
    968   ; %ym = call <4 x float> @llvm.aarch64.neon.ld4.v4f32(i8* %py2, i32 4) nounwind
    969   %ym = load <4 x float>, <4 x float>* %py2, align 4
    970 
    971   %pz = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5
    972   %pz2 = bitcast float* %pz to <4 x float>*
    973 ;  %zm2 = call <4 x float> @llvm.aarch64.neon.ld4.v4f32(i8* %pz2, i32 4) nounwind
    974   %zm2 = load <4 x float>, <4 x float>* %pz2, align 4
    975   %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
    976 
    977   %a1 = fmul <4 x float> %x, %xm
    978   %a2 = fmul <4 x float> %y, %ym
    979   %a3 = fadd <4 x float> %a1, %a2
    980   %a4 = fmul <4 x float> %z, %zm
    981   %a5 = fadd <4 x float> %a4, %a3
    982   %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
    983   ret <3 x float> %a6
    984 }
    985 
    986 define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
    987   %x0 = extractelement <2 x float> %in, i32 0
    988   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
    989   %y0 = extractelement <2 x float> %in, i32 1
    990   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
    991 
    992   %px = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
    993   %px2 = bitcast float* %px to <4 x float>*
    994   %xm = load <4 x float>, <4 x float>* %px2, align 4
    995   %py = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
    996   %py2 = bitcast float* %py to <4 x float>*
    997   %ym = load <4 x float>, <4 x float>* %py2, align 4
    998 
    999   %a1 = fmul <4 x float> %x, %xm
   1000   %a2 = fmul <4 x float> %y, %ym
   1001   %a3 = fadd <4 x float> %a1, %a2
   1002   %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
   1003   ret <3 x float> %a4
   1004 }
   1005 
   1006 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
   1007   %x0 = extractelement <4 x float> %in, i32 0
   1008   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1009   %y0 = extractelement <4 x float> %in, i32 1
   1010   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1011   %z0 = extractelement <4 x float> %in, i32 2
   1012   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
   1013   %w0 = extractelement <4 x float> %in, i32 3
   1014   %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone
   1015 
   1016   %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1017   %px2 = bitcast float* %px to <4 x float>*
   1018   %xm = load <4 x float>, <4 x float>* %px2, align 4
   1019   %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1020   %py2 = bitcast float* %py to <4 x float>*
   1021   %ym = load <4 x float>, <4 x float>* %py2, align 4
   1022   %pz = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   1023   %pz2 = bitcast float* %pz to <4 x float>*
   1024   %zm = load <4 x float>, <4 x float>* %pz2, align 4
   1025   %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1026   %pw2 = bitcast float* %pw to <4 x float>*
   1027   %wm = load <4 x float>, <4 x float>* %pw2, align 4
   1028 
   1029   %a1 = fmul <4 x float> %x, %xm
   1030   %a2 = fmul <4 x float> %y, %ym
   1031   %a3 = fadd <4 x float> %a1, %a2
   1032   %a4 = fmul <4 x float> %z, %zm
   1033   %a5 = fadd <4 x float> %a3, %a4
   1034   %a6 = fmul <4 x float> %w, %wm
   1035   %a7 = fadd <4 x float> %a5, %a6
   1036   ret <4 x float> %a7
   1037 }
   1038 
   1039 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
   1040   %x0 = extractelement <3 x float> %in, i32 0
   1041   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1042   %y0 = extractelement <3 x float> %in, i32 1
   1043   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1044   %z0 = extractelement <3 x float> %in, i32 2
   1045   %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
   1046 
   1047   %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1048   %px2 = bitcast float* %px to <4 x float>*
   1049   %xm = load <4 x float>, <4 x float>* %px2, align 4
   1050   %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1051   %py2 = bitcast float* %py to <4 x float>*
   1052   %ym = load <4 x float>, <4 x float>* %py2, align 4
   1053   %pz = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   1054   %pz2 = bitcast float* %pz to <4 x float>*
   1055   %zm = load <4 x float>, <4 x float>* %pz2, align 4
   1056   %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1057   %pw2 = bitcast float* %pw to <4 x float>*
   1058   %wm = load <4 x float>, <4 x float>* %pw2, align 4
   1059 
   1060   %a1 = fmul <4 x float> %x, %xm
   1061   %a2 = fadd <4 x float> %wm, %a1
   1062   %a3 = fmul <4 x float> %y, %ym
   1063   %a4 = fadd <4 x float> %a2, %a3
   1064   %a5 = fmul <4 x float> %z, %zm
   1065   %a6 = fadd <4 x float> %a4, %a5
   1066   ret <4 x float> %a6
   1067 }
   1068 
   1069 define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
   1070   %x0 = extractelement <2 x float> %in, i32 0
   1071   %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
   1072   %y0 = extractelement <2 x float> %in, i32 1
   1073   %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
   1074 
   1075   %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   1076   %px2 = bitcast float* %px to <4 x float>*
   1077   %xm = load <4 x float>, <4 x float>* %px2, align 4
   1078   %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   1079   %py2 = bitcast float* %py to <4 x float>*
   1080   %ym = load <4 x float>, <4 x float>* %py2, align 4
   1081   %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   1082   %pw2 = bitcast float* %pw to <4 x float>*
   1083   %wm = load <4 x float>, <4 x float>* %pw2, align 4
   1084 
   1085   %a1 = fmul <4 x float> %x, %xm
   1086   %a2 = fadd <4 x float> %wm, %a1
   1087   %a3 = fmul <4 x float> %y, %ym
   1088   %a4 = fadd <4 x float> %a2, %a3
   1089   ret <4 x float> %a4
   1090 }
   1091 
   1092 
   1093 
   1094 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   1095 ;;;;;;;;;              pixel ops                 ;;;;;;;;;;
   1096 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   1097 
   1098 
   1099 @fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
   1100 @fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
   1101 @fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16
   1102 
   1103 declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
   1104 declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
   1105 
   1106 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
   1107 define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
   1108     %f255 = load <4 x float>, <4 x float>* @fc_255.0, align 16
   1109     %f05 = load <4 x float>, <4 x float>* @fc_0.5, align 16
   1110     %f0 = load <4 x float>, <4 x float>* @fc_0, align 16
   1111     %v1 = fmul <4 x float> %f255, %color
   1112     %v2 = fadd <4 x float> %f05, %v1
   1113     %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone
   1114     %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone
   1115     ret <4 x i8> %v4
   1116 }
   1117 
   1118 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
   1119 define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<4 x i32> %color) nounwind readnone {
   1120     %1 = bitcast <4 x i32> %color to <4 x float>
   1121     %2 = insertelement <4 x float> %1, float 1.0, i32 3
   1122     %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
   1123     ret <4 x i8> %3
   1124 }
   1125 
   1126 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
   1127 define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
   1128     %1 = insertelement <4 x float> undef, float %r, i32 0
   1129     %2 = insertelement <4 x float> %1, float %g, i32 1
   1130     %3 = insertelement <4 x float> %2, float %b, i32 2
   1131     %4 = insertelement <4 x float> %3, float 1.0, i32 3
   1132     %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
   1133     ret <4 x i8> %5
   1134 }
   1135 
   1136 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
   1137 define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
   1138     %1 = insertelement <4 x float> undef, float %r, i32 0
   1139     %2 = insertelement <4 x float> %1, float %g, i32 1
   1140     %3 = insertelement <4 x float> %2, float %b, i32 2
   1141     %4 = insertelement <4 x float> %3, float %a, i32 3
   1142     %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
   1143     ret <4 x i8> %5
   1144 }
   1145 
   1146