Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
      2 
      3 define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
      4 ;CHECK-LABEL: subhn8b:
      5 ;CHECK: subhn.8b
      6         %tmp1 = load <8 x i16>, <8 x i16>* %A
      7         %tmp2 = load <8 x i16>, <8 x i16>* %B
      8         %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
      9         ret <8 x i8> %tmp3
     10 }
     11 
     12 define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     13 ;CHECK-LABEL: subhn4h:
     14 ;CHECK: subhn.4h
     15         %tmp1 = load <4 x i32>, <4 x i32>* %A
     16         %tmp2 = load <4 x i32>, <4 x i32>* %B
     17         %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
     18         ret <4 x i16> %tmp3
     19 }
     20 
     21 define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
     22 ;CHECK-LABEL: subhn2s:
     23 ;CHECK: subhn.2s
     24         %tmp1 = load <2 x i64>, <2 x i64>* %A
     25         %tmp2 = load <2 x i64>, <2 x i64>* %B
     26         %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
     27         ret <2 x i32> %tmp3
     28 }
     29 
     30 define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
     31 ;CHECK-LABEL: subhn2_16b:
     32 ;CHECK: subhn.8b
     33 ;CHECK-NEXT: subhn2.16b
     34   %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
     35   %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
     36   %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     37   ret <16 x i8> %res
     38 }
     39 
     40 define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
     41 ;CHECK-LABEL: subhn2_8h:
     42 ;CHECK: subhn.4h
     43 ;CHECK-NEXT: subhn2.8h
     44   %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
     45   %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
     46   %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
     47   ret <8 x i16> %res
     48 }
     49 
     50 define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
     51 ;CHECK-LABEL: subhn2_4s:
     52 ;CHECK: subhn.2s
     53 ;CHECK-NEXT: subhn2.4s
     54   %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
     55   %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
     56   %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
     57   ret <4 x i32> %res
     58 }
     59 
     60 declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     61 declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     62 declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     63 
     64 define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
     65 ;CHECK-LABEL: rsubhn8b:
     66 ;CHECK: rsubhn.8b
     67         %tmp1 = load <8 x i16>, <8 x i16>* %A
     68         %tmp2 = load <8 x i16>, <8 x i16>* %B
     69         %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
     70         ret <8 x i8> %tmp3
     71 }
     72 
     73 define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     74 ;CHECK-LABEL: rsubhn4h:
     75 ;CHECK: rsubhn.4h
     76         %tmp1 = load <4 x i32>, <4 x i32>* %A
     77         %tmp2 = load <4 x i32>, <4 x i32>* %B
     78         %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
     79         ret <4 x i16> %tmp3
     80 }
     81 
     82 define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
     83 ;CHECK-LABEL: rsubhn2s:
     84 ;CHECK: rsubhn.2s
     85         %tmp1 = load <2 x i64>, <2 x i64>* %A
     86         %tmp2 = load <2 x i64>, <2 x i64>* %B
     87         %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
     88         ret <2 x i32> %tmp3
     89 }
     90 
     91 define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
     92 ;CHECK-LABEL: rsubhn2_16b:
     93 ;CHECK: rsubhn.8b
     94 ;CHECK-NEXT: rsubhn2.16b
     95   %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
     96   %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
     97   %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     98   ret <16 x i8> %res
     99 }
    100 
    101 define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
    102 ;CHECK-LABEL: rsubhn2_8h:
    103 ;CHECK: rsubhn.4h
    104 ;CHECK-NEXT: rsubhn2.8h
    105   %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
    106   %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
    107   %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    108   ret <8 x i16> %res
    109 }
    110 
    111 define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
    112 ;CHECK-LABEL: rsubhn2_4s:
    113 ;CHECK: rsubhn.2s
    114 ;CHECK-NEXT: rsubhn2.4s
    115   %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
    116   %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
    117   %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    118   ret <4 x i32> %res
    119 }
    120 
    121 declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
    122 declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
    123 declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
    124 
    125 define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    126 ;CHECK-LABEL: ssubl8h:
    127 ;CHECK: ssubl.8h
    128         %tmp1 = load <8 x i8>, <8 x i8>* %A
    129         %tmp2 = load <8 x i8>, <8 x i8>* %B
    130   %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
    131   %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
    132   %tmp5 = sub <8 x i16> %tmp3, %tmp4
    133         ret <8 x i16> %tmp5
    134 }
    135 
    136 define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    137 ;CHECK-LABEL: ssubl4s:
    138 ;CHECK: ssubl.4s
    139         %tmp1 = load <4 x i16>, <4 x i16>* %A
    140         %tmp2 = load <4 x i16>, <4 x i16>* %B
    141   %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
    142   %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
    143   %tmp5 = sub <4 x i32> %tmp3, %tmp4
    144         ret <4 x i32> %tmp5
    145 }
    146 
    147 define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    148 ;CHECK-LABEL: ssubl2d:
    149 ;CHECK: ssubl.2d
    150         %tmp1 = load <2 x i32>, <2 x i32>* %A
    151         %tmp2 = load <2 x i32>, <2 x i32>* %B
    152   %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
    153   %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
    154   %tmp5 = sub <2 x i64> %tmp3, %tmp4
    155         ret <2 x i64> %tmp5
    156 }
    157 
    158 define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    159 ;CHECK-LABEL: ssubl2_8h:
    160 ;CHECK: ssubl.8h
    161         %tmp1 = load <16 x i8>, <16 x i8>* %A
    162         %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    163         %ext1 = sext <8 x i8> %high1 to <8 x i16>
    164 
    165         %tmp2 = load <16 x i8>, <16 x i8>* %B
    166         %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    167         %ext2 = sext <8 x i8> %high2 to <8 x i16>
    168 
    169         %res = sub <8 x i16> %ext1, %ext2
    170         ret <8 x i16> %res
    171 }
    172 
    173 define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    174 ;CHECK-LABEL: ssubl2_4s:
    175 ;CHECK: ssubl.4s
    176         %tmp1 = load <8 x i16>, <8 x i16>* %A
    177         %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    178         %ext1 = sext <4 x i16> %high1 to <4 x i32>
    179 
    180         %tmp2 = load <8 x i16>, <8 x i16>* %B
    181         %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    182         %ext2 = sext <4 x i16> %high2 to <4 x i32>
    183 
    184         %res = sub <4 x i32> %ext1, %ext2
    185         ret <4 x i32> %res
    186 }
    187 
    188 define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    189 ;CHECK-LABEL: ssubl2_2d:
    190 ;CHECK: ssubl.2d
    191         %tmp1 = load <4 x i32>, <4 x i32>* %A
    192         %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    193         %ext1 = sext <2 x i32> %high1 to <2 x i64>
    194 
    195         %tmp2 = load <4 x i32>, <4 x i32>* %B
    196         %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    197         %ext2 = sext <2 x i32> %high2 to <2 x i64>
    198 
    199         %res = sub <2 x i64> %ext1, %ext2
    200         ret <2 x i64> %res
    201 }
    202 
    203 define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    204 ;CHECK-LABEL: usubl8h:
    205 ;CHECK: usubl.8h
    206   %tmp1 = load <8 x i8>, <8 x i8>* %A
    207   %tmp2 = load <8 x i8>, <8 x i8>* %B
    208   %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
    209   %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
    210   %tmp5 = sub <8 x i16> %tmp3, %tmp4
    211   ret <8 x i16> %tmp5
    212 }
    213 
    214 define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    215 ;CHECK-LABEL: usubl4s:
    216 ;CHECK: usubl.4s
    217   %tmp1 = load <4 x i16>, <4 x i16>* %A
    218   %tmp2 = load <4 x i16>, <4 x i16>* %B
    219   %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
    220   %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
    221   %tmp5 = sub <4 x i32> %tmp3, %tmp4
    222   ret <4 x i32> %tmp5
    223 }
    224 
    225 define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    226 ;CHECK-LABEL: usubl2d:
    227 ;CHECK: usubl.2d
    228   %tmp1 = load <2 x i32>, <2 x i32>* %A
    229   %tmp2 = load <2 x i32>, <2 x i32>* %B
    230   %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
    231   %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
    232   %tmp5 = sub <2 x i64> %tmp3, %tmp4
    233   ret <2 x i64> %tmp5
    234 }
    235 
    236 define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    237 ;CHECK-LABEL: usubl2_8h:
    238 ;CHECK: usubl.8h
    239   %tmp1 = load <16 x i8>, <16 x i8>* %A
    240   %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    241   %ext1 = zext <8 x i8> %high1 to <8 x i16>
    242 
    243   %tmp2 = load <16 x i8>, <16 x i8>* %B
    244   %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    245   %ext2 = zext <8 x i8> %high2 to <8 x i16>
    246 
    247   %res = sub <8 x i16> %ext1, %ext2
    248   ret <8 x i16> %res
    249 }
    250 
    251 define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    252 ;CHECK-LABEL: usubl2_4s:
    253 ;CHECK: usubl.4s
    254   %tmp1 = load <8 x i16>, <8 x i16>* %A
    255   %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    256   %ext1 = zext <4 x i16> %high1 to <4 x i32>
    257 
    258   %tmp2 = load <8 x i16>, <8 x i16>* %B
    259   %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    260   %ext2 = zext <4 x i16> %high2 to <4 x i32>
    261 
    262   %res = sub <4 x i32> %ext1, %ext2
    263   ret <4 x i32> %res
    264 }
    265 
    266 define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    267 ;CHECK-LABEL: usubl2_2d:
    268 ;CHECK: usubl.2d
    269   %tmp1 = load <4 x i32>, <4 x i32>* %A
    270   %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    271   %ext1 = zext <2 x i32> %high1 to <2 x i64>
    272 
    273   %tmp2 = load <4 x i32>, <4 x i32>* %B
    274   %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    275   %ext2 = zext <2 x i32> %high2 to <2 x i64>
    276 
    277   %res = sub <2 x i64> %ext1, %ext2
    278   ret <2 x i64> %res
    279 }
    280 
    281 define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
    282 ;CHECK-LABEL: ssubw8h:
    283 ;CHECK: ssubw.8h
    284         %tmp1 = load <8 x i16>, <8 x i16>* %A
    285         %tmp2 = load <8 x i8>, <8 x i8>* %B
    286   %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
    287   %tmp4 = sub <8 x i16> %tmp1, %tmp3
    288         ret <8 x i16> %tmp4
    289 }
    290 
    291 define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
    292 ;CHECK-LABEL: ssubw4s:
    293 ;CHECK: ssubw.4s
    294         %tmp1 = load <4 x i32>, <4 x i32>* %A
    295         %tmp2 = load <4 x i16>, <4 x i16>* %B
    296   %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
    297   %tmp4 = sub <4 x i32> %tmp1, %tmp3
    298         ret <4 x i32> %tmp4
    299 }
    300 
    301 define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
    302 ;CHECK-LABEL: ssubw2d:
    303 ;CHECK: ssubw.2d
    304         %tmp1 = load <2 x i64>, <2 x i64>* %A
    305         %tmp2 = load <2 x i32>, <2 x i32>* %B
    306   %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
    307   %tmp4 = sub <2 x i64> %tmp1, %tmp3
    308         ret <2 x i64> %tmp4
    309 }
    310 
    311 define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
    312 ;CHECK-LABEL: ssubw2_8h:
    313 ;CHECK: ssubw.8h
    314         %tmp1 = load <8 x i16>, <8 x i16>* %A
    315 
    316         %tmp2 = load <16 x i8>, <16 x i8>* %B
    317         %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    318         %ext2 = sext <8 x i8> %high2 to <8 x i16>
    319 
    320         %res = sub <8 x i16> %tmp1, %ext2
    321         ret <8 x i16> %res
    322 }
    323 
    324 define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
    325 ;CHECK-LABEL: ssubw2_4s:
    326 ;CHECK: ssubw.4s
    327         %tmp1 = load <4 x i32>, <4 x i32>* %A
    328 
    329         %tmp2 = load <8 x i16>, <8 x i16>* %B
    330         %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    331         %ext2 = sext <4 x i16> %high2 to <4 x i32>
    332 
    333         %res = sub <4 x i32> %tmp1, %ext2
    334         ret <4 x i32> %res
    335 }
    336 
    337 define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
    338 ;CHECK-LABEL: ssubw2_2d:
    339 ;CHECK: ssubw.2d
    340         %tmp1 = load <2 x i64>, <2 x i64>* %A
    341 
    342         %tmp2 = load <4 x i32>, <4 x i32>* %B
    343         %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    344         %ext2 = sext <2 x i32> %high2 to <2 x i64>
    345 
    346         %res = sub <2 x i64> %tmp1, %ext2
    347         ret <2 x i64> %res
    348 }
    349 
    350 define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
    351 ;CHECK-LABEL: usubw8h:
    352 ;CHECK: usubw.8h
    353         %tmp1 = load <8 x i16>, <8 x i16>* %A
    354         %tmp2 = load <8 x i8>, <8 x i8>* %B
    355   %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
    356   %tmp4 = sub <8 x i16> %tmp1, %tmp3
    357         ret <8 x i16> %tmp4
    358 }
    359 
    360 define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
    361 ;CHECK-LABEL: usubw4s:
    362 ;CHECK: usubw.4s
    363         %tmp1 = load <4 x i32>, <4 x i32>* %A
    364         %tmp2 = load <4 x i16>, <4 x i16>* %B
    365   %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
    366   %tmp4 = sub <4 x i32> %tmp1, %tmp3
    367         ret <4 x i32> %tmp4
    368 }
    369 
    370 define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
    371 ;CHECK-LABEL: usubw2d:
    372 ;CHECK: usubw.2d
    373         %tmp1 = load <2 x i64>, <2 x i64>* %A
    374         %tmp2 = load <2 x i32>, <2 x i32>* %B
    375   %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
    376   %tmp4 = sub <2 x i64> %tmp1, %tmp3
    377         ret <2 x i64> %tmp4
    378 }
    379 
    380 define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
    381 ;CHECK-LABEL: usubw2_8h:
    382 ;CHECK: usubw.8h
    383         %tmp1 = load <8 x i16>, <8 x i16>* %A
    384 
    385         %tmp2 = load <16 x i8>, <16 x i8>* %B
    386         %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    387         %ext2 = zext <8 x i8> %high2 to <8 x i16>
    388 
    389         %res = sub <8 x i16> %tmp1, %ext2
    390         ret <8 x i16> %res
    391 }
    392 
    393 define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
    394 ;CHECK-LABEL: usubw2_4s:
    395 ;CHECK: usubw.4s
    396         %tmp1 = load <4 x i32>, <4 x i32>* %A
    397 
    398         %tmp2 = load <8 x i16>, <8 x i16>* %B
    399         %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    400         %ext2 = zext <4 x i16> %high2 to <4 x i32>
    401 
    402         %res = sub <4 x i32> %tmp1, %ext2
    403         ret <4 x i32> %res
    404 }
    405 
    406 define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
    407 ;CHECK-LABEL: usubw2_2d:
    408 ;CHECK: usubw.2d
    409         %tmp1 = load <2 x i64>, <2 x i64>* %A
    410 
    411         %tmp2 = load <4 x i32>, <4 x i32>* %B
    412         %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    413         %ext2 = zext <2 x i32> %high2 to <2 x i64>
    414 
    415         %res = sub <2 x i64> %tmp1, %ext2
    416         ret <2 x i64> %res
    417 }
    418