Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
      2 
      3 define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
      4 ; CHECK-LABEL: smull_v8i8_v8i16:
      5 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
      6   %tmp1 = load <8 x i8>, <8 x i8>* %A
      7   %tmp2 = load <8 x i8>, <8 x i8>* %B
      8   %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
      9   %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
     10   %tmp5 = mul <8 x i16> %tmp3, %tmp4
     11   ret <8 x i16> %tmp5
     12 }
     13 
     14 define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     15 ; CHECK-LABEL: smull_v4i16_v4i32:
     16 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
     17   %tmp1 = load <4 x i16>, <4 x i16>* %A
     18   %tmp2 = load <4 x i16>, <4 x i16>* %B
     19   %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
     20   %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
     21   %tmp5 = mul <4 x i32> %tmp3, %tmp4
     22   ret <4 x i32> %tmp5
     23 }
     24 
     25 define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     26 ; CHECK-LABEL: smull_v2i32_v2i64:
     27 ; CHECK:  smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
     28   %tmp1 = load <2 x i32>, <2 x i32>* %A
     29   %tmp2 = load <2 x i32>, <2 x i32>* %B
     30   %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
     31   %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
     32   %tmp5 = mul <2 x i64> %tmp3, %tmp4
     33   ret <2 x i64> %tmp5
     34 }
     35 
     36 define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
     37 ; CHECK-LABEL: umull_v8i8_v8i16:
     38 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
     39   %tmp1 = load <8 x i8>, <8 x i8>* %A
     40   %tmp2 = load <8 x i8>, <8 x i8>* %B
     41   %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
     42   %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
     43   %tmp5 = mul <8 x i16> %tmp3, %tmp4
     44   ret <8 x i16> %tmp5
     45 }
     46 
     47 define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     48 ; CHECK-LABEL: umull_v4i16_v4i32:
     49 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
     50   %tmp1 = load <4 x i16>, <4 x i16>* %A
     51   %tmp2 = load <4 x i16>, <4 x i16>* %B
     52   %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
     53   %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
     54   %tmp5 = mul <4 x i32> %tmp3, %tmp4
     55   ret <4 x i32> %tmp5
     56 }
     57 
     58 define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     59 ; CHECK-LABEL: umull_v2i32_v2i64:
     60 ; CHECK:  umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
     61   %tmp1 = load <2 x i32>, <2 x i32>* %A
     62   %tmp2 = load <2 x i32>, <2 x i32>* %B
     63   %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
     64   %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
     65   %tmp5 = mul <2 x i64> %tmp3, %tmp4
     66   ret <2 x i64> %tmp5
     67 }
     68 
     69 define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
     70 ; CHECK-LABEL: smlal_v8i8_v8i16:
     71 ; CHECK:  smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
     72   %tmp1 = load <8 x i16>, <8 x i16>* %A
     73   %tmp2 = load <8 x i8>, <8 x i8>* %B
     74   %tmp3 = load <8 x i8>, <8 x i8>* %C
     75   %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
     76   %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
     77   %tmp6 = mul <8 x i16> %tmp4, %tmp5
     78   %tmp7 = add <8 x i16> %tmp1, %tmp6
     79   ret <8 x i16> %tmp7
     80 }
     81 
     82 define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
     83 ; CHECK-LABEL: smlal_v4i16_v4i32:
     84 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
     85   %tmp1 = load <4 x i32>, <4 x i32>* %A
     86   %tmp2 = load <4 x i16>, <4 x i16>* %B
     87   %tmp3 = load <4 x i16>, <4 x i16>* %C
     88   %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
     89   %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
     90   %tmp6 = mul <4 x i32> %tmp4, %tmp5
     91   %tmp7 = add <4 x i32> %tmp1, %tmp6
     92   ret <4 x i32> %tmp7
     93 }
     94 
     95 define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
     96 ; CHECK-LABEL: smlal_v2i32_v2i64:
     97 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
     98   %tmp1 = load <2 x i64>, <2 x i64>* %A
     99   %tmp2 = load <2 x i32>, <2 x i32>* %B
    100   %tmp3 = load <2 x i32>, <2 x i32>* %C
    101   %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
    102   %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
    103   %tmp6 = mul <2 x i64> %tmp4, %tmp5
    104   %tmp7 = add <2 x i64> %tmp1, %tmp6
    105   ret <2 x i64> %tmp7
    106 }
    107 
    108 define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
    109 ; CHECK-LABEL: umlal_v8i8_v8i16:
    110 ; CHECK:  umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    111   %tmp1 = load <8 x i16>, <8 x i16>* %A
    112   %tmp2 = load <8 x i8>, <8 x i8>* %B
    113   %tmp3 = load <8 x i8>, <8 x i8>* %C
    114   %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
    115   %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
    116   %tmp6 = mul <8 x i16> %tmp4, %tmp5
    117   %tmp7 = add <8 x i16> %tmp1, %tmp6
    118   ret <8 x i16> %tmp7
    119 }
    120 
    121 define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    122 ; CHECK-LABEL: umlal_v4i16_v4i32:
    123 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    124   %tmp1 = load <4 x i32>, <4 x i32>* %A
    125   %tmp2 = load <4 x i16>, <4 x i16>* %B
    126   %tmp3 = load <4 x i16>, <4 x i16>* %C
    127   %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
    128   %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
    129   %tmp6 = mul <4 x i32> %tmp4, %tmp5
    130   %tmp7 = add <4 x i32> %tmp1, %tmp6
    131   ret <4 x i32> %tmp7
    132 }
    133 
    134 define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    135 ; CHECK-LABEL: umlal_v2i32_v2i64:
    136 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    137   %tmp1 = load <2 x i64>, <2 x i64>* %A
    138   %tmp2 = load <2 x i32>, <2 x i32>* %B
    139   %tmp3 = load <2 x i32>, <2 x i32>* %C
    140   %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
    141   %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
    142   %tmp6 = mul <2 x i64> %tmp4, %tmp5
    143   %tmp7 = add <2 x i64> %tmp1, %tmp6
    144   ret <2 x i64> %tmp7
    145 }
    146 
    147 define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
    148 ; CHECK-LABEL: smlsl_v8i8_v8i16:
    149 ; CHECK:  smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    150   %tmp1 = load <8 x i16>, <8 x i16>* %A
    151   %tmp2 = load <8 x i8>, <8 x i8>* %B
    152   %tmp3 = load <8 x i8>, <8 x i8>* %C
    153   %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
    154   %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
    155   %tmp6 = mul <8 x i16> %tmp4, %tmp5
    156   %tmp7 = sub <8 x i16> %tmp1, %tmp6
    157   ret <8 x i16> %tmp7
    158 }
    159 
    160 define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    161 ; CHECK-LABEL: smlsl_v4i16_v4i32:
    162 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    163   %tmp1 = load <4 x i32>, <4 x i32>* %A
    164   %tmp2 = load <4 x i16>, <4 x i16>* %B
    165   %tmp3 = load <4 x i16>, <4 x i16>* %C
    166   %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
    167   %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
    168   %tmp6 = mul <4 x i32> %tmp4, %tmp5
    169   %tmp7 = sub <4 x i32> %tmp1, %tmp6
    170   ret <4 x i32> %tmp7
    171 }
    172 
    173 define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    174 ; CHECK-LABEL: smlsl_v2i32_v2i64:
    175 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    176   %tmp1 = load <2 x i64>, <2 x i64>* %A
    177   %tmp2 = load <2 x i32>, <2 x i32>* %B
    178   %tmp3 = load <2 x i32>, <2 x i32>* %C
    179   %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
    180   %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
    181   %tmp6 = mul <2 x i64> %tmp4, %tmp5
    182   %tmp7 = sub <2 x i64> %tmp1, %tmp6
    183   ret <2 x i64> %tmp7
    184 }
    185 
    186 define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
    187 ; CHECK-LABEL: umlsl_v8i8_v8i16:
    188 ; CHECK:  umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    189   %tmp1 = load <8 x i16>, <8 x i16>* %A
    190   %tmp2 = load <8 x i8>, <8 x i8>* %B
    191   %tmp3 = load <8 x i8>, <8 x i8>* %C
    192   %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
    193   %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
    194   %tmp6 = mul <8 x i16> %tmp4, %tmp5
    195   %tmp7 = sub <8 x i16> %tmp1, %tmp6
    196   ret <8 x i16> %tmp7
    197 }
    198 
    199 define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    200 ; CHECK-LABEL: umlsl_v4i16_v4i32:
    201 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    202   %tmp1 = load <4 x i32>, <4 x i32>* %A
    203   %tmp2 = load <4 x i16>, <4 x i16>* %B
    204   %tmp3 = load <4 x i16>, <4 x i16>* %C
    205   %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
    206   %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
    207   %tmp6 = mul <4 x i32> %tmp4, %tmp5
    208   %tmp7 = sub <4 x i32> %tmp1, %tmp6
    209   ret <4 x i32> %tmp7
    210 }
    211 
    212 define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    213 ; CHECK-LABEL: umlsl_v2i32_v2i64:
    214 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    215   %tmp1 = load <2 x i64>, <2 x i64>* %A
    216   %tmp2 = load <2 x i32>, <2 x i32>* %B
    217   %tmp3 = load <2 x i32>, <2 x i32>* %C
    218   %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
    219   %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
    220   %tmp6 = mul <2 x i64> %tmp4, %tmp5
    221   %tmp7 = sub <2 x i64> %tmp1, %tmp6
    222   ret <2 x i64> %tmp7
    223 }
    224 
    225 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
    226 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
    227 ; CHECK-LABEL: smull_extvec_v8i8_v8i16:
    228 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    229   %tmp3 = sext <8 x i8> %arg to <8 x i16>
    230   %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
    231   ret <8 x i16> %tmp4
    232 }
    233 
    234 define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
    235 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
    236 ; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
    237 ; CHECK: mov
    238 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    239   %tmp3 = sext <8 x i8> %arg to <8 x i16>
    240   %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
    241   ret <8 x i16> %tmp4
    242 }
    243 
    244 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
    245 ; CHECK-LABEL: smull_extvec_v4i16_v4i32:
    246 ; CHECK:  smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    247   %tmp3 = sext <4 x i16> %arg to <4 x i32>
    248   %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
    249   ret <4 x i32> %tmp4
    250 }
    251 
    252 define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
    253 ; CHECK: smull_extvec_v2i32_v2i64
    254 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    255   %tmp3 = sext <2 x i32> %arg to <2 x i64>
    256   %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
    257   ret <2 x i64> %tmp4
    258 }
    259 
    260 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
    261 ; CHECK-LABEL: umull_extvec_v8i8_v8i16:
    262 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    263   %tmp3 = zext <8 x i8> %arg to <8 x i16>
    264   %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
    265   ret <8 x i16> %tmp4
    266 }
    267 
    268 define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
    269 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
    270 ; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
    271 ; CHECK: mov
    272 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    273   %tmp3 = zext <8 x i8> %arg to <8 x i16>
    274   %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
    275   ret <8 x i16> %tmp4
    276 }
    277 
    278 define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
    279 ; CHECK-LABEL: umull_extvec_v4i16_v4i32:
    280 ; CHECK:  umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    281   %tmp3 = zext <4 x i16> %arg to <4 x i32>
    282   %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
    283   ret <4 x i32> %tmp4
    284 }
    285 
    286 define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
    287 ; CHECK-LABEL: umull_extvec_v2i32_v2i64:
    288 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    289   %tmp3 = zext <2 x i32> %arg to <2 x i64>
    290   %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
    291   ret <2 x i64> %tmp4
    292 }
    293 
    294 define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
    295 ; If one operand has a zero-extend and the other a sign-extend, smull
    296 ; cannot be used.
    297 ; CHECK-LABEL: smullWithInconsistentExtensions:
    298 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    299   %1 = sext <8 x i8> %vec to <8 x i16>
    300   %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
    301   %3 = extractelement <8 x i16> %2, i32 0
    302   ret i16 %3
    303 }
    304 
    305 define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
    306 entry:
    307 ; CHECK-LABEL: distribute:
    308 ; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
    309 ; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
    310   %0 = trunc i32 %mul to i8
    311   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
    312   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
    313   %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
    314   %4 = bitcast <16 x i8> %3 to <2 x double>
    315   %5 = extractelement <2 x double> %4, i32 1
    316   %6 = bitcast double %5 to <8 x i8>
    317   %7 = zext <8 x i8> %6 to <8 x i16>
    318   %8 = zext <8 x i8> %2 to <8 x i16>
    319   %9 = extractelement <2 x double> %4, i32 0
    320   %10 = bitcast double %9 to <8 x i8>
    321   %11 = zext <8 x i8> %10 to <8 x i16>
    322   %12 = add <8 x i16> %7, %11
    323   %13 = mul <8 x i16> %12, %8
    324   %14 = bitcast i16* %dst to i8*
    325   tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
    326   ret void
    327 }
    328 
    329 declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
    330 
    331 declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
    332 
    333