Home | History | Annotate | Download | only in meta
      1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
     16 #define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
     17 
     18 #ifdef GEMMLOWP_NEON_64
     19 
     20 #include <cassert>
     21 #include <cstdint>
     22 
     23 namespace gemmlowp {
     24 namespace meta {
     25 
     26 template <>
     27 inline void
     28 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1,
     29           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
     30                        const FusedKernelParams<QuantizedStaticPreprocessed,
     31                                                RowMajor>& params,
     32                        uint8_t* result) {
     33 #ifdef DEBUG
     34 #ifdef DEBUG_METAGEMM_VERBOSE
     35   std::cout << __FILE__ << "(" << __LINE__
     36             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
     37                "QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()"
     38             << std::endl
     39             << std::flush;
     40 #endif
     41 #endif
     42   asm volatile(
     43       "prfm pldl1keep, [%x[lhs]]\n"
     44       "prfm pldl1keep, [%x[rhs]]\n"
     45 
     46       // Clear aggregators.
     47       "movi v0.4s, #0\n"
     48 
     49       // General NxM lanes loop.
     50       "1:"
     51 
     52       // Subtract counter.
     53       "subs %x[count], %x[count], #8\n"
     54 
     55       "ld1 {v1.2s}, [%x[lhs]], #8\n"
     56       "ld1 {v2.2s}, [%x[rhs]], #8\n"
     57       "prfm pldl1keep, [%x[lhs], #64]\n"
     58       "prfm pldl1keep, [%x[rhs], #64]\n"
     59       "umull v3.8h, v2.8b, v1.8b\n"
     60       "uadalp v0.4s, v3.8h\n"
     61 
     62       // Loop break.
     63       "bgt 1b\n"
     64 
     65       // StaticQuantization::Prepare
     66       "ld1 {v4.4s}, [%x[lhs]], #16\n"
     67       "ld1 {v5.4s}, [%x[rhs]], #16\n"
     68       "dup v6.4s, %w[multiplicative_offset]\n"
     69       "dup v7.4s, %w[rounding_offset]\n"
     70       "dup v8.4s, %w[shift]\n"
     71       "dup v4.4s, v4.s[0]\n"
     72 
     73       // RowMajorOutput::Prepare
     74 
     75       // Reduce aggregators.
     76       "addp v0.4s, v0.4s, v0.4s\n"
     77       "addp v0.4s, v0.4s, v0.4s\n"
     78 
     79       // StaticQuantization::Transform
     80       "add v0.4s, v0.4s, v4.4s\n"
     81       "add v0.4s, v0.4s, v5.4s\n"
     82       "mul v0.4s, v0.4s, v6.4s\n"
     83       "add v0.4s, v0.4s, v7.4s\n"
     84       "sshl v0.4s, v0.4s, v8.4s\n"
     85       "sqxtn v0.4h, v0.4s\n"
     86       "sqxtun v0.8b, v0.8h\n"
     87 
     88       // RowMajorOutput::Output
     89       "st1 {v0.b}[0], [%x[result]], #1\n"
     90       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
     91       : [count] "r"(params.kernel.count),
     92         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
     93         [shift] "r"(params.kernel.shift),
     94         [stride] "r"(params.output_stream.stride),
     95         [rounding_offset] "r"(params.kernel.rounding_offset)
     96       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
     97 }
     98 
     99 template <>
    100 inline void
    101 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2,
    102           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    103                        const FusedKernelParams<QuantizedStaticPreprocessed,
    104                                                RowMajor>& params,
    105                        uint8_t* result) {
    106 #ifdef DEBUG
    107 #ifdef DEBUG_METAGEMM_VERBOSE
    108   std::cout << __FILE__ << "(" << __LINE__
    109             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    110                "QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()"
    111             << std::endl
    112             << std::flush;
    113 #endif
    114 #endif
    115   asm volatile(
    116       "prfm pldl1keep, [%x[lhs]]\n"
    117       "prfm pldl1keep, [%x[rhs]]\n"
    118 
    119       // Clear aggregators.
    120       "movi v0.4s, #0\n"
    121       "movi v1.4s, #0\n"
    122 
    123       // General NxM lanes loop.
    124       "1:"
    125 
    126       // Subtract counter.
    127       "subs %x[count], %x[count], #8\n"
    128 
    129       "ld1 {v2.2s}, [%x[lhs]], #8\n"
    130       "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
    131       "prfm pldl1keep, [%x[lhs], #64]\n"
    132       "prfm pldl1keep, [%x[rhs], #64]\n"
    133       "umull v5.8h, v3.8b, v2.8b\n"
    134       "umull v6.8h, v4.8b, v2.8b\n"
    135       "uadalp v0.4s, v5.8h\n"
    136       "uadalp v1.4s, v6.8h\n"
    137 
    138       // Loop break.
    139       "bgt 1b\n"
    140 
    141       // StaticQuantization::Prepare
    142       "ld1 {v4.4s}, [%x[lhs]], #16\n"
    143       "ld1 {v5.4s}, [%x[rhs]], #16\n"
    144       "dup v6.4s, %w[multiplicative_offset]\n"
    145       "dup v7.4s, %w[rounding_offset]\n"
    146       "dup v8.4s, %w[shift]\n"
    147       "dup v4.4s, v4.s[0]\n"
    148 
    149       // RowMajorOutput::Prepare
    150 
    151       // Reduce aggregators.
    152       "addp v0.4s, v0.4s, v1.4s\n"
    153       "addp v0.4s, v0.4s, v0.4s\n"
    154 
    155       // StaticQuantization::Transform
    156       "add v0.4s, v0.4s, v4.4s\n"
    157       "add v0.4s, v0.4s, v5.4s\n"
    158       "mul v0.4s, v0.4s, v6.4s\n"
    159       "add v0.4s, v0.4s, v7.4s\n"
    160       "sshl v0.4s, v0.4s, v8.4s\n"
    161       "sqxtn v0.4h, v0.4s\n"
    162       "sqxtun v0.8b, v0.8h\n"
    163 
    164       // RowMajorOutput::Output
    165       "st1 {v0.h}[0], [%x[result]], #2\n"
    166       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    167       : [count] "r"(params.kernel.count),
    168         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    169         [shift] "r"(params.kernel.shift),
    170         [stride] "r"(params.output_stream.stride),
    171         [rounding_offset] "r"(params.kernel.rounding_offset)
    172       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
    173 }
    174 
    175 template <>
    176 inline void
    177 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3,
    178           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    179                        const FusedKernelParams<QuantizedStaticPreprocessed,
    180                                                RowMajor>& params,
    181                        uint8_t* result) {
    182 #ifdef DEBUG
    183 #ifdef DEBUG_METAGEMM_VERBOSE
    184   std::cout << __FILE__ << "(" << __LINE__
    185             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    186                "QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()"
    187             << std::endl
    188             << std::flush;
    189 #endif
    190 #endif
    191   asm volatile(
    192       "prfm pldl1keep, [%x[lhs]]\n"
    193       "prfm pldl1keep, [%x[rhs]]\n"
    194 
    195       // Clear aggregators.
    196       "movi v0.4s, #0\n"
    197       "movi v1.4s, #0\n"
    198       "movi v2.4s, #0\n"
    199 
    200       // General NxM lanes loop.
    201       "1:"
    202 
    203       // Subtract counter.
    204       "subs %x[count], %x[count], #8\n"
    205 
    206       "ld1 {v3.2s}, [%x[lhs]], #8\n"
    207       "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
    208       "prfm pldl1keep, [%x[lhs], #64]\n"
    209       "prfm pldl1keep, [%x[rhs], #64]\n"
    210       "umull v7.8h, v4.8b, v3.8b\n"
    211       "umull v8.8h, v5.8b, v3.8b\n"
    212       "umull v9.8h, v6.8b, v3.8b\n"
    213       "uadalp v0.4s, v7.8h\n"
    214       "uadalp v1.4s, v8.8h\n"
    215       "uadalp v2.4s, v9.8h\n"
    216 
    217       // Loop break.
    218       "bgt 1b\n"
    219 
    220       // StaticQuantization::Prepare
    221       "ld1 {v4.4s}, [%x[lhs]], #16\n"
    222       "ld1 {v5.4s}, [%x[rhs]], #16\n"
    223       "dup v6.4s, %w[multiplicative_offset]\n"
    224       "dup v7.4s, %w[rounding_offset]\n"
    225       "dup v8.4s, %w[shift]\n"
    226       "dup v4.4s, v4.s[0]\n"
    227 
    228       // RowMajorOutput::Prepare
    229 
    230       // Reduce aggregators.
    231       "addp v0.4s, v0.4s, v1.4s\n"
    232       "addp v2.4s, v2.4s, v2.4s\n"
    233       "addp v0.4s, v0.4s, v2.4s\n"
    234 
    235       // StaticQuantization::Transform
    236       "add v0.4s, v0.4s, v4.4s\n"
    237       "add v0.4s, v0.4s, v5.4s\n"
    238       "mul v0.4s, v0.4s, v6.4s\n"
    239       "add v0.4s, v0.4s, v7.4s\n"
    240       "sshl v0.4s, v0.4s, v8.4s\n"
    241       "sqxtn v0.4h, v0.4s\n"
    242       "sqxtun v0.8b, v0.8h\n"
    243 
    244       // RowMajorOutput::Output
    245       "st1 {v0.h}[0], [%x[result]], #2\n"
    246       "st1 {v0.b}[2], [%x[result]], #1\n"
    247       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    248       : [count] "r"(params.kernel.count),
    249         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    250         [shift] "r"(params.kernel.shift),
    251         [stride] "r"(params.output_stream.stride),
    252         [rounding_offset] "r"(params.kernel.rounding_offset)
    253       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
    254         "memory");
    255 }
    256 
    257 template <>
    258 inline void
    259 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4,
    260           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    261                        const FusedKernelParams<QuantizedStaticPreprocessed,
    262                                                RowMajor>& params,
    263                        uint8_t* result) {
    264 #ifdef DEBUG
    265 #ifdef DEBUG_METAGEMM_VERBOSE
    266   std::cout << __FILE__ << "(" << __LINE__
    267             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    268                "QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()"
    269             << std::endl
    270             << std::flush;
    271 #endif
    272 #endif
    273   asm volatile(
    274       "prfm pldl1keep, [%x[lhs]]\n"
    275       "prfm pldl1keep, [%x[rhs]]\n"
    276 
    277       // Clear aggregators.
    278       "movi v0.4s, #0\n"
    279       "movi v1.4s, #0\n"
    280       "movi v2.4s, #0\n"
    281       "mov v3.16b, v0.16b\n"
    282 
    283       // General NxM lanes loop.
    284       "1:"
    285 
    286       // Subtract counter.
    287       "subs %x[count], %x[count], #8\n"
    288 
    289       "ld1 {v4.2s}, [%x[lhs]], #8\n"
    290       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
    291       "prfm pldl1keep, [%x[lhs], #64]\n"
    292       "prfm pldl1keep, [%x[rhs], #64]\n"
    293       "umull v9.8h, v5.8b, v4.8b\n"
    294       "umull v10.8h, v6.8b, v4.8b\n"
    295       "umull v11.8h, v7.8b, v4.8b\n"
    296       "umull v12.8h, v8.8b, v4.8b\n"
    297       "uadalp v0.4s, v9.8h\n"
    298       "uadalp v1.4s, v10.8h\n"
    299       "uadalp v2.4s, v11.8h\n"
    300       "uadalp v3.4s, v12.8h\n"
    301 
    302       // Loop break.
    303       "bgt 1b\n"
    304 
    305       // StaticQuantization::Prepare
    306       "ld1 {v4.4s}, [%x[lhs]], #16\n"
    307       "ld1 {v5.4s}, [%x[rhs]], #16\n"
    308       "dup v6.4s, %w[multiplicative_offset]\n"
    309       "dup v7.4s, %w[rounding_offset]\n"
    310       "dup v8.4s, %w[shift]\n"
    311       "dup v4.4s, v4.s[0]\n"
    312 
    313       // RowMajorOutput::Prepare
    314 
    315       // Reduce aggregators.
    316       "addp v0.4s, v0.4s, v1.4s\n"
    317       "addp v2.4s, v2.4s, v3.4s\n"
    318       "addp v0.4s, v0.4s, v2.4s\n"
    319 
    320       // StaticQuantization::Transform
    321       "add v0.4s, v0.4s, v4.4s\n"
    322       "add v0.4s, v0.4s, v5.4s\n"
    323       "mul v0.4s, v0.4s, v6.4s\n"
    324       "add v0.4s, v0.4s, v7.4s\n"
    325       "sshl v0.4s, v0.4s, v8.4s\n"
    326       "sqxtn v0.4h, v0.4s\n"
    327       "sqxtun v0.8b, v0.8h\n"
    328 
    329       // RowMajorOutput::Output
    330       "st1 {v0.s}[0], [%x[result]], #4\n"
    331       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    332       : [count] "r"(params.kernel.count),
    333         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    334         [shift] "r"(params.kernel.shift),
    335         [stride] "r"(params.output_stream.stride),
    336         [rounding_offset] "r"(params.kernel.rounding_offset)
    337       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
    338         "v11", "v12", "cc", "memory");
    339 }
    340 
    341 template <>
    342 inline void
    343 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5,
    344           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    345                        const FusedKernelParams<QuantizedStaticPreprocessed,
    346                                                RowMajor>& params,
    347                        uint8_t* result) {
    348 #ifdef DEBUG
    349 #ifdef DEBUG_METAGEMM_VERBOSE
    350   std::cout << __FILE__ << "(" << __LINE__
    351             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    352                "QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()"
    353             << std::endl
    354             << std::flush;
    355 #endif
    356 #endif
    357   asm volatile(
    358       "prfm pldl1keep, [%x[lhs]]\n"
    359       "prfm pldl1keep, [%x[rhs]]\n"
    360 
    361       // Clear aggregators.
    362       "movi v0.4s, #0\n"
    363       "movi v1.4s, #0\n"
    364       "movi v2.4s, #0\n"
    365       "mov v3.16b, v0.16b\n"
    366       "mov v4.16b, v1.16b\n"
    367 
    368       // General 1xM lanes loop.
    369       "1:"
    370 
    371       // Subtract counter.
    372       "subs %x[count], %x[count], #8\n"
    373 
    374       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
    375       "ld1 {v9.2s}, [%x[lhs]], #8\n"
    376       "prfm pldl1keep, [%x[lhs], #64]\n"
    377       "umull v10.8h, v5.8b, v9.8b\n"
    378       "umull v11.8h, v6.8b, v9.8b\n"
    379       "umull v12.8h, v7.8b, v9.8b\n"
    380       "umull v13.8h, v8.8b, v9.8b\n"
    381       "ld1 {v5.2s}, [%x[rhs]], #8\n"
    382       "prfm pldl1keep, [%x[rhs], #128]\n"
    383       "uadalp v0.4s, v10.8h\n"
    384       "uadalp v1.4s, v11.8h\n"
    385       "uadalp v2.4s, v12.8h\n"
    386       "uadalp v3.4s, v13.8h\n"
    387       "umull v10.8h, v5.8b, v9.8b\n"
    388       "uadalp v4.4s, v10.8h\n"
    389 
    390       // Loop break.
    391       "bgt 1b\n"
    392 
    393       // StaticQuantization::Prepare
    394       "ld1 {v5.4s}, [%x[lhs]], #16\n"
    395       "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
    396       "dup v8.4s, %w[multiplicative_offset]\n"
    397       "dup v9.4s, %w[rounding_offset]\n"
    398       "dup v10.4s, %w[shift]\n"
    399       "dup v5.4s, v5.s[0]\n"
    400 
    401       // RowMajorOutput::Prepare
    402 
    403       // Reduce aggregators.
    404       "addp v0.4s, v0.4s, v1.4s\n"
    405       "addp v2.4s, v2.4s, v3.4s\n"
    406       "addp v4.4s, v4.4s, v4.4s\n"
    407       "addp v0.4s, v0.4s, v2.4s\n"
    408       "addp v1.4s, v4.4s, v4.4s\n"
    409 
    410       // StaticQuantization::Transform
    411       "add v0.4s, v0.4s, v5.4s\n"
    412       "add v1.4s, v1.4s, v5.4s\n"
    413       "add v0.4s, v0.4s, v6.4s\n"
    414       "add v1.4s, v1.4s, v7.4s\n"
    415       "mul v0.4s, v0.4s, v8.4s\n"
    416       "mul v1.4s, v1.4s, v8.4s\n"
    417       "add v0.4s, v0.4s, v9.4s\n"
    418       "add v1.4s, v1.4s, v9.4s\n"
    419       "sshl v0.4s, v0.4s, v10.4s\n"
    420       "sshl v1.4s, v1.4s, v10.4s\n"
    421       "sqxtn v0.4h, v0.4s\n"
    422       "sqxtn2 v0.8h, v1.4s\n"
    423       "sqxtun v0.8b, v0.8h\n"
    424 
    425       // RowMajorOutput::Output
    426       "st1 {v0.s}[0], [%x[result]], #4\n"
    427       "st1 {v0.b}[4], [%x[result]], #1\n"
    428       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    429       : [count] "r"(params.kernel.count),
    430         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    431         [shift] "r"(params.kernel.shift),
    432         [stride] "r"(params.output_stream.stride),
    433         [rounding_offset] "r"(params.kernel.rounding_offset)
    434       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
    435         "v11", "v12", "v13", "cc", "memory");
    436 }
    437 
    438 template <>
    439 inline void
    440 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6,
    441           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    442                        const FusedKernelParams<QuantizedStaticPreprocessed,
    443                                                RowMajor>& params,
    444                        uint8_t* result) {
    445 #ifdef DEBUG
    446 #ifdef DEBUG_METAGEMM_VERBOSE
    447   std::cout << __FILE__ << "(" << __LINE__
    448             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    449                "QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()"
    450             << std::endl
    451             << std::flush;
    452 #endif
    453 #endif
    454   asm volatile(
    455       "prfm pldl1keep, [%x[lhs]]\n"
    456       "prfm pldl1keep, [%x[rhs]]\n"
    457 
    458       // Clear aggregators.
    459       "movi v0.4s, #0\n"
    460       "movi v1.4s, #0\n"
    461       "movi v2.4s, #0\n"
    462       "mov v3.16b, v0.16b\n"
    463       "mov v4.16b, v1.16b\n"
    464       "mov v5.16b, v2.16b\n"
    465 
    466       // General 1xM lanes loop.
    467       "1:"
    468 
    469       // Subtract counter.
    470       "subs %x[count], %x[count], #8\n"
    471 
    472       "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
    473       "ld1 {v10.2s}, [%x[lhs]], #8\n"
    474       "prfm pldl1keep, [%x[lhs], #64]\n"
    475       "umull v11.8h, v6.8b, v10.8b\n"
    476       "umull v12.8h, v7.8b, v10.8b\n"
    477       "umull v13.8h, v8.8b, v10.8b\n"
    478       "umull v14.8h, v9.8b, v10.8b\n"
    479       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
    480       "prfm pldl1keep, [%x[rhs], #128]\n"
    481       "uadalp v0.4s, v11.8h\n"
    482       "uadalp v1.4s, v12.8h\n"
    483       "uadalp v2.4s, v13.8h\n"
    484       "uadalp v3.4s, v14.8h\n"
    485       "umull v11.8h, v6.8b, v10.8b\n"
    486       "umull v12.8h, v7.8b, v10.8b\n"
    487       "uadalp v4.4s, v11.8h\n"
    488       "uadalp v5.4s, v12.8h\n"
    489 
    490       // Loop break.
    491       "bgt 1b\n"
    492 
    493       // StaticQuantization::Prepare
    494       "ld1 {v6.4s}, [%x[lhs]], #16\n"
    495       "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
    496       "dup v9.4s, %w[multiplicative_offset]\n"
    497       "dup v10.4s, %w[rounding_offset]\n"
    498       "dup v11.4s, %w[shift]\n"
    499       "dup v6.4s, v6.s[0]\n"
    500 
    501       // RowMajorOutput::Prepare
    502 
    503       // Reduce aggregators.
    504       "addp v0.4s, v0.4s, v1.4s\n"
    505       "addp v2.4s, v2.4s, v3.4s\n"
    506       "addp v4.4s, v4.4s, v5.4s\n"
    507       "addp v0.4s, v0.4s, v2.4s\n"
    508       "addp v1.4s, v4.4s, v4.4s\n"
    509 
    510       // StaticQuantization::Transform
    511       "add v0.4s, v0.4s, v6.4s\n"
    512       "add v1.4s, v1.4s, v6.4s\n"
    513       "add v0.4s, v0.4s, v7.4s\n"
    514       "add v1.4s, v1.4s, v8.4s\n"
    515       "mul v0.4s, v0.4s, v9.4s\n"
    516       "mul v1.4s, v1.4s, v9.4s\n"
    517       "add v0.4s, v0.4s, v10.4s\n"
    518       "add v1.4s, v1.4s, v10.4s\n"
    519       "sshl v0.4s, v0.4s, v11.4s\n"
    520       "sshl v1.4s, v1.4s, v11.4s\n"
    521       "sqxtn v0.4h, v0.4s\n"
    522       "sqxtn2 v0.8h, v1.4s\n"
    523       "sqxtun v0.8b, v0.8h\n"
    524 
    525       // RowMajorOutput::Output
    526       "st1 {v0.s}[0], [%x[result]], #4\n"
    527       "st1 {v0.h}[2], [%x[result]], #2\n"
    528       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    529       : [count] "r"(params.kernel.count),
    530         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    531         [shift] "r"(params.kernel.shift),
    532         [stride] "r"(params.output_stream.stride),
    533         [rounding_offset] "r"(params.kernel.rounding_offset)
    534       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
    535         "v11", "v12", "v13", "v14", "cc", "memory");
    536 }
    537 
    538 template <>
    539 inline void
    540 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7,
    541           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    542                        const FusedKernelParams<QuantizedStaticPreprocessed,
    543                                                RowMajor>& params,
    544                        uint8_t* result) {
    545 #ifdef DEBUG
    546 #ifdef DEBUG_METAGEMM_VERBOSE
    547   std::cout << __FILE__ << "(" << __LINE__
    548             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    549                "QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()"
    550             << std::endl
    551             << std::flush;
    552 #endif
    553 #endif
    554   asm volatile(
    555       "prfm pldl1keep, [%x[lhs]]\n"
    556       "prfm pldl1keep, [%x[rhs]]\n"
    557 
    558       // Clear aggregators.
    559       "movi v0.4s, #0\n"
    560       "movi v1.4s, #0\n"
    561       "movi v2.4s, #0\n"
    562       "mov v3.16b, v0.16b\n"
    563       "mov v4.16b, v1.16b\n"
    564       "mov v5.16b, v2.16b\n"
    565       "mov v6.16b, v3.16b\n"
    566 
    567       // General 1xM lanes loop.
    568       "1:"
    569 
    570       // Subtract counter.
    571       "subs %x[count], %x[count], #8\n"
    572 
    573       "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
    574       "ld1 {v11.2s}, [%x[lhs]], #8\n"
    575       "prfm pldl1keep, [%x[lhs], #64]\n"
    576       "umull v12.8h, v7.8b, v11.8b\n"
    577       "umull v13.8h, v8.8b, v11.8b\n"
    578       "umull v14.8h, v9.8b, v11.8b\n"
    579       "umull v15.8h, v10.8b, v11.8b\n"
    580       "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
    581       "prfm pldl1keep, [%x[rhs], #128]\n"
    582       "uadalp v0.4s, v12.8h\n"
    583       "uadalp v1.4s, v13.8h\n"
    584       "uadalp v2.4s, v14.8h\n"
    585       "uadalp v3.4s, v15.8h\n"
    586       "umull v12.8h, v7.8b, v11.8b\n"
    587       "umull v13.8h, v8.8b, v11.8b\n"
    588       "umull v14.8h, v9.8b, v11.8b\n"
    589       "uadalp v4.4s, v12.8h\n"
    590       "uadalp v5.4s, v13.8h\n"
    591       "uadalp v6.4s, v14.8h\n"
    592 
    593       // Loop break.
    594       "bgt 1b\n"
    595 
    596       // StaticQuantization::Prepare
    597       "ld1 {v7.4s}, [%x[lhs]], #16\n"
    598       "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
    599       "dup v10.4s, %w[multiplicative_offset]\n"
    600       "dup v11.4s, %w[rounding_offset]\n"
    601       "dup v12.4s, %w[shift]\n"
    602       "dup v7.4s, v7.s[0]\n"
    603 
    604       // RowMajorOutput::Prepare
    605 
    606       // Reduce aggregators.
    607       "addp v0.4s, v0.4s, v1.4s\n"
    608       "addp v2.4s, v2.4s, v3.4s\n"
    609       "addp v4.4s, v4.4s, v5.4s\n"
    610       "addp v6.4s, v6.4s, v6.4s\n"
    611       "addp v0.4s, v0.4s, v2.4s\n"
    612       "addp v1.4s, v4.4s, v6.4s\n"
    613 
    614       // StaticQuantization::Transform
    615       "add v0.4s, v0.4s, v7.4s\n"
    616       "add v1.4s, v1.4s, v7.4s\n"
    617       "add v0.4s, v0.4s, v8.4s\n"
    618       "add v1.4s, v1.4s, v9.4s\n"
    619       "mul v0.4s, v0.4s, v10.4s\n"
    620       "mul v1.4s, v1.4s, v10.4s\n"
    621       "add v0.4s, v0.4s, v11.4s\n"
    622       "add v1.4s, v1.4s, v11.4s\n"
    623       "sshl v0.4s, v0.4s, v12.4s\n"
    624       "sshl v1.4s, v1.4s, v12.4s\n"
    625       "sqxtn v0.4h, v0.4s\n"
    626       "sqxtn2 v0.8h, v1.4s\n"
    627       "sqxtun v0.8b, v0.8h\n"
    628 
    629       // RowMajorOutput::Output
    630       "st1 {v0.s}[0], [%x[result]], #4\n"
    631       "st1 {v0.h}[2], [%x[result]], #2\n"
    632       "st1 {v0.b}[6], [%x[result]], #1\n"
    633       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    634       : [count] "r"(params.kernel.count),
    635         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    636         [shift] "r"(params.kernel.shift),
    637         [stride] "r"(params.output_stream.stride),
    638         [rounding_offset] "r"(params.kernel.rounding_offset)
    639       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
    640         "v11", "v12", "v13", "v14", "v15", "cc", "memory");
    641 }
    642 
    643 template <>
    644 inline void
    645 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8,
    646           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    647                        const FusedKernelParams<QuantizedStaticPreprocessed,
    648                                                RowMajor>& params,
    649                        uint8_t* result) {
    650 #ifdef DEBUG
    651 #ifdef DEBUG_METAGEMM_VERBOSE
    652   std::cout << __FILE__ << "(" << __LINE__
    653             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    654                "QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()"
    655             << std::endl
    656             << std::flush;
    657 #endif
    658 #endif
    659   asm volatile(
    660       "prfm pldl1keep, [%x[lhs]]\n"
    661       "prfm pldl1keep, [%x[rhs]]\n"
    662 
    663       // Clear aggregators.
    664       "movi v0.4s, #0\n"
    665       "movi v1.4s, #0\n"
    666       "movi v2.4s, #0\n"
    667       "mov v3.16b, v0.16b\n"
    668       "mov v4.16b, v1.16b\n"
    669       "mov v5.16b, v2.16b\n"
    670       "mov v6.16b, v3.16b\n"
    671       "mov v7.16b, v4.16b\n"
    672 
    673       // 1x8 lanes loop.
    674       "1:"
    675 
    676       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
    677       "ld1 {v8.2s}, [%x[lhs]], #8\n"
    678       "umull v13.8h, v8.8b, v9.8b\n"
    679       "umull v14.8h, v8.8b, v10.8b\n"
    680       "umull v15.8h, v8.8b, v11.8b\n"
    681       "umull v16.8h, v8.8b, v12.8b\n"
    682       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
    683       "uadalp v0.4s, v13.8h\n"
    684       "uadalp v1.4s, v14.8h\n"
    685       "uadalp v2.4s, v15.8h\n"
    686       "uadalp v3.4s, v16.8h\n"
    687       "prfm pldl1keep, [%x[rhs], #256]\n"
    688       "umull v17.8h, v8.8b, v9.8b\n"
    689       "umull v13.8h, v8.8b, v10.8b\n"
    690       "umull v14.8h, v8.8b, v11.8b\n"
    691       "umull v15.8h, v8.8b, v12.8b\n"
    692       "prfm pldl1keep, [%x[lhs], #32]\n"
    693 
    694       // Subtract counter.
    695       "subs %x[count], %x[count], #8\n"
    696 
    697       "uadalp v4.4s, v17.8h\n"
    698       "uadalp v5.4s, v13.8h\n"
    699       "uadalp v6.4s, v14.8h\n"
    700       "uadalp v7.4s, v15.8h\n"
    701 
    702       // Loop break.
    703       "bgt 1b\n"
    704 
    705       // StaticQuantization::Prepare
    706       "ld1 {v8.4s}, [%x[lhs]], #16\n"
    707       "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
    708       "dup v11.4s, %w[multiplicative_offset]\n"
    709       "dup v12.4s, %w[rounding_offset]\n"
    710       "dup v13.4s, %w[shift]\n"
    711       "dup v8.4s, v8.s[0]\n"
    712 
    713       // RowMajorOutput::Prepare
    714 
    715       // Reduce aggregators.
    716       "addp v0.4s, v0.4s, v1.4s\n"
    717       "addp v2.4s, v2.4s, v3.4s\n"
    718       "addp v4.4s, v4.4s, v5.4s\n"
    719       "addp v6.4s, v6.4s, v7.4s\n"
    720       "addp v0.4s, v0.4s, v2.4s\n"
    721       "addp v1.4s, v4.4s, v6.4s\n"
    722 
    723       // StaticQuantization::Transform
    724       "add v0.4s, v0.4s, v8.4s\n"
    725       "add v1.4s, v1.4s, v8.4s\n"
    726       "add v0.4s, v0.4s, v9.4s\n"
    727       "add v1.4s, v1.4s, v10.4s\n"
    728       "mul v0.4s, v0.4s, v11.4s\n"
    729       "mul v1.4s, v1.4s, v11.4s\n"
    730       "add v0.4s, v0.4s, v12.4s\n"
    731       "add v1.4s, v1.4s, v12.4s\n"
    732       "sshl v0.4s, v0.4s, v13.4s\n"
    733       "sshl v1.4s, v1.4s, v13.4s\n"
    734       "sqxtn v0.4h, v0.4s\n"
    735       "sqxtn2 v0.8h, v1.4s\n"
    736       "sqxtun v0.8b, v0.8h\n"
    737 
    738       // RowMajorOutput::Output
    739       "st1 {v0.2s}, [%x[result]], #8\n"
    740       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    741       : [count] "r"(params.kernel.count),
    742         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    743         [shift] "r"(params.kernel.shift),
    744         [stride] "r"(params.output_stream.stride),
    745         [rounding_offset] "r"(params.kernel.rounding_offset)
    746       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
    747         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
    748 }
    749 
    750 template <>
    751 inline void
    752 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1,
    753           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    754                        const FusedKernelParams<QuantizedStaticPreprocessed,
    755                                                RowMajor>& params,
    756                        uint8_t* result) {
    757 #ifdef DEBUG
    758 #ifdef DEBUG_METAGEMM_VERBOSE
    759   std::cout << __FILE__ << "(" << __LINE__
    760             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    761                "QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()"
    762             << std::endl
    763             << std::flush;
    764 #endif
    765 #endif
    766   asm volatile(
    767       "prfm pldl1keep, [%x[lhs]]\n"
    768       "prfm pldl1keep, [%x[rhs]]\n"
    769 
    770       // Clear aggregators.
    771       "movi v0.4s, #0\n"
    772       "movi v1.4s, #0\n"
    773 
    774       // General NxM lanes loop.
    775       "1:"
    776 
    777       // Subtract counter.
    778       "subs %x[count], %x[count], #8\n"
    779 
    780       "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
    781       "ld1 {v4.2s}, [%x[rhs]], #8\n"
    782       "prfm pldl1keep, [%x[lhs], #64]\n"
    783       "prfm pldl1keep, [%x[rhs], #64]\n"
    784       "umull v5.8h, v4.8b, v2.8b\n"
    785       "umull v6.8h, v4.8b, v3.8b\n"
    786       "uadalp v0.4s, v5.8h\n"
    787       "uadalp v1.4s, v6.8h\n"
    788 
    789       // Loop break.
    790       "bgt 1b\n"
    791 
    792       // StaticQuantization::Prepare
    793       "ld1 {v4.4s}, [%x[lhs]], #16\n"
    794       "ld1 {v5.4s}, [%x[rhs]], #16\n"
    795       "dup v6.4s, %w[multiplicative_offset]\n"
    796       "dup v7.4s, %w[rounding_offset]\n"
    797       "dup v8.4s, %w[shift]\n"
    798       "dup v2.4s, v4.s[0]\n"
    799       "dup v4.4s, v4.s[1]\n"
    800 
    801       // RowMajorOutput::Prepare
    802       "add x0, %x[result], %x[stride]\n"
    803 
    804       // Reduce aggregators.
    805       "addp v0.4s, v0.4s, v0.4s\n"
    806       "addp v0.4s, v0.4s, v0.4s\n"
    807       "addp v1.4s, v1.4s, v1.4s\n"
    808       "addp v1.4s, v1.4s, v1.4s\n"
    809 
    810       // StaticQuantization::Transform
    811       "add v0.4s, v0.4s, v2.4s\n"
    812       "add v1.4s, v1.4s, v4.4s\n"
    813       "add v0.4s, v0.4s, v5.4s\n"
    814       "add v1.4s, v1.4s, v5.4s\n"
    815       "mul v0.4s, v0.4s, v6.4s\n"
    816       "mul v1.4s, v1.4s, v6.4s\n"
    817       "add v0.4s, v0.4s, v7.4s\n"
    818       "add v1.4s, v1.4s, v7.4s\n"
    819       "sshl v0.4s, v0.4s, v8.4s\n"
    820       "sshl v1.4s, v1.4s, v8.4s\n"
    821       "sqxtn v0.4h, v0.4s\n"
    822       "sqxtn v1.4h, v1.4s\n"
    823       "sqxtun v0.8b, v0.8h\n"
    824       "sqxtun v1.8b, v1.8h\n"
    825 
    826       // RowMajorOutput::Output
    827       "st1 {v0.b}[0], [%x[result]], #1\n"
    828       "st1 {v1.b}[0], [x0], #1\n"
    829       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    830       : [count] "r"(params.kernel.count),
    831         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    832         [shift] "r"(params.kernel.shift),
    833         [stride] "r"(params.output_stream.stride),
    834         [rounding_offset] "r"(params.kernel.rounding_offset)
    835       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc",
    836         "memory");
    837 }
    838 
    839 template <>
    840 inline void
    841 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2,
    842           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    843                        const FusedKernelParams<QuantizedStaticPreprocessed,
    844                                                RowMajor>& params,
    845                        uint8_t* result) {
    846 #ifdef DEBUG
    847 #ifdef DEBUG_METAGEMM_VERBOSE
    848   std::cout << __FILE__ << "(" << __LINE__
    849             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    850                "QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()"
    851             << std::endl
    852             << std::flush;
    853 #endif
    854 #endif
    855   asm volatile(
    856       "prfm pldl1keep, [%x[lhs]]\n"
    857       "prfm pldl1keep, [%x[rhs]]\n"
    858 
    859       // Clear aggregators.
    860       "movi v0.4s, #0\n"
    861       "movi v1.4s, #0\n"
    862       "movi v2.4s, #0\n"
    863       "mov v3.16b, v0.16b\n"
    864 
    865       // General NxM lanes loop.
    866       "1:"
    867 
    868       // Subtract counter.
    869       "subs %x[count], %x[count], #8\n"
    870 
    871       "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
    872       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
    873       "prfm pldl1keep, [%x[lhs], #64]\n"
    874       "prfm pldl1keep, [%x[rhs], #64]\n"
    875       "umull v8.8h, v6.8b, v4.8b\n"
    876       "umull v9.8h, v7.8b, v4.8b\n"
    877       "umull v10.8h, v6.8b, v5.8b\n"
    878       "umull v11.8h, v7.8b, v5.8b\n"
    879       "uadalp v0.4s, v8.8h\n"
    880       "uadalp v1.4s, v9.8h\n"
    881       "uadalp v2.4s, v10.8h\n"
    882       "uadalp v3.4s, v11.8h\n"
    883 
    884       // Loop break.
    885       "bgt 1b\n"
    886 
    887       // StaticQuantization::Prepare
    888       "ld1 {v4.4s}, [%x[lhs]], #16\n"
    889       "ld1 {v5.4s}, [%x[rhs]], #16\n"
    890       "dup v6.4s, %w[multiplicative_offset]\n"
    891       "dup v7.4s, %w[rounding_offset]\n"
    892       "dup v8.4s, %w[shift]\n"
    893       "dup v9.4s, v4.s[0]\n"
    894       "dup v4.4s, v4.s[1]\n"
    895 
    896       // RowMajorOutput::Prepare
    897       "add x0, %x[result], %x[stride]\n"
    898 
    899       // Reduce aggregators.
    900       "addp v0.4s, v0.4s, v1.4s\n"
    901       "addp v0.4s, v0.4s, v0.4s\n"
    902       "addp v2.4s, v2.4s, v3.4s\n"
    903       "addp v2.4s, v2.4s, v2.4s\n"
    904 
    905       // StaticQuantization::Transform
    906       "add v0.4s, v0.4s, v9.4s\n"
    907       "add v2.4s, v2.4s, v4.4s\n"
    908       "add v0.4s, v0.4s, v5.4s\n"
    909       "add v2.4s, v2.4s, v5.4s\n"
    910       "mul v0.4s, v0.4s, v6.4s\n"
    911       "mul v2.4s, v2.4s, v6.4s\n"
    912       "add v0.4s, v0.4s, v7.4s\n"
    913       "add v2.4s, v2.4s, v7.4s\n"
    914       "sshl v0.4s, v0.4s, v8.4s\n"
    915       "sshl v2.4s, v2.4s, v8.4s\n"
    916       "sqxtn v0.4h, v0.4s\n"
    917       "sqxtn v2.4h, v2.4s\n"
    918       "sqxtun v0.8b, v0.8h\n"
    919       "sqxtun v2.8b, v2.8h\n"
    920 
    921       // RowMajorOutput::Output
    922       "st1 {v0.h}[0], [%x[result]], #2\n"
    923       "st1 {v2.h}[0], [x0], #2\n"
    924       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    925       : [count] "r"(params.kernel.count),
    926         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    927         [shift] "r"(params.kernel.shift),
    928         [stride] "r"(params.output_stream.stride),
    929         [rounding_offset] "r"(params.kernel.rounding_offset)
    930       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
    931         "v11", "cc", "memory");
    932 }
    933 
    934 template <>
    935 inline void
    936 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3,
    937           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    938                        const FusedKernelParams<QuantizedStaticPreprocessed,
    939                                                RowMajor>& params,
    940                        uint8_t* result) {
    941 #ifdef DEBUG
    942 #ifdef DEBUG_METAGEMM_VERBOSE
    943   std::cout << __FILE__ << "(" << __LINE__
    944             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    945                "QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()"
    946             << std::endl
    947             << std::flush;
    948 #endif
    949 #endif
    950   asm volatile(
    951       "prfm pldl1keep, [%x[lhs]]\n"
    952       "prfm pldl1keep, [%x[rhs]]\n"
    953 
    954       // Clear aggregators.
    955       "movi v0.4s, #0\n"
    956       "movi v1.4s, #0\n"
    957       "movi v2.4s, #0\n"
    958       "mov v3.16b, v0.16b\n"
    959       "mov v4.16b, v1.16b\n"
    960       "mov v5.16b, v2.16b\n"
    961 
    962       // General NxM lanes loop.
    963       "1:"
    964 
    965       // Subtract counter.
    966       "subs %x[count], %x[count], #8\n"
    967 
    968       "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
    969       "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
    970       "prfm pldl1keep, [%x[lhs], #64]\n"
    971       "prfm pldl1keep, [%x[rhs], #64]\n"
    972       "umull v11.8h, v8.8b, v6.8b\n"
    973       "umull v12.8h, v9.8b, v6.8b\n"
    974       "umull v13.8h, v10.8b, v6.8b\n"
    975       "umull v14.8h, v8.8b, v7.8b\n"
    976       "umull v15.8h, v9.8b, v7.8b\n"
    977       "umull v16.8h, v10.8b, v7.8b\n"
    978       "uadalp v0.4s, v11.8h\n"
    979       "uadalp v1.4s, v12.8h\n"
    980       "uadalp v2.4s, v13.8h\n"
    981       "uadalp v3.4s, v14.8h\n"
    982       "uadalp v4.4s, v15.8h\n"
    983       "uadalp v5.4s, v16.8h\n"
    984 
    985       // Loop break.
    986       "bgt 1b\n"
    987 
    988       // StaticQuantization::Prepare
    989       "ld1 {v6.4s}, [%x[lhs]], #16\n"
    990       "ld1 {v7.4s}, [%x[rhs]], #16\n"
    991       "dup v8.4s, %w[multiplicative_offset]\n"
    992       "dup v9.4s, %w[rounding_offset]\n"
    993       "dup v10.4s, %w[shift]\n"
    994       "dup v11.4s, v6.s[0]\n"
    995       "dup v6.4s, v6.s[1]\n"
    996 
    997       // RowMajorOutput::Prepare
    998       "add x0, %x[result], %x[stride]\n"
    999 
   1000       // Reduce aggregators.
   1001       "addp v0.4s, v0.4s, v1.4s\n"
   1002       "addp v2.4s, v2.4s, v2.4s\n"
   1003       "addp v0.4s, v0.4s, v2.4s\n"
   1004       "addp v3.4s, v3.4s, v4.4s\n"
   1005       "addp v5.4s, v5.4s, v5.4s\n"
   1006       "addp v3.4s, v3.4s, v5.4s\n"
   1007 
   1008       // StaticQuantization::Transform
   1009       "add v0.4s, v0.4s, v11.4s\n"
   1010       "add v3.4s, v3.4s, v6.4s\n"
   1011       "add v0.4s, v0.4s, v7.4s\n"
   1012       "add v3.4s, v3.4s, v7.4s\n"
   1013       "mul v0.4s, v0.4s, v8.4s\n"
   1014       "mul v3.4s, v3.4s, v8.4s\n"
   1015       "add v0.4s, v0.4s, v9.4s\n"
   1016       "add v3.4s, v3.4s, v9.4s\n"
   1017       "sshl v0.4s, v0.4s, v10.4s\n"
   1018       "sshl v3.4s, v3.4s, v10.4s\n"
   1019       "sqxtn v0.4h, v0.4s\n"
   1020       "sqxtn v3.4h, v3.4s\n"
   1021       "sqxtun v0.8b, v0.8h\n"
   1022       "sqxtun v3.8b, v3.8h\n"
   1023 
   1024       // RowMajorOutput::Output
   1025       "st1 {v0.h}[0], [%x[result]], #2\n"
   1026       "st1 {v0.b}[2], [%x[result]], #1\n"
   1027       "st1 {v3.h}[0], [x0], #2\n"
   1028       "st1 {v3.b}[2], [x0], #1\n"
   1029       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1030       : [count] "r"(params.kernel.count),
   1031         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1032         [shift] "r"(params.kernel.shift),
   1033         [stride] "r"(params.output_stream.stride),
   1034         [rounding_offset] "r"(params.kernel.rounding_offset)
   1035       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   1036         "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
   1037 }
   1038 
   1039 template <>
   1040 inline void
   1041 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4,
   1042           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1043                        const FusedKernelParams<QuantizedStaticPreprocessed,
   1044                                                RowMajor>& params,
   1045                        uint8_t* result) {
   1046 #ifdef DEBUG
   1047 #ifdef DEBUG_METAGEMM_VERBOSE
   1048   std::cout << __FILE__ << "(" << __LINE__
   1049             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
   1050                "QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()"
   1051             << std::endl
   1052             << std::flush;
   1053 #endif
   1054 #endif
   1055   asm volatile(
   1056       "prfm pldl1keep, [%x[lhs]]\n"
   1057       "prfm pldl1keep, [%x[rhs]]\n"
   1058 
   1059       // Clear aggregators.
   1060       "movi v0.4s, #0\n"
   1061       "movi v1.4s, #0\n"
   1062       "movi v2.4s, #0\n"
   1063       "mov v3.16b, v0.16b\n"
   1064       "mov v4.16b, v1.16b\n"
   1065       "mov v5.16b, v2.16b\n"
   1066       "mov v6.16b, v3.16b\n"
   1067       "mov v7.16b, v4.16b\n"
   1068 
   1069       // 2x4 lanes loop.
   1070       "1:"
   1071 
   1072       "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
   1073       "ld1 {v8.8b}, [%x[lhs]], #8\n"
   1074       "umull v14.8h, v8.8b, v10.8b\n"
   1075       "ld1 {v9.8b}, [%x[lhs]], #8\n"
   1076       "umull v15.8h, v8.8b, v11.8b\n"
   1077       "prfm pldl1keep, [%x[rhs], #64]\n"
   1078       "umull v16.8h, v8.8b, v12.8b\n"
   1079       "prfm pldl1keep, [%x[lhs], #64]\n"
   1080       "umull v17.8h, v8.8b, v13.8b\n"
   1081       "umull v18.8h, v9.8b, v10.8b\n"
   1082       "uadalp v0.4s, v14.8h\n"
   1083       "uadalp v1.4s, v15.8h\n"
   1084       "uadalp v2.4s, v16.8h\n"
   1085       "umull v14.8h, v9.8b, v11.8b\n"
   1086       "umull v15.8h, v9.8b, v12.8b\n"
   1087       "umull v16.8h, v9.8b, v13.8b\n"
   1088 
   1089       // Subtract counter.
   1090       "subs %x[count], %x[count], #8\n"
   1091 
   1092       "uadalp v3.4s, v17.8h\n"
   1093       "uadalp v4.4s, v18.8h\n"
   1094       "uadalp v5.4s, v14.8h\n"
   1095       "uadalp v6.4s, v15.8h\n"
   1096       "uadalp v7.4s, v16.8h\n"
   1097 
   1098       // Loop break.
   1099       "bgt 1b\n"
   1100 
   1101       // StaticQuantization::Prepare
   1102       "ld1 {v8.4s}, [%x[lhs]], #16\n"
   1103       "ld1 {v9.4s}, [%x[rhs]], #16\n"
   1104       "dup v10.4s, %w[multiplicative_offset]\n"
   1105       "dup v11.4s, %w[rounding_offset]\n"
   1106       "dup v12.4s, %w[shift]\n"
   1107       "dup v13.4s, v8.s[0]\n"
   1108       "dup v8.4s, v8.s[1]\n"
   1109 
   1110       // RowMajorOutput::Prepare
   1111       "add x0, %x[result], %x[stride]\n"
   1112 
   1113       // Reduce aggregators.
   1114       "addp v0.4s, v0.4s, v1.4s\n"
   1115       "addp v2.4s, v2.4s, v3.4s\n"
   1116       "addp v0.4s, v0.4s, v2.4s\n"
   1117       "addp v4.4s, v4.4s, v5.4s\n"
   1118       "addp v6.4s, v6.4s, v7.4s\n"
   1119       "addp v4.4s, v4.4s, v6.4s\n"
   1120 
   1121       // StaticQuantization::Transform
   1122       "add v0.4s, v0.4s, v13.4s\n"
   1123       "add v4.4s, v4.4s, v8.4s\n"
   1124       "add v0.4s, v0.4s, v9.4s\n"
   1125       "add v4.4s, v4.4s, v9.4s\n"
   1126       "mul v0.4s, v0.4s, v10.4s\n"
   1127       "mul v4.4s, v4.4s, v10.4s\n"
   1128       "add v0.4s, v0.4s, v11.4s\n"
   1129       "add v4.4s, v4.4s, v11.4s\n"
   1130       "sshl v0.4s, v0.4s, v12.4s\n"
   1131       "sshl v4.4s, v4.4s, v12.4s\n"
   1132       "sqxtn v0.4h, v0.4s\n"
   1133       "sqxtn v4.4h, v4.4s\n"
   1134       "sqxtun v0.8b, v0.8h\n"
   1135       "sqxtun v4.8b, v4.8h\n"
   1136 
   1137       // RowMajorOutput::Output
   1138       "st1 {v0.s}[0], [%x[result]], #4\n"
   1139       "st1 {v4.s}[0], [x0], #4\n"
   1140       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1141       : [count] "r"(params.kernel.count),
   1142         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1143         [shift] "r"(params.kernel.shift),
   1144         [stride] "r"(params.output_stream.stride),
   1145         [rounding_offset] "r"(params.kernel.rounding_offset)
   1146       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   1147         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
   1148 }
   1149 
   1150 template <>
   1151 inline void
   1152 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1,
   1153           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1154                        const FusedKernelParams<QuantizedStaticPreprocessed,
   1155                                                RowMajor>& params,
   1156                        uint8_t* result) {
   1157 #ifdef DEBUG
   1158 #ifdef DEBUG_METAGEMM_VERBOSE
   1159   std::cout << __FILE__ << "(" << __LINE__
   1160             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
   1161                "QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()"
   1162             << std::endl
   1163             << std::flush;
   1164 #endif
   1165 #endif
   1166   asm volatile(
   1167       "prfm pldl1keep, [%x[lhs]]\n"
   1168       "prfm pldl1keep, [%x[rhs]]\n"
   1169 
   1170       // Clear aggregators.
   1171       "movi v0.4s, #0\n"
   1172       "movi v1.4s, #0\n"
   1173       "movi v2.4s, #0\n"
   1174 
   1175       // General NxM lanes loop.
   1176       "1:"
   1177 
   1178       // Subtract counter.
   1179       "subs %x[count], %x[count], #8\n"
   1180 
   1181       "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
   1182       "ld1 {v6.2s}, [%x[rhs]], #8\n"
   1183       "prfm pldl1keep, [%x[lhs], #64]\n"
   1184       "prfm pldl1keep, [%x[rhs], #64]\n"
   1185       "umull v7.8h, v6.8b, v3.8b\n"
   1186       "umull v8.8h, v6.8b, v4.8b\n"
   1187       "umull v9.8h, v6.8b, v5.8b\n"
   1188       "uadalp v0.4s, v7.8h\n"
   1189       "uadalp v1.4s, v8.8h\n"
   1190       "uadalp v2.4s, v9.8h\n"
   1191 
   1192       // Loop break.
   1193       "bgt 1b\n"
   1194 
   1195       // StaticQuantization::Prepare
   1196       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   1197       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   1198       "dup v6.4s, %w[multiplicative_offset]\n"
   1199       "dup v7.4s, %w[rounding_offset]\n"
   1200       "dup v8.4s, %w[shift]\n"
   1201       "dup v3.4s, v4.s[0]\n"
   1202       "dup v9.4s, v4.s[1]\n"
   1203       "dup v4.4s, v4.s[2]\n"
   1204 
   1205       // RowMajorOutput::Prepare
   1206       "add x0, %x[result], %x[stride]\n"
   1207       "add x1, x0, %x[stride]\n"
   1208 
   1209       // Reduce aggregators.
   1210       "addp v0.4s, v0.4s, v0.4s\n"
   1211       "addp v0.4s, v0.4s, v0.4s\n"
   1212       "addp v1.4s, v1.4s, v1.4s\n"
   1213       "addp v1.4s, v1.4s, v1.4s\n"
   1214       "addp v2.4s, v2.4s, v2.4s\n"
   1215       "addp v2.4s, v2.4s, v2.4s\n"
   1216 
   1217       // StaticQuantization::Transform
   1218       "add v0.4s, v0.4s, v3.4s\n"
   1219       "add v1.4s, v1.4s, v9.4s\n"
   1220       "add v2.4s, v2.4s, v4.4s\n"
   1221       "add v0.4s, v0.4s, v5.4s\n"
   1222       "add v1.4s, v1.4s, v5.4s\n"
   1223       "add v2.4s, v2.4s, v5.4s\n"
   1224       "mul v0.4s, v0.4s, v6.4s\n"
   1225       "mul v1.4s, v1.4s, v6.4s\n"
   1226       "mul v2.4s, v2.4s, v6.4s\n"
   1227       "add v0.4s, v0.4s, v7.4s\n"
   1228       "add v1.4s, v1.4s, v7.4s\n"
   1229       "add v2.4s, v2.4s, v7.4s\n"
   1230       "sshl v0.4s, v0.4s, v8.4s\n"
   1231       "sshl v1.4s, v1.4s, v8.4s\n"
   1232       "sshl v2.4s, v2.4s, v8.4s\n"
   1233       "sqxtn v0.4h, v0.4s\n"
   1234       "sqxtn v1.4h, v1.4s\n"
   1235       "sqxtn v2.4h, v2.4s\n"
   1236       "sqxtun v0.8b, v0.8h\n"
   1237       "sqxtun v1.8b, v1.8h\n"
   1238       "sqxtun v2.8b, v2.8h\n"
   1239 
   1240       // RowMajorOutput::Output
   1241       "st1 {v0.b}[0], [%x[result]], #1\n"
   1242       "st1 {v1.b}[0], [x0], #1\n"
   1243       "st1 {v2.b}[0], [x1], #1\n"
   1244       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1245       : [count] "r"(params.kernel.count),
   1246         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1247         [shift] "r"(params.kernel.shift),
   1248         [stride] "r"(params.output_stream.stride),
   1249         [rounding_offset] "r"(params.kernel.rounding_offset)
   1250       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   1251         "cc", "memory");
   1252 }
   1253 
   1254 template <>
   1255 inline void
   1256 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2,
   1257           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1258                        const FusedKernelParams<QuantizedStaticPreprocessed,
   1259                                                RowMajor>& params,
   1260                        uint8_t* result) {
   1261 #ifdef DEBUG
   1262 #ifdef DEBUG_METAGEMM_VERBOSE
   1263   std::cout << __FILE__ << "(" << __LINE__
   1264             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
   1265                "QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()"
   1266             << std::endl
   1267             << std::flush;
   1268 #endif
   1269 #endif
   1270   asm volatile(
   1271       "prfm pldl1keep, [%x[lhs]]\n"
   1272       "prfm pldl1keep, [%x[rhs]]\n"
   1273 
   1274       // Clear aggregators.
   1275       "movi v0.4s, #0\n"
   1276       "movi v1.4s, #0\n"
   1277       "movi v2.4s, #0\n"
   1278       "mov v3.16b, v0.16b\n"
   1279       "mov v4.16b, v1.16b\n"
   1280       "mov v5.16b, v2.16b\n"
   1281 
   1282       // General NxM lanes loop.
   1283       "1:"
   1284 
   1285       // Subtract counter.
   1286       "subs %x[count], %x[count], #8\n"
   1287 
   1288       "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
   1289       "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
   1290       "prfm pldl1keep, [%x[lhs], #64]\n"
   1291       "prfm pldl1keep, [%x[rhs], #64]\n"
   1292       "umull v11.8h, v9.8b, v6.8b\n"
   1293       "umull v12.8h, v10.8b, v6.8b\n"
   1294       "umull v13.8h, v9.8b, v7.8b\n"
   1295       "umull v14.8h, v10.8b, v7.8b\n"
   1296       "umull v15.8h, v9.8b, v8.8b\n"
   1297       "umull v16.8h, v10.8b, v8.8b\n"
   1298       "uadalp v0.4s, v11.8h\n"
   1299       "uadalp v1.4s, v12.8h\n"
   1300       "uadalp v2.4s, v13.8h\n"
   1301       "uadalp v3.4s, v14.8h\n"
   1302       "uadalp v4.4s, v15.8h\n"
   1303       "uadalp v5.4s, v16.8h\n"
   1304 
   1305       // Loop break.
   1306       "bgt 1b\n"
   1307 
   1308       // StaticQuantization::Prepare
   1309       "ld1 {v6.4s}, [%x[lhs]], #16\n"
   1310       "ld1 {v7.4s}, [%x[rhs]], #16\n"
   1311       "dup v8.4s, %w[multiplicative_offset]\n"
   1312       "dup v9.4s, %w[rounding_offset]\n"
   1313       "dup v10.4s, %w[shift]\n"
   1314       "dup v11.4s, v6.s[0]\n"
   1315       "dup v12.4s, v6.s[1]\n"
   1316       "dup v6.4s, v6.s[2]\n"
   1317 
   1318       // RowMajorOutput::Prepare
   1319       "add x0, %x[result], %x[stride]\n"
   1320       "add x1, x0, %x[stride]\n"
   1321 
   1322       // Reduce aggregators.
   1323       "addp v0.4s, v0.4s, v1.4s\n"
   1324       "addp v0.4s, v0.4s, v0.4s\n"
   1325       "addp v2.4s, v2.4s, v3.4s\n"
   1326       "addp v2.4s, v2.4s, v2.4s\n"
   1327       "addp v4.4s, v4.4s, v5.4s\n"
   1328       "addp v4.4s, v4.4s, v4.4s\n"
   1329 
   1330       // StaticQuantization::Transform
   1331       "add v0.4s, v0.4s, v11.4s\n"
   1332       "add v2.4s, v2.4s, v12.4s\n"
   1333       "add v4.4s, v4.4s, v6.4s\n"
   1334       "add v0.4s, v0.4s, v7.4s\n"
   1335       "add v2.4s, v2.4s, v7.4s\n"
   1336       "add v4.4s, v4.4s, v7.4s\n"
   1337       "mul v0.4s, v0.4s, v8.4s\n"
   1338       "mul v2.4s, v2.4s, v8.4s\n"
   1339       "mul v4.4s, v4.4s, v8.4s\n"
   1340       "add v0.4s, v0.4s, v9.4s\n"
   1341       "add v2.4s, v2.4s, v9.4s\n"
   1342       "add v4.4s, v4.4s, v9.4s\n"
   1343       "sshl v0.4s, v0.4s, v10.4s\n"
   1344       "sshl v2.4s, v2.4s, v10.4s\n"
   1345       "sshl v4.4s, v4.4s, v10.4s\n"
   1346       "sqxtn v0.4h, v0.4s\n"
   1347       "sqxtn v2.4h, v2.4s\n"
   1348       "sqxtn v4.4h, v4.4s\n"
   1349       "sqxtun v0.8b, v0.8h\n"
   1350       "sqxtun v2.8b, v2.8h\n"
   1351       "sqxtun v4.8b, v4.8h\n"
   1352 
   1353       // RowMajorOutput::Output
   1354       "st1 {v0.h}[0], [%x[result]], #2\n"
   1355       "st1 {v2.h}[0], [x0], #2\n"
   1356       "st1 {v4.h}[0], [x1], #2\n"
   1357       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1358       : [count] "r"(params.kernel.count),
   1359         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1360         [shift] "r"(params.kernel.shift),
   1361         [stride] "r"(params.output_stream.stride),
   1362         [rounding_offset] "r"(params.kernel.rounding_offset)
   1363       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   1364         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
   1365 }
   1366 
   1367 template <>
   1368 inline void
   1369 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3,
   1370           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1371                        const FusedKernelParams<QuantizedStaticPreprocessed,
   1372                                                RowMajor>& params,
   1373                        uint8_t* result) {
   1374 #ifdef DEBUG
   1375 #ifdef DEBUG_METAGEMM_VERBOSE
   1376   std::cout << __FILE__ << "(" << __LINE__
   1377             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
   1378                "QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()"
   1379             << std::endl
   1380             << std::flush;
   1381 #endif
   1382 #endif
   1383   asm volatile(
   1384       "prfm pldl1keep, [%x[lhs]]\n"
   1385       "prfm pldl1keep, [%x[rhs]]\n"
   1386 
   1387       // Clear aggregators.
   1388       "movi v0.4s, #0\n"
   1389       "movi v1.4s, #0\n"
   1390       "movi v2.4s, #0\n"
   1391       "mov v3.16b, v0.16b\n"
   1392       "mov v4.16b, v1.16b\n"
   1393       "mov v5.16b, v2.16b\n"
   1394       "mov v6.16b, v3.16b\n"
   1395       "mov v7.16b, v4.16b\n"
   1396       "mov v8.16b, v5.16b\n"
   1397 
   1398       // 3x3 lanes loop.
   1399       "1:"
   1400 
   1401       "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
   1402       "ld1 {v9.8b}, [%x[lhs]], #8\n"
   1403       "umull v15.8h, v9.8b, v12.8b\n"
   1404       "ld1 {v10.8b}, [%x[lhs]], #8\n"
   1405       "umull v16.8h, v9.8b, v13.8b\n"
   1406       "ld1 {v11.8b}, [%x[lhs]], #8\n"
   1407       "umull v17.8h, v9.8b, v14.8b\n"
   1408       "prfm pldl1keep, [%x[lhs], #64]\n"
   1409       "umull v18.8h, v10.8b, v12.8b\n"
   1410       "prfm pldl1keep, [%x[rhs], #64]\n"
   1411       "uadalp v0.4s, v15.8h\n"
   1412       "uadalp v1.4s, v16.8h\n"
   1413       "uadalp v2.4s, v17.8h\n"
   1414       "uadalp v3.4s, v18.8h\n"
   1415       "umull v15.8h, v10.8b, v13.8b\n"
   1416       "umull v16.8h, v10.8b, v14.8b\n"
   1417       "umull v17.8h, v11.8b, v12.8b\n"
   1418       "umull v18.8h, v11.8b, v13.8b\n"
   1419 
   1420       // Subtract counter.
   1421       "subs %x[count], %x[count], #8\n"
   1422 
   1423       "umull v9.8h, v11.8b, v14.8b\n"
   1424       "uadalp v4.4s, v15.8h\n"
   1425       "uadalp v5.4s, v16.8h\n"
   1426       "uadalp v6.4s, v17.8h\n"
   1427       "uadalp v7.4s, v18.8h\n"
   1428       "uadalp v8.4s, v9.8h\n"
   1429 
   1430       // Loop break.
   1431       "bgt 1b\n"
   1432 
   1433       // StaticQuantization::Prepare
   1434       "ld1 {v9.4s}, [%x[lhs]], #16\n"
   1435       "ld1 {v10.4s}, [%x[rhs]], #16\n"
   1436       "dup v11.4s, %w[multiplicative_offset]\n"
   1437       "dup v12.4s, %w[rounding_offset]\n"
   1438       "dup v13.4s, %w[shift]\n"
   1439       "dup v14.4s, v9.s[0]\n"
   1440       "dup v15.4s, v9.s[1]\n"
   1441       "dup v9.4s, v9.s[2]\n"
   1442 
   1443       // RowMajorOutput::Prepare
   1444       "add x0, %x[result], %x[stride]\n"
   1445       "add x1, x0, %x[stride]\n"
   1446 
   1447       // Reduce aggregators.
   1448       "addp v0.4s, v0.4s, v1.4s\n"
   1449       "addp v2.4s, v2.4s, v2.4s\n"
   1450       "addp v0.4s, v0.4s, v2.4s\n"
   1451       "addp v3.4s, v3.4s, v4.4s\n"
   1452       "addp v5.4s, v5.4s, v5.4s\n"
   1453       "addp v3.4s, v3.4s, v5.4s\n"
   1454       "addp v6.4s, v6.4s, v7.4s\n"
   1455       "addp v8.4s, v8.4s, v8.4s\n"
   1456       "addp v6.4s, v6.4s, v8.4s\n"
   1457 
   1458       // StaticQuantization::Transform
   1459       "add v0.4s, v0.4s, v14.4s\n"
   1460       "add v3.4s, v3.4s, v15.4s\n"
   1461       "add v6.4s, v6.4s, v9.4s\n"
   1462       "add v0.4s, v0.4s, v10.4s\n"
   1463       "add v3.4s, v3.4s, v10.4s\n"
   1464       "add v6.4s, v6.4s, v10.4s\n"
   1465       "mul v0.4s, v0.4s, v11.4s\n"
   1466       "mul v3.4s, v3.4s, v11.4s\n"
   1467       "mul v6.4s, v6.4s, v11.4s\n"
   1468       "add v0.4s, v0.4s, v12.4s\n"
   1469       "add v3.4s, v3.4s, v12.4s\n"
   1470       "add v6.4s, v6.4s, v12.4s\n"
   1471       "sshl v0.4s, v0.4s, v13.4s\n"
   1472       "sshl v3.4s, v3.4s, v13.4s\n"
   1473       "sshl v6.4s, v6.4s, v13.4s\n"
   1474       "sqxtn v0.4h, v0.4s\n"
   1475       "sqxtn v3.4h, v3.4s\n"
   1476       "sqxtn v6.4h, v6.4s\n"
   1477       "sqxtun v0.8b, v0.8h\n"
   1478       "sqxtun v3.8b, v3.8h\n"
   1479       "sqxtun v6.8b, v6.8h\n"
   1480 
   1481       // RowMajorOutput::Output
   1482       "st1 {v0.h}[0], [%x[result]], #2\n"
   1483       "st1 {v0.b}[2], [%x[result]], #1\n"
   1484       "st1 {v3.h}[0], [x0], #2\n"
   1485       "st1 {v3.b}[2], [x0], #1\n"
   1486       "st1 {v6.h}[0], [x1], #2\n"
   1487       "st1 {v6.b}[2], [x1], #1\n"
   1488       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1489       : [count] "r"(params.kernel.count),
   1490         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1491         [shift] "r"(params.kernel.shift),
   1492         [stride] "r"(params.output_stream.stride),
   1493         [rounding_offset] "r"(params.kernel.rounding_offset)
   1494       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   1495         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
   1496         "memory");
   1497 }
   1498 
   1499 template <>
   1500 inline void MulKernel<
   1501     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1,
   1502     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1503                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1504                                          RowMajor>& params,
   1505                  int32_t* result) {
   1506 #ifdef DEBUG
   1507 #ifdef DEBUG_METAGEMM_VERBOSE
   1508   std::cout << __FILE__ << "(" << __LINE__
   1509             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1510                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, "
   1511                "8>::Multiply()"
   1512             << std::endl
   1513             << std::flush;
   1514 #endif
   1515 #endif
   1516   asm volatile(
   1517       "prfm pldl1keep, [%x[lhs]]\n"
   1518       "prfm pldl1keep, [%x[rhs]]\n"
   1519 
   1520       // Clear aggregators.
   1521       "movi v0.4s, #0\n"
   1522 
   1523       // General NxM lanes loop.
   1524       "1:"
   1525 
   1526       // Subtract counter.
   1527       "subs %x[count], %x[count], #8\n"
   1528 
   1529       "ld1 {v1.2s}, [%x[lhs]], #8\n"
   1530       "ld1 {v2.2s}, [%x[rhs]], #8\n"
   1531       "prfm pldl1keep, [%x[lhs], #64]\n"
   1532       "prfm pldl1keep, [%x[rhs], #64]\n"
   1533       "umull v3.8h, v2.8b, v1.8b\n"
   1534       "uadalp v0.4s, v3.8h\n"
   1535 
   1536       // Loop break.
   1537       "bgt 1b\n"
   1538 
   1539       // StaticQuantizationInt32::Prepare
   1540       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   1541       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   1542       "dup v4.4s, v4.s[0]\n"
   1543 
   1544       // RowMajorOutput::Prepare
   1545 
   1546       // Reduce aggregators.
   1547       "addp v0.4s, v0.4s, v0.4s\n"
   1548       "addp v0.4s, v0.4s, v0.4s\n"
   1549 
   1550       // StaticQuantizationInt32::Transform
   1551       "add v0.4s, v0.4s, v4.4s\n"
   1552       "add v0.4s, v0.4s, v5.4s\n"
   1553 
   1554       // RowMajorOutput::Output
   1555       "st1 {v0.s}[0], [%x[result]], #4\n"
   1556       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1557       : [count] "r"(params.kernel.count),
   1558         [stride] "r"(params.output_stream.stride)
   1559       : "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory");
   1560 }
   1561 
   1562 template <>
   1563 inline void MulKernel<
   1564     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2,
   1565     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1566                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1567                                          RowMajor>& params,
   1568                  int32_t* result) {
   1569 #ifdef DEBUG
   1570 #ifdef DEBUG_METAGEMM_VERBOSE
   1571   std::cout << __FILE__ << "(" << __LINE__
   1572             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1573                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, "
   1574                "8>::Multiply()"
   1575             << std::endl
   1576             << std::flush;
   1577 #endif
   1578 #endif
   1579   asm volatile(
   1580       "prfm pldl1keep, [%x[lhs]]\n"
   1581       "prfm pldl1keep, [%x[rhs]]\n"
   1582 
   1583       // Clear aggregators.
   1584       "movi v0.4s, #0\n"
   1585       "movi v1.4s, #0\n"
   1586 
   1587       // General NxM lanes loop.
   1588       "1:"
   1589 
   1590       // Subtract counter.
   1591       "subs %x[count], %x[count], #8\n"
   1592 
   1593       "ld1 {v2.2s}, [%x[lhs]], #8\n"
   1594       "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
   1595       "prfm pldl1keep, [%x[lhs], #64]\n"
   1596       "prfm pldl1keep, [%x[rhs], #64]\n"
   1597       "umull v5.8h, v3.8b, v2.8b\n"
   1598       "umull v6.8h, v4.8b, v2.8b\n"
   1599       "uadalp v0.4s, v5.8h\n"
   1600       "uadalp v1.4s, v6.8h\n"
   1601 
   1602       // Loop break.
   1603       "bgt 1b\n"
   1604 
   1605       // StaticQuantizationInt32::Prepare
   1606       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   1607       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   1608       "dup v4.4s, v4.s[0]\n"
   1609 
   1610       // RowMajorOutput::Prepare
   1611 
   1612       // Reduce aggregators.
   1613       "addp v0.4s, v0.4s, v1.4s\n"
   1614       "addp v0.4s, v0.4s, v0.4s\n"
   1615 
   1616       // StaticQuantizationInt32::Transform
   1617       "add v0.4s, v0.4s, v4.4s\n"
   1618       "add v0.4s, v0.4s, v5.4s\n"
   1619 
   1620       // RowMajorOutput::Output
   1621       "st1 {v0.2s}, [%x[result]], #8\n"
   1622       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1623       : [count] "r"(params.kernel.count),
   1624         [stride] "r"(params.output_stream.stride)
   1625       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
   1626 }
   1627 
   1628 template <>
   1629 inline void MulKernel<
   1630     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3,
   1631     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1632                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1633                                          RowMajor>& params,
   1634                  int32_t* result) {
   1635 #ifdef DEBUG
   1636 #ifdef DEBUG_METAGEMM_VERBOSE
   1637   std::cout << __FILE__ << "(" << __LINE__
   1638             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1639                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, "
   1640                "8>::Multiply()"
   1641             << std::endl
   1642             << std::flush;
   1643 #endif
   1644 #endif
   1645   asm volatile(
   1646       "prfm pldl1keep, [%x[lhs]]\n"
   1647       "prfm pldl1keep, [%x[rhs]]\n"
   1648 
   1649       // Clear aggregators.
   1650       "movi v0.4s, #0\n"
   1651       "movi v1.4s, #0\n"
   1652       "movi v2.4s, #0\n"
   1653 
   1654       // General NxM lanes loop.
   1655       "1:"
   1656 
   1657       // Subtract counter.
   1658       "subs %x[count], %x[count], #8\n"
   1659 
   1660       "ld1 {v3.2s}, [%x[lhs]], #8\n"
   1661       "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
   1662       "prfm pldl1keep, [%x[lhs], #64]\n"
   1663       "prfm pldl1keep, [%x[rhs], #64]\n"
   1664       "umull v7.8h, v4.8b, v3.8b\n"
   1665       "umull v8.8h, v5.8b, v3.8b\n"
   1666       "umull v9.8h, v6.8b, v3.8b\n"
   1667       "uadalp v0.4s, v7.8h\n"
   1668       "uadalp v1.4s, v8.8h\n"
   1669       "uadalp v2.4s, v9.8h\n"
   1670 
   1671       // Loop break.
   1672       "bgt 1b\n"
   1673 
   1674       // StaticQuantizationInt32::Prepare
   1675       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   1676       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   1677       "dup v4.4s, v4.s[0]\n"
   1678 
   1679       // RowMajorOutput::Prepare
   1680 
   1681       // Reduce aggregators.
   1682       "addp v0.4s, v0.4s, v1.4s\n"
   1683       "addp v2.4s, v2.4s, v2.4s\n"
   1684       "addp v0.4s, v0.4s, v2.4s\n"
   1685 
   1686       // StaticQuantizationInt32::Transform
   1687       "add v0.4s, v0.4s, v4.4s\n"
   1688       "add v0.4s, v0.4s, v5.4s\n"
   1689 
   1690       // RowMajorOutput::Output
   1691       "st1 {v0.2s}, [%x[result]], #8\n"
   1692       "st1 {v0.s}[2], [%x[result]], #4\n"
   1693       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1694       : [count] "r"(params.kernel.count),
   1695         [stride] "r"(params.output_stream.stride)
   1696       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
   1697         "memory");
   1698 }
   1699 
   1700 template <>
   1701 inline void MulKernel<
   1702     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4,
   1703     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1704                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1705                                          RowMajor>& params,
   1706                  int32_t* result) {
   1707 #ifdef DEBUG
   1708 #ifdef DEBUG_METAGEMM_VERBOSE
   1709   std::cout << __FILE__ << "(" << __LINE__
   1710             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1711                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, "
   1712                "8>::Multiply()"
   1713             << std::endl
   1714             << std::flush;
   1715 #endif
   1716 #endif
   1717   asm volatile(
   1718       "prfm pldl1keep, [%x[lhs]]\n"
   1719       "prfm pldl1keep, [%x[rhs]]\n"
   1720 
   1721       // Clear aggregators.
   1722       "movi v0.4s, #0\n"
   1723       "movi v1.4s, #0\n"
   1724       "movi v2.4s, #0\n"
   1725       "mov v3.16b, v0.16b\n"
   1726 
   1727       // General NxM lanes loop.
   1728       "1:"
   1729 
   1730       // Subtract counter.
   1731       "subs %x[count], %x[count], #8\n"
   1732 
   1733       "ld1 {v4.2s}, [%x[lhs]], #8\n"
   1734       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
   1735       "prfm pldl1keep, [%x[lhs], #64]\n"
   1736       "prfm pldl1keep, [%x[rhs], #64]\n"
   1737       "umull v9.8h, v5.8b, v4.8b\n"
   1738       "umull v10.8h, v6.8b, v4.8b\n"
   1739       "umull v11.8h, v7.8b, v4.8b\n"
   1740       "umull v12.8h, v8.8b, v4.8b\n"
   1741       "uadalp v0.4s, v9.8h\n"
   1742       "uadalp v1.4s, v10.8h\n"
   1743       "uadalp v2.4s, v11.8h\n"
   1744       "uadalp v3.4s, v12.8h\n"
   1745 
   1746       // Loop break.
   1747       "bgt 1b\n"
   1748 
   1749       // StaticQuantizationInt32::Prepare
   1750       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   1751       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   1752       "dup v4.4s, v4.s[0]\n"
   1753 
   1754       // RowMajorOutput::Prepare
   1755 
   1756       // Reduce aggregators.
   1757       "addp v0.4s, v0.4s, v1.4s\n"
   1758       "addp v2.4s, v2.4s, v3.4s\n"
   1759       "addp v0.4s, v0.4s, v2.4s\n"
   1760 
   1761       // StaticQuantizationInt32::Transform
   1762       "add v0.4s, v0.4s, v4.4s\n"
   1763       "add v0.4s, v0.4s, v5.4s\n"
   1764 
   1765       // RowMajorOutput::Output
   1766       "st1 {v0.4s}, [%x[result]], #16\n"
   1767       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1768       : [count] "r"(params.kernel.count),
   1769         [stride] "r"(params.output_stream.stride)
   1770       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   1771         "v11", "v12", "cc", "memory");
   1772 }
   1773 
   1774 template <>
   1775 inline void MulKernel<
   1776     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5,
   1777     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1778                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1779                                          RowMajor>& params,
   1780                  int32_t* result) {
   1781 #ifdef DEBUG
   1782 #ifdef DEBUG_METAGEMM_VERBOSE
   1783   std::cout << __FILE__ << "(" << __LINE__
   1784             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1785                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, "
   1786                "8>::Multiply()"
   1787             << std::endl
   1788             << std::flush;
   1789 #endif
   1790 #endif
   1791   asm volatile(
   1792       "prfm pldl1keep, [%x[lhs]]\n"
   1793       "prfm pldl1keep, [%x[rhs]]\n"
   1794 
   1795       // Clear aggregators.
   1796       "movi v0.4s, #0\n"
   1797       "movi v1.4s, #0\n"
   1798       "movi v2.4s, #0\n"
   1799       "mov v3.16b, v0.16b\n"
   1800       "mov v4.16b, v1.16b\n"
   1801 
   1802       // General 1xM lanes loop.
   1803       "1:"
   1804 
   1805       // Subtract counter.
   1806       "subs %x[count], %x[count], #8\n"
   1807 
   1808       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
   1809       "ld1 {v9.2s}, [%x[lhs]], #8\n"
   1810       "prfm pldl1keep, [%x[lhs], #64]\n"
   1811       "umull v10.8h, v5.8b, v9.8b\n"
   1812       "umull v11.8h, v6.8b, v9.8b\n"
   1813       "umull v12.8h, v7.8b, v9.8b\n"
   1814       "umull v13.8h, v8.8b, v9.8b\n"
   1815       "ld1 {v5.2s}, [%x[rhs]], #8\n"
   1816       "prfm pldl1keep, [%x[rhs], #128]\n"
   1817       "uadalp v0.4s, v10.8h\n"
   1818       "uadalp v1.4s, v11.8h\n"
   1819       "uadalp v2.4s, v12.8h\n"
   1820       "uadalp v3.4s, v13.8h\n"
   1821       "umull v10.8h, v5.8b, v9.8b\n"
   1822       "uadalp v4.4s, v10.8h\n"
   1823 
   1824       // Loop break.
   1825       "bgt 1b\n"
   1826 
   1827       // StaticQuantizationInt32::Prepare
   1828       "ld1 {v5.4s}, [%x[lhs]], #16\n"
   1829       "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
   1830       "dup v5.4s, v5.s[0]\n"
   1831 
   1832       // RowMajorOutput::Prepare
   1833 
   1834       // Reduce aggregators.
   1835       "addp v0.4s, v0.4s, v1.4s\n"
   1836       "addp v2.4s, v2.4s, v3.4s\n"
   1837       "addp v4.4s, v4.4s, v4.4s\n"
   1838       "addp v0.4s, v0.4s, v2.4s\n"
   1839       "addp v1.4s, v4.4s, v4.4s\n"
   1840 
   1841       // StaticQuantizationInt32::Transform
   1842       "add v0.4s, v0.4s, v5.4s\n"
   1843       "add v1.4s, v1.4s, v5.4s\n"
   1844       "add v0.4s, v0.4s, v6.4s\n"
   1845       "add v1.4s, v1.4s, v7.4s\n"
   1846 
   1847       // RowMajorOutput::Output
   1848       "st1 {v0.4s}, [%x[result]], #16\n"
   1849       "st1 {v1.s}[0], [%x[result]], #4\n"
   1850       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1851       : [count] "r"(params.kernel.count),
   1852         [stride] "r"(params.output_stream.stride)
   1853       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   1854         "v11", "v12", "v13", "cc", "memory");
   1855 }
   1856 
   1857 template <>
   1858 inline void MulKernel<
   1859     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6,
   1860     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1861                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1862                                          RowMajor>& params,
   1863                  int32_t* result) {
   1864 #ifdef DEBUG
   1865 #ifdef DEBUG_METAGEMM_VERBOSE
   1866   std::cout << __FILE__ << "(" << __LINE__
   1867             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1868                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, "
   1869                "8>::Multiply()"
   1870             << std::endl
   1871             << std::flush;
   1872 #endif
   1873 #endif
   1874   asm volatile(
   1875       "prfm pldl1keep, [%x[lhs]]\n"
   1876       "prfm pldl1keep, [%x[rhs]]\n"
   1877 
   1878       // Clear aggregators.
   1879       "movi v0.4s, #0\n"
   1880       "movi v1.4s, #0\n"
   1881       "movi v2.4s, #0\n"
   1882       "mov v3.16b, v0.16b\n"
   1883       "mov v4.16b, v1.16b\n"
   1884       "mov v5.16b, v2.16b\n"
   1885 
   1886       // General 1xM lanes loop.
   1887       "1:"
   1888 
   1889       // Subtract counter.
   1890       "subs %x[count], %x[count], #8\n"
   1891 
   1892       "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
   1893       "ld1 {v10.2s}, [%x[lhs]], #8\n"
   1894       "prfm pldl1keep, [%x[lhs], #64]\n"
   1895       "umull v11.8h, v6.8b, v10.8b\n"
   1896       "umull v12.8h, v7.8b, v10.8b\n"
   1897       "umull v13.8h, v8.8b, v10.8b\n"
   1898       "umull v14.8h, v9.8b, v10.8b\n"
   1899       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
   1900       "prfm pldl1keep, [%x[rhs], #128]\n"
   1901       "uadalp v0.4s, v11.8h\n"
   1902       "uadalp v1.4s, v12.8h\n"
   1903       "uadalp v2.4s, v13.8h\n"
   1904       "uadalp v3.4s, v14.8h\n"
   1905       "umull v11.8h, v6.8b, v10.8b\n"
   1906       "umull v12.8h, v7.8b, v10.8b\n"
   1907       "uadalp v4.4s, v11.8h\n"
   1908       "uadalp v5.4s, v12.8h\n"
   1909 
   1910       // Loop break.
   1911       "bgt 1b\n"
   1912 
   1913       // StaticQuantizationInt32::Prepare
   1914       "ld1 {v6.4s}, [%x[lhs]], #16\n"
   1915       "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
   1916       "dup v6.4s, v6.s[0]\n"
   1917 
   1918       // RowMajorOutput::Prepare
   1919 
   1920       // Reduce aggregators.
   1921       "addp v0.4s, v0.4s, v1.4s\n"
   1922       "addp v2.4s, v2.4s, v3.4s\n"
   1923       "addp v4.4s, v4.4s, v5.4s\n"
   1924       "addp v0.4s, v0.4s, v2.4s\n"
   1925       "addp v1.4s, v4.4s, v4.4s\n"
   1926 
   1927       // StaticQuantizationInt32::Transform
   1928       "add v0.4s, v0.4s, v6.4s\n"
   1929       "add v1.4s, v1.4s, v6.4s\n"
   1930       "add v0.4s, v0.4s, v7.4s\n"
   1931       "add v1.4s, v1.4s, v8.4s\n"
   1932 
   1933       // RowMajorOutput::Output
   1934       "st1 {v0.4s}, [%x[result]], #16\n"
   1935       "st1 {v1.2s}, [%x[result]], #8\n"
   1936       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1937       : [count] "r"(params.kernel.count),
   1938         [stride] "r"(params.output_stream.stride)
   1939       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   1940         "v11", "v12", "v13", "v14", "cc", "memory");
   1941 }
   1942 
   1943 template <>
   1944 inline void MulKernel<
   1945     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7,
   1946     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1947                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1948                                          RowMajor>& params,
   1949                  int32_t* result) {
   1950 #ifdef DEBUG
   1951 #ifdef DEBUG_METAGEMM_VERBOSE
   1952   std::cout << __FILE__ << "(" << __LINE__
   1953             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1954                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, "
   1955                "8>::Multiply()"
   1956             << std::endl
   1957             << std::flush;
   1958 #endif
   1959 #endif
   1960   asm volatile(
   1961       "prfm pldl1keep, [%x[lhs]]\n"
   1962       "prfm pldl1keep, [%x[rhs]]\n"
   1963 
   1964       // Clear aggregators.
   1965       "movi v0.4s, #0\n"
   1966       "movi v1.4s, #0\n"
   1967       "movi v2.4s, #0\n"
   1968       "mov v3.16b, v0.16b\n"
   1969       "mov v4.16b, v1.16b\n"
   1970       "mov v5.16b, v2.16b\n"
   1971       "mov v6.16b, v3.16b\n"
   1972 
   1973       // General 1xM lanes loop.
   1974       "1:"
   1975 
   1976       // Subtract counter.
   1977       "subs %x[count], %x[count], #8\n"
   1978 
   1979       "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
   1980       "ld1 {v11.2s}, [%x[lhs]], #8\n"
   1981       "prfm pldl1keep, [%x[lhs], #64]\n"
   1982       "umull v12.8h, v7.8b, v11.8b\n"
   1983       "umull v13.8h, v8.8b, v11.8b\n"
   1984       "umull v14.8h, v9.8b, v11.8b\n"
   1985       "umull v15.8h, v10.8b, v11.8b\n"
   1986       "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
   1987       "prfm pldl1keep, [%x[rhs], #128]\n"
   1988       "uadalp v0.4s, v12.8h\n"
   1989       "uadalp v1.4s, v13.8h\n"
   1990       "uadalp v2.4s, v14.8h\n"
   1991       "uadalp v3.4s, v15.8h\n"
   1992       "umull v12.8h, v7.8b, v11.8b\n"
   1993       "umull v13.8h, v8.8b, v11.8b\n"
   1994       "umull v14.8h, v9.8b, v11.8b\n"
   1995       "uadalp v4.4s, v12.8h\n"
   1996       "uadalp v5.4s, v13.8h\n"
   1997       "uadalp v6.4s, v14.8h\n"
   1998 
   1999       // Loop break.
   2000       "bgt 1b\n"
   2001 
   2002       // StaticQuantizationInt32::Prepare
   2003       "ld1 {v7.4s}, [%x[lhs]], #16\n"
   2004       "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
   2005       "dup v7.4s, v7.s[0]\n"
   2006 
   2007       // RowMajorOutput::Prepare
   2008 
   2009       // Reduce aggregators.
   2010       "addp v0.4s, v0.4s, v1.4s\n"
   2011       "addp v2.4s, v2.4s, v3.4s\n"
   2012       "addp v4.4s, v4.4s, v5.4s\n"
   2013       "addp v6.4s, v6.4s, v6.4s\n"
   2014       "addp v0.4s, v0.4s, v2.4s\n"
   2015       "addp v1.4s, v4.4s, v6.4s\n"
   2016 
   2017       // StaticQuantizationInt32::Transform
   2018       "add v0.4s, v0.4s, v7.4s\n"
   2019       "add v1.4s, v1.4s, v7.4s\n"
   2020       "add v0.4s, v0.4s, v8.4s\n"
   2021       "add v1.4s, v1.4s, v9.4s\n"
   2022 
   2023       // RowMajorOutput::Output
   2024       "st1 {v0.4s}, [%x[result]], #16\n"
   2025       "st1 {v1.2s}, [%x[result]], #8\n"
   2026       "st1 {v1.s}[2], [%x[result]], #4\n"
   2027       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2028       : [count] "r"(params.kernel.count),
   2029         [stride] "r"(params.output_stream.stride)
   2030       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   2031         "v11", "v12", "v13", "v14", "v15", "cc", "memory");
   2032 }
   2033 
   2034 template <>
   2035 inline void MulKernel<
   2036     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8,
   2037     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2038                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2039                                          RowMajor>& params,
   2040                  int32_t* result) {
   2041 #ifdef DEBUG
   2042 #ifdef DEBUG_METAGEMM_VERBOSE
   2043   std::cout << __FILE__ << "(" << __LINE__
   2044             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2045                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, "
   2046                "8>::Multiply()"
   2047             << std::endl
   2048             << std::flush;
   2049 #endif
   2050 #endif
   2051   asm volatile(
   2052       "prfm pldl1keep, [%x[lhs]]\n"
   2053       "prfm pldl1keep, [%x[rhs]]\n"
   2054 
   2055       // Clear aggregators.
   2056       "movi v0.4s, #0\n"
   2057       "movi v1.4s, #0\n"
   2058       "movi v2.4s, #0\n"
   2059       "mov v3.16b, v0.16b\n"
   2060       "mov v4.16b, v1.16b\n"
   2061       "mov v5.16b, v2.16b\n"
   2062       "mov v6.16b, v3.16b\n"
   2063       "mov v7.16b, v4.16b\n"
   2064 
   2065       // 1x8 lanes loop.
   2066       "1:"
   2067 
   2068       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
   2069       "ld1 {v8.2s}, [%x[lhs]], #8\n"
   2070       "umull v13.8h, v8.8b, v9.8b\n"
   2071       "umull v14.8h, v8.8b, v10.8b\n"
   2072       "umull v15.8h, v8.8b, v11.8b\n"
   2073       "umull v16.8h, v8.8b, v12.8b\n"
   2074       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
   2075       "uadalp v0.4s, v13.8h\n"
   2076       "uadalp v1.4s, v14.8h\n"
   2077       "uadalp v2.4s, v15.8h\n"
   2078       "uadalp v3.4s, v16.8h\n"
   2079       "prfm pldl1keep, [%x[rhs], #256]\n"
   2080       "umull v17.8h, v8.8b, v9.8b\n"
   2081       "umull v13.8h, v8.8b, v10.8b\n"
   2082       "umull v14.8h, v8.8b, v11.8b\n"
   2083       "umull v15.8h, v8.8b, v12.8b\n"
   2084       "prfm pldl1keep, [%x[lhs], #32]\n"
   2085 
   2086       // Subtract counter.
   2087       "subs %x[count], %x[count], #8\n"
   2088 
   2089       "uadalp v4.4s, v17.8h\n"
   2090       "uadalp v5.4s, v13.8h\n"
   2091       "uadalp v6.4s, v14.8h\n"
   2092       "uadalp v7.4s, v15.8h\n"
   2093 
   2094       // Loop break.
   2095       "bgt 1b\n"
   2096 
   2097       // StaticQuantizationInt32::Prepare
   2098       "ld1 {v8.4s}, [%x[lhs]], #16\n"
   2099       "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
   2100       "dup v8.4s, v8.s[0]\n"
   2101 
   2102       // RowMajorOutput::Prepare
   2103 
   2104       // Reduce aggregators.
   2105       "addp v0.4s, v0.4s, v1.4s\n"
   2106       "addp v2.4s, v2.4s, v3.4s\n"
   2107       "addp v4.4s, v4.4s, v5.4s\n"
   2108       "addp v6.4s, v6.4s, v7.4s\n"
   2109       "addp v0.4s, v0.4s, v2.4s\n"
   2110       "addp v1.4s, v4.4s, v6.4s\n"
   2111 
   2112       // StaticQuantizationInt32::Transform
   2113       "add v0.4s, v0.4s, v8.4s\n"
   2114       "add v1.4s, v1.4s, v8.4s\n"
   2115       "add v0.4s, v0.4s, v9.4s\n"
   2116       "add v1.4s, v1.4s, v10.4s\n"
   2117 
   2118       // RowMajorOutput::Output
   2119       "st1 {v0.4s, v1.4s}, [%x[result]], #32\n"
   2120       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2121       : [count] "r"(params.kernel.count),
   2122         [stride] "r"(params.output_stream.stride)
   2123       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   2124         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
   2125 }
   2126 
   2127 template <>
   2128 inline void MulKernel<
   2129     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1,
   2130     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2131                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2132                                          RowMajor>& params,
   2133                  int32_t* result) {
   2134 #ifdef DEBUG
   2135 #ifdef DEBUG_METAGEMM_VERBOSE
   2136   std::cout << __FILE__ << "(" << __LINE__
   2137             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2138                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, "
   2139                "8>::Multiply()"
   2140             << std::endl
   2141             << std::flush;
   2142 #endif
   2143 #endif
   2144   asm volatile(
   2145       "prfm pldl1keep, [%x[lhs]]\n"
   2146       "prfm pldl1keep, [%x[rhs]]\n"
   2147 
   2148       // Clear aggregators.
   2149       "movi v0.4s, #0\n"
   2150       "movi v1.4s, #0\n"
   2151 
   2152       // General NxM lanes loop.
   2153       "1:"
   2154 
   2155       // Subtract counter.
   2156       "subs %x[count], %x[count], #8\n"
   2157 
   2158       "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
   2159       "ld1 {v4.2s}, [%x[rhs]], #8\n"
   2160       "prfm pldl1keep, [%x[lhs], #64]\n"
   2161       "prfm pldl1keep, [%x[rhs], #64]\n"
   2162       "umull v5.8h, v4.8b, v2.8b\n"
   2163       "umull v6.8h, v4.8b, v3.8b\n"
   2164       "uadalp v0.4s, v5.8h\n"
   2165       "uadalp v1.4s, v6.8h\n"
   2166 
   2167       // Loop break.
   2168       "bgt 1b\n"
   2169 
   2170       // StaticQuantizationInt32::Prepare
   2171       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   2172       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   2173       "dup v2.4s, v4.s[0]\n"
   2174       "dup v4.4s, v4.s[1]\n"
   2175 
   2176       // RowMajorOutput::Prepare
   2177       "add x0, %x[result], %x[stride]\n"
   2178 
   2179       // Reduce aggregators.
   2180       "addp v0.4s, v0.4s, v0.4s\n"
   2181       "addp v0.4s, v0.4s, v0.4s\n"
   2182       "addp v1.4s, v1.4s, v1.4s\n"
   2183       "addp v1.4s, v1.4s, v1.4s\n"
   2184 
   2185       // StaticQuantizationInt32::Transform
   2186       "add v0.4s, v0.4s, v2.4s\n"
   2187       "add v1.4s, v1.4s, v4.4s\n"
   2188       "add v0.4s, v0.4s, v5.4s\n"
   2189       "add v1.4s, v1.4s, v5.4s\n"
   2190 
   2191       // RowMajorOutput::Output
   2192       "st1 {v0.s}[0], [%x[result]], #4\n"
   2193       "st1 {v1.s}[0], [x0], #4\n"
   2194       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2195       : [count] "r"(params.kernel.count),
   2196         [stride] "r"(params.output_stream.stride)
   2197       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
   2198 }
   2199 
   2200 template <>
   2201 inline void MulKernel<
   2202     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2,
   2203     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2204                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2205                                          RowMajor>& params,
   2206                  int32_t* result) {
   2207 #ifdef DEBUG
   2208 #ifdef DEBUG_METAGEMM_VERBOSE
   2209   std::cout << __FILE__ << "(" << __LINE__
   2210             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2211                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, "
   2212                "8>::Multiply()"
   2213             << std::endl
   2214             << std::flush;
   2215 #endif
   2216 #endif
   2217   asm volatile(
   2218       "prfm pldl1keep, [%x[lhs]]\n"
   2219       "prfm pldl1keep, [%x[rhs]]\n"
   2220 
   2221       // Clear aggregators.
   2222       "movi v0.4s, #0\n"
   2223       "movi v1.4s, #0\n"
   2224       "movi v2.4s, #0\n"
   2225       "mov v3.16b, v0.16b\n"
   2226 
   2227       // General NxM lanes loop.
   2228       "1:"
   2229 
   2230       // Subtract counter.
   2231       "subs %x[count], %x[count], #8\n"
   2232 
   2233       "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
   2234       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
   2235       "prfm pldl1keep, [%x[lhs], #64]\n"
   2236       "prfm pldl1keep, [%x[rhs], #64]\n"
   2237       "umull v8.8h, v6.8b, v4.8b\n"
   2238       "umull v9.8h, v7.8b, v4.8b\n"
   2239       "umull v10.8h, v6.8b, v5.8b\n"
   2240       "umull v11.8h, v7.8b, v5.8b\n"
   2241       "uadalp v0.4s, v8.8h\n"
   2242       "uadalp v1.4s, v9.8h\n"
   2243       "uadalp v2.4s, v10.8h\n"
   2244       "uadalp v3.4s, v11.8h\n"
   2245 
   2246       // Loop break.
   2247       "bgt 1b\n"
   2248 
   2249       // StaticQuantizationInt32::Prepare
   2250       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   2251       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   2252       "dup v6.4s, v4.s[0]\n"
   2253       "dup v4.4s, v4.s[1]\n"
   2254 
   2255       // RowMajorOutput::Prepare
   2256       "add x0, %x[result], %x[stride]\n"
   2257 
   2258       // Reduce aggregators.
   2259       "addp v0.4s, v0.4s, v1.4s\n"
   2260       "addp v0.4s, v0.4s, v0.4s\n"
   2261       "addp v2.4s, v2.4s, v3.4s\n"
   2262       "addp v2.4s, v2.4s, v2.4s\n"
   2263 
   2264       // StaticQuantizationInt32::Transform
   2265       "add v0.4s, v0.4s, v6.4s\n"
   2266       "add v2.4s, v2.4s, v4.4s\n"
   2267       "add v0.4s, v0.4s, v5.4s\n"
   2268       "add v2.4s, v2.4s, v5.4s\n"
   2269 
   2270       // RowMajorOutput::Output
   2271       "st1 {v0.2s}, [%x[result]], #8\n"
   2272       "st1 {v2.2s}, [x0], #8\n"
   2273       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2274       : [count] "r"(params.kernel.count),
   2275         [stride] "r"(params.output_stream.stride)
   2276       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   2277         "v11", "cc", "memory");
   2278 }
   2279 
   2280 template <>
   2281 inline void MulKernel<
   2282     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3,
   2283     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2284                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2285                                          RowMajor>& params,
   2286                  int32_t* result) {
   2287 #ifdef DEBUG
   2288 #ifdef DEBUG_METAGEMM_VERBOSE
   2289   std::cout << __FILE__ << "(" << __LINE__
   2290             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2291                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, "
   2292                "8>::Multiply()"
   2293             << std::endl
   2294             << std::flush;
   2295 #endif
   2296 #endif
   2297   asm volatile(
   2298       "prfm pldl1keep, [%x[lhs]]\n"
   2299       "prfm pldl1keep, [%x[rhs]]\n"
   2300 
   2301       // Clear aggregators.
   2302       "movi v0.4s, #0\n"
   2303       "movi v1.4s, #0\n"
   2304       "movi v2.4s, #0\n"
   2305       "mov v3.16b, v0.16b\n"
   2306       "mov v4.16b, v1.16b\n"
   2307       "mov v5.16b, v2.16b\n"
   2308 
   2309       // General NxM lanes loop.
   2310       "1:"
   2311 
   2312       // Subtract counter.
   2313       "subs %x[count], %x[count], #8\n"
   2314 
   2315       "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
   2316       "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
   2317       "prfm pldl1keep, [%x[lhs], #64]\n"
   2318       "prfm pldl1keep, [%x[rhs], #64]\n"
   2319       "umull v11.8h, v8.8b, v6.8b\n"
   2320       "umull v12.8h, v9.8b, v6.8b\n"
   2321       "umull v13.8h, v10.8b, v6.8b\n"
   2322       "umull v14.8h, v8.8b, v7.8b\n"
   2323       "umull v15.8h, v9.8b, v7.8b\n"
   2324       "umull v16.8h, v10.8b, v7.8b\n"
   2325       "uadalp v0.4s, v11.8h\n"
   2326       "uadalp v1.4s, v12.8h\n"
   2327       "uadalp v2.4s, v13.8h\n"
   2328       "uadalp v3.4s, v14.8h\n"
   2329       "uadalp v4.4s, v15.8h\n"
   2330       "uadalp v5.4s, v16.8h\n"
   2331 
   2332       // Loop break.
   2333       "bgt 1b\n"
   2334 
   2335       // StaticQuantizationInt32::Prepare
   2336       "ld1 {v6.4s}, [%x[lhs]], #16\n"
   2337       "ld1 {v7.4s}, [%x[rhs]], #16\n"
   2338       "dup v8.4s, v6.s[0]\n"
   2339       "dup v6.4s, v6.s[1]\n"
   2340 
   2341       // RowMajorOutput::Prepare
   2342       "add x0, %x[result], %x[stride]\n"
   2343 
   2344       // Reduce aggregators.
   2345       "addp v0.4s, v0.4s, v1.4s\n"
   2346       "addp v2.4s, v2.4s, v2.4s\n"
   2347       "addp v0.4s, v0.4s, v2.4s\n"
   2348       "addp v3.4s, v3.4s, v4.4s\n"
   2349       "addp v5.4s, v5.4s, v5.4s\n"
   2350       "addp v3.4s, v3.4s, v5.4s\n"
   2351 
   2352       // StaticQuantizationInt32::Transform
   2353       "add v0.4s, v0.4s, v8.4s\n"
   2354       "add v3.4s, v3.4s, v6.4s\n"
   2355       "add v0.4s, v0.4s, v7.4s\n"
   2356       "add v3.4s, v3.4s, v7.4s\n"
   2357 
   2358       // RowMajorOutput::Output
   2359       "st1 {v0.2s}, [%x[result]], #8\n"
   2360       "st1 {v0.s}[2], [%x[result]], #4\n"
   2361       "st1 {v3.2s}, [x0], #8\n"
   2362       "st1 {v3.s}[2], [x0], #4\n"
   2363       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2364       : [count] "r"(params.kernel.count),
   2365         [stride] "r"(params.output_stream.stride)
   2366       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   2367         "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
   2368 }
   2369 
   2370 template <>
   2371 inline void MulKernel<
   2372     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4,
   2373     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2374                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2375                                          RowMajor>& params,
   2376                  int32_t* result) {
   2377 #ifdef DEBUG
   2378 #ifdef DEBUG_METAGEMM_VERBOSE
   2379   std::cout << __FILE__ << "(" << __LINE__
   2380             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2381                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, "
   2382                "8>::Multiply()"
   2383             << std::endl
   2384             << std::flush;
   2385 #endif
   2386 #endif
   2387   asm volatile(
   2388       "prfm pldl1keep, [%x[lhs]]\n"
   2389       "prfm pldl1keep, [%x[rhs]]\n"
   2390 
   2391       // Clear aggregators.
   2392       "movi v0.4s, #0\n"
   2393       "movi v1.4s, #0\n"
   2394       "movi v2.4s, #0\n"
   2395       "mov v3.16b, v0.16b\n"
   2396       "mov v4.16b, v1.16b\n"
   2397       "mov v5.16b, v2.16b\n"
   2398       "mov v6.16b, v3.16b\n"
   2399       "mov v7.16b, v4.16b\n"
   2400 
   2401       // 2x4 lanes loop.
   2402       "1:"
   2403 
   2404       "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
   2405       "ld1 {v8.8b}, [%x[lhs]], #8\n"
   2406       "umull v14.8h, v8.8b, v10.8b\n"
   2407       "ld1 {v9.8b}, [%x[lhs]], #8\n"
   2408       "umull v15.8h, v8.8b, v11.8b\n"
   2409       "prfm pldl1keep, [%x[rhs], #64]\n"
   2410       "umull v16.8h, v8.8b, v12.8b\n"
   2411       "prfm pldl1keep, [%x[lhs], #64]\n"
   2412       "umull v17.8h, v8.8b, v13.8b\n"
   2413       "umull v18.8h, v9.8b, v10.8b\n"
   2414       "uadalp v0.4s, v14.8h\n"
   2415       "uadalp v1.4s, v15.8h\n"
   2416       "uadalp v2.4s, v16.8h\n"
   2417       "umull v14.8h, v9.8b, v11.8b\n"
   2418       "umull v15.8h, v9.8b, v12.8b\n"
   2419       "umull v16.8h, v9.8b, v13.8b\n"
   2420 
   2421       // Subtract counter.
   2422       "subs %x[count], %x[count], #8\n"
   2423 
   2424       "uadalp v3.4s, v17.8h\n"
   2425       "uadalp v4.4s, v18.8h\n"
   2426       "uadalp v5.4s, v14.8h\n"
   2427       "uadalp v6.4s, v15.8h\n"
   2428       "uadalp v7.4s, v16.8h\n"
   2429 
   2430       // Loop break.
   2431       "bgt 1b\n"
   2432 
   2433       // StaticQuantizationInt32::Prepare
   2434       "ld1 {v8.4s}, [%x[lhs]], #16\n"
   2435       "ld1 {v9.4s}, [%x[rhs]], #16\n"
   2436       "dup v10.4s, v8.s[0]\n"
   2437       "dup v8.4s, v8.s[1]\n"
   2438 
   2439       // RowMajorOutput::Prepare
   2440       "add x0, %x[result], %x[stride]\n"
   2441 
   2442       // Reduce aggregators.
   2443       "addp v0.4s, v0.4s, v1.4s\n"
   2444       "addp v2.4s, v2.4s, v3.4s\n"
   2445       "addp v0.4s, v0.4s, v2.4s\n"
   2446       "addp v4.4s, v4.4s, v5.4s\n"
   2447       "addp v6.4s, v6.4s, v7.4s\n"
   2448       "addp v4.4s, v4.4s, v6.4s\n"
   2449 
   2450       // StaticQuantizationInt32::Transform
   2451       "add v0.4s, v0.4s, v10.4s\n"
   2452       "add v4.4s, v4.4s, v8.4s\n"
   2453       "add v0.4s, v0.4s, v9.4s\n"
   2454       "add v4.4s, v4.4s, v9.4s\n"
   2455 
   2456       // RowMajorOutput::Output
   2457       "st1 {v0.4s}, [%x[result]], #16\n"
   2458       "st1 {v4.4s}, [x0], #16\n"
   2459       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2460       : [count] "r"(params.kernel.count),
   2461         [stride] "r"(params.output_stream.stride)
   2462       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   2463         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
   2464 }
   2465 
   2466 template <>
   2467 inline void MulKernel<
   2468     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1,
   2469     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2470                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2471                                          RowMajor>& params,
   2472                  int32_t* result) {
   2473 #ifdef DEBUG
   2474 #ifdef DEBUG_METAGEMM_VERBOSE
   2475   std::cout << __FILE__ << "(" << __LINE__
   2476             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2477                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, "
   2478                "8>::Multiply()"
   2479             << std::endl
   2480             << std::flush;
   2481 #endif
   2482 #endif
   2483   asm volatile(
   2484       "prfm pldl1keep, [%x[lhs]]\n"
   2485       "prfm pldl1keep, [%x[rhs]]\n"
   2486 
   2487       // Clear aggregators.
   2488       "movi v0.4s, #0\n"
   2489       "movi v1.4s, #0\n"
   2490       "movi v2.4s, #0\n"
   2491 
   2492       // General NxM lanes loop.
   2493       "1:"
   2494 
   2495       // Subtract counter.
   2496       "subs %x[count], %x[count], #8\n"
   2497 
   2498       "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
   2499       "ld1 {v6.2s}, [%x[rhs]], #8\n"
   2500       "prfm pldl1keep, [%x[lhs], #64]\n"
   2501       "prfm pldl1keep, [%x[rhs], #64]\n"
   2502       "umull v7.8h, v6.8b, v3.8b\n"
   2503       "umull v8.8h, v6.8b, v4.8b\n"
   2504       "umull v9.8h, v6.8b, v5.8b\n"
   2505       "uadalp v0.4s, v7.8h\n"
   2506       "uadalp v1.4s, v8.8h\n"
   2507       "uadalp v2.4s, v9.8h\n"
   2508 
   2509       // Loop break.
   2510       "bgt 1b\n"
   2511 
   2512       // StaticQuantizationInt32::Prepare
   2513       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   2514       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   2515       "dup v3.4s, v4.s[0]\n"
   2516       "dup v6.4s, v4.s[1]\n"
   2517       "dup v4.4s, v4.s[2]\n"
   2518 
   2519       // RowMajorOutput::Prepare
   2520       "add x0, %x[result], %x[stride]\n"
   2521       "add x1, x0, %x[stride]\n"
   2522 
   2523       // Reduce aggregators.
   2524       "addp v0.4s, v0.4s, v0.4s\n"
   2525       "addp v0.4s, v0.4s, v0.4s\n"
   2526       "addp v1.4s, v1.4s, v1.4s\n"
   2527       "addp v1.4s, v1.4s, v1.4s\n"
   2528       "addp v2.4s, v2.4s, v2.4s\n"
   2529       "addp v2.4s, v2.4s, v2.4s\n"
   2530 
   2531       // StaticQuantizationInt32::Transform
   2532       "add v0.4s, v0.4s, v3.4s\n"
   2533       "add v1.4s, v1.4s, v6.4s\n"
   2534       "add v2.4s, v2.4s, v4.4s\n"
   2535       "add v0.4s, v0.4s, v5.4s\n"
   2536       "add v1.4s, v1.4s, v5.4s\n"
   2537       "add v2.4s, v2.4s, v5.4s\n"
   2538 
   2539       // RowMajorOutput::Output
   2540       "st1 {v0.s}[0], [%x[result]], #4\n"
   2541       "st1 {v1.s}[0], [x0], #4\n"
   2542       "st1 {v2.s}[0], [x1], #4\n"
   2543       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2544       : [count] "r"(params.kernel.count),
   2545         [stride] "r"(params.output_stream.stride)
   2546       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   2547         "cc", "memory");
   2548 }
   2549 
   2550 template <>
   2551 inline void MulKernel<
   2552     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2,
   2553     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2554                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2555                                          RowMajor>& params,
   2556                  int32_t* result) {
   2557 #ifdef DEBUG
   2558 #ifdef DEBUG_METAGEMM_VERBOSE
   2559   std::cout << __FILE__ << "(" << __LINE__
   2560             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2561                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, "
   2562                "8>::Multiply()"
   2563             << std::endl
   2564             << std::flush;
   2565 #endif
   2566 #endif
   2567   asm volatile(
   2568       "prfm pldl1keep, [%x[lhs]]\n"
   2569       "prfm pldl1keep, [%x[rhs]]\n"
   2570 
   2571       // Clear aggregators.
   2572       "movi v0.4s, #0\n"
   2573       "movi v1.4s, #0\n"
   2574       "movi v2.4s, #0\n"
   2575       "mov v3.16b, v0.16b\n"
   2576       "mov v4.16b, v1.16b\n"
   2577       "mov v5.16b, v2.16b\n"
   2578 
   2579       // General NxM lanes loop.
   2580       "1:"
   2581 
   2582       // Subtract counter.
   2583       "subs %x[count], %x[count], #8\n"
   2584 
   2585       "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
   2586       "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
   2587       "prfm pldl1keep, [%x[lhs], #64]\n"
   2588       "prfm pldl1keep, [%x[rhs], #64]\n"
   2589       "umull v11.8h, v9.8b, v6.8b\n"
   2590       "umull v12.8h, v10.8b, v6.8b\n"
   2591       "umull v13.8h, v9.8b, v7.8b\n"
   2592       "umull v14.8h, v10.8b, v7.8b\n"
   2593       "umull v15.8h, v9.8b, v8.8b\n"
   2594       "umull v16.8h, v10.8b, v8.8b\n"
   2595       "uadalp v0.4s, v11.8h\n"
   2596       "uadalp v1.4s, v12.8h\n"
   2597       "uadalp v2.4s, v13.8h\n"
   2598       "uadalp v3.4s, v14.8h\n"
   2599       "uadalp v4.4s, v15.8h\n"
   2600       "uadalp v5.4s, v16.8h\n"
   2601 
   2602       // Loop break.
   2603       "bgt 1b\n"
   2604 
   2605       // StaticQuantizationInt32::Prepare
   2606       "ld1 {v6.4s}, [%x[lhs]], #16\n"
   2607       "ld1 {v7.4s}, [%x[rhs]], #16\n"
   2608       "dup v8.4s, v6.s[0]\n"
   2609       "dup v9.4s, v6.s[1]\n"
   2610       "dup v6.4s, v6.s[2]\n"
   2611 
   2612       // RowMajorOutput::Prepare
   2613       "add x0, %x[result], %x[stride]\n"
   2614       "add x1, x0, %x[stride]\n"
   2615 
   2616       // Reduce aggregators.
   2617       "addp v0.4s, v0.4s, v1.4s\n"
   2618       "addp v0.4s, v0.4s, v0.4s\n"
   2619       "addp v2.4s, v2.4s, v3.4s\n"
   2620       "addp v2.4s, v2.4s, v2.4s\n"
   2621       "addp v4.4s, v4.4s, v5.4s\n"
   2622       "addp v4.4s, v4.4s, v4.4s\n"
   2623 
   2624       // StaticQuantizationInt32::Transform
   2625       "add v0.4s, v0.4s, v8.4s\n"
   2626       "add v2.4s, v2.4s, v9.4s\n"
   2627       "add v4.4s, v4.4s, v6.4s\n"
   2628       "add v0.4s, v0.4s, v7.4s\n"
   2629       "add v2.4s, v2.4s, v7.4s\n"
   2630       "add v4.4s, v4.4s, v7.4s\n"
   2631 
   2632       // RowMajorOutput::Output
   2633       "st1 {v0.2s}, [%x[result]], #8\n"
   2634       "st1 {v2.2s}, [x0], #8\n"
   2635       "st1 {v4.2s}, [x1], #8\n"
   2636       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2637       : [count] "r"(params.kernel.count),
   2638         [stride] "r"(params.output_stream.stride)
   2639       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   2640         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
   2641 }
   2642 
   2643 template <>
   2644 inline void MulKernel<
   2645     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3,
   2646     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2647                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2648                                          RowMajor>& params,
   2649                  int32_t* result) {
   2650 #ifdef DEBUG
   2651 #ifdef DEBUG_METAGEMM_VERBOSE
   2652   std::cout << __FILE__ << "(" << __LINE__
   2653             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2654                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, "
   2655                "8>::Multiply()"
   2656             << std::endl
   2657             << std::flush;
   2658 #endif
   2659 #endif
   2660   asm volatile(
   2661       "prfm pldl1keep, [%x[lhs]]\n"
   2662       "prfm pldl1keep, [%x[rhs]]\n"
   2663 
   2664       // Clear aggregators.
   2665       "movi v0.4s, #0\n"
   2666       "movi v1.4s, #0\n"
   2667       "movi v2.4s, #0\n"
   2668       "mov v3.16b, v0.16b\n"
   2669       "mov v4.16b, v1.16b\n"
   2670       "mov v5.16b, v2.16b\n"
   2671       "mov v6.16b, v3.16b\n"
   2672       "mov v7.16b, v4.16b\n"
   2673       "mov v8.16b, v5.16b\n"
   2674 
   2675       // 3x3 lanes loop.
   2676       "1:"
   2677 
   2678       "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
   2679       "ld1 {v9.8b}, [%x[lhs]], #8\n"
   2680       "umull v15.8h, v9.8b, v12.8b\n"
   2681       "ld1 {v10.8b}, [%x[lhs]], #8\n"
   2682       "umull v16.8h, v9.8b, v13.8b\n"
   2683       "ld1 {v11.8b}, [%x[lhs]], #8\n"
   2684       "umull v17.8h, v9.8b, v14.8b\n"
   2685       "prfm pldl1keep, [%x[lhs], #64]\n"
   2686       "umull v18.8h, v10.8b, v12.8b\n"
   2687       "prfm pldl1keep, [%x[rhs], #64]\n"
   2688       "uadalp v0.4s, v15.8h\n"
   2689       "uadalp v1.4s, v16.8h\n"
   2690       "uadalp v2.4s, v17.8h\n"
   2691       "uadalp v3.4s, v18.8h\n"
   2692       "umull v15.8h, v10.8b, v13.8b\n"
   2693       "umull v16.8h, v10.8b, v14.8b\n"
   2694       "umull v17.8h, v11.8b, v12.8b\n"
   2695       "umull v18.8h, v11.8b, v13.8b\n"
   2696 
   2697       // Subtract counter.
   2698       "subs %x[count], %x[count], #8\n"
   2699 
   2700       "umull v9.8h, v11.8b, v14.8b\n"
   2701       "uadalp v4.4s, v15.8h\n"
   2702       "uadalp v5.4s, v16.8h\n"
   2703       "uadalp v6.4s, v17.8h\n"
   2704       "uadalp v7.4s, v18.8h\n"
   2705       "uadalp v8.4s, v9.8h\n"
   2706 
   2707       // Loop break.
   2708       "bgt 1b\n"
   2709 
   2710       // StaticQuantizationInt32::Prepare
   2711       "ld1 {v9.4s}, [%x[lhs]], #16\n"
   2712       "ld1 {v10.4s}, [%x[rhs]], #16\n"
   2713       "dup v11.4s, v9.s[0]\n"
   2714       "dup v12.4s, v9.s[1]\n"
   2715       "dup v9.4s, v9.s[2]\n"
   2716 
   2717       // RowMajorOutput::Prepare
   2718       "add x0, %x[result], %x[stride]\n"
   2719       "add x1, x0, %x[stride]\n"
   2720 
   2721       // Reduce aggregators.
   2722       "addp v0.4s, v0.4s, v1.4s\n"
   2723       "addp v2.4s, v2.4s, v2.4s\n"
   2724       "addp v0.4s, v0.4s, v2.4s\n"
   2725       "addp v3.4s, v3.4s, v4.4s\n"
   2726       "addp v5.4s, v5.4s, v5.4s\n"
   2727       "addp v3.4s, v3.4s, v5.4s\n"
   2728       "addp v6.4s, v6.4s, v7.4s\n"
   2729       "addp v8.4s, v8.4s, v8.4s\n"
   2730       "addp v6.4s, v6.4s, v8.4s\n"
   2731 
   2732       // StaticQuantizationInt32::Transform
   2733       "add v0.4s, v0.4s, v11.4s\n"
   2734       "add v3.4s, v3.4s, v12.4s\n"
   2735       "add v6.4s, v6.4s, v9.4s\n"
   2736       "add v0.4s, v0.4s, v10.4s\n"
   2737       "add v3.4s, v3.4s, v10.4s\n"
   2738       "add v6.4s, v6.4s, v10.4s\n"
   2739 
   2740       // RowMajorOutput::Output
   2741       "st1 {v0.2s}, [%x[result]], #8\n"
   2742       "st1 {v0.s}[2], [%x[result]], #4\n"
   2743       "st1 {v3.2s}, [x0], #8\n"
   2744       "st1 {v3.s}[2], [x0], #4\n"
   2745       "st1 {v6.2s}, [x1], #8\n"
   2746       "st1 {v6.s}[2], [x1], #4\n"
   2747       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2748       : [count] "r"(params.kernel.count),
   2749         [stride] "r"(params.output_stream.stride)
   2750       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   2751         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
   2752         "memory");
   2753 }
   2754 
   2755 template <>
   2756 inline void MulKernel<
   2757     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1,
   2758     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2759                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   2760                                          RowMajor>& params,
   2761                  float* result) {
   2762 #ifdef DEBUG
   2763 #ifdef DEBUG_METAGEMM_VERBOSE
   2764   std::cout << __FILE__ << "(" << __LINE__
   2765             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   2766                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, "
   2767                "8>::Multiply()"
   2768             << std::endl
   2769             << std::flush;
   2770 #endif
   2771 #endif
   2772   asm volatile(
   2773       "prfm pldl1keep, [%x[lhs]]\n"
   2774       "prfm pldl1keep, [%x[rhs]]\n"
   2775 
   2776       // Clear aggregators.
   2777       "movi v0.4s, #0\n"
   2778 
   2779       // General NxM lanes loop.
   2780       "1:"
   2781 
   2782       // Subtract counter.
   2783       "subs %x[count], %x[count], #8\n"
   2784 
   2785       "ld1 {v1.2s}, [%x[lhs]], #8\n"
   2786       "ld1 {v2.2s}, [%x[rhs]], #8\n"
   2787       "prfm pldl1keep, [%x[lhs], #64]\n"
   2788       "prfm pldl1keep, [%x[rhs], #64]\n"
   2789       "umull v3.8h, v2.8b, v1.8b\n"
   2790       "uadalp v0.4s, v3.8h\n"
   2791 
   2792       // Loop break.
   2793       "bgt 1b\n"
   2794 
   2795       // StaticQuantizationFloat::Prepare
   2796       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   2797       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   2798       "dup v6.4s, %w[scale]\n"
   2799       "dup v4.4s, v4.s[0]\n"
   2800 
   2801       // RowMajorOutput::Prepare
   2802 
   2803       // Reduce aggregators.
   2804       "addp v0.4s, v0.4s, v0.4s\n"
   2805       "addp v0.4s, v0.4s, v0.4s\n"
   2806 
   2807       // StaticQuantizationFloat::Transform
   2808       "add v0.4s, v0.4s, v4.4s\n"
   2809       "add v0.4s, v0.4s, v5.4s\n"
   2810       "scvtf v0.4s, v0.4s\n"
   2811       "fmul v0.4s, v0.4s, v6.4s\n"
   2812 
   2813       // RowMajorOutput::Output
   2814       "st1 {v0.s}[0], [%x[result]], #4\n"
   2815       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2816       : [count] "r"(params.kernel.count),
   2817         [stride] "r"(params.output_stream.stride),
   2818         [scale] "r"(params.kernel.scale)
   2819       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
   2820 }
   2821 
   2822 template <>
   2823 inline void MulKernel<
   2824     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2,
   2825     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2826                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   2827                                          RowMajor>& params,
   2828                  float* result) {
   2829 #ifdef DEBUG
   2830 #ifdef DEBUG_METAGEMM_VERBOSE
   2831   std::cout << __FILE__ << "(" << __LINE__
   2832             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   2833                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, "
   2834                "8>::Multiply()"
   2835             << std::endl
   2836             << std::flush;
   2837 #endif
   2838 #endif
   2839   asm volatile(
   2840       "prfm pldl1keep, [%x[lhs]]\n"
   2841       "prfm pldl1keep, [%x[rhs]]\n"
   2842 
   2843       // Clear aggregators.
   2844       "movi v0.4s, #0\n"
   2845       "movi v1.4s, #0\n"
   2846 
   2847       // General NxM lanes loop.
   2848       "1:"
   2849 
   2850       // Subtract counter.
   2851       "subs %x[count], %x[count], #8\n"
   2852 
   2853       "ld1 {v2.2s}, [%x[lhs]], #8\n"
   2854       "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
   2855       "prfm pldl1keep, [%x[lhs], #64]\n"
   2856       "prfm pldl1keep, [%x[rhs], #64]\n"
   2857       "umull v5.8h, v3.8b, v2.8b\n"
   2858       "umull v6.8h, v4.8b, v2.8b\n"
   2859       "uadalp v0.4s, v5.8h\n"
   2860       "uadalp v1.4s, v6.8h\n"
   2861 
   2862       // Loop break.
   2863       "bgt 1b\n"
   2864 
   2865       // StaticQuantizationFloat::Prepare
   2866       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   2867       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   2868       "dup v6.4s, %w[scale]\n"
   2869       "dup v4.4s, v4.s[0]\n"
   2870 
   2871       // RowMajorOutput::Prepare
   2872 
   2873       // Reduce aggregators.
   2874       "addp v0.4s, v0.4s, v1.4s\n"
   2875       "addp v0.4s, v0.4s, v0.4s\n"
   2876 
   2877       // StaticQuantizationFloat::Transform
   2878       "add v0.4s, v0.4s, v4.4s\n"
   2879       "add v0.4s, v0.4s, v5.4s\n"
   2880       "scvtf v0.4s, v0.4s\n"
   2881       "fmul v0.4s, v0.4s, v6.4s\n"
   2882 
   2883       // RowMajorOutput::Output
   2884       "st1 {v0.2s}, [%x[result]], #8\n"
   2885       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2886       : [count] "r"(params.kernel.count),
   2887         [stride] "r"(params.output_stream.stride),
   2888         [scale] "r"(params.kernel.scale)
   2889       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
   2890 }
   2891 
   2892 template <>
   2893 inline void MulKernel<
   2894     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3,
   2895     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2896                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   2897                                          RowMajor>& params,
   2898                  float* result) {
   2899 #ifdef DEBUG
   2900 #ifdef DEBUG_METAGEMM_VERBOSE
   2901   std::cout << __FILE__ << "(" << __LINE__
   2902             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   2903                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, "
   2904                "8>::Multiply()"
   2905             << std::endl
   2906             << std::flush;
   2907 #endif
   2908 #endif
   2909   asm volatile(
   2910       "prfm pldl1keep, [%x[lhs]]\n"
   2911       "prfm pldl1keep, [%x[rhs]]\n"
   2912 
   2913       // Clear aggregators.
   2914       "movi v0.4s, #0\n"
   2915       "movi v1.4s, #0\n"
   2916       "movi v2.4s, #0\n"
   2917 
   2918       // General NxM lanes loop.
   2919       "1:"
   2920 
   2921       // Subtract counter.
   2922       "subs %x[count], %x[count], #8\n"
   2923 
   2924       "ld1 {v3.2s}, [%x[lhs]], #8\n"
   2925       "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
   2926       "prfm pldl1keep, [%x[lhs], #64]\n"
   2927       "prfm pldl1keep, [%x[rhs], #64]\n"
   2928       "umull v7.8h, v4.8b, v3.8b\n"
   2929       "umull v8.8h, v5.8b, v3.8b\n"
   2930       "umull v9.8h, v6.8b, v3.8b\n"
   2931       "uadalp v0.4s, v7.8h\n"
   2932       "uadalp v1.4s, v8.8h\n"
   2933       "uadalp v2.4s, v9.8h\n"
   2934 
   2935       // Loop break.
   2936       "bgt 1b\n"
   2937 
   2938       // StaticQuantizationFloat::Prepare
   2939       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   2940       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   2941       "dup v6.4s, %w[scale]\n"
   2942       "dup v4.4s, v4.s[0]\n"
   2943 
   2944       // RowMajorOutput::Prepare
   2945 
   2946       // Reduce aggregators.
   2947       "addp v0.4s, v0.4s, v1.4s\n"
   2948       "addp v2.4s, v2.4s, v2.4s\n"
   2949       "addp v0.4s, v0.4s, v2.4s\n"
   2950 
   2951       // StaticQuantizationFloat::Transform
   2952       "add v0.4s, v0.4s, v4.4s\n"
   2953       "add v0.4s, v0.4s, v5.4s\n"
   2954       "scvtf v0.4s, v0.4s\n"
   2955       "fmul v0.4s, v0.4s, v6.4s\n"
   2956 
   2957       // RowMajorOutput::Output
   2958       "st1 {v0.2s}, [%x[result]], #8\n"
   2959       "st1 {v0.s}[2], [%x[result]], #4\n"
   2960       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2961       : [count] "r"(params.kernel.count),
   2962         [stride] "r"(params.output_stream.stride),
   2963         [scale] "r"(params.kernel.scale)
   2964       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
   2965         "memory");
   2966 }
   2967 
   2968 template <>
   2969 inline void MulKernel<
   2970     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4,
   2971     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2972                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   2973                                          RowMajor>& params,
   2974                  float* result) {
   2975 #ifdef DEBUG
   2976 #ifdef DEBUG_METAGEMM_VERBOSE
   2977   std::cout << __FILE__ << "(" << __LINE__
   2978             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   2979                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, "
   2980                "8>::Multiply()"
   2981             << std::endl
   2982             << std::flush;
   2983 #endif
   2984 #endif
   2985   asm volatile(
   2986       "prfm pldl1keep, [%x[lhs]]\n"
   2987       "prfm pldl1keep, [%x[rhs]]\n"
   2988 
   2989       // Clear aggregators.
   2990       "movi v0.4s, #0\n"
   2991       "movi v1.4s, #0\n"
   2992       "movi v2.4s, #0\n"
   2993       "mov v3.16b, v0.16b\n"
   2994 
   2995       // General NxM lanes loop.
   2996       "1:"
   2997 
   2998       // Subtract counter.
   2999       "subs %x[count], %x[count], #8\n"
   3000 
   3001       "ld1 {v4.2s}, [%x[lhs]], #8\n"
   3002       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
   3003       "prfm pldl1keep, [%x[lhs], #64]\n"
   3004       "prfm pldl1keep, [%x[rhs], #64]\n"
   3005       "umull v9.8h, v5.8b, v4.8b\n"
   3006       "umull v10.8h, v6.8b, v4.8b\n"
   3007       "umull v11.8h, v7.8b, v4.8b\n"
   3008       "umull v12.8h, v8.8b, v4.8b\n"
   3009       "uadalp v0.4s, v9.8h\n"
   3010       "uadalp v1.4s, v10.8h\n"
   3011       "uadalp v2.4s, v11.8h\n"
   3012       "uadalp v3.4s, v12.8h\n"
   3013 
   3014       // Loop break.
   3015       "bgt 1b\n"
   3016 
   3017       // StaticQuantizationFloat::Prepare
   3018       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   3019       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   3020       "dup v6.4s, %w[scale]\n"
   3021       "dup v4.4s, v4.s[0]\n"
   3022 
   3023       // RowMajorOutput::Prepare
   3024 
   3025       // Reduce aggregators.
   3026       "addp v0.4s, v0.4s, v1.4s\n"
   3027       "addp v2.4s, v2.4s, v3.4s\n"
   3028       "addp v0.4s, v0.4s, v2.4s\n"
   3029 
   3030       // StaticQuantizationFloat::Transform
   3031       "add v0.4s, v0.4s, v4.4s\n"
   3032       "add v0.4s, v0.4s, v5.4s\n"
   3033       "scvtf v0.4s, v0.4s\n"
   3034       "fmul v0.4s, v0.4s, v6.4s\n"
   3035 
   3036       // RowMajorOutput::Output
   3037       "st1 {v0.4s}, [%x[result]], #16\n"
   3038       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3039       : [count] "r"(params.kernel.count),
   3040         [stride] "r"(params.output_stream.stride),
   3041         [scale] "r"(params.kernel.scale)
   3042       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   3043         "v11", "v12", "cc", "memory");
   3044 }
   3045 
   3046 template <>
   3047 inline void MulKernel<
   3048     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5,
   3049     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3050                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3051                                          RowMajor>& params,
   3052                  float* result) {
   3053 #ifdef DEBUG
   3054 #ifdef DEBUG_METAGEMM_VERBOSE
   3055   std::cout << __FILE__ << "(" << __LINE__
   3056             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3057                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, "
   3058                "8>::Multiply()"
   3059             << std::endl
   3060             << std::flush;
   3061 #endif
   3062 #endif
   3063   asm volatile(
   3064       "prfm pldl1keep, [%x[lhs]]\n"
   3065       "prfm pldl1keep, [%x[rhs]]\n"
   3066 
   3067       // Clear aggregators.
   3068       "movi v0.4s, #0\n"
   3069       "movi v1.4s, #0\n"
   3070       "movi v2.4s, #0\n"
   3071       "mov v3.16b, v0.16b\n"
   3072       "mov v4.16b, v1.16b\n"
   3073 
   3074       // General 1xM lanes loop.
   3075       "1:"
   3076 
   3077       // Subtract counter.
   3078       "subs %x[count], %x[count], #8\n"
   3079 
   3080       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
   3081       "ld1 {v9.2s}, [%x[lhs]], #8\n"
   3082       "prfm pldl1keep, [%x[lhs], #64]\n"
   3083       "umull v10.8h, v5.8b, v9.8b\n"
   3084       "umull v11.8h, v6.8b, v9.8b\n"
   3085       "umull v12.8h, v7.8b, v9.8b\n"
   3086       "umull v13.8h, v8.8b, v9.8b\n"
   3087       "ld1 {v5.2s}, [%x[rhs]], #8\n"
   3088       "prfm pldl1keep, [%x[rhs], #128]\n"
   3089       "uadalp v0.4s, v10.8h\n"
   3090       "uadalp v1.4s, v11.8h\n"
   3091       "uadalp v2.4s, v12.8h\n"
   3092       "uadalp v3.4s, v13.8h\n"
   3093       "umull v10.8h, v5.8b, v9.8b\n"
   3094       "uadalp v4.4s, v10.8h\n"
   3095 
   3096       // Loop break.
   3097       "bgt 1b\n"
   3098 
   3099       // StaticQuantizationFloat::Prepare
   3100       "ld1 {v5.4s}, [%x[lhs]], #16\n"
   3101       "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
   3102       "dup v8.4s, %w[scale]\n"
   3103       "dup v5.4s, v5.s[0]\n"
   3104 
   3105       // RowMajorOutput::Prepare
   3106 
   3107       // Reduce aggregators.
   3108       "addp v0.4s, v0.4s, v1.4s\n"
   3109       "addp v2.4s, v2.4s, v3.4s\n"
   3110       "addp v4.4s, v4.4s, v4.4s\n"
   3111       "addp v0.4s, v0.4s, v2.4s\n"
   3112       "addp v1.4s, v4.4s, v4.4s\n"
   3113 
   3114       // StaticQuantizationFloat::Transform
   3115       "add v0.4s, v0.4s, v5.4s\n"
   3116       "add v1.4s, v1.4s, v5.4s\n"
   3117       "add v0.4s, v0.4s, v6.4s\n"
   3118       "add v1.4s, v1.4s, v7.4s\n"
   3119       "scvtf v0.4s, v0.4s\n"
   3120       "scvtf v1.4s, v1.4s\n"
   3121       "fmul v0.4s, v0.4s, v8.4s\n"
   3122       "fmul v1.4s, v1.4s, v8.4s\n"
   3123 
   3124       // RowMajorOutput::Output
   3125       "st1 {v0.4s}, [%x[result]], #16\n"
   3126       "st1 {v1.s}[0], [%x[result]], #4\n"
   3127       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3128       : [count] "r"(params.kernel.count),
   3129         [stride] "r"(params.output_stream.stride),
   3130         [scale] "r"(params.kernel.scale)
   3131       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   3132         "v11", "v12", "v13", "cc", "memory");
   3133 }
   3134 
   3135 template <>
   3136 inline void MulKernel<
   3137     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6,
   3138     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3139                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3140                                          RowMajor>& params,
   3141                  float* result) {
   3142 #ifdef DEBUG
   3143 #ifdef DEBUG_METAGEMM_VERBOSE
   3144   std::cout << __FILE__ << "(" << __LINE__
   3145             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3146                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, "
   3147                "8>::Multiply()"
   3148             << std::endl
   3149             << std::flush;
   3150 #endif
   3151 #endif
   3152   asm volatile(
   3153       "prfm pldl1keep, [%x[lhs]]\n"
   3154       "prfm pldl1keep, [%x[rhs]]\n"
   3155 
   3156       // Clear aggregators.
   3157       "movi v0.4s, #0\n"
   3158       "movi v1.4s, #0\n"
   3159       "movi v2.4s, #0\n"
   3160       "mov v3.16b, v0.16b\n"
   3161       "mov v4.16b, v1.16b\n"
   3162       "mov v5.16b, v2.16b\n"
   3163 
   3164       // General 1xM lanes loop.
   3165       "1:"
   3166 
   3167       // Subtract counter.
   3168       "subs %x[count], %x[count], #8\n"
   3169 
   3170       "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
   3171       "ld1 {v10.2s}, [%x[lhs]], #8\n"
   3172       "prfm pldl1keep, [%x[lhs], #64]\n"
   3173       "umull v11.8h, v6.8b, v10.8b\n"
   3174       "umull v12.8h, v7.8b, v10.8b\n"
   3175       "umull v13.8h, v8.8b, v10.8b\n"
   3176       "umull v14.8h, v9.8b, v10.8b\n"
   3177       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
   3178       "prfm pldl1keep, [%x[rhs], #128]\n"
   3179       "uadalp v0.4s, v11.8h\n"
   3180       "uadalp v1.4s, v12.8h\n"
   3181       "uadalp v2.4s, v13.8h\n"
   3182       "uadalp v3.4s, v14.8h\n"
   3183       "umull v11.8h, v6.8b, v10.8b\n"
   3184       "umull v12.8h, v7.8b, v10.8b\n"
   3185       "uadalp v4.4s, v11.8h\n"
   3186       "uadalp v5.4s, v12.8h\n"
   3187 
   3188       // Loop break.
   3189       "bgt 1b\n"
   3190 
   3191       // StaticQuantizationFloat::Prepare
   3192       "ld1 {v6.4s}, [%x[lhs]], #16\n"
   3193       "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
   3194       "dup v9.4s, %w[scale]\n"
   3195       "dup v6.4s, v6.s[0]\n"
   3196 
   3197       // RowMajorOutput::Prepare
   3198 
   3199       // Reduce aggregators.
   3200       "addp v0.4s, v0.4s, v1.4s\n"
   3201       "addp v2.4s, v2.4s, v3.4s\n"
   3202       "addp v4.4s, v4.4s, v5.4s\n"
   3203       "addp v0.4s, v0.4s, v2.4s\n"
   3204       "addp v1.4s, v4.4s, v4.4s\n"
   3205 
   3206       // StaticQuantizationFloat::Transform
   3207       "add v0.4s, v0.4s, v6.4s\n"
   3208       "add v1.4s, v1.4s, v6.4s\n"
   3209       "add v0.4s, v0.4s, v7.4s\n"
   3210       "add v1.4s, v1.4s, v8.4s\n"
   3211       "scvtf v0.4s, v0.4s\n"
   3212       "scvtf v1.4s, v1.4s\n"
   3213       "fmul v0.4s, v0.4s, v9.4s\n"
   3214       "fmul v1.4s, v1.4s, v9.4s\n"
   3215 
   3216       // RowMajorOutput::Output
   3217       "st1 {v0.4s}, [%x[result]], #16\n"
   3218       "st1 {v1.2s}, [%x[result]], #8\n"
   3219       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3220       : [count] "r"(params.kernel.count),
   3221         [stride] "r"(params.output_stream.stride),
   3222         [scale] "r"(params.kernel.scale)
   3223       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   3224         "v11", "v12", "v13", "v14", "cc", "memory");
   3225 }
   3226 
   3227 template <>
   3228 inline void MulKernel<
   3229     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7,
   3230     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3231                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3232                                          RowMajor>& params,
   3233                  float* result) {
   3234 #ifdef DEBUG
   3235 #ifdef DEBUG_METAGEMM_VERBOSE
   3236   std::cout << __FILE__ << "(" << __LINE__
   3237             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3238                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, "
   3239                "8>::Multiply()"
   3240             << std::endl
   3241             << std::flush;
   3242 #endif
   3243 #endif
   3244   asm volatile(
   3245       "prfm pldl1keep, [%x[lhs]]\n"
   3246       "prfm pldl1keep, [%x[rhs]]\n"
   3247 
   3248       // Clear aggregators.
   3249       "movi v0.4s, #0\n"
   3250       "movi v1.4s, #0\n"
   3251       "movi v2.4s, #0\n"
   3252       "mov v3.16b, v0.16b\n"
   3253       "mov v4.16b, v1.16b\n"
   3254       "mov v5.16b, v2.16b\n"
   3255       "mov v6.16b, v3.16b\n"
   3256 
   3257       // General 1xM lanes loop.
   3258       "1:"
   3259 
   3260       // Subtract counter.
   3261       "subs %x[count], %x[count], #8\n"
   3262 
   3263       "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
   3264       "ld1 {v11.2s}, [%x[lhs]], #8\n"
   3265       "prfm pldl1keep, [%x[lhs], #64]\n"
   3266       "umull v12.8h, v7.8b, v11.8b\n"
   3267       "umull v13.8h, v8.8b, v11.8b\n"
   3268       "umull v14.8h, v9.8b, v11.8b\n"
   3269       "umull v15.8h, v10.8b, v11.8b\n"
   3270       "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
   3271       "prfm pldl1keep, [%x[rhs], #128]\n"
   3272       "uadalp v0.4s, v12.8h\n"
   3273       "uadalp v1.4s, v13.8h\n"
   3274       "uadalp v2.4s, v14.8h\n"
   3275       "uadalp v3.4s, v15.8h\n"
   3276       "umull v12.8h, v7.8b, v11.8b\n"
   3277       "umull v13.8h, v8.8b, v11.8b\n"
   3278       "umull v14.8h, v9.8b, v11.8b\n"
   3279       "uadalp v4.4s, v12.8h\n"
   3280       "uadalp v5.4s, v13.8h\n"
   3281       "uadalp v6.4s, v14.8h\n"
   3282 
   3283       // Loop break.
   3284       "bgt 1b\n"
   3285 
   3286       // StaticQuantizationFloat::Prepare
   3287       "ld1 {v7.4s}, [%x[lhs]], #16\n"
   3288       "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
   3289       "dup v10.4s, %w[scale]\n"
   3290       "dup v7.4s, v7.s[0]\n"
   3291 
   3292       // RowMajorOutput::Prepare
   3293 
   3294       // Reduce aggregators.
   3295       "addp v0.4s, v0.4s, v1.4s\n"
   3296       "addp v2.4s, v2.4s, v3.4s\n"
   3297       "addp v4.4s, v4.4s, v5.4s\n"
   3298       "addp v6.4s, v6.4s, v6.4s\n"
   3299       "addp v0.4s, v0.4s, v2.4s\n"
   3300       "addp v1.4s, v4.4s, v6.4s\n"
   3301 
   3302       // StaticQuantizationFloat::Transform
   3303       "add v0.4s, v0.4s, v7.4s\n"
   3304       "add v1.4s, v1.4s, v7.4s\n"
   3305       "add v0.4s, v0.4s, v8.4s\n"
   3306       "add v1.4s, v1.4s, v9.4s\n"
   3307       "scvtf v0.4s, v0.4s\n"
   3308       "scvtf v1.4s, v1.4s\n"
   3309       "fmul v0.4s, v0.4s, v10.4s\n"
   3310       "fmul v1.4s, v1.4s, v10.4s\n"
   3311 
   3312       // RowMajorOutput::Output
   3313       "st1 {v0.4s}, [%x[result]], #16\n"
   3314       "st1 {v1.2s}, [%x[result]], #8\n"
   3315       "st1 {v1.s}[2], [%x[result]], #4\n"
   3316       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3317       : [count] "r"(params.kernel.count),
   3318         [stride] "r"(params.output_stream.stride),
   3319         [scale] "r"(params.kernel.scale)
   3320       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   3321         "v11", "v12", "v13", "v14", "v15", "cc", "memory");
   3322 }
   3323 
   3324 template <>
   3325 inline void MulKernel<
   3326     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8,
   3327     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3328                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3329                                          RowMajor>& params,
   3330                  float* result) {
   3331 #ifdef DEBUG
   3332 #ifdef DEBUG_METAGEMM_VERBOSE
   3333   std::cout << __FILE__ << "(" << __LINE__
   3334             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3335                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, "
   3336                "8>::Multiply()"
   3337             << std::endl
   3338             << std::flush;
   3339 #endif
   3340 #endif
   3341   asm volatile(
   3342       "prfm pldl1keep, [%x[lhs]]\n"
   3343       "prfm pldl1keep, [%x[rhs]]\n"
   3344 
   3345       // Clear aggregators.
   3346       "movi v0.4s, #0\n"
   3347       "movi v1.4s, #0\n"
   3348       "movi v2.4s, #0\n"
   3349       "mov v3.16b, v0.16b\n"
   3350       "mov v4.16b, v1.16b\n"
   3351       "mov v5.16b, v2.16b\n"
   3352       "mov v6.16b, v3.16b\n"
   3353       "mov v7.16b, v4.16b\n"
   3354 
   3355       // 1x8 lanes loop.
   3356       "1:"
   3357 
   3358       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
   3359       "ld1 {v8.2s}, [%x[lhs]], #8\n"
   3360       "umull v13.8h, v8.8b, v9.8b\n"
   3361       "umull v14.8h, v8.8b, v10.8b\n"
   3362       "umull v15.8h, v8.8b, v11.8b\n"
   3363       "umull v16.8h, v8.8b, v12.8b\n"
   3364       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
   3365       "uadalp v0.4s, v13.8h\n"
   3366       "uadalp v1.4s, v14.8h\n"
   3367       "uadalp v2.4s, v15.8h\n"
   3368       "uadalp v3.4s, v16.8h\n"
   3369       "prfm pldl1keep, [%x[rhs], #256]\n"
   3370       "umull v17.8h, v8.8b, v9.8b\n"
   3371       "umull v13.8h, v8.8b, v10.8b\n"
   3372       "umull v14.8h, v8.8b, v11.8b\n"
   3373       "umull v15.8h, v8.8b, v12.8b\n"
   3374       "prfm pldl1keep, [%x[lhs], #32]\n"
   3375 
   3376       // Subtract counter.
   3377       "subs %x[count], %x[count], #8\n"
   3378 
   3379       "uadalp v4.4s, v17.8h\n"
   3380       "uadalp v5.4s, v13.8h\n"
   3381       "uadalp v6.4s, v14.8h\n"
   3382       "uadalp v7.4s, v15.8h\n"
   3383 
   3384       // Loop break.
   3385       "bgt 1b\n"
   3386 
   3387       // StaticQuantizationFloat::Prepare
   3388       "ld1 {v8.4s}, [%x[lhs]], #16\n"
   3389       "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
   3390       "dup v11.4s, %w[scale]\n"
   3391       "dup v8.4s, v8.s[0]\n"
   3392 
   3393       // RowMajorOutput::Prepare
   3394 
   3395       // Reduce aggregators.
   3396       "addp v0.4s, v0.4s, v1.4s\n"
   3397       "addp v2.4s, v2.4s, v3.4s\n"
   3398       "addp v4.4s, v4.4s, v5.4s\n"
   3399       "addp v6.4s, v6.4s, v7.4s\n"
   3400       "addp v0.4s, v0.4s, v2.4s\n"
   3401       "addp v1.4s, v4.4s, v6.4s\n"
   3402 
   3403       // StaticQuantizationFloat::Transform
   3404       "add v0.4s, v0.4s, v8.4s\n"
   3405       "add v1.4s, v1.4s, v8.4s\n"
   3406       "add v0.4s, v0.4s, v9.4s\n"
   3407       "add v1.4s, v1.4s, v10.4s\n"
   3408       "scvtf v0.4s, v0.4s\n"
   3409       "scvtf v1.4s, v1.4s\n"
   3410       "fmul v0.4s, v0.4s, v11.4s\n"
   3411       "fmul v1.4s, v1.4s, v11.4s\n"
   3412 
   3413       // RowMajorOutput::Output
   3414       "st1 {v0.4s, v1.4s}, [%x[result]], #32\n"
   3415       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3416       : [count] "r"(params.kernel.count),
   3417         [stride] "r"(params.output_stream.stride),
   3418         [scale] "r"(params.kernel.scale)
   3419       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   3420         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
   3421 }
   3422 
   3423 template <>
   3424 inline void MulKernel<
   3425     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1,
   3426     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3427                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3428                                          RowMajor>& params,
   3429                  float* result) {
   3430 #ifdef DEBUG
   3431 #ifdef DEBUG_METAGEMM_VERBOSE
   3432   std::cout << __FILE__ << "(" << __LINE__
   3433             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3434                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, "
   3435                "8>::Multiply()"
   3436             << std::endl
   3437             << std::flush;
   3438 #endif
   3439 #endif
   3440   asm volatile(
   3441       "prfm pldl1keep, [%x[lhs]]\n"
   3442       "prfm pldl1keep, [%x[rhs]]\n"
   3443 
   3444       // Clear aggregators.
   3445       "movi v0.4s, #0\n"
   3446       "movi v1.4s, #0\n"
   3447 
   3448       // General NxM lanes loop.
   3449       "1:"
   3450 
   3451       // Subtract counter.
   3452       "subs %x[count], %x[count], #8\n"
   3453 
   3454       "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
   3455       "ld1 {v4.2s}, [%x[rhs]], #8\n"
   3456       "prfm pldl1keep, [%x[lhs], #64]\n"
   3457       "prfm pldl1keep, [%x[rhs], #64]\n"
   3458       "umull v5.8h, v4.8b, v2.8b\n"
   3459       "umull v6.8h, v4.8b, v3.8b\n"
   3460       "uadalp v0.4s, v5.8h\n"
   3461       "uadalp v1.4s, v6.8h\n"
   3462 
   3463       // Loop break.
   3464       "bgt 1b\n"
   3465 
   3466       // StaticQuantizationFloat::Prepare
   3467       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   3468       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   3469       "dup v6.4s, %w[scale]\n"
   3470       "dup v2.4s, v4.s[0]\n"
   3471       "dup v4.4s, v4.s[1]\n"
   3472 
   3473       // RowMajorOutput::Prepare
   3474       "add x0, %x[result], %x[stride]\n"
   3475 
   3476       // Reduce aggregators.
   3477       "addp v0.4s, v0.4s, v0.4s\n"
   3478       "addp v0.4s, v0.4s, v0.4s\n"
   3479       "addp v1.4s, v1.4s, v1.4s\n"
   3480       "addp v1.4s, v1.4s, v1.4s\n"
   3481 
   3482       // StaticQuantizationFloat::Transform
   3483       "add v0.4s, v0.4s, v2.4s\n"
   3484       "add v1.4s, v1.4s, v4.4s\n"
   3485       "add v0.4s, v0.4s, v5.4s\n"
   3486       "add v1.4s, v1.4s, v5.4s\n"
   3487       "scvtf v0.4s, v0.4s\n"
   3488       "scvtf v1.4s, v1.4s\n"
   3489       "fmul v0.4s, v0.4s, v6.4s\n"
   3490       "fmul v1.4s, v1.4s, v6.4s\n"
   3491 
   3492       // RowMajorOutput::Output
   3493       "st1 {v0.s}[0], [%x[result]], #4\n"
   3494       "st1 {v1.s}[0], [x0], #4\n"
   3495       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3496       : [count] "r"(params.kernel.count),
   3497         [stride] "r"(params.output_stream.stride),
   3498         [scale] "r"(params.kernel.scale)
   3499       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
   3500 }
   3501 
   3502 template <>
   3503 inline void MulKernel<
   3504     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2,
   3505     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3506                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3507                                          RowMajor>& params,
   3508                  float* result) {
   3509 #ifdef DEBUG
   3510 #ifdef DEBUG_METAGEMM_VERBOSE
   3511   std::cout << __FILE__ << "(" << __LINE__
   3512             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3513                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, "
   3514                "8>::Multiply()"
   3515             << std::endl
   3516             << std::flush;
   3517 #endif
   3518 #endif
   3519   asm volatile(
   3520       "prfm pldl1keep, [%x[lhs]]\n"
   3521       "prfm pldl1keep, [%x[rhs]]\n"
   3522 
   3523       // Clear aggregators.
   3524       "movi v0.4s, #0\n"
   3525       "movi v1.4s, #0\n"
   3526       "movi v2.4s, #0\n"
   3527       "mov v3.16b, v0.16b\n"
   3528 
   3529       // General NxM lanes loop.
   3530       "1:"
   3531 
   3532       // Subtract counter.
   3533       "subs %x[count], %x[count], #8\n"
   3534 
   3535       "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
   3536       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
   3537       "prfm pldl1keep, [%x[lhs], #64]\n"
   3538       "prfm pldl1keep, [%x[rhs], #64]\n"
   3539       "umull v8.8h, v6.8b, v4.8b\n"
   3540       "umull v9.8h, v7.8b, v4.8b\n"
   3541       "umull v10.8h, v6.8b, v5.8b\n"
   3542       "umull v11.8h, v7.8b, v5.8b\n"
   3543       "uadalp v0.4s, v8.8h\n"
   3544       "uadalp v1.4s, v9.8h\n"
   3545       "uadalp v2.4s, v10.8h\n"
   3546       "uadalp v3.4s, v11.8h\n"
   3547 
   3548       // Loop break.
   3549       "bgt 1b\n"
   3550 
   3551       // StaticQuantizationFloat::Prepare
   3552       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   3553       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   3554       "dup v6.4s, %w[scale]\n"
   3555       "dup v7.4s, v4.s[0]\n"
   3556       "dup v4.4s, v4.s[1]\n"
   3557 
   3558       // RowMajorOutput::Prepare
   3559       "add x0, %x[result], %x[stride]\n"
   3560 
   3561       // Reduce aggregators.
   3562       "addp v0.4s, v0.4s, v1.4s\n"
   3563       "addp v0.4s, v0.4s, v0.4s\n"
   3564       "addp v2.4s, v2.4s, v3.4s\n"
   3565       "addp v2.4s, v2.4s, v2.4s\n"
   3566 
   3567       // StaticQuantizationFloat::Transform
   3568       "add v0.4s, v0.4s, v7.4s\n"
   3569       "add v2.4s, v2.4s, v4.4s\n"
   3570       "add v0.4s, v0.4s, v5.4s\n"
   3571       "add v2.4s, v2.4s, v5.4s\n"
   3572       "scvtf v0.4s, v0.4s\n"
   3573       "scvtf v2.4s, v2.4s\n"
   3574       "fmul v0.4s, v0.4s, v6.4s\n"
   3575       "fmul v2.4s, v2.4s, v6.4s\n"
   3576 
   3577       // RowMajorOutput::Output
   3578       "st1 {v0.2s}, [%x[result]], #8\n"
   3579       "st1 {v2.2s}, [x0], #8\n"
   3580       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3581       : [count] "r"(params.kernel.count),
   3582         [stride] "r"(params.output_stream.stride),
   3583         [scale] "r"(params.kernel.scale)
   3584       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   3585         "v11", "cc", "memory");
   3586 }
   3587 
   3588 template <>
   3589 inline void MulKernel<
   3590     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3,
   3591     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3592                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3593                                          RowMajor>& params,
   3594                  float* result) {
   3595 #ifdef DEBUG
   3596 #ifdef DEBUG_METAGEMM_VERBOSE
   3597   std::cout << __FILE__ << "(" << __LINE__
   3598             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3599                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, "
   3600                "8>::Multiply()"
   3601             << std::endl
   3602             << std::flush;
   3603 #endif
   3604 #endif
   3605   asm volatile(
   3606       "prfm pldl1keep, [%x[lhs]]\n"
   3607       "prfm pldl1keep, [%x[rhs]]\n"
   3608 
   3609       // Clear aggregators.
   3610       "movi v0.4s, #0\n"
   3611       "movi v1.4s, #0\n"
   3612       "movi v2.4s, #0\n"
   3613       "mov v3.16b, v0.16b\n"
   3614       "mov v4.16b, v1.16b\n"
   3615       "mov v5.16b, v2.16b\n"
   3616 
   3617       // General NxM lanes loop.
   3618       "1:"
   3619 
   3620       // Subtract counter.
   3621       "subs %x[count], %x[count], #8\n"
   3622 
   3623       "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
   3624       "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
   3625       "prfm pldl1keep, [%x[lhs], #64]\n"
   3626       "prfm pldl1keep, [%x[rhs], #64]\n"
   3627       "umull v11.8h, v8.8b, v6.8b\n"
   3628       "umull v12.8h, v9.8b, v6.8b\n"
   3629       "umull v13.8h, v10.8b, v6.8b\n"
   3630       "umull v14.8h, v8.8b, v7.8b\n"
   3631       "umull v15.8h, v9.8b, v7.8b\n"
   3632       "umull v16.8h, v10.8b, v7.8b\n"
   3633       "uadalp v0.4s, v11.8h\n"
   3634       "uadalp v1.4s, v12.8h\n"
   3635       "uadalp v2.4s, v13.8h\n"
   3636       "uadalp v3.4s, v14.8h\n"
   3637       "uadalp v4.4s, v15.8h\n"
   3638       "uadalp v5.4s, v16.8h\n"
   3639 
   3640       // Loop break.
   3641       "bgt 1b\n"
   3642 
   3643       // StaticQuantizationFloat::Prepare
   3644       "ld1 {v6.4s}, [%x[lhs]], #16\n"
   3645       "ld1 {v7.4s}, [%x[rhs]], #16\n"
   3646       "dup v8.4s, %w[scale]\n"
   3647       "dup v9.4s, v6.s[0]\n"
   3648       "dup v6.4s, v6.s[1]\n"
   3649 
   3650       // RowMajorOutput::Prepare
   3651       "add x0, %x[result], %x[stride]\n"
   3652 
   3653       // Reduce aggregators.
   3654       "addp v0.4s, v0.4s, v1.4s\n"
   3655       "addp v2.4s, v2.4s, v2.4s\n"
   3656       "addp v0.4s, v0.4s, v2.4s\n"
   3657       "addp v3.4s, v3.4s, v4.4s\n"
   3658       "addp v5.4s, v5.4s, v5.4s\n"
   3659       "addp v3.4s, v3.4s, v5.4s\n"
   3660 
   3661       // StaticQuantizationFloat::Transform
   3662       "add v0.4s, v0.4s, v9.4s\n"
   3663       "add v3.4s, v3.4s, v6.4s\n"
   3664       "add v0.4s, v0.4s, v7.4s\n"
   3665       "add v3.4s, v3.4s, v7.4s\n"
   3666       "scvtf v0.4s, v0.4s\n"
   3667       "scvtf v3.4s, v3.4s\n"
   3668       "fmul v0.4s, v0.4s, v8.4s\n"
   3669       "fmul v3.4s, v3.4s, v8.4s\n"
   3670 
   3671       // RowMajorOutput::Output
   3672       "st1 {v0.2s}, [%x[result]], #8\n"
   3673       "st1 {v0.s}[2], [%x[result]], #4\n"
   3674       "st1 {v3.2s}, [x0], #8\n"
   3675       "st1 {v3.s}[2], [x0], #4\n"
   3676       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3677       : [count] "r"(params.kernel.count),
   3678         [stride] "r"(params.output_stream.stride),
   3679         [scale] "r"(params.kernel.scale)
   3680       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   3681         "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
   3682 }
   3683 
   3684 template <>
   3685 inline void MulKernel<
   3686     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4,
   3687     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3688                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3689                                          RowMajor>& params,
   3690                  float* result) {
   3691 #ifdef DEBUG
   3692 #ifdef DEBUG_METAGEMM_VERBOSE
   3693   std::cout << __FILE__ << "(" << __LINE__
   3694             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3695                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, "
   3696                "8>::Multiply()"
   3697             << std::endl
   3698             << std::flush;
   3699 #endif
   3700 #endif
   3701   asm volatile(
   3702       "prfm pldl1keep, [%x[lhs]]\n"
   3703       "prfm pldl1keep, [%x[rhs]]\n"
   3704 
   3705       // Clear aggregators.
   3706       "movi v0.4s, #0\n"
   3707       "movi v1.4s, #0\n"
   3708       "movi v2.4s, #0\n"
   3709       "mov v3.16b, v0.16b\n"
   3710       "mov v4.16b, v1.16b\n"
   3711       "mov v5.16b, v2.16b\n"
   3712       "mov v6.16b, v3.16b\n"
   3713       "mov v7.16b, v4.16b\n"
   3714 
   3715       // 2x4 lanes loop.
   3716       "1:"
   3717 
   3718       "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
   3719       "ld1 {v8.8b}, [%x[lhs]], #8\n"
   3720       "umull v14.8h, v8.8b, v10.8b\n"
   3721       "ld1 {v9.8b}, [%x[lhs]], #8\n"
   3722       "umull v15.8h, v8.8b, v11.8b\n"
   3723       "prfm pldl1keep, [%x[rhs], #64]\n"
   3724       "umull v16.8h, v8.8b, v12.8b\n"
   3725       "prfm pldl1keep, [%x[lhs], #64]\n"
   3726       "umull v17.8h, v8.8b, v13.8b\n"
   3727       "umull v18.8h, v9.8b, v10.8b\n"
   3728       "uadalp v0.4s, v14.8h\n"
   3729       "uadalp v1.4s, v15.8h\n"
   3730       "uadalp v2.4s, v16.8h\n"
   3731       "umull v14.8h, v9.8b, v11.8b\n"
   3732       "umull v15.8h, v9.8b, v12.8b\n"
   3733       "umull v16.8h, v9.8b, v13.8b\n"
   3734 
   3735       // Subtract counter.
   3736       "subs %x[count], %x[count], #8\n"
   3737 
   3738       "uadalp v3.4s, v17.8h\n"
   3739       "uadalp v4.4s, v18.8h\n"
   3740       "uadalp v5.4s, v14.8h\n"
   3741       "uadalp v6.4s, v15.8h\n"
   3742       "uadalp v7.4s, v16.8h\n"
   3743 
   3744       // Loop break.
   3745       "bgt 1b\n"
   3746 
   3747       // StaticQuantizationFloat::Prepare
   3748       "ld1 {v8.4s}, [%x[lhs]], #16\n"
   3749       "ld1 {v9.4s}, [%x[rhs]], #16\n"
   3750       "dup v10.4s, %w[scale]\n"
   3751       "dup v11.4s, v8.s[0]\n"
   3752       "dup v8.4s, v8.s[1]\n"
   3753 
   3754       // RowMajorOutput::Prepare
   3755       "add x0, %x[result], %x[stride]\n"
   3756 
   3757       // Reduce aggregators.
   3758       "addp v0.4s, v0.4s, v1.4s\n"
   3759       "addp v2.4s, v2.4s, v3.4s\n"
   3760       "addp v0.4s, v0.4s, v2.4s\n"
   3761       "addp v4.4s, v4.4s, v5.4s\n"
   3762       "addp v6.4s, v6.4s, v7.4s\n"
   3763       "addp v4.4s, v4.4s, v6.4s\n"
   3764 
   3765       // StaticQuantizationFloat::Transform
   3766       "add v0.4s, v0.4s, v11.4s\n"
   3767       "add v4.4s, v4.4s, v8.4s\n"
   3768       "add v0.4s, v0.4s, v9.4s\n"
   3769       "add v4.4s, v4.4s, v9.4s\n"
   3770       "scvtf v0.4s, v0.4s\n"
   3771       "scvtf v4.4s, v4.4s\n"
   3772       "fmul v0.4s, v0.4s, v10.4s\n"
   3773       "fmul v4.4s, v4.4s, v10.4s\n"
   3774 
   3775       // RowMajorOutput::Output
   3776       "st1 {v0.4s}, [%x[result]], #16\n"
   3777       "st1 {v4.4s}, [x0], #16\n"
   3778       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3779       : [count] "r"(params.kernel.count),
   3780         [stride] "r"(params.output_stream.stride),
   3781         [scale] "r"(params.kernel.scale)
   3782       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
   3783         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
   3784 }
   3785 
   3786 template <>
   3787 inline void MulKernel<
   3788     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1,
   3789     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3790                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3791                                          RowMajor>& params,
   3792                  float* result) {
   3793 #ifdef DEBUG
   3794 #ifdef DEBUG_METAGEMM_VERBOSE
   3795   std::cout << __FILE__ << "(" << __LINE__
   3796             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3797                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, "
   3798                "8>::Multiply()"
   3799             << std::endl
   3800             << std::flush;
   3801 #endif
   3802 #endif
   3803   asm volatile(
   3804       "prfm pldl1keep, [%x[lhs]]\n"
   3805       "prfm pldl1keep, [%x[rhs]]\n"
   3806 
   3807       // Clear aggregators.
   3808       "movi v0.4s, #0\n"
   3809       "movi v1.4s, #0\n"
   3810       "movi v2.4s, #0\n"
   3811 
   3812       // General NxM lanes loop.
   3813       "1:"
   3814 
   3815       // Subtract counter.
   3816       "subs %x[count], %x[count], #8\n"
   3817 
   3818       "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
   3819       "ld1 {v6.2s}, [%x[rhs]], #8\n"
   3820       "prfm pldl1keep, [%x[lhs], #64]\n"
   3821       "prfm pldl1keep, [%x[rhs], #64]\n"
   3822       "umull v7.8h, v6.8b, v3.8b\n"
   3823       "umull v8.8h, v6.8b, v4.8b\n"
   3824       "umull v9.8h, v6.8b, v5.8b\n"
   3825       "uadalp v0.4s, v7.8h\n"
   3826       "uadalp v1.4s, v8.8h\n"
   3827       "uadalp v2.4s, v9.8h\n"
   3828 
   3829       // Loop break.
   3830       "bgt 1b\n"
   3831 
   3832       // StaticQuantizationFloat::Prepare
   3833       "ld1 {v4.4s}, [%x[lhs]], #16\n"
   3834       "ld1 {v5.4s}, [%x[rhs]], #16\n"
   3835       "dup v6.4s, %w[scale]\n"
   3836       "dup v3.4s, v4.s[0]\n"
   3837       "dup v7.4s, v4.s[1]\n"
   3838       "dup v4.4s, v4.s[2]\n"
   3839 
   3840       // RowMajorOutput::Prepare
   3841       "add x0, %x[result], %x[stride]\n"
   3842       "add x1, x0, %x[stride]\n"
   3843 
   3844       // Reduce aggregators.
   3845       "addp v0.4s, v0.4s, v0.4s\n"
   3846       "addp v0.4s, v0.4s, v0.4s\n"
   3847       "addp v1.4s, v1.4s, v1.4s\n"
   3848       "addp v1.4s, v1.4s, v1.4s\n"
   3849       "addp v2.4s, v2.4s, v2.4s\n"
   3850       "addp v2.4s, v2.4s, v2.4s\n"
   3851 
   3852       // StaticQuantizationFloat::Transform
   3853       "add v0.4s, v0.4s, v3.4s\n"
   3854       "add v1.4s, v1.4s, v7.4s\n"
   3855       "add v2.4s, v2.4s, v4.4s\n"
   3856       "add v0.4s, v0.4s, v5.4s\n"
   3857       "add v1.4s, v1.4s, v5.4s\n"
   3858       "add v2.4s, v2.4s, v5.4s\n"
   3859       "scvtf v0.4s, v0.4s\n"
   3860       "scvtf v1.4s, v1.4s\n"
   3861       "scvtf v2.4s, v2.4s\n"
   3862       "fmul v0.4s, v0.4s, v6.4s\n"
   3863       "fmul v1.4s, v1.4s, v6.4s\n"
   3864       "fmul v2.4s, v2.4s, v6.4s\n"
   3865 
   3866       // RowMajorOutput::Output
   3867       "st1 {v0.s}[0], [%x[result]], #4\n"
   3868       "st1 {v1.s}[0], [x0], #4\n"
   3869       "st1 {v2.s}[0], [x1], #4\n"
   3870       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3871       : [count] "r"(params.kernel.count),
   3872         [stride] "r"(params.output_stream.stride),
   3873         [scale] "r"(params.kernel.scale)
   3874       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   3875         "cc", "memory");
   3876 }
   3877 
   3878 template <>
   3879 inline void MulKernel<
   3880     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2,
   3881     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3882                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3883                                          RowMajor>& params,
   3884                  float* result) {
   3885 #ifdef DEBUG
   3886 #ifdef DEBUG_METAGEMM_VERBOSE
   3887   std::cout << __FILE__ << "(" << __LINE__
   3888             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3889                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, "
   3890                "8>::Multiply()"
   3891             << std::endl
   3892             << std::flush;
   3893 #endif
   3894 #endif
   3895   asm volatile(
   3896       "prfm pldl1keep, [%x[lhs]]\n"
   3897       "prfm pldl1keep, [%x[rhs]]\n"
   3898 
   3899       // Clear aggregators.
   3900       "movi v0.4s, #0\n"
   3901       "movi v1.4s, #0\n"
   3902       "movi v2.4s, #0\n"
   3903       "mov v3.16b, v0.16b\n"
   3904       "mov v4.16b, v1.16b\n"
   3905       "mov v5.16b, v2.16b\n"
   3906 
   3907       // General NxM lanes loop.
   3908       "1:"
   3909 
   3910       // Subtract counter.
   3911       "subs %x[count], %x[count], #8\n"
   3912 
   3913       "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
   3914       "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
   3915       "prfm pldl1keep, [%x[lhs], #64]\n"
   3916       "prfm pldl1keep, [%x[rhs], #64]\n"
   3917       "umull v11.8h, v9.8b, v6.8b\n"
   3918       "umull v12.8h, v10.8b, v6.8b\n"
   3919       "umull v13.8h, v9.8b, v7.8b\n"
   3920       "umull v14.8h, v10.8b, v7.8b\n"
   3921       "umull v15.8h, v9.8b, v8.8b\n"
   3922       "umull v16.8h, v10.8b, v8.8b\n"
   3923       "uadalp v0.4s, v11.8h\n"
   3924       "uadalp v1.4s, v12.8h\n"
   3925       "uadalp v2.4s, v13.8h\n"
   3926       "uadalp v3.4s, v14.8h\n"
   3927       "uadalp v4.4s, v15.8h\n"
   3928       "uadalp v5.4s, v16.8h\n"
   3929 
   3930       // Loop break.
   3931       "bgt 1b\n"
   3932 
   3933       // StaticQuantizationFloat::Prepare
   3934       "ld1 {v6.4s}, [%x[lhs]], #16\n"
   3935       "ld1 {v7.4s}, [%x[rhs]], #16\n"
   3936       "dup v8.4s, %w[scale]\n"
   3937       "dup v9.4s, v6.s[0]\n"
   3938       "dup v10.4s, v6.s[1]\n"
   3939       "dup v6.4s, v6.s[2]\n"
   3940 
   3941       // RowMajorOutput::Prepare
   3942       "add x0, %x[result], %x[stride]\n"
   3943       "add x1, x0, %x[stride]\n"
   3944 
   3945       // Reduce aggregators.
   3946       "addp v0.4s, v0.4s, v1.4s\n"
   3947       "addp v0.4s, v0.4s, v0.4s\n"
   3948       "addp v2.4s, v2.4s, v3.4s\n"
   3949       "addp v2.4s, v2.4s, v2.4s\n"
   3950       "addp v4.4s, v4.4s, v5.4s\n"
   3951       "addp v4.4s, v4.4s, v4.4s\n"
   3952 
   3953       // StaticQuantizationFloat::Transform
   3954       "add v0.4s, v0.4s, v9.4s\n"
   3955       "add v2.4s, v2.4s, v10.4s\n"
   3956       "add v4.4s, v4.4s, v6.4s\n"
   3957       "add v0.4s, v0.4s, v7.4s\n"
   3958       "add v2.4s, v2.4s, v7.4s\n"
   3959       "add v4.4s, v4.4s, v7.4s\n"
   3960       "scvtf v0.4s, v0.4s\n"
   3961       "scvtf v2.4s, v2.4s\n"
   3962       "scvtf v4.4s, v4.4s\n"
   3963       "fmul v0.4s, v0.4s, v8.4s\n"
   3964       "fmul v2.4s, v2.4s, v8.4s\n"
   3965       "fmul v4.4s, v4.4s, v8.4s\n"
   3966 
   3967       // RowMajorOutput::Output
   3968       "st1 {v0.2s}, [%x[result]], #8\n"
   3969       "st1 {v2.2s}, [x0], #8\n"
   3970       "st1 {v4.2s}, [x1], #8\n"
   3971       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3972       : [count] "r"(params.kernel.count),
   3973         [stride] "r"(params.output_stream.stride),
   3974         [scale] "r"(params.kernel.scale)
   3975       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   3976         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
   3977 }
   3978 
   3979 template <>
   3980 inline void MulKernel<
   3981     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3,
   3982     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3983                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3984                                          RowMajor>& params,
   3985                  float* result) {
   3986 #ifdef DEBUG
   3987 #ifdef DEBUG_METAGEMM_VERBOSE
   3988   std::cout << __FILE__ << "(" << __LINE__
   3989             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3990                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, "
   3991                "8>::Multiply()"
   3992             << std::endl
   3993             << std::flush;
   3994 #endif
   3995 #endif
   3996   asm volatile(
   3997       "prfm pldl1keep, [%x[lhs]]\n"
   3998       "prfm pldl1keep, [%x[rhs]]\n"
   3999 
   4000       // Clear aggregators.
   4001       "movi v0.4s, #0\n"
   4002       "movi v1.4s, #0\n"
   4003       "movi v2.4s, #0\n"
   4004       "mov v3.16b, v0.16b\n"
   4005       "mov v4.16b, v1.16b\n"
   4006       "mov v5.16b, v2.16b\n"
   4007       "mov v6.16b, v3.16b\n"
   4008       "mov v7.16b, v4.16b\n"
   4009       "mov v8.16b, v5.16b\n"
   4010 
   4011       // 3x3 lanes loop.
   4012       "1:"
   4013 
   4014       "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
   4015       "ld1 {v9.8b}, [%x[lhs]], #8\n"
   4016       "umull v15.8h, v9.8b, v12.8b\n"
   4017       "ld1 {v10.8b}, [%x[lhs]], #8\n"
   4018       "umull v16.8h, v9.8b, v13.8b\n"
   4019       "ld1 {v11.8b}, [%x[lhs]], #8\n"
   4020       "umull v17.8h, v9.8b, v14.8b\n"
   4021       "prfm pldl1keep, [%x[lhs], #64]\n"
   4022       "umull v18.8h, v10.8b, v12.8b\n"
   4023       "prfm pldl1keep, [%x[rhs], #64]\n"
   4024       "uadalp v0.4s, v15.8h\n"
   4025       "uadalp v1.4s, v16.8h\n"
   4026       "uadalp v2.4s, v17.8h\n"
   4027       "uadalp v3.4s, v18.8h\n"
   4028       "umull v15.8h, v10.8b, v13.8b\n"
   4029       "umull v16.8h, v10.8b, v14.8b\n"
   4030       "umull v17.8h, v11.8b, v12.8b\n"
   4031       "umull v18.8h, v11.8b, v13.8b\n"
   4032 
   4033       // Subtract counter.
   4034       "subs %x[count], %x[count], #8\n"
   4035 
   4036       "umull v9.8h, v11.8b, v14.8b\n"
   4037       "uadalp v4.4s, v15.8h\n"
   4038       "uadalp v5.4s, v16.8h\n"
   4039       "uadalp v6.4s, v17.8h\n"
   4040       "uadalp v7.4s, v18.8h\n"
   4041       "uadalp v8.4s, v9.8h\n"
   4042 
   4043       // Loop break.
   4044       "bgt 1b\n"
   4045 
   4046       // StaticQuantizationFloat::Prepare
   4047       "ld1 {v9.4s}, [%x[lhs]], #16\n"
   4048       "ld1 {v10.4s}, [%x[rhs]], #16\n"
   4049       "dup v11.4s, %w[scale]\n"
   4050       "dup v12.4s, v9.s[0]\n"
   4051       "dup v13.4s, v9.s[1]\n"
   4052       "dup v9.4s, v9.s[2]\n"
   4053 
   4054       // RowMajorOutput::Prepare
   4055       "add x0, %x[result], %x[stride]\n"
   4056       "add x1, x0, %x[stride]\n"
   4057 
   4058       // Reduce aggregators.
   4059       "addp v0.4s, v0.4s, v1.4s\n"
   4060       "addp v2.4s, v2.4s, v2.4s\n"
   4061       "addp v0.4s, v0.4s, v2.4s\n"
   4062       "addp v3.4s, v3.4s, v4.4s\n"
   4063       "addp v5.4s, v5.4s, v5.4s\n"
   4064       "addp v3.4s, v3.4s, v5.4s\n"
   4065       "addp v6.4s, v6.4s, v7.4s\n"
   4066       "addp v8.4s, v8.4s, v8.4s\n"
   4067       "addp v6.4s, v6.4s, v8.4s\n"
   4068 
   4069       // StaticQuantizationFloat::Transform
   4070       "add v0.4s, v0.4s, v12.4s\n"
   4071       "add v3.4s, v3.4s, v13.4s\n"
   4072       "add v6.4s, v6.4s, v9.4s\n"
   4073       "add v0.4s, v0.4s, v10.4s\n"
   4074       "add v3.4s, v3.4s, v10.4s\n"
   4075       "add v6.4s, v6.4s, v10.4s\n"
   4076       "scvtf v0.4s, v0.4s\n"
   4077       "scvtf v3.4s, v3.4s\n"
   4078       "scvtf v6.4s, v6.4s\n"
   4079       "fmul v0.4s, v0.4s, v11.4s\n"
   4080       "fmul v3.4s, v3.4s, v11.4s\n"
   4081       "fmul v6.4s, v6.4s, v11.4s\n"
   4082 
   4083       // RowMajorOutput::Output
   4084       "st1 {v0.2s}, [%x[result]], #8\n"
   4085       "st1 {v0.s}[2], [%x[result]], #4\n"
   4086       "st1 {v3.2s}, [x0], #8\n"
   4087       "st1 {v3.s}[2], [x0], #4\n"
   4088       "st1 {v6.2s}, [x1], #8\n"
   4089       "st1 {v6.s}[2], [x1], #4\n"
   4090       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   4091       : [count] "r"(params.kernel.count),
   4092         [stride] "r"(params.output_stream.stride),
   4093         [scale] "r"(params.kernel.scale)
   4094       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
   4095         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
   4096         "memory");
   4097 }
   4098 
   4099 }  // namespace meta
   4100 }  // namespace gemmlowp
   4101 
   4102 #else
   4103 #warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!"
   4104 #endif
   4105 
   4106 #endif  // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
   4107