Home | History | Annotate | Download | only in meta
      1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
     16 #define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
     17 
     18 #ifdef GEMMLOWP_NEON_32
     19 
     20 #include <cassert>
     21 #include <cstdint>
     22 
     23 namespace gemmlowp {
     24 namespace meta {
     25 
     26 template <>
     27 inline void
     28 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1,
     29           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
     30                        const FusedKernelParams<QuantizedStaticPreprocessed,
     31                                                RowMajor>& params,
     32                        uint8_t* result) {
     33 #ifdef DEBUG
     34 #ifdef DEBUG_METAGEMM_VERBOSE
     35   std::cout << __FILE__ << "(" << __LINE__
     36             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
     37                "QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()"
     38             << std::endl
     39             << std::flush;
     40 #endif
     41 #endif
     42   asm volatile(
     43       "pld [%[lhs]]\n"
     44       "pld [%[rhs]]\n"
     45 
     46       // Clear aggregators.
     47       "vmov.i32 q0, #0\n"
     48 
     49       // General NxM lanes loop.
     50       "1:"
     51 
     52       // Subtract counter.
     53       "subs %[count], %[count], #8\n"
     54 
     55       "vld1.32 {d2}, [%[lhs]:64]!\n"
     56       "vld1.32 {d3}, [%[rhs]:64]!\n"
     57       "pld [%[lhs], #64]\n"
     58       "pld [%[rhs], #64]\n"
     59       "vmull.u8 q2, d3, d2\n"
     60       "vpadal.u16 q0, q2\n"
     61 
     62       // Loop break.
     63       "bgt 1b\n"
     64 
     65       // StaticQuantization::Prepare
     66       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
     67       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
     68       "vdup.32 q6, %[multiplicative_offset]\n"
     69       "vdup.32 q7, %[rounding_offset]\n"
     70       "vdup.32 q8, %[shift]\n"
     71       "vdup.32 q4, d8[0]\n"
     72 
     73       // RowMajorOutput::Prepare
     74 
     75       // Reduce aggregators.
     76       "vpadd.u32 d0, d0, d1\n"
     77       "vpadd.u32 d0, d0, d0\n"
     78 
     79       // StaticQuantization::Transform
     80       "vadd.s32 q0, q0, q4\n"
     81       "vadd.s32 q0, q0, q5\n"
     82       "vmul.i32 q0, q0, q6\n"
     83       "vadd.i32 q0, q0, q7\n"
     84       "vshl.s32 q0, q0, q8\n"
     85       "vqmovn.s32 d0, q0\n"
     86       "vqmovun.s16 d0, q0\n"
     87 
     88       // RowMajorOutput::Output
     89       "vst1.8 {d0[0]}, [%[result]]!\n"
     90       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
     91       : [count] "r"(params.kernel.count),
     92         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
     93         [shift] "r"(params.kernel.shift),
     94         [stride] "r"(params.output_stream.stride),
     95         [rounding_offset] "r"(params.kernel.rounding_offset)
     96       : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "d12",
     97         "d13", "d14", "d15", "d16", "d17", "cc", "memory");
     98 }
     99 
    100 template <>
    101 inline void
    102 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2,
    103           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    104                        const FusedKernelParams<QuantizedStaticPreprocessed,
    105                                                RowMajor>& params,
    106                        uint8_t* result) {
    107 #ifdef DEBUG
    108 #ifdef DEBUG_METAGEMM_VERBOSE
    109   std::cout << __FILE__ << "(" << __LINE__
    110             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    111                "QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()"
    112             << std::endl
    113             << std::flush;
    114 #endif
    115 #endif
    116   asm volatile(
    117       "pld [%[lhs]]\n"
    118       "pld [%[rhs]]\n"
    119 
    120       // Clear aggregators.
    121       "vmov.i32 q0, #0\n"
    122       "vmov.i32 q1, #0\n"
    123 
    124       // General NxM lanes loop.
    125       "1:"
    126 
    127       // Subtract counter.
    128       "subs %[count], %[count], #8\n"
    129 
    130       "vld1.32 {d4}, [%[lhs]:64]!\n"
    131       "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
    132       "pld [%[lhs], #64]\n"
    133       "pld [%[rhs], #64]\n"
    134       "vmull.u8 q4, d5, d4\n"
    135       "vmull.u8 q5, d6, d4\n"
    136       "vpadal.u16 q0, q4\n"
    137       "vpadal.u16 q1, q5\n"
    138 
    139       // Loop break.
    140       "bgt 1b\n"
    141 
    142       // StaticQuantization::Prepare
    143       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
    144       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
    145       "vdup.32 q6, %[multiplicative_offset]\n"
    146       "vdup.32 q7, %[rounding_offset]\n"
    147       "vdup.32 q8, %[shift]\n"
    148       "vdup.32 q4, d8[0]\n"
    149 
    150       // RowMajorOutput::Prepare
    151 
    152       // Reduce aggregators.
    153       "vpadd.u32 d0, d0, d1\n"
    154       "vpadd.u32 d2, d2, d3\n"
    155       "vpadd.u32 d0, d0, d2\n"
    156 
    157       // StaticQuantization::Transform
    158       "vadd.s32 q0, q0, q4\n"
    159       "vadd.s32 q0, q0, q5\n"
    160       "vmul.i32 q0, q0, q6\n"
    161       "vadd.i32 q0, q0, q7\n"
    162       "vshl.s32 q0, q0, q8\n"
    163       "vqmovn.s32 d0, q0\n"
    164       "vqmovun.s16 d0, q0\n"
    165 
    166       // RowMajorOutput::Output
    167       "vst1.16 {d0[0]}, [%[result]]!\n"
    168       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    169       : [count] "r"(params.kernel.count),
    170         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    171         [shift] "r"(params.kernel.shift),
    172         [stride] "r"(params.output_stream.stride),
    173         [rounding_offset] "r"(params.kernel.rounding_offset)
    174       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
    175         "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
    176 }
    177 
    178 template <>
    179 inline void
    180 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3,
    181           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    182                        const FusedKernelParams<QuantizedStaticPreprocessed,
    183                                                RowMajor>& params,
    184                        uint8_t* result) {
    185 #ifdef DEBUG
    186 #ifdef DEBUG_METAGEMM_VERBOSE
    187   std::cout << __FILE__ << "(" << __LINE__
    188             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    189                "QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()"
    190             << std::endl
    191             << std::flush;
    192 #endif
    193 #endif
    194   asm volatile(
    195       "pld [%[lhs]]\n"
    196       "pld [%[rhs]]\n"
    197 
    198       // Clear aggregators.
    199       "vmov.i32 q0, #0\n"
    200       "vmov.i32 q1, #0\n"
    201       "vmov.i32 q2, #0\n"
    202 
    203       // General NxM lanes loop.
    204       "1:"
    205 
    206       // Subtract counter.
    207       "subs %[count], %[count], #8\n"
    208 
    209       "vld1.32 {d6}, [%[lhs]:64]!\n"
    210       "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
    211       "pld [%[lhs], #64]\n"
    212       "pld [%[rhs], #64]\n"
    213       "vmull.u8 q5, d7, d6\n"
    214       "vmull.u8 q6, d8, d6\n"
    215       "vmull.u8 q7, d9, d6\n"
    216       "vpadal.u16 q0, q5\n"
    217       "vpadal.u16 q1, q6\n"
    218       "vpadal.u16 q2, q7\n"
    219 
    220       // Loop break.
    221       "bgt 1b\n"
    222 
    223       // StaticQuantization::Prepare
    224       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
    225       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
    226       "vdup.32 q6, %[multiplicative_offset]\n"
    227       "vdup.32 q7, %[rounding_offset]\n"
    228       "vdup.32 q8, %[shift]\n"
    229       "vdup.32 q4, d8[0]\n"
    230 
    231       // RowMajorOutput::Prepare
    232 
    233       // Reduce aggregators.
    234       "vpadd.u32 d0, d0, d1\n"
    235       "vpadd.u32 d2, d2, d3\n"
    236       "vpadd.u32 d4, d4, d5\n"
    237       "vpadd.u32 d0, d0, d2\n"
    238       "vpadd.u32 d1, d4, d4\n"
    239 
    240       // StaticQuantization::Transform
    241       "vadd.s32 q0, q0, q4\n"
    242       "vadd.s32 q0, q0, q5\n"
    243       "vmul.i32 q0, q0, q6\n"
    244       "vadd.i32 q0, q0, q7\n"
    245       "vshl.s32 q0, q0, q8\n"
    246       "vqmovn.s32 d0, q0\n"
    247       "vqmovun.s16 d0, q0\n"
    248 
    249       // RowMajorOutput::Output
    250       "vst1.16 {d0[0]}, [%[result]]!\n"
    251       "vst1.8 {d0[2]}, [%[result]]!\n"
    252       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    253       : [count] "r"(params.kernel.count),
    254         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    255         [shift] "r"(params.kernel.shift),
    256         [stride] "r"(params.output_stream.stride),
    257         [rounding_offset] "r"(params.kernel.rounding_offset)
    258       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
    259         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
    260 }
    261 
    262 template <>
    263 inline void
    264 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4,
    265           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    266                        const FusedKernelParams<QuantizedStaticPreprocessed,
    267                                                RowMajor>& params,
    268                        uint8_t* result) {
    269 #ifdef DEBUG
    270 #ifdef DEBUG_METAGEMM_VERBOSE
    271   std::cout << __FILE__ << "(" << __LINE__
    272             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    273                "QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()"
    274             << std::endl
    275             << std::flush;
    276 #endif
    277 #endif
    278   asm volatile(
    279       "pld [%[lhs]]\n"
    280       "pld [%[rhs]]\n"
    281 
    282       // Clear aggregators.
    283       "vmov.i32 q0, #0\n"
    284       "vmov.i32 q1, #0\n"
    285       "vmov.i32 q2, #0\n"
    286       "vmov.i32 q3, q0\n"
    287 
    288       // General NxM lanes loop.
    289       "1:"
    290 
    291       // Subtract counter.
    292       "subs %[count], %[count], #8\n"
    293 
    294       "vld1.32 {d8}, [%[lhs]:64]!\n"
    295       "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
    296       "pld [%[lhs], #64]\n"
    297       "pld [%[rhs], #64]\n"
    298       "vmull.u8 q7, d9, d8\n"
    299       "vmull.u8 q8, d10, d8\n"
    300       "vmull.u8 q9, d11, d8\n"
    301       "vmull.u8 q10, d12, d8\n"
    302       "vpadal.u16 q0, q7\n"
    303       "vpadal.u16 q1, q8\n"
    304       "vpadal.u16 q2, q9\n"
    305       "vpadal.u16 q3, q10\n"
    306 
    307       // Loop break.
    308       "bgt 1b\n"
    309 
    310       // StaticQuantization::Prepare
    311       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
    312       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
    313       "vdup.32 q6, %[multiplicative_offset]\n"
    314       "vdup.32 q7, %[rounding_offset]\n"
    315       "vdup.32 q8, %[shift]\n"
    316       "vdup.32 q4, d8[0]\n"
    317 
    318       // RowMajorOutput::Prepare
    319 
    320       // Reduce aggregators.
    321       "vpadd.u32 d0, d0, d1\n"
    322       "vpadd.u32 d2, d2, d3\n"
    323       "vpadd.u32 d4, d4, d5\n"
    324       "vpadd.u32 d6, d6, d7\n"
    325       "vpadd.u32 d0, d0, d2\n"
    326       "vpadd.u32 d1, d4, d6\n"
    327 
    328       // StaticQuantization::Transform
    329       "vadd.s32 q0, q0, q4\n"
    330       "vadd.s32 q0, q0, q5\n"
    331       "vmul.i32 q0, q0, q6\n"
    332       "vadd.i32 q0, q0, q7\n"
    333       "vshl.s32 q0, q0, q8\n"
    334       "vqmovn.s32 d0, q0\n"
    335       "vqmovun.s16 d0, q0\n"
    336 
    337       // RowMajorOutput::Output
    338       "vst1.32 {d0[0]}, [%[result]]!\n"
    339       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    340       : [count] "r"(params.kernel.count),
    341         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    342         [shift] "r"(params.kernel.shift),
    343         [stride] "r"(params.output_stream.stride),
    344         [rounding_offset] "r"(params.kernel.rounding_offset)
    345       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
    346         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
    347         "d21", "cc", "memory");
    348 }
    349 
    350 template <>
    351 inline void
    352 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5,
    353           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    354                        const FusedKernelParams<QuantizedStaticPreprocessed,
    355                                                RowMajor>& params,
    356                        uint8_t* result) {
    357 #ifdef DEBUG
    358 #ifdef DEBUG_METAGEMM_VERBOSE
    359   std::cout << __FILE__ << "(" << __LINE__
    360             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    361                "QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()"
    362             << std::endl
    363             << std::flush;
    364 #endif
    365 #endif
    366   asm volatile(
    367       "pld [%[lhs]]\n"
    368       "pld [%[rhs]]\n"
    369 
    370       // Clear aggregators.
    371       "vmov.i32 q0, #0\n"
    372       "vmov.i32 q1, #0\n"
    373       "vmov.i32 q2, #0\n"
    374       "vmov.i32 q3, q0\n"
    375       "vmov.i32 q4, q1\n"
    376 
    377       // General 1xM lanes loop.
    378       "1:"
    379 
    380       // Subtract counter.
    381       "subs %[count], %[count], #8\n"
    382 
    383       "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
    384       "vld1.32 {d14}, [%[lhs]:64]!\n"
    385       "pld [%[lhs], #64]\n"
    386       "vmull.u8 q8, d10, d14\n"
    387       "vmull.u8 q9, d11, d14\n"
    388       "vmull.u8 q10, d12, d14\n"
    389       "vmull.u8 q11, d13, d14\n"
    390       "vld1.32 {d10}, [%[rhs]:64]!\n"
    391       "pld [%[rhs], #128]\n"
    392       "vpadal.u16 q0, q8\n"
    393       "vpadal.u16 q1, q9\n"
    394       "vpadal.u16 q2, q10\n"
    395       "vpadal.u16 q3, q11\n"
    396       "vmull.u8 q8, d10, d14\n"
    397       "vpadal.u16 q4, q8\n"
    398 
    399       // Loop break.
    400       "bgt 1b\n"
    401 
    402       // StaticQuantization::Prepare
    403       "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
    404       "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
    405       "vdup.32 q8, %[multiplicative_offset]\n"
    406       "vdup.32 q9, %[rounding_offset]\n"
    407       "vdup.32 q10, %[shift]\n"
    408       "vdup.32 q5, d10[0]\n"
    409 
    410       // RowMajorOutput::Prepare
    411 
    412       // Reduce aggregators.
    413       "vpadd.u32 d0, d0, d1\n"
    414       "vpadd.u32 d2, d2, d3\n"
    415       "vpadd.u32 d4, d4, d5\n"
    416       "vpadd.u32 d6, d6, d7\n"
    417       "vpadd.u32 d8, d8, d9\n"
    418       "vpadd.u32 d0, d0, d2\n"
    419       "vpadd.u32 d1, d4, d6\n"
    420       "vpadd.u32 d2, d8, d8\n"
    421 
    422       // StaticQuantization::Transform
    423       "vadd.s32 q0, q0, q5\n"
    424       "vadd.s32 q1, q1, q5\n"
    425       "vadd.s32 q0, q0, q6\n"
    426       "vadd.s32 q1, q1, q7\n"
    427       "vmul.i32 q0, q0, q8\n"
    428       "vmul.i32 q1, q1, q8\n"
    429       "vadd.i32 q0, q0, q9\n"
    430       "vadd.i32 q1, q1, q9\n"
    431       "vshl.s32 q0, q0, q10\n"
    432       "vshl.s32 q1, q1, q10\n"
    433       "vqmovn.s32 d0, q0\n"
    434       "vqmovn.s32 d1, q1\n"
    435       "vqmovun.s16 d0, q0\n"
    436 
    437       // RowMajorOutput::Output
    438       "vst1.32 {d0[0]}, [%[result]]!\n"
    439       "vst1.8 {d0[4]}, [%[result]]!\n"
    440       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    441       : [count] "r"(params.kernel.count),
    442         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    443         [shift] "r"(params.kernel.shift),
    444         [stride] "r"(params.output_stream.stride),
    445         [rounding_offset] "r"(params.kernel.rounding_offset)
    446       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
    447         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
    448         "d21", "d22", "d23", "cc", "memory");
    449 }
    450 
    451 template <>
    452 inline void
    453 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6,
    454           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    455                        const FusedKernelParams<QuantizedStaticPreprocessed,
    456                                                RowMajor>& params,
    457                        uint8_t* result) {
    458 #ifdef DEBUG
    459 #ifdef DEBUG_METAGEMM_VERBOSE
    460   std::cout << __FILE__ << "(" << __LINE__
    461             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    462                "QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()"
    463             << std::endl
    464             << std::flush;
    465 #endif
    466 #endif
    467   asm volatile(
    468       "pld [%[lhs]]\n"
    469       "pld [%[rhs]]\n"
    470 
    471       // Clear aggregators.
    472       "vmov.i32 q0, #0\n"
    473       "vmov.i32 q1, #0\n"
    474       "vmov.i32 q2, #0\n"
    475       "vmov.i32 q3, q0\n"
    476       "vmov.i32 q4, q1\n"
    477       "vmov.i32 q5, q2\n"
    478 
    479       // General 1xM lanes loop.
    480       "1:"
    481 
    482       // Subtract counter.
    483       "subs %[count], %[count], #8\n"
    484 
    485       "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
    486       "vld1.32 {d16}, [%[lhs]:64]!\n"
    487       "pld [%[lhs], #64]\n"
    488       "vmull.u8 q9, d12, d16\n"
    489       "vmull.u8 q10, d13, d16\n"
    490       "vmull.u8 q11, d14, d16\n"
    491       "vmull.u8 q12, d15, d16\n"
    492       "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
    493       "pld [%[rhs], #128]\n"
    494       "vpadal.u16 q0, q9\n"
    495       "vpadal.u16 q1, q10\n"
    496       "vpadal.u16 q2, q11\n"
    497       "vpadal.u16 q3, q12\n"
    498       "vmull.u8 q9, d12, d16\n"
    499       "vmull.u8 q10, d13, d16\n"
    500       "vpadal.u16 q4, q9\n"
    501       "vpadal.u16 q5, q10\n"
    502 
    503       // Loop break.
    504       "bgt 1b\n"
    505 
    506       // StaticQuantization::Prepare
    507       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
    508       "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
    509       "vdup.32 q9, %[multiplicative_offset]\n"
    510       "vdup.32 q10, %[rounding_offset]\n"
    511       "vdup.32 q11, %[shift]\n"
    512       "vdup.32 q6, d12[0]\n"
    513 
    514       // RowMajorOutput::Prepare
    515 
    516       // Reduce aggregators.
    517       "vpadd.u32 d0, d0, d1\n"
    518       "vpadd.u32 d2, d2, d3\n"
    519       "vpadd.u32 d4, d4, d5\n"
    520       "vpadd.u32 d6, d6, d7\n"
    521       "vpadd.u32 d8, d8, d9\n"
    522       "vpadd.u32 d10, d10, d11\n"
    523       "vpadd.u32 d0, d0, d2\n"
    524       "vpadd.u32 d1, d4, d6\n"
    525       "vpadd.u32 d2, d8, d10\n"
    526 
    527       // StaticQuantization::Transform
    528       "vadd.s32 q0, q0, q6\n"
    529       "vadd.s32 q1, q1, q6\n"
    530       "vadd.s32 q0, q0, q7\n"
    531       "vadd.s32 q1, q1, q8\n"
    532       "vmul.i32 q0, q0, q9\n"
    533       "vmul.i32 q1, q1, q9\n"
    534       "vadd.i32 q0, q0, q10\n"
    535       "vadd.i32 q1, q1, q10\n"
    536       "vshl.s32 q0, q0, q11\n"
    537       "vshl.s32 q1, q1, q11\n"
    538       "vqmovn.s32 d0, q0\n"
    539       "vqmovn.s32 d1, q1\n"
    540       "vqmovun.s16 d0, q0\n"
    541 
    542       // RowMajorOutput::Output
    543       "vst1.32 {d0[0]}, [%[result]]!\n"
    544       "vst1.16 {d0[2]}, [%[result]]!\n"
    545       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    546       : [count] "r"(params.kernel.count),
    547         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    548         [shift] "r"(params.kernel.shift),
    549         [stride] "r"(params.output_stream.stride),
    550         [rounding_offset] "r"(params.kernel.rounding_offset)
    551       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
    552         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
    553         "d21", "d22", "d23", "d24", "d25", "cc", "memory");
    554 }
    555 
    556 template <>
    557 inline void
    558 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7,
    559           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    560                        const FusedKernelParams<QuantizedStaticPreprocessed,
    561                                                RowMajor>& params,
    562                        uint8_t* result) {
    563 #ifdef DEBUG
    564 #ifdef DEBUG_METAGEMM_VERBOSE
    565   std::cout << __FILE__ << "(" << __LINE__
    566             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    567                "QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()"
    568             << std::endl
    569             << std::flush;
    570 #endif
    571 #endif
    572   asm volatile(
    573       "pld [%[lhs]]\n"
    574       "pld [%[rhs]]\n"
    575 
    576       // Clear aggregators.
    577       "vmov.i32 q0, #0\n"
    578       "vmov.i32 q1, #0\n"
    579       "vmov.i32 q2, #0\n"
    580       "vmov.i32 q3, q0\n"
    581       "vmov.i32 q4, q1\n"
    582       "vmov.i32 q5, q2\n"
    583       "vmov.i32 q6, q3\n"
    584 
    585       // General 1xM lanes loop.
    586       "1:"
    587 
    588       // Subtract counter.
    589       "subs %[count], %[count], #8\n"
    590 
    591       "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
    592       "vld1.32 {d18}, [%[lhs]:64]!\n"
    593       "pld [%[lhs], #64]\n"
    594       "vmull.u8 q10, d14, d18\n"
    595       "vmull.u8 q11, d15, d18\n"
    596       "vmull.u8 q12, d16, d18\n"
    597       "vmull.u8 q13, d17, d18\n"
    598       "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
    599       "pld [%[rhs], #128]\n"
    600       "vpadal.u16 q0, q10\n"
    601       "vpadal.u16 q1, q11\n"
    602       "vpadal.u16 q2, q12\n"
    603       "vpadal.u16 q3, q13\n"
    604       "vmull.u8 q10, d14, d18\n"
    605       "vmull.u8 q11, d15, d18\n"
    606       "vmull.u8 q12, d16, d18\n"
    607       "vpadal.u16 q4, q10\n"
    608       "vpadal.u16 q5, q11\n"
    609       "vpadal.u16 q6, q12\n"
    610 
    611       // Loop break.
    612       "bgt 1b\n"
    613 
    614       // StaticQuantization::Prepare
    615       "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
    616       "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
    617       "vdup.32 q10, %[multiplicative_offset]\n"
    618       "vdup.32 q11, %[rounding_offset]\n"
    619       "vdup.32 q12, %[shift]\n"
    620       "vdup.32 q7, d14[0]\n"
    621 
    622       // RowMajorOutput::Prepare
    623 
    624       // Reduce aggregators.
    625       "vpadd.u32 d0, d0, d1\n"
    626       "vpadd.u32 d2, d2, d3\n"
    627       "vpadd.u32 d4, d4, d5\n"
    628       "vpadd.u32 d6, d6, d7\n"
    629       "vpadd.u32 d8, d8, d9\n"
    630       "vpadd.u32 d10, d10, d11\n"
    631       "vpadd.u32 d12, d12, d13\n"
    632       "vpadd.u32 d0, d0, d2\n"
    633       "vpadd.u32 d1, d4, d6\n"
    634       "vpadd.u32 d2, d8, d10\n"
    635       "vpadd.u32 d3, d12, d12\n"
    636 
    637       // StaticQuantization::Transform
    638       "vadd.s32 q0, q0, q7\n"
    639       "vadd.s32 q1, q1, q7\n"
    640       "vadd.s32 q0, q0, q8\n"
    641       "vadd.s32 q1, q1, q9\n"
    642       "vmul.i32 q0, q0, q10\n"
    643       "vmul.i32 q1, q1, q10\n"
    644       "vadd.i32 q0, q0, q11\n"
    645       "vadd.i32 q1, q1, q11\n"
    646       "vshl.s32 q0, q0, q12\n"
    647       "vshl.s32 q1, q1, q12\n"
    648       "vqmovn.s32 d0, q0\n"
    649       "vqmovn.s32 d1, q1\n"
    650       "vqmovun.s16 d0, q0\n"
    651 
    652       // RowMajorOutput::Output
    653       "vst1.32 {d0[0]}, [%[result]]!\n"
    654       "vst1.16 {d0[2]}, [%[result]]!\n"
    655       "vst1.8 {d0[6]}, [%[result]]!\n"
    656       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    657       : [count] "r"(params.kernel.count),
    658         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    659         [shift] "r"(params.kernel.shift),
    660         [stride] "r"(params.output_stream.stride),
    661         [rounding_offset] "r"(params.kernel.rounding_offset)
    662       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
    663         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
    664         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
    665 }
    666 
    667 template <>
    668 inline void
    669 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8,
    670           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    671                        const FusedKernelParams<QuantizedStaticPreprocessed,
    672                                                RowMajor>& params,
    673                        uint8_t* result) {
    674 #ifdef DEBUG
    675 #ifdef DEBUG_METAGEMM_VERBOSE
    676   std::cout << __FILE__ << "(" << __LINE__
    677             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    678                "QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()"
    679             << std::endl
    680             << std::flush;
    681 #endif
    682 #endif
    683   asm volatile(
    684       "pld [%[lhs]]\n"
    685       "pld [%[rhs]]\n"
    686 
    687       // Clear aggregators.
    688       "vmov.i32 q0, #0\n"
    689       "vmov.i32 q1, #0\n"
    690       "vmov.i32 q2, #0\n"
    691       "vmov.i32 q3, q0\n"
    692       "vmov.i32 q4, q1\n"
    693       "vmov.i32 q5, q2\n"
    694       "vmov.i32 q6, q3\n"
    695       "vmov.i32 q7, q4\n"
    696 
    697       // 1x8 lanes loop.
    698       "1:"
    699 
    700       "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
    701       "vld1.32 {d16}, [%[lhs]:64]!\n"
    702       "vmull.u8 q11, d16, d17\n"
    703       "vmull.u8 q12, d16, d18\n"
    704       "vmull.u8 q13, d16, d19\n"
    705       "vmull.u8 q14, d16, d20\n"
    706       "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
    707       "vpadal.u16 q0, q11\n"
    708       "vpadal.u16 q1, q12\n"
    709       "vpadal.u16 q2, q13\n"
    710       "vpadal.u16 q3, q14\n"
    711       "pld [%[rhs], #256]\n"
    712       "vmull.u8 q15, d16, d17\n"
    713       "vmull.u8 q11, d16, d18\n"
    714       "vmull.u8 q12, d16, d19\n"
    715       "vmull.u8 q13, d16, d20\n"
    716       "pld [%[lhs], #32]\n"
    717 
    718       // Subtract counter.
    719       "subs %[count], %[count], #8\n"
    720 
    721       "vpadal.u16 q4, q15\n"
    722       "vpadal.u16 q5, q11\n"
    723       "vpadal.u16 q6, q12\n"
    724       "vpadal.u16 q7, q13\n"
    725 
    726       // Loop break.
    727       "bgt 1b\n"
    728 
    729       // StaticQuantization::Prepare
    730       "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
    731       "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
    732       "vdup.32 q11, %[multiplicative_offset]\n"
    733       "vdup.32 q12, %[rounding_offset]\n"
    734       "vdup.32 q13, %[shift]\n"
    735       "vdup.32 q8, d16[0]\n"
    736 
    737       // RowMajorOutput::Prepare
    738 
    739       // Reduce aggregators.
    740       "vpadd.u32 d0, d0, d1\n"
    741       "vpadd.u32 d2, d2, d3\n"
    742       "vpadd.u32 d4, d4, d5\n"
    743       "vpadd.u32 d6, d6, d7\n"
    744       "vpadd.u32 d8, d8, d9\n"
    745       "vpadd.u32 d10, d10, d11\n"
    746       "vpadd.u32 d12, d12, d13\n"
    747       "vpadd.u32 d14, d14, d15\n"
    748       "vpadd.u32 d0, d0, d2\n"
    749       "vpadd.u32 d1, d4, d6\n"
    750       "vpadd.u32 d2, d8, d10\n"
    751       "vpadd.u32 d3, d12, d14\n"
    752 
    753       // StaticQuantization::Transform
    754       "vadd.s32 q0, q0, q8\n"
    755       "vadd.s32 q1, q1, q8\n"
    756       "vadd.s32 q0, q0, q9\n"
    757       "vadd.s32 q1, q1, q10\n"
    758       "vmul.i32 q0, q0, q11\n"
    759       "vmul.i32 q1, q1, q11\n"
    760       "vadd.i32 q0, q0, q12\n"
    761       "vadd.i32 q1, q1, q12\n"
    762       "vshl.s32 q0, q0, q13\n"
    763       "vshl.s32 q1, q1, q13\n"
    764       "vqmovn.s32 d0, q0\n"
    765       "vqmovn.s32 d1, q1\n"
    766       "vqmovun.s16 d0, q0\n"
    767 
    768       // RowMajorOutput::Output
    769       "vst1.32 {d0}, [%[result]]!\n"
    770       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    771       : [count] "r"(params.kernel.count),
    772         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    773         [shift] "r"(params.kernel.shift),
    774         [stride] "r"(params.output_stream.stride),
    775         [rounding_offset] "r"(params.kernel.rounding_offset)
    776       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
    777         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
    778         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
    779         "d31", "cc", "memory");
    780 }
    781 
    782 template <>
    783 inline void
    784 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1,
    785           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    786                        const FusedKernelParams<QuantizedStaticPreprocessed,
    787                                                RowMajor>& params,
    788                        uint8_t* result) {
    789 #ifdef DEBUG
    790 #ifdef DEBUG_METAGEMM_VERBOSE
    791   std::cout << __FILE__ << "(" << __LINE__
    792             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    793                "QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()"
    794             << std::endl
    795             << std::flush;
    796 #endif
    797 #endif
    798   asm volatile(
    799       "pld [%[lhs]]\n"
    800       "pld [%[rhs]]\n"
    801 
    802       // Clear aggregators.
    803       "vmov.i32 q0, #0\n"
    804       "vmov.i32 q1, #0\n"
    805 
    806       // General NxM lanes loop.
    807       "1:"
    808 
    809       // Subtract counter.
    810       "subs %[count], %[count], #8\n"
    811 
    812       "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
    813       "vld1.32 {d6}, [%[rhs]:64]!\n"
    814       "pld [%[lhs], #64]\n"
    815       "pld [%[rhs], #64]\n"
    816       "vmull.u8 q4, d6, d4\n"
    817       "vmull.u8 q5, d6, d5\n"
    818       "vpadal.u16 q0, q4\n"
    819       "vpadal.u16 q1, q5\n"
    820 
    821       // Loop break.
    822       "bgt 1b\n"
    823 
    824       // StaticQuantization::Prepare
    825       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
    826       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
    827       "vdup.32 q6, %[multiplicative_offset]\n"
    828       "vdup.32 q7, %[rounding_offset]\n"
    829       "vdup.32 q8, %[shift]\n"
    830       "vdup.32 q2, d8[0]\n"
    831       "vdup.32 q4, d8[1]\n"
    832 
    833       // RowMajorOutput::Prepare
    834       "add r0, %[result], %[stride]\n"
    835 
    836       // Reduce aggregators.
    837       "vpadd.u32 d0, d0, d1\n"
    838       "vpadd.u32 d0, d0, d0\n"
    839       "vpadd.u32 d2, d2, d3\n"
    840       "vpadd.u32 d2, d2, d2\n"
    841 
    842       // StaticQuantization::Transform
    843       "vadd.s32 q0, q0, q2\n"
    844       "vadd.s32 q1, q1, q4\n"
    845       "vadd.s32 q0, q0, q5\n"
    846       "vadd.s32 q1, q1, q5\n"
    847       "vmul.i32 q0, q0, q6\n"
    848       "vmul.i32 q1, q1, q6\n"
    849       "vadd.i32 q0, q0, q7\n"
    850       "vadd.i32 q1, q1, q7\n"
    851       "vshl.s32 q0, q0, q8\n"
    852       "vshl.s32 q1, q1, q8\n"
    853       "vqmovn.s32 d0, q0\n"
    854       "vqmovn.s32 d2, q1\n"
    855       "vqmovun.s16 d0, q0\n"
    856       "vqmovun.s16 d2, q1\n"
    857 
    858       // RowMajorOutput::Output
    859       "vst1.8 {d0[0]}, [%[result]]!\n"
    860       "vst1.8 {d2[0]}, [r0]!\n"
    861       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    862       : [count] "r"(params.kernel.count),
    863         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    864         [shift] "r"(params.kernel.shift),
    865         [stride] "r"(params.output_stream.stride),
    866         [rounding_offset] "r"(params.kernel.rounding_offset)
    867       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
    868         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
    869 }
    870 
    871 template <>
    872 inline void
    873 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2,
    874           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    875                        const FusedKernelParams<QuantizedStaticPreprocessed,
    876                                                RowMajor>& params,
    877                        uint8_t* result) {
    878 #ifdef DEBUG
    879 #ifdef DEBUG_METAGEMM_VERBOSE
    880   std::cout << __FILE__ << "(" << __LINE__
    881             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    882                "QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()"
    883             << std::endl
    884             << std::flush;
    885 #endif
    886 #endif
    887   asm volatile(
    888       "pld [%[lhs]]\n"
    889       "pld [%[rhs]]\n"
    890 
    891       // Clear aggregators.
    892       "vmov.i32 q0, #0\n"
    893       "vmov.i32 q1, #0\n"
    894       "vmov.i32 q2, #0\n"
    895       "vmov.i32 q3, q0\n"
    896 
    897       // General NxM lanes loop.
    898       "1:"
    899 
    900       // Subtract counter.
    901       "subs %[count], %[count], #8\n"
    902 
    903       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
    904       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
    905       "pld [%[lhs], #64]\n"
    906       "pld [%[rhs], #64]\n"
    907       "vmull.u8 q6, d10, d8\n"
    908       "vmull.u8 q7, d11, d8\n"
    909       "vmull.u8 q8, d10, d9\n"
    910       "vmull.u8 q9, d11, d9\n"
    911       "vpadal.u16 q0, q6\n"
    912       "vpadal.u16 q1, q7\n"
    913       "vpadal.u16 q2, q8\n"
    914       "vpadal.u16 q3, q9\n"
    915 
    916       // Loop break.
    917       "bgt 1b\n"
    918 
    919       // StaticQuantization::Prepare
    920       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
    921       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
    922       "vdup.32 q6, %[multiplicative_offset]\n"
    923       "vdup.32 q7, %[rounding_offset]\n"
    924       "vdup.32 q8, %[shift]\n"
    925       "vdup.32 q9, d8[0]\n"
    926       "vdup.32 q4, d8[1]\n"
    927 
    928       // RowMajorOutput::Prepare
    929       "add r0, %[result], %[stride]\n"
    930 
    931       // Reduce aggregators.
    932       "vpadd.u32 d0, d0, d1\n"
    933       "vpadd.u32 d2, d2, d3\n"
    934       "vpadd.u32 d0, d0, d2\n"
    935       "vpadd.u32 d4, d4, d5\n"
    936       "vpadd.u32 d6, d6, d7\n"
    937       "vpadd.u32 d4, d4, d6\n"
    938 
    939       // StaticQuantization::Transform
    940       "vadd.s32 q0, q0, q9\n"
    941       "vadd.s32 q2, q2, q4\n"
    942       "vadd.s32 q0, q0, q5\n"
    943       "vadd.s32 q2, q2, q5\n"
    944       "vmul.i32 q0, q0, q6\n"
    945       "vmul.i32 q2, q2, q6\n"
    946       "vadd.i32 q0, q0, q7\n"
    947       "vadd.i32 q2, q2, q7\n"
    948       "vshl.s32 q0, q0, q8\n"
    949       "vshl.s32 q2, q2, q8\n"
    950       "vqmovn.s32 d0, q0\n"
    951       "vqmovn.s32 d4, q2\n"
    952       "vqmovun.s16 d0, q0\n"
    953       "vqmovun.s16 d4, q2\n"
    954 
    955       // RowMajorOutput::Output
    956       "vst1.16 {d0[0]}, [%[result]]!\n"
    957       "vst1.16 {d4[0]}, [r0]!\n"
    958       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
    959       : [count] "r"(params.kernel.count),
    960         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
    961         [shift] "r"(params.kernel.shift),
    962         [stride] "r"(params.output_stream.stride),
    963         [rounding_offset] "r"(params.kernel.rounding_offset)
    964       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
    965         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
    966         "memory");
    967 }
    968 
    969 template <>
    970 inline void
    971 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3,
    972           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
    973                        const FusedKernelParams<QuantizedStaticPreprocessed,
    974                                                RowMajor>& params,
    975                        uint8_t* result) {
    976 #ifdef DEBUG
    977 #ifdef DEBUG_METAGEMM_VERBOSE
    978   std::cout << __FILE__ << "(" << __LINE__
    979             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
    980                "QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()"
    981             << std::endl
    982             << std::flush;
    983 #endif
    984 #endif
    985   asm volatile(
    986       "pld [%[lhs]]\n"
    987       "pld [%[rhs]]\n"
    988 
    989       // Clear aggregators.
    990       "vmov.i32 q0, #0\n"
    991       "vmov.i32 q1, #0\n"
    992       "vmov.i32 q2, #0\n"
    993       "vmov.i32 q3, q0\n"
    994       "vmov.i32 q4, q1\n"
    995       "vmov.i32 q5, q2\n"
    996 
    997       // General NxM lanes loop.
    998       "1:"
    999 
   1000       // Subtract counter.
   1001       "subs %[count], %[count], #8\n"
   1002 
   1003       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   1004       "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
   1005       "pld [%[lhs], #64]\n"
   1006       "pld [%[rhs], #64]\n"
   1007       "vmull.u8 q9, d14, d12\n"
   1008       "vmull.u8 q10, d15, d12\n"
   1009       "vmull.u8 q11, d16, d12\n"
   1010       "vmull.u8 q12, d14, d13\n"
   1011       "vmull.u8 q13, d15, d13\n"
   1012       "vmull.u8 q14, d16, d13\n"
   1013       "vpadal.u16 q0, q9\n"
   1014       "vpadal.u16 q1, q10\n"
   1015       "vpadal.u16 q2, q11\n"
   1016       "vpadal.u16 q3, q12\n"
   1017       "vpadal.u16 q4, q13\n"
   1018       "vpadal.u16 q5, q14\n"
   1019 
   1020       // Loop break.
   1021       "bgt 1b\n"
   1022 
   1023       // StaticQuantization::Prepare
   1024       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   1025       "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
   1026       "vdup.32 q8, %[multiplicative_offset]\n"
   1027       "vdup.32 q9, %[rounding_offset]\n"
   1028       "vdup.32 q10, %[shift]\n"
   1029       "vdup.32 q11, d12[0]\n"
   1030       "vdup.32 q6, d12[1]\n"
   1031 
   1032       // RowMajorOutput::Prepare
   1033       "add r0, %[result], %[stride]\n"
   1034 
   1035       // Reduce aggregators.
   1036       "vpadd.u32 d0, d0, d1\n"
   1037       "vpadd.u32 d2, d2, d3\n"
   1038       "vpadd.u32 d4, d4, d5\n"
   1039       "vpadd.u32 d0, d0, d2\n"
   1040       "vpadd.u32 d1, d4, d4\n"
   1041       "vpadd.u32 d6, d6, d7\n"
   1042       "vpadd.u32 d8, d8, d9\n"
   1043       "vpadd.u32 d10, d10, d11\n"
   1044       "vpadd.u32 d6, d6, d8\n"
   1045       "vpadd.u32 d7, d10, d10\n"
   1046 
   1047       // StaticQuantization::Transform
   1048       "vadd.s32 q0, q0, q11\n"
   1049       "vadd.s32 q3, q3, q6\n"
   1050       "vadd.s32 q0, q0, q7\n"
   1051       "vadd.s32 q3, q3, q7\n"
   1052       "vmul.i32 q0, q0, q8\n"
   1053       "vmul.i32 q3, q3, q8\n"
   1054       "vadd.i32 q0, q0, q9\n"
   1055       "vadd.i32 q3, q3, q9\n"
   1056       "vshl.s32 q0, q0, q10\n"
   1057       "vshl.s32 q3, q3, q10\n"
   1058       "vqmovn.s32 d0, q0\n"
   1059       "vqmovn.s32 d6, q3\n"
   1060       "vqmovun.s16 d0, q0\n"
   1061       "vqmovun.s16 d6, q3\n"
   1062 
   1063       // RowMajorOutput::Output
   1064       "vst1.16 {d0[0]}, [%[result]]!\n"
   1065       "vst1.8 {d0[2]}, [%[result]]!\n"
   1066       "vst1.16 {d6[0]}, [r0]!\n"
   1067       "vst1.8 {d6[2]}, [r0]!\n"
   1068       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1069       : [count] "r"(params.kernel.count),
   1070         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1071         [shift] "r"(params.kernel.shift),
   1072         [stride] "r"(params.output_stream.stride),
   1073         [rounding_offset] "r"(params.kernel.rounding_offset)
   1074       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   1075         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   1076         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
   1077         "memory");
   1078 }
   1079 
   1080 template <>
   1081 inline void
   1082 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4,
   1083           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1084                        const FusedKernelParams<QuantizedStaticPreprocessed,
   1085                                                RowMajor>& params,
   1086                        uint8_t* result) {
   1087 #ifdef DEBUG
   1088 #ifdef DEBUG_METAGEMM_VERBOSE
   1089   std::cout << __FILE__ << "(" << __LINE__
   1090             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
   1091                "QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()"
   1092             << std::endl
   1093             << std::flush;
   1094 #endif
   1095 #endif
   1096   asm volatile(
   1097       "pld [%[lhs]]\n"
   1098       "pld [%[rhs]]\n"
   1099 
   1100       // Clear aggregators.
   1101       "vmov.i32 q0, #0\n"
   1102       "vmov.i32 q1, #0\n"
   1103       "vmov.i32 q2, #0\n"
   1104       "vmov.i32 q3, q0\n"
   1105       "vmov.i32 q4, q1\n"
   1106       "vmov.i32 q5, q2\n"
   1107       "vmov.i32 q6, q3\n"
   1108       "vmov.i32 q7, q4\n"
   1109 
   1110       // 2x4 lanes loop.
   1111       "1:"
   1112 
   1113       "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
   1114       "vld1.8 {d16}, [%[lhs]:64]!\n"
   1115       "vmull.u8 q11, d16, d18\n"
   1116       "vld1.8 {d17}, [%[lhs]:64]!\n"
   1117       "vmull.u8 q12, d16, d19\n"
   1118       "pld [%[rhs], #64]\n"
   1119       "vmull.u8 q13, d16, d20\n"
   1120       "pld [%[lhs], #64]\n"
   1121       "vmull.u8 q14, d16, d21\n"
   1122       "vmull.u8 q15, d17, d18\n"
   1123       "vpadal.u16 q0, q11\n"
   1124       "vpadal.u16 q1, q12\n"
   1125       "vpadal.u16 q2, q13\n"
   1126       "vmull.u8 q11, d17, d19\n"
   1127       "vmull.u8 q12, d17, d20\n"
   1128       "vmull.u8 q13, d17, d21\n"
   1129 
   1130       // Subtract counter.
   1131       "subs %[count], %[count], #8\n"
   1132 
   1133       "vpadal.u16 q3, q14\n"
   1134       "vpadal.u16 q4, q15\n"
   1135       "vpadal.u16 q5, q11\n"
   1136       "vpadal.u16 q6, q12\n"
   1137       "vpadal.u16 q7, q13\n"
   1138 
   1139       // Loop break.
   1140       "bgt 1b\n"
   1141 
   1142       // StaticQuantization::Prepare
   1143       "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
   1144       "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
   1145       "vdup.32 q10, %[multiplicative_offset]\n"
   1146       "vdup.32 q11, %[rounding_offset]\n"
   1147       "vdup.32 q12, %[shift]\n"
   1148       "vdup.32 q13, d16[0]\n"
   1149       "vdup.32 q8, d16[1]\n"
   1150 
   1151       // RowMajorOutput::Prepare
   1152       "add r0, %[result], %[stride]\n"
   1153 
   1154       // Reduce aggregators.
   1155       "vpadd.u32 d0, d0, d1\n"
   1156       "vpadd.u32 d2, d2, d3\n"
   1157       "vpadd.u32 d4, d4, d5\n"
   1158       "vpadd.u32 d6, d6, d7\n"
   1159       "vpadd.u32 d0, d0, d2\n"
   1160       "vpadd.u32 d1, d4, d6\n"
   1161       "vpadd.u32 d8, d8, d9\n"
   1162       "vpadd.u32 d10, d10, d11\n"
   1163       "vpadd.u32 d12, d12, d13\n"
   1164       "vpadd.u32 d14, d14, d15\n"
   1165       "vpadd.u32 d8, d8, d10\n"
   1166       "vpadd.u32 d9, d12, d14\n"
   1167 
   1168       // StaticQuantization::Transform
   1169       "vadd.s32 q0, q0, q13\n"
   1170       "vadd.s32 q4, q4, q8\n"
   1171       "vadd.s32 q0, q0, q9\n"
   1172       "vadd.s32 q4, q4, q9\n"
   1173       "vmul.i32 q0, q0, q10\n"
   1174       "vmul.i32 q4, q4, q10\n"
   1175       "vadd.i32 q0, q0, q11\n"
   1176       "vadd.i32 q4, q4, q11\n"
   1177       "vshl.s32 q0, q0, q12\n"
   1178       "vshl.s32 q4, q4, q12\n"
   1179       "vqmovn.s32 d0, q0\n"
   1180       "vqmovn.s32 d8, q4\n"
   1181       "vqmovun.s16 d0, q0\n"
   1182       "vqmovun.s16 d8, q4\n"
   1183 
   1184       // RowMajorOutput::Output
   1185       "vst1.32 {d0[0]}, [%[result]]!\n"
   1186       "vst1.32 {d8[0]}, [r0]!\n"
   1187       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1188       : [count] "r"(params.kernel.count),
   1189         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1190         [shift] "r"(params.kernel.shift),
   1191         [stride] "r"(params.output_stream.stride),
   1192         [rounding_offset] "r"(params.kernel.rounding_offset)
   1193       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   1194         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   1195         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
   1196         "d31", "cc", "memory");
   1197 }
   1198 
   1199 template <>
   1200 inline void
   1201 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1,
   1202           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1203                        const FusedKernelParams<QuantizedStaticPreprocessed,
   1204                                                RowMajor>& params,
   1205                        uint8_t* result) {
   1206 #ifdef DEBUG
   1207 #ifdef DEBUG_METAGEMM_VERBOSE
   1208   std::cout << __FILE__ << "(" << __LINE__
   1209             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
   1210                "QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()"
   1211             << std::endl
   1212             << std::flush;
   1213 #endif
   1214 #endif
   1215   asm volatile(
   1216       "pld [%[lhs]]\n"
   1217       "pld [%[rhs]]\n"
   1218 
   1219       // Clear aggregators.
   1220       "vmov.i32 q0, #0\n"
   1221       "vmov.i32 q1, #0\n"
   1222       "vmov.i32 q2, #0\n"
   1223 
   1224       // General NxM lanes loop.
   1225       "1:"
   1226 
   1227       // Subtract counter.
   1228       "subs %[count], %[count], #8\n"
   1229 
   1230       "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
   1231       "vld1.32 {d9}, [%[rhs]:64]!\n"
   1232       "pld [%[lhs], #64]\n"
   1233       "pld [%[rhs], #64]\n"
   1234       "vmull.u8 q5, d9, d6\n"
   1235       "vmull.u8 q6, d9, d7\n"
   1236       "vmull.u8 q7, d9, d8\n"
   1237       "vpadal.u16 q0, q5\n"
   1238       "vpadal.u16 q1, q6\n"
   1239       "vpadal.u16 q2, q7\n"
   1240 
   1241       // Loop break.
   1242       "bgt 1b\n"
   1243 
   1244       // StaticQuantization::Prepare
   1245       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   1246       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   1247       "vdup.32 q6, %[multiplicative_offset]\n"
   1248       "vdup.32 q7, %[rounding_offset]\n"
   1249       "vdup.32 q8, %[shift]\n"
   1250       "vdup.32 q3, d8[0]\n"
   1251       "vdup.32 q9, d8[1]\n"
   1252       "vdup.32 q4, d9[0]\n"
   1253 
   1254       // RowMajorOutput::Prepare
   1255       "add r0, %[result], %[stride]\n"
   1256       "add r1, r0, %[stride]\n"
   1257 
   1258       // Reduce aggregators.
   1259       "vpadd.u32 d0, d0, d1\n"
   1260       "vpadd.u32 d0, d0, d0\n"
   1261       "vpadd.u32 d2, d2, d3\n"
   1262       "vpadd.u32 d2, d2, d2\n"
   1263       "vpadd.u32 d4, d4, d5\n"
   1264       "vpadd.u32 d4, d4, d4\n"
   1265 
   1266       // StaticQuantization::Transform
   1267       "vadd.s32 q0, q0, q3\n"
   1268       "vadd.s32 q1, q1, q9\n"
   1269       "vadd.s32 q2, q2, q4\n"
   1270       "vadd.s32 q0, q0, q5\n"
   1271       "vadd.s32 q1, q1, q5\n"
   1272       "vadd.s32 q2, q2, q5\n"
   1273       "vmul.i32 q0, q0, q6\n"
   1274       "vmul.i32 q1, q1, q6\n"
   1275       "vmul.i32 q2, q2, q6\n"
   1276       "vadd.i32 q0, q0, q7\n"
   1277       "vadd.i32 q1, q1, q7\n"
   1278       "vadd.i32 q2, q2, q7\n"
   1279       "vshl.s32 q0, q0, q8\n"
   1280       "vshl.s32 q1, q1, q8\n"
   1281       "vshl.s32 q2, q2, q8\n"
   1282       "vqmovn.s32 d0, q0\n"
   1283       "vqmovn.s32 d2, q1\n"
   1284       "vqmovn.s32 d4, q2\n"
   1285       "vqmovun.s16 d0, q0\n"
   1286       "vqmovun.s16 d2, q1\n"
   1287       "vqmovun.s16 d4, q2\n"
   1288 
   1289       // RowMajorOutput::Output
   1290       "vst1.8 {d0[0]}, [%[result]]!\n"
   1291       "vst1.8 {d2[0]}, [r0]!\n"
   1292       "vst1.8 {d4[0]}, [r1]!\n"
   1293       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1294       : [count] "r"(params.kernel.count),
   1295         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1296         [shift] "r"(params.kernel.shift),
   1297         [stride] "r"(params.output_stream.stride),
   1298         [rounding_offset] "r"(params.kernel.rounding_offset)
   1299       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   1300         "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
   1301         "cc", "memory");
   1302 }
   1303 
   1304 template <>
   1305 inline void
   1306 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2,
   1307           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1308                        const FusedKernelParams<QuantizedStaticPreprocessed,
   1309                                                RowMajor>& params,
   1310                        uint8_t* result) {
   1311 #ifdef DEBUG
   1312 #ifdef DEBUG_METAGEMM_VERBOSE
   1313   std::cout << __FILE__ << "(" << __LINE__
   1314             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
   1315                "QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()"
   1316             << std::endl
   1317             << std::flush;
   1318 #endif
   1319 #endif
   1320   asm volatile(
   1321       "pld [%[lhs]]\n"
   1322       "pld [%[rhs]]\n"
   1323 
   1324       // Clear aggregators.
   1325       "vmov.i32 q0, #0\n"
   1326       "vmov.i32 q1, #0\n"
   1327       "vmov.i32 q2, #0\n"
   1328       "vmov.i32 q3, q0\n"
   1329       "vmov.i32 q4, q1\n"
   1330       "vmov.i32 q5, q2\n"
   1331 
   1332       // General NxM lanes loop.
   1333       "1:"
   1334 
   1335       // Subtract counter.
   1336       "subs %[count], %[count], #8\n"
   1337 
   1338       "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
   1339       "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
   1340       "pld [%[lhs], #64]\n"
   1341       "pld [%[rhs], #64]\n"
   1342       "vmull.u8 q9, d15, d12\n"
   1343       "vmull.u8 q10, d16, d12\n"
   1344       "vmull.u8 q11, d15, d13\n"
   1345       "vmull.u8 q12, d16, d13\n"
   1346       "vmull.u8 q13, d15, d14\n"
   1347       "vmull.u8 q14, d16, d14\n"
   1348       "vpadal.u16 q0, q9\n"
   1349       "vpadal.u16 q1, q10\n"
   1350       "vpadal.u16 q2, q11\n"
   1351       "vpadal.u16 q3, q12\n"
   1352       "vpadal.u16 q4, q13\n"
   1353       "vpadal.u16 q5, q14\n"
   1354 
   1355       // Loop break.
   1356       "bgt 1b\n"
   1357 
   1358       // StaticQuantization::Prepare
   1359       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   1360       "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
   1361       "vdup.32 q8, %[multiplicative_offset]\n"
   1362       "vdup.32 q9, %[rounding_offset]\n"
   1363       "vdup.32 q10, %[shift]\n"
   1364       "vdup.32 q11, d12[0]\n"
   1365       "vdup.32 q12, d12[1]\n"
   1366       "vdup.32 q6, d13[0]\n"
   1367 
   1368       // RowMajorOutput::Prepare
   1369       "add r0, %[result], %[stride]\n"
   1370       "add r1, r0, %[stride]\n"
   1371 
   1372       // Reduce aggregators.
   1373       "vpadd.u32 d0, d0, d1\n"
   1374       "vpadd.u32 d2, d2, d3\n"
   1375       "vpadd.u32 d0, d0, d2\n"
   1376       "vpadd.u32 d4, d4, d5\n"
   1377       "vpadd.u32 d6, d6, d7\n"
   1378       "vpadd.u32 d4, d4, d6\n"
   1379       "vpadd.u32 d8, d8, d9\n"
   1380       "vpadd.u32 d10, d10, d11\n"
   1381       "vpadd.u32 d8, d8, d10\n"
   1382 
   1383       // StaticQuantization::Transform
   1384       "vadd.s32 q0, q0, q11\n"
   1385       "vadd.s32 q2, q2, q12\n"
   1386       "vadd.s32 q4, q4, q6\n"
   1387       "vadd.s32 q0, q0, q7\n"
   1388       "vadd.s32 q2, q2, q7\n"
   1389       "vadd.s32 q4, q4, q7\n"
   1390       "vmul.i32 q0, q0, q8\n"
   1391       "vmul.i32 q2, q2, q8\n"
   1392       "vmul.i32 q4, q4, q8\n"
   1393       "vadd.i32 q0, q0, q9\n"
   1394       "vadd.i32 q2, q2, q9\n"
   1395       "vadd.i32 q4, q4, q9\n"
   1396       "vshl.s32 q0, q0, q10\n"
   1397       "vshl.s32 q2, q2, q10\n"
   1398       "vshl.s32 q4, q4, q10\n"
   1399       "vqmovn.s32 d0, q0\n"
   1400       "vqmovn.s32 d4, q2\n"
   1401       "vqmovn.s32 d8, q4\n"
   1402       "vqmovun.s16 d0, q0\n"
   1403       "vqmovun.s16 d4, q2\n"
   1404       "vqmovun.s16 d8, q4\n"
   1405 
   1406       // RowMajorOutput::Output
   1407       "vst1.16 {d0[0]}, [%[result]]!\n"
   1408       "vst1.16 {d4[0]}, [r0]!\n"
   1409       "vst1.16 {d8[0]}, [r1]!\n"
   1410       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1411       : [count] "r"(params.kernel.count),
   1412         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1413         [shift] "r"(params.kernel.shift),
   1414         [stride] "r"(params.output_stream.stride),
   1415         [rounding_offset] "r"(params.kernel.rounding_offset)
   1416       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   1417         "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
   1418         "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
   1419         "cc", "memory");
   1420 }
   1421 
   1422 template <>
   1423 inline void
   1424 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3,
   1425           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1426                        const FusedKernelParams<QuantizedStaticPreprocessed,
   1427                                                RowMajor>& params,
   1428                        uint8_t* result) {
   1429 #ifdef DEBUG
   1430 #ifdef DEBUG_METAGEMM_VERBOSE
   1431   std::cout << __FILE__ << "(" << __LINE__
   1432             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
   1433                "QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()"
   1434             << std::endl
   1435             << std::flush;
   1436 #endif
   1437 #endif
   1438   asm volatile(
   1439       "pld [%[lhs]]\n"
   1440       "pld [%[rhs]]\n"
   1441 
   1442       // Clear aggregators.
   1443       "vmov.i32 q0, #0\n"
   1444       "vmov.i32 q1, #0\n"
   1445       "vmov.i32 q2, #0\n"
   1446       "vmov.i32 q3, q0\n"
   1447       "vmov.i32 q4, q1\n"
   1448       "vmov.i32 q5, q2\n"
   1449       "vmov.i32 q6, q3\n"
   1450       "vmov.i32 q7, q4\n"
   1451       "vmov.i32 q8, q5\n"
   1452 
   1453       // 3x3 lanes loop.
   1454       "1:"
   1455 
   1456       "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
   1457       "vld1.8 {d18}, [%[lhs]:64]!\n"
   1458       "vmull.u8 q12, d18, d21\n"
   1459       "vld1.8 {d19}, [%[lhs]:64]!\n"
   1460       "vmull.u8 q13, d18, d22\n"
   1461       "vld1.8 {d20}, [%[lhs]:64]!\n"
   1462       "vmull.u8 q14, d18, d23\n"
   1463       "pld [%[lhs], #64]\n"
   1464       "vmull.u8 q15, d19, d21\n"
   1465       "pld [%[rhs], #64]\n"
   1466       "vpadal.u16 q0, q12\n"
   1467       "vpadal.u16 q1, q13\n"
   1468       "vpadal.u16 q2, q14\n"
   1469       "vpadal.u16 q3, q15\n"
   1470       "vmull.u8 q12, d19, d22\n"
   1471       "vmull.u8 q13, d19, d23\n"
   1472       "vmull.u8 q14, d20, d21\n"
   1473       "vmull.u8 q15, d20, d22\n"
   1474 
   1475       // Subtract counter.
   1476       "subs %[count], %[count], #8\n"
   1477 
   1478       "vmull.u8 q9, d20, d23\n"
   1479       "vpadal.u16 q4, q12\n"
   1480       "vpadal.u16 q5, q13\n"
   1481       "vpadal.u16 q6, q14\n"
   1482       "vpadal.u16 q7, q15\n"
   1483       "vpadal.u16 q8, q9\n"
   1484 
   1485       // Loop break.
   1486       "bgt 1b\n"
   1487 
   1488       // StaticQuantization::Prepare
   1489       "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
   1490       "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
   1491       "vdup.32 q11, %[multiplicative_offset]\n"
   1492       "vdup.32 q12, %[rounding_offset]\n"
   1493       "vdup.32 q13, %[shift]\n"
   1494       "vdup.32 q14, d18[0]\n"
   1495       "vdup.32 q15, d18[1]\n"
   1496       "vdup.32 q9, d19[0]\n"
   1497 
   1498       // RowMajorOutput::Prepare
   1499       "add r0, %[result], %[stride]\n"
   1500       "add r1, r0, %[stride]\n"
   1501 
   1502       // Reduce aggregators.
   1503       "vpadd.u32 d0, d0, d1\n"
   1504       "vpadd.u32 d2, d2, d3\n"
   1505       "vpadd.u32 d4, d4, d5\n"
   1506       "vpadd.u32 d0, d0, d2\n"
   1507       "vpadd.u32 d1, d4, d4\n"
   1508       "vpadd.u32 d6, d6, d7\n"
   1509       "vpadd.u32 d8, d8, d9\n"
   1510       "vpadd.u32 d10, d10, d11\n"
   1511       "vpadd.u32 d6, d6, d8\n"
   1512       "vpadd.u32 d7, d10, d10\n"
   1513       "vpadd.u32 d12, d12, d13\n"
   1514       "vpadd.u32 d14, d14, d15\n"
   1515       "vpadd.u32 d16, d16, d17\n"
   1516       "vpadd.u32 d12, d12, d14\n"
   1517       "vpadd.u32 d13, d16, d16\n"
   1518 
   1519       // StaticQuantization::Transform
   1520       "vadd.s32 q0, q0, q14\n"
   1521       "vadd.s32 q3, q3, q15\n"
   1522       "vadd.s32 q6, q6, q9\n"
   1523       "vadd.s32 q0, q0, q10\n"
   1524       "vadd.s32 q3, q3, q10\n"
   1525       "vadd.s32 q6, q6, q10\n"
   1526       "vmul.i32 q0, q0, q11\n"
   1527       "vmul.i32 q3, q3, q11\n"
   1528       "vmul.i32 q6, q6, q11\n"
   1529       "vadd.i32 q0, q0, q12\n"
   1530       "vadd.i32 q3, q3, q12\n"
   1531       "vadd.i32 q6, q6, q12\n"
   1532       "vshl.s32 q0, q0, q13\n"
   1533       "vshl.s32 q3, q3, q13\n"
   1534       "vshl.s32 q6, q6, q13\n"
   1535       "vqmovn.s32 d0, q0\n"
   1536       "vqmovn.s32 d6, q3\n"
   1537       "vqmovn.s32 d12, q6\n"
   1538       "vqmovun.s16 d0, q0\n"
   1539       "vqmovun.s16 d6, q3\n"
   1540       "vqmovun.s16 d12, q6\n"
   1541 
   1542       // RowMajorOutput::Output
   1543       "vst1.16 {d0[0]}, [%[result]]!\n"
   1544       "vst1.8 {d0[2]}, [%[result]]!\n"
   1545       "vst1.16 {d6[0]}, [r0]!\n"
   1546       "vst1.8 {d6[2]}, [r0]!\n"
   1547       "vst1.16 {d12[0]}, [r1]!\n"
   1548       "vst1.8 {d12[2]}, [r1]!\n"
   1549       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1550       : [count] "r"(params.kernel.count),
   1551         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
   1552         [shift] "r"(params.kernel.shift),
   1553         [stride] "r"(params.output_stream.stride),
   1554         [rounding_offset] "r"(params.kernel.rounding_offset)
   1555       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   1556         "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
   1557         "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
   1558         "d30", "d31", "cc", "memory");
   1559 }
   1560 
   1561 template <>
   1562 inline void MulKernel<
   1563     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1,
   1564     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1565                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1566                                          RowMajor>& params,
   1567                  int32_t* result) {
   1568 #ifdef DEBUG
   1569 #ifdef DEBUG_METAGEMM_VERBOSE
   1570   std::cout << __FILE__ << "(" << __LINE__
   1571             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1572                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, "
   1573                "8>::Multiply()"
   1574             << std::endl
   1575             << std::flush;
   1576 #endif
   1577 #endif
   1578   asm volatile(
   1579       "pld [%[lhs]]\n"
   1580       "pld [%[rhs]]\n"
   1581 
   1582       // Clear aggregators.
   1583       "vmov.i32 q0, #0\n"
   1584 
   1585       // General NxM lanes loop.
   1586       "1:"
   1587 
   1588       // Subtract counter.
   1589       "subs %[count], %[count], #8\n"
   1590 
   1591       "vld1.32 {d2}, [%[lhs]:64]!\n"
   1592       "vld1.32 {d3}, [%[rhs]:64]!\n"
   1593       "pld [%[lhs], #64]\n"
   1594       "pld [%[rhs], #64]\n"
   1595       "vmull.u8 q2, d3, d2\n"
   1596       "vpadal.u16 q0, q2\n"
   1597 
   1598       // Loop break.
   1599       "bgt 1b\n"
   1600 
   1601       // StaticQuantizationInt32::Prepare
   1602       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   1603       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   1604       "vdup.32 q4, d8[0]\n"
   1605 
   1606       // RowMajorOutput::Prepare
   1607 
   1608       // Reduce aggregators.
   1609       "vpadd.u32 d0, d0, d1\n"
   1610       "vpadd.u32 d0, d0, d0\n"
   1611 
   1612       // StaticQuantizationInt32::Transform
   1613       "vadd.s32 q0, q0, q4\n"
   1614       "vadd.s32 q0, q0, q5\n"
   1615 
   1616       // RowMajorOutput::Output
   1617       "vst1.32 {d0[0]}, [%[result]]!\n"
   1618       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1619       : [count] "r"(params.kernel.count),
   1620         [stride] "r"(params.output_stream.stride)
   1621       : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "cc",
   1622         "memory");
   1623 }
   1624 
   1625 template <>
   1626 inline void MulKernel<
   1627     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2,
   1628     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1629                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1630                                          RowMajor>& params,
   1631                  int32_t* result) {
   1632 #ifdef DEBUG
   1633 #ifdef DEBUG_METAGEMM_VERBOSE
   1634   std::cout << __FILE__ << "(" << __LINE__
   1635             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1636                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, "
   1637                "8>::Multiply()"
   1638             << std::endl
   1639             << std::flush;
   1640 #endif
   1641 #endif
   1642   asm volatile(
   1643       "pld [%[lhs]]\n"
   1644       "pld [%[rhs]]\n"
   1645 
   1646       // Clear aggregators.
   1647       "vmov.i32 q0, #0\n"
   1648       "vmov.i32 q1, #0\n"
   1649 
   1650       // General NxM lanes loop.
   1651       "1:"
   1652 
   1653       // Subtract counter.
   1654       "subs %[count], %[count], #8\n"
   1655 
   1656       "vld1.32 {d4}, [%[lhs]:64]!\n"
   1657       "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
   1658       "pld [%[lhs], #64]\n"
   1659       "pld [%[rhs], #64]\n"
   1660       "vmull.u8 q4, d5, d4\n"
   1661       "vmull.u8 q5, d6, d4\n"
   1662       "vpadal.u16 q0, q4\n"
   1663       "vpadal.u16 q1, q5\n"
   1664 
   1665       // Loop break.
   1666       "bgt 1b\n"
   1667 
   1668       // StaticQuantizationInt32::Prepare
   1669       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   1670       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   1671       "vdup.32 q4, d8[0]\n"
   1672 
   1673       // RowMajorOutput::Prepare
   1674 
   1675       // Reduce aggregators.
   1676       "vpadd.u32 d0, d0, d1\n"
   1677       "vpadd.u32 d2, d2, d3\n"
   1678       "vpadd.u32 d0, d0, d2\n"
   1679 
   1680       // StaticQuantizationInt32::Transform
   1681       "vadd.s32 q0, q0, q4\n"
   1682       "vadd.s32 q0, q0, q5\n"
   1683 
   1684       // RowMajorOutput::Output
   1685       "vst1.32 {d0}, [%[result]]!\n"
   1686       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1687       : [count] "r"(params.kernel.count),
   1688         [stride] "r"(params.output_stream.stride)
   1689       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
   1690         "cc", "memory");
   1691 }
   1692 
   1693 template <>
   1694 inline void MulKernel<
   1695     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3,
   1696     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1697                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1698                                          RowMajor>& params,
   1699                  int32_t* result) {
   1700 #ifdef DEBUG
   1701 #ifdef DEBUG_METAGEMM_VERBOSE
   1702   std::cout << __FILE__ << "(" << __LINE__
   1703             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1704                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, "
   1705                "8>::Multiply()"
   1706             << std::endl
   1707             << std::flush;
   1708 #endif
   1709 #endif
   1710   asm volatile(
   1711       "pld [%[lhs]]\n"
   1712       "pld [%[rhs]]\n"
   1713 
   1714       // Clear aggregators.
   1715       "vmov.i32 q0, #0\n"
   1716       "vmov.i32 q1, #0\n"
   1717       "vmov.i32 q2, #0\n"
   1718 
   1719       // General NxM lanes loop.
   1720       "1:"
   1721 
   1722       // Subtract counter.
   1723       "subs %[count], %[count], #8\n"
   1724 
   1725       "vld1.32 {d6}, [%[lhs]:64]!\n"
   1726       "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
   1727       "pld [%[lhs], #64]\n"
   1728       "pld [%[rhs], #64]\n"
   1729       "vmull.u8 q5, d7, d6\n"
   1730       "vmull.u8 q6, d8, d6\n"
   1731       "vmull.u8 q7, d9, d6\n"
   1732       "vpadal.u16 q0, q5\n"
   1733       "vpadal.u16 q1, q6\n"
   1734       "vpadal.u16 q2, q7\n"
   1735 
   1736       // Loop break.
   1737       "bgt 1b\n"
   1738 
   1739       // StaticQuantizationInt32::Prepare
   1740       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   1741       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   1742       "vdup.32 q4, d8[0]\n"
   1743 
   1744       // RowMajorOutput::Prepare
   1745 
   1746       // Reduce aggregators.
   1747       "vpadd.u32 d0, d0, d1\n"
   1748       "vpadd.u32 d2, d2, d3\n"
   1749       "vpadd.u32 d4, d4, d5\n"
   1750       "vpadd.u32 d0, d0, d2\n"
   1751       "vpadd.u32 d1, d4, d4\n"
   1752 
   1753       // StaticQuantizationInt32::Transform
   1754       "vadd.s32 q0, q0, q4\n"
   1755       "vadd.s32 q0, q0, q5\n"
   1756 
   1757       // RowMajorOutput::Output
   1758       "vst1.32 {d0}, [%[result]]!\n"
   1759       "vst1.32 {d1[0]}, [%[result]]!\n"
   1760       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1761       : [count] "r"(params.kernel.count),
   1762         [stride] "r"(params.output_stream.stride)
   1763       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   1764         "d11", "d12", "d13", "d14", "d15", "cc", "memory");
   1765 }
   1766 
   1767 template <>
   1768 inline void MulKernel<
   1769     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4,
   1770     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1771                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1772                                          RowMajor>& params,
   1773                  int32_t* result) {
   1774 #ifdef DEBUG
   1775 #ifdef DEBUG_METAGEMM_VERBOSE
   1776   std::cout << __FILE__ << "(" << __LINE__
   1777             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1778                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, "
   1779                "8>::Multiply()"
   1780             << std::endl
   1781             << std::flush;
   1782 #endif
   1783 #endif
   1784   asm volatile(
   1785       "pld [%[lhs]]\n"
   1786       "pld [%[rhs]]\n"
   1787 
   1788       // Clear aggregators.
   1789       "vmov.i32 q0, #0\n"
   1790       "vmov.i32 q1, #0\n"
   1791       "vmov.i32 q2, #0\n"
   1792       "vmov.i32 q3, q0\n"
   1793 
   1794       // General NxM lanes loop.
   1795       "1:"
   1796 
   1797       // Subtract counter.
   1798       "subs %[count], %[count], #8\n"
   1799 
   1800       "vld1.32 {d8}, [%[lhs]:64]!\n"
   1801       "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
   1802       "pld [%[lhs], #64]\n"
   1803       "pld [%[rhs], #64]\n"
   1804       "vmull.u8 q7, d9, d8\n"
   1805       "vmull.u8 q8, d10, d8\n"
   1806       "vmull.u8 q9, d11, d8\n"
   1807       "vmull.u8 q10, d12, d8\n"
   1808       "vpadal.u16 q0, q7\n"
   1809       "vpadal.u16 q1, q8\n"
   1810       "vpadal.u16 q2, q9\n"
   1811       "vpadal.u16 q3, q10\n"
   1812 
   1813       // Loop break.
   1814       "bgt 1b\n"
   1815 
   1816       // StaticQuantizationInt32::Prepare
   1817       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   1818       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   1819       "vdup.32 q4, d8[0]\n"
   1820 
   1821       // RowMajorOutput::Prepare
   1822 
   1823       // Reduce aggregators.
   1824       "vpadd.u32 d0, d0, d1\n"
   1825       "vpadd.u32 d2, d2, d3\n"
   1826       "vpadd.u32 d4, d4, d5\n"
   1827       "vpadd.u32 d6, d6, d7\n"
   1828       "vpadd.u32 d0, d0, d2\n"
   1829       "vpadd.u32 d1, d4, d6\n"
   1830 
   1831       // StaticQuantizationInt32::Transform
   1832       "vadd.s32 q0, q0, q4\n"
   1833       "vadd.s32 q0, q0, q5\n"
   1834 
   1835       // RowMajorOutput::Output
   1836       "vst1.32 {d0, d1}, [%[result]]!\n"
   1837       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1838       : [count] "r"(params.kernel.count),
   1839         [stride] "r"(params.output_stream.stride)
   1840       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   1841         "d11", "d12", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21",
   1842         "cc", "memory");
   1843 }
   1844 
   1845 template <>
   1846 inline void MulKernel<
   1847     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5,
   1848     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1849                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1850                                          RowMajor>& params,
   1851                  int32_t* result) {
   1852 #ifdef DEBUG
   1853 #ifdef DEBUG_METAGEMM_VERBOSE
   1854   std::cout << __FILE__ << "(" << __LINE__
   1855             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1856                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, "
   1857                "8>::Multiply()"
   1858             << std::endl
   1859             << std::flush;
   1860 #endif
   1861 #endif
   1862   asm volatile(
   1863       "pld [%[lhs]]\n"
   1864       "pld [%[rhs]]\n"
   1865 
   1866       // Clear aggregators.
   1867       "vmov.i32 q0, #0\n"
   1868       "vmov.i32 q1, #0\n"
   1869       "vmov.i32 q2, #0\n"
   1870       "vmov.i32 q3, q0\n"
   1871       "vmov.i32 q4, q1\n"
   1872 
   1873       // General 1xM lanes loop.
   1874       "1:"
   1875 
   1876       // Subtract counter.
   1877       "subs %[count], %[count], #8\n"
   1878 
   1879       "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
   1880       "vld1.32 {d14}, [%[lhs]:64]!\n"
   1881       "pld [%[lhs], #64]\n"
   1882       "vmull.u8 q8, d10, d14\n"
   1883       "vmull.u8 q9, d11, d14\n"
   1884       "vmull.u8 q10, d12, d14\n"
   1885       "vmull.u8 q11, d13, d14\n"
   1886       "vld1.32 {d10}, [%[rhs]:64]!\n"
   1887       "pld [%[rhs], #128]\n"
   1888       "vpadal.u16 q0, q8\n"
   1889       "vpadal.u16 q1, q9\n"
   1890       "vpadal.u16 q2, q10\n"
   1891       "vpadal.u16 q3, q11\n"
   1892       "vmull.u8 q8, d10, d14\n"
   1893       "vpadal.u16 q4, q8\n"
   1894 
   1895       // Loop break.
   1896       "bgt 1b\n"
   1897 
   1898       // StaticQuantizationInt32::Prepare
   1899       "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
   1900       "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
   1901       "vdup.32 q5, d10[0]\n"
   1902 
   1903       // RowMajorOutput::Prepare
   1904 
   1905       // Reduce aggregators.
   1906       "vpadd.u32 d0, d0, d1\n"
   1907       "vpadd.u32 d2, d2, d3\n"
   1908       "vpadd.u32 d4, d4, d5\n"
   1909       "vpadd.u32 d6, d6, d7\n"
   1910       "vpadd.u32 d8, d8, d9\n"
   1911       "vpadd.u32 d0, d0, d2\n"
   1912       "vpadd.u32 d1, d4, d6\n"
   1913       "vpadd.u32 d2, d8, d8\n"
   1914 
   1915       // StaticQuantizationInt32::Transform
   1916       "vadd.s32 q0, q0, q5\n"
   1917       "vadd.s32 q1, q1, q5\n"
   1918       "vadd.s32 q0, q0, q6\n"
   1919       "vadd.s32 q1, q1, q7\n"
   1920 
   1921       // RowMajorOutput::Output
   1922       "vst1.32 {d0, d1}, [%[result]]!\n"
   1923       "vst1.32 {d2[0]}, [%[result]]!\n"
   1924       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   1925       : [count] "r"(params.kernel.count),
   1926         [stride] "r"(params.output_stream.stride)
   1927       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   1928         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   1929         "d21", "d22", "d23", "cc", "memory");
   1930 }
   1931 
   1932 template <>
   1933 inline void MulKernel<
   1934     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6,
   1935     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   1936                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   1937                                          RowMajor>& params,
   1938                  int32_t* result) {
   1939 #ifdef DEBUG
   1940 #ifdef DEBUG_METAGEMM_VERBOSE
   1941   std::cout << __FILE__ << "(" << __LINE__
   1942             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   1943                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, "
   1944                "8>::Multiply()"
   1945             << std::endl
   1946             << std::flush;
   1947 #endif
   1948 #endif
   1949   asm volatile(
   1950       "pld [%[lhs]]\n"
   1951       "pld [%[rhs]]\n"
   1952 
   1953       // Clear aggregators.
   1954       "vmov.i32 q0, #0\n"
   1955       "vmov.i32 q1, #0\n"
   1956       "vmov.i32 q2, #0\n"
   1957       "vmov.i32 q3, q0\n"
   1958       "vmov.i32 q4, q1\n"
   1959       "vmov.i32 q5, q2\n"
   1960 
   1961       // General 1xM lanes loop.
   1962       "1:"
   1963 
   1964       // Subtract counter.
   1965       "subs %[count], %[count], #8\n"
   1966 
   1967       "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
   1968       "vld1.32 {d16}, [%[lhs]:64]!\n"
   1969       "pld [%[lhs], #64]\n"
   1970       "vmull.u8 q9, d12, d16\n"
   1971       "vmull.u8 q10, d13, d16\n"
   1972       "vmull.u8 q11, d14, d16\n"
   1973       "vmull.u8 q12, d15, d16\n"
   1974       "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
   1975       "pld [%[rhs], #128]\n"
   1976       "vpadal.u16 q0, q9\n"
   1977       "vpadal.u16 q1, q10\n"
   1978       "vpadal.u16 q2, q11\n"
   1979       "vpadal.u16 q3, q12\n"
   1980       "vmull.u8 q9, d12, d16\n"
   1981       "vmull.u8 q10, d13, d16\n"
   1982       "vpadal.u16 q4, q9\n"
   1983       "vpadal.u16 q5, q10\n"
   1984 
   1985       // Loop break.
   1986       "bgt 1b\n"
   1987 
   1988       // StaticQuantizationInt32::Prepare
   1989       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   1990       "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
   1991       "vdup.32 q6, d12[0]\n"
   1992 
   1993       // RowMajorOutput::Prepare
   1994 
   1995       // Reduce aggregators.
   1996       "vpadd.u32 d0, d0, d1\n"
   1997       "vpadd.u32 d2, d2, d3\n"
   1998       "vpadd.u32 d4, d4, d5\n"
   1999       "vpadd.u32 d6, d6, d7\n"
   2000       "vpadd.u32 d8, d8, d9\n"
   2001       "vpadd.u32 d10, d10, d11\n"
   2002       "vpadd.u32 d0, d0, d2\n"
   2003       "vpadd.u32 d1, d4, d6\n"
   2004       "vpadd.u32 d2, d8, d10\n"
   2005 
   2006       // StaticQuantizationInt32::Transform
   2007       "vadd.s32 q0, q0, q6\n"
   2008       "vadd.s32 q1, q1, q6\n"
   2009       "vadd.s32 q0, q0, q7\n"
   2010       "vadd.s32 q1, q1, q8\n"
   2011 
   2012       // RowMajorOutput::Output
   2013       "vst1.32 {d0, d1, d2}, [%[result]]!\n"
   2014       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2015       : [count] "r"(params.kernel.count),
   2016         [stride] "r"(params.output_stream.stride)
   2017       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   2018         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   2019         "d21", "d22", "d23", "d24", "d25", "cc", "memory");
   2020 }
   2021 
   2022 template <>
   2023 inline void MulKernel<
   2024     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7,
   2025     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2026                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2027                                          RowMajor>& params,
   2028                  int32_t* result) {
   2029 #ifdef DEBUG
   2030 #ifdef DEBUG_METAGEMM_VERBOSE
   2031   std::cout << __FILE__ << "(" << __LINE__
   2032             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2033                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, "
   2034                "8>::Multiply()"
   2035             << std::endl
   2036             << std::flush;
   2037 #endif
   2038 #endif
   2039   asm volatile(
   2040       "pld [%[lhs]]\n"
   2041       "pld [%[rhs]]\n"
   2042 
   2043       // Clear aggregators.
   2044       "vmov.i32 q0, #0\n"
   2045       "vmov.i32 q1, #0\n"
   2046       "vmov.i32 q2, #0\n"
   2047       "vmov.i32 q3, q0\n"
   2048       "vmov.i32 q4, q1\n"
   2049       "vmov.i32 q5, q2\n"
   2050       "vmov.i32 q6, q3\n"
   2051 
   2052       // General 1xM lanes loop.
   2053       "1:"
   2054 
   2055       // Subtract counter.
   2056       "subs %[count], %[count], #8\n"
   2057 
   2058       "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
   2059       "vld1.32 {d18}, [%[lhs]:64]!\n"
   2060       "pld [%[lhs], #64]\n"
   2061       "vmull.u8 q10, d14, d18\n"
   2062       "vmull.u8 q11, d15, d18\n"
   2063       "vmull.u8 q12, d16, d18\n"
   2064       "vmull.u8 q13, d17, d18\n"
   2065       "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
   2066       "pld [%[rhs], #128]\n"
   2067       "vpadal.u16 q0, q10\n"
   2068       "vpadal.u16 q1, q11\n"
   2069       "vpadal.u16 q2, q12\n"
   2070       "vpadal.u16 q3, q13\n"
   2071       "vmull.u8 q10, d14, d18\n"
   2072       "vmull.u8 q11, d15, d18\n"
   2073       "vmull.u8 q12, d16, d18\n"
   2074       "vpadal.u16 q4, q10\n"
   2075       "vpadal.u16 q5, q11\n"
   2076       "vpadal.u16 q6, q12\n"
   2077 
   2078       // Loop break.
   2079       "bgt 1b\n"
   2080 
   2081       // StaticQuantizationInt32::Prepare
   2082       "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
   2083       "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
   2084       "vdup.32 q7, d14[0]\n"
   2085 
   2086       // RowMajorOutput::Prepare
   2087 
   2088       // Reduce aggregators.
   2089       "vpadd.u32 d0, d0, d1\n"
   2090       "vpadd.u32 d2, d2, d3\n"
   2091       "vpadd.u32 d4, d4, d5\n"
   2092       "vpadd.u32 d6, d6, d7\n"
   2093       "vpadd.u32 d8, d8, d9\n"
   2094       "vpadd.u32 d10, d10, d11\n"
   2095       "vpadd.u32 d12, d12, d13\n"
   2096       "vpadd.u32 d0, d0, d2\n"
   2097       "vpadd.u32 d1, d4, d6\n"
   2098       "vpadd.u32 d2, d8, d10\n"
   2099       "vpadd.u32 d3, d12, d12\n"
   2100 
   2101       // StaticQuantizationInt32::Transform
   2102       "vadd.s32 q0, q0, q7\n"
   2103       "vadd.s32 q1, q1, q7\n"
   2104       "vadd.s32 q0, q0, q8\n"
   2105       "vadd.s32 q1, q1, q9\n"
   2106 
   2107       // RowMajorOutput::Output
   2108       "vst1.32 {d0, d1, d2}, [%[result]]!\n"
   2109       "vst1.32 {d3[0]}, [%[result]]!\n"
   2110       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2111       : [count] "r"(params.kernel.count),
   2112         [stride] "r"(params.output_stream.stride)
   2113       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   2114         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   2115         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
   2116 }
   2117 
   2118 template <>
   2119 inline void MulKernel<
   2120     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8,
   2121     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2122                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2123                                          RowMajor>& params,
   2124                  int32_t* result) {
   2125 #ifdef DEBUG
   2126 #ifdef DEBUG_METAGEMM_VERBOSE
   2127   std::cout << __FILE__ << "(" << __LINE__
   2128             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2129                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, "
   2130                "8>::Multiply()"
   2131             << std::endl
   2132             << std::flush;
   2133 #endif
   2134 #endif
   2135   asm volatile(
   2136       "pld [%[lhs]]\n"
   2137       "pld [%[rhs]]\n"
   2138 
   2139       // Clear aggregators.
   2140       "vmov.i32 q0, #0\n"
   2141       "vmov.i32 q1, #0\n"
   2142       "vmov.i32 q2, #0\n"
   2143       "vmov.i32 q3, q0\n"
   2144       "vmov.i32 q4, q1\n"
   2145       "vmov.i32 q5, q2\n"
   2146       "vmov.i32 q6, q3\n"
   2147       "vmov.i32 q7, q4\n"
   2148 
   2149       // 1x8 lanes loop.
   2150       "1:"
   2151 
   2152       "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
   2153       "vld1.32 {d16}, [%[lhs]:64]!\n"
   2154       "vmull.u8 q11, d16, d17\n"
   2155       "vmull.u8 q12, d16, d18\n"
   2156       "vmull.u8 q13, d16, d19\n"
   2157       "vmull.u8 q14, d16, d20\n"
   2158       "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
   2159       "vpadal.u16 q0, q11\n"
   2160       "vpadal.u16 q1, q12\n"
   2161       "vpadal.u16 q2, q13\n"
   2162       "vpadal.u16 q3, q14\n"
   2163       "pld [%[rhs], #256]\n"
   2164       "vmull.u8 q15, d16, d17\n"
   2165       "vmull.u8 q11, d16, d18\n"
   2166       "vmull.u8 q12, d16, d19\n"
   2167       "vmull.u8 q13, d16, d20\n"
   2168       "pld [%[lhs], #32]\n"
   2169 
   2170       // Subtract counter.
   2171       "subs %[count], %[count], #8\n"
   2172 
   2173       "vpadal.u16 q4, q15\n"
   2174       "vpadal.u16 q5, q11\n"
   2175       "vpadal.u16 q6, q12\n"
   2176       "vpadal.u16 q7, q13\n"
   2177 
   2178       // Loop break.
   2179       "bgt 1b\n"
   2180 
   2181       // StaticQuantizationInt32::Prepare
   2182       "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
   2183       "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
   2184       "vdup.32 q8, d16[0]\n"
   2185 
   2186       // RowMajorOutput::Prepare
   2187 
   2188       // Reduce aggregators.
   2189       "vpadd.u32 d0, d0, d1\n"
   2190       "vpadd.u32 d2, d2, d3\n"
   2191       "vpadd.u32 d4, d4, d5\n"
   2192       "vpadd.u32 d6, d6, d7\n"
   2193       "vpadd.u32 d8, d8, d9\n"
   2194       "vpadd.u32 d10, d10, d11\n"
   2195       "vpadd.u32 d12, d12, d13\n"
   2196       "vpadd.u32 d14, d14, d15\n"
   2197       "vpadd.u32 d0, d0, d2\n"
   2198       "vpadd.u32 d1, d4, d6\n"
   2199       "vpadd.u32 d2, d8, d10\n"
   2200       "vpadd.u32 d3, d12, d14\n"
   2201 
   2202       // StaticQuantizationInt32::Transform
   2203       "vadd.s32 q0, q0, q8\n"
   2204       "vadd.s32 q1, q1, q8\n"
   2205       "vadd.s32 q0, q0, q9\n"
   2206       "vadd.s32 q1, q1, q10\n"
   2207 
   2208       // RowMajorOutput::Output
   2209       "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n"
   2210       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2211       : [count] "r"(params.kernel.count),
   2212         [stride] "r"(params.output_stream.stride)
   2213       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   2214         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   2215         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
   2216         "d31", "cc", "memory");
   2217 }
   2218 
   2219 template <>
   2220 inline void MulKernel<
   2221     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1,
   2222     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2223                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2224                                          RowMajor>& params,
   2225                  int32_t* result) {
   2226 #ifdef DEBUG
   2227 #ifdef DEBUG_METAGEMM_VERBOSE
   2228   std::cout << __FILE__ << "(" << __LINE__
   2229             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2230                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, "
   2231                "8>::Multiply()"
   2232             << std::endl
   2233             << std::flush;
   2234 #endif
   2235 #endif
   2236   asm volatile(
   2237       "pld [%[lhs]]\n"
   2238       "pld [%[rhs]]\n"
   2239 
   2240       // Clear aggregators.
   2241       "vmov.i32 q0, #0\n"
   2242       "vmov.i32 q1, #0\n"
   2243 
   2244       // General NxM lanes loop.
   2245       "1:"
   2246 
   2247       // Subtract counter.
   2248       "subs %[count], %[count], #8\n"
   2249 
   2250       "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
   2251       "vld1.32 {d6}, [%[rhs]:64]!\n"
   2252       "pld [%[lhs], #64]\n"
   2253       "pld [%[rhs], #64]\n"
   2254       "vmull.u8 q4, d6, d4\n"
   2255       "vmull.u8 q5, d6, d5\n"
   2256       "vpadal.u16 q0, q4\n"
   2257       "vpadal.u16 q1, q5\n"
   2258 
   2259       // Loop break.
   2260       "bgt 1b\n"
   2261 
   2262       // StaticQuantizationInt32::Prepare
   2263       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   2264       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   2265       "vdup.32 q2, d8[0]\n"
   2266       "vdup.32 q4, d8[1]\n"
   2267 
   2268       // RowMajorOutput::Prepare
   2269       "add r0, %[result], %[stride]\n"
   2270 
   2271       // Reduce aggregators.
   2272       "vpadd.u32 d0, d0, d1\n"
   2273       "vpadd.u32 d0, d0, d0\n"
   2274       "vpadd.u32 d2, d2, d3\n"
   2275       "vpadd.u32 d2, d2, d2\n"
   2276 
   2277       // StaticQuantizationInt32::Transform
   2278       "vadd.s32 q0, q0, q2\n"
   2279       "vadd.s32 q1, q1, q4\n"
   2280       "vadd.s32 q0, q0, q5\n"
   2281       "vadd.s32 q1, q1, q5\n"
   2282 
   2283       // RowMajorOutput::Output
   2284       "vst1.32 {d0[0]}, [%[result]]!\n"
   2285       "vst1.32 {d2[0]}, [r0]!\n"
   2286       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2287       : [count] "r"(params.kernel.count),
   2288         [stride] "r"(params.output_stream.stride)
   2289       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
   2290         "d11", "cc", "memory");
   2291 }
   2292 
   2293 template <>
   2294 inline void MulKernel<
   2295     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2,
   2296     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2297                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2298                                          RowMajor>& params,
   2299                  int32_t* result) {
   2300 #ifdef DEBUG
   2301 #ifdef DEBUG_METAGEMM_VERBOSE
   2302   std::cout << __FILE__ << "(" << __LINE__
   2303             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2304                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, "
   2305                "8>::Multiply()"
   2306             << std::endl
   2307             << std::flush;
   2308 #endif
   2309 #endif
   2310   asm volatile(
   2311       "pld [%[lhs]]\n"
   2312       "pld [%[rhs]]\n"
   2313 
   2314       // Clear aggregators.
   2315       "vmov.i32 q0, #0\n"
   2316       "vmov.i32 q1, #0\n"
   2317       "vmov.i32 q2, #0\n"
   2318       "vmov.i32 q3, q0\n"
   2319 
   2320       // General NxM lanes loop.
   2321       "1:"
   2322 
   2323       // Subtract counter.
   2324       "subs %[count], %[count], #8\n"
   2325 
   2326       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   2327       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   2328       "pld [%[lhs], #64]\n"
   2329       "pld [%[rhs], #64]\n"
   2330       "vmull.u8 q6, d10, d8\n"
   2331       "vmull.u8 q7, d11, d8\n"
   2332       "vmull.u8 q8, d10, d9\n"
   2333       "vmull.u8 q9, d11, d9\n"
   2334       "vpadal.u16 q0, q6\n"
   2335       "vpadal.u16 q1, q7\n"
   2336       "vpadal.u16 q2, q8\n"
   2337       "vpadal.u16 q3, q9\n"
   2338 
   2339       // Loop break.
   2340       "bgt 1b\n"
   2341 
   2342       // StaticQuantizationInt32::Prepare
   2343       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   2344       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   2345       "vdup.32 q6, d8[0]\n"
   2346       "vdup.32 q4, d8[1]\n"
   2347 
   2348       // RowMajorOutput::Prepare
   2349       "add r0, %[result], %[stride]\n"
   2350 
   2351       // Reduce aggregators.
   2352       "vpadd.u32 d0, d0, d1\n"
   2353       "vpadd.u32 d2, d2, d3\n"
   2354       "vpadd.u32 d0, d0, d2\n"
   2355       "vpadd.u32 d4, d4, d5\n"
   2356       "vpadd.u32 d6, d6, d7\n"
   2357       "vpadd.u32 d4, d4, d6\n"
   2358 
   2359       // StaticQuantizationInt32::Transform
   2360       "vadd.s32 q0, q0, q6\n"
   2361       "vadd.s32 q2, q2, q4\n"
   2362       "vadd.s32 q0, q0, q5\n"
   2363       "vadd.s32 q2, q2, q5\n"
   2364 
   2365       // RowMajorOutput::Output
   2366       "vst1.32 {d0}, [%[result]]!\n"
   2367       "vst1.32 {d4}, [r0]!\n"
   2368       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2369       : [count] "r"(params.kernel.count),
   2370         [stride] "r"(params.output_stream.stride)
   2371       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   2372         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
   2373         "memory");
   2374 }
   2375 
   2376 template <>
   2377 inline void MulKernel<
   2378     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3,
   2379     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2380                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2381                                          RowMajor>& params,
   2382                  int32_t* result) {
   2383 #ifdef DEBUG
   2384 #ifdef DEBUG_METAGEMM_VERBOSE
   2385   std::cout << __FILE__ << "(" << __LINE__
   2386             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2387                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, "
   2388                "8>::Multiply()"
   2389             << std::endl
   2390             << std::flush;
   2391 #endif
   2392 #endif
   2393   asm volatile(
   2394       "pld [%[lhs]]\n"
   2395       "pld [%[rhs]]\n"
   2396 
   2397       // Clear aggregators.
   2398       "vmov.i32 q0, #0\n"
   2399       "vmov.i32 q1, #0\n"
   2400       "vmov.i32 q2, #0\n"
   2401       "vmov.i32 q3, q0\n"
   2402       "vmov.i32 q4, q1\n"
   2403       "vmov.i32 q5, q2\n"
   2404 
   2405       // General NxM lanes loop.
   2406       "1:"
   2407 
   2408       // Subtract counter.
   2409       "subs %[count], %[count], #8\n"
   2410 
   2411       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   2412       "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
   2413       "pld [%[lhs], #64]\n"
   2414       "pld [%[rhs], #64]\n"
   2415       "vmull.u8 q9, d14, d12\n"
   2416       "vmull.u8 q10, d15, d12\n"
   2417       "vmull.u8 q11, d16, d12\n"
   2418       "vmull.u8 q12, d14, d13\n"
   2419       "vmull.u8 q13, d15, d13\n"
   2420       "vmull.u8 q14, d16, d13\n"
   2421       "vpadal.u16 q0, q9\n"
   2422       "vpadal.u16 q1, q10\n"
   2423       "vpadal.u16 q2, q11\n"
   2424       "vpadal.u16 q3, q12\n"
   2425       "vpadal.u16 q4, q13\n"
   2426       "vpadal.u16 q5, q14\n"
   2427 
   2428       // Loop break.
   2429       "bgt 1b\n"
   2430 
   2431       // StaticQuantizationInt32::Prepare
   2432       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   2433       "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
   2434       "vdup.32 q8, d12[0]\n"
   2435       "vdup.32 q6, d12[1]\n"
   2436 
   2437       // RowMajorOutput::Prepare
   2438       "add r0, %[result], %[stride]\n"
   2439 
   2440       // Reduce aggregators.
   2441       "vpadd.u32 d0, d0, d1\n"
   2442       "vpadd.u32 d2, d2, d3\n"
   2443       "vpadd.u32 d4, d4, d5\n"
   2444       "vpadd.u32 d0, d0, d2\n"
   2445       "vpadd.u32 d1, d4, d4\n"
   2446       "vpadd.u32 d6, d6, d7\n"
   2447       "vpadd.u32 d8, d8, d9\n"
   2448       "vpadd.u32 d10, d10, d11\n"
   2449       "vpadd.u32 d6, d6, d8\n"
   2450       "vpadd.u32 d7, d10, d10\n"
   2451 
   2452       // StaticQuantizationInt32::Transform
   2453       "vadd.s32 q0, q0, q8\n"
   2454       "vadd.s32 q3, q3, q6\n"
   2455       "vadd.s32 q0, q0, q7\n"
   2456       "vadd.s32 q3, q3, q7\n"
   2457 
   2458       // RowMajorOutput::Output
   2459       "vst1.32 {d0}, [%[result]]!\n"
   2460       "vst1.32 {d1[0]}, [%[result]]!\n"
   2461       "vst1.32 {d6}, [r0]!\n"
   2462       "vst1.32 {d7[0]}, [r0]!\n"
   2463       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2464       : [count] "r"(params.kernel.count),
   2465         [stride] "r"(params.output_stream.stride)
   2466       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   2467         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   2468         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
   2469         "memory");
   2470 }
   2471 
   2472 template <>
   2473 inline void MulKernel<
   2474     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4,
   2475     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2476                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2477                                          RowMajor>& params,
   2478                  int32_t* result) {
   2479 #ifdef DEBUG
   2480 #ifdef DEBUG_METAGEMM_VERBOSE
   2481   std::cout << __FILE__ << "(" << __LINE__
   2482             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2483                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, "
   2484                "8>::Multiply()"
   2485             << std::endl
   2486             << std::flush;
   2487 #endif
   2488 #endif
   2489   asm volatile(
   2490       "pld [%[lhs]]\n"
   2491       "pld [%[rhs]]\n"
   2492 
   2493       // Clear aggregators.
   2494       "vmov.i32 q0, #0\n"
   2495       "vmov.i32 q1, #0\n"
   2496       "vmov.i32 q2, #0\n"
   2497       "vmov.i32 q3, q0\n"
   2498       "vmov.i32 q4, q1\n"
   2499       "vmov.i32 q5, q2\n"
   2500       "vmov.i32 q6, q3\n"
   2501       "vmov.i32 q7, q4\n"
   2502 
   2503       // 2x4 lanes loop.
   2504       "1:"
   2505 
   2506       "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
   2507       "vld1.8 {d16}, [%[lhs]:64]!\n"
   2508       "vmull.u8 q11, d16, d18\n"
   2509       "vld1.8 {d17}, [%[lhs]:64]!\n"
   2510       "vmull.u8 q12, d16, d19\n"
   2511       "pld [%[rhs], #64]\n"
   2512       "vmull.u8 q13, d16, d20\n"
   2513       "pld [%[lhs], #64]\n"
   2514       "vmull.u8 q14, d16, d21\n"
   2515       "vmull.u8 q15, d17, d18\n"
   2516       "vpadal.u16 q0, q11\n"
   2517       "vpadal.u16 q1, q12\n"
   2518       "vpadal.u16 q2, q13\n"
   2519       "vmull.u8 q11, d17, d19\n"
   2520       "vmull.u8 q12, d17, d20\n"
   2521       "vmull.u8 q13, d17, d21\n"
   2522 
   2523       // Subtract counter.
   2524       "subs %[count], %[count], #8\n"
   2525 
   2526       "vpadal.u16 q3, q14\n"
   2527       "vpadal.u16 q4, q15\n"
   2528       "vpadal.u16 q5, q11\n"
   2529       "vpadal.u16 q6, q12\n"
   2530       "vpadal.u16 q7, q13\n"
   2531 
   2532       // Loop break.
   2533       "bgt 1b\n"
   2534 
   2535       // StaticQuantizationInt32::Prepare
   2536       "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
   2537       "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
   2538       "vdup.32 q10, d16[0]\n"
   2539       "vdup.32 q8, d16[1]\n"
   2540 
   2541       // RowMajorOutput::Prepare
   2542       "add r0, %[result], %[stride]\n"
   2543 
   2544       // Reduce aggregators.
   2545       "vpadd.u32 d0, d0, d1\n"
   2546       "vpadd.u32 d2, d2, d3\n"
   2547       "vpadd.u32 d4, d4, d5\n"
   2548       "vpadd.u32 d6, d6, d7\n"
   2549       "vpadd.u32 d0, d0, d2\n"
   2550       "vpadd.u32 d1, d4, d6\n"
   2551       "vpadd.u32 d8, d8, d9\n"
   2552       "vpadd.u32 d10, d10, d11\n"
   2553       "vpadd.u32 d12, d12, d13\n"
   2554       "vpadd.u32 d14, d14, d15\n"
   2555       "vpadd.u32 d8, d8, d10\n"
   2556       "vpadd.u32 d9, d12, d14\n"
   2557 
   2558       // StaticQuantizationInt32::Transform
   2559       "vadd.s32 q0, q0, q10\n"
   2560       "vadd.s32 q4, q4, q8\n"
   2561       "vadd.s32 q0, q0, q9\n"
   2562       "vadd.s32 q4, q4, q9\n"
   2563 
   2564       // RowMajorOutput::Output
   2565       "vst1.32 {d0, d1}, [%[result]]!\n"
   2566       "vst1.32 {d8, d9}, [r0]!\n"
   2567       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2568       : [count] "r"(params.kernel.count),
   2569         [stride] "r"(params.output_stream.stride)
   2570       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   2571         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   2572         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
   2573         "d31", "cc", "memory");
   2574 }
   2575 
   2576 template <>
   2577 inline void MulKernel<
   2578     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1,
   2579     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2580                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2581                                          RowMajor>& params,
   2582                  int32_t* result) {
   2583 #ifdef DEBUG
   2584 #ifdef DEBUG_METAGEMM_VERBOSE
   2585   std::cout << __FILE__ << "(" << __LINE__
   2586             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2587                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, "
   2588                "8>::Multiply()"
   2589             << std::endl
   2590             << std::flush;
   2591 #endif
   2592 #endif
   2593   asm volatile(
   2594       "pld [%[lhs]]\n"
   2595       "pld [%[rhs]]\n"
   2596 
   2597       // Clear aggregators.
   2598       "vmov.i32 q0, #0\n"
   2599       "vmov.i32 q1, #0\n"
   2600       "vmov.i32 q2, #0\n"
   2601 
   2602       // General NxM lanes loop.
   2603       "1:"
   2604 
   2605       // Subtract counter.
   2606       "subs %[count], %[count], #8\n"
   2607 
   2608       "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
   2609       "vld1.32 {d9}, [%[rhs]:64]!\n"
   2610       "pld [%[lhs], #64]\n"
   2611       "pld [%[rhs], #64]\n"
   2612       "vmull.u8 q5, d9, d6\n"
   2613       "vmull.u8 q6, d9, d7\n"
   2614       "vmull.u8 q7, d9, d8\n"
   2615       "vpadal.u16 q0, q5\n"
   2616       "vpadal.u16 q1, q6\n"
   2617       "vpadal.u16 q2, q7\n"
   2618 
   2619       // Loop break.
   2620       "bgt 1b\n"
   2621 
   2622       // StaticQuantizationInt32::Prepare
   2623       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   2624       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   2625       "vdup.32 q3, d8[0]\n"
   2626       "vdup.32 q6, d8[1]\n"
   2627       "vdup.32 q4, d9[0]\n"
   2628 
   2629       // RowMajorOutput::Prepare
   2630       "add r0, %[result], %[stride]\n"
   2631       "add r1, r0, %[stride]\n"
   2632 
   2633       // Reduce aggregators.
   2634       "vpadd.u32 d0, d0, d1\n"
   2635       "vpadd.u32 d0, d0, d0\n"
   2636       "vpadd.u32 d2, d2, d3\n"
   2637       "vpadd.u32 d2, d2, d2\n"
   2638       "vpadd.u32 d4, d4, d5\n"
   2639       "vpadd.u32 d4, d4, d4\n"
   2640 
   2641       // StaticQuantizationInt32::Transform
   2642       "vadd.s32 q0, q0, q3\n"
   2643       "vadd.s32 q1, q1, q6\n"
   2644       "vadd.s32 q2, q2, q4\n"
   2645       "vadd.s32 q0, q0, q5\n"
   2646       "vadd.s32 q1, q1, q5\n"
   2647       "vadd.s32 q2, q2, q5\n"
   2648 
   2649       // RowMajorOutput::Output
   2650       "vst1.32 {d0[0]}, [%[result]]!\n"
   2651       "vst1.32 {d2[0]}, [r0]!\n"
   2652       "vst1.32 {d4[0]}, [r1]!\n"
   2653       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2654       : [count] "r"(params.kernel.count),
   2655         [stride] "r"(params.output_stream.stride)
   2656       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   2657         "d10", "d11", "d12", "d13", "d14", "d15", "cc", "memory");
   2658 }
   2659 
   2660 template <>
   2661 inline void MulKernel<
   2662     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2,
   2663     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2664                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2665                                          RowMajor>& params,
   2666                  int32_t* result) {
   2667 #ifdef DEBUG
   2668 #ifdef DEBUG_METAGEMM_VERBOSE
   2669   std::cout << __FILE__ << "(" << __LINE__
   2670             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2671                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, "
   2672                "8>::Multiply()"
   2673             << std::endl
   2674             << std::flush;
   2675 #endif
   2676 #endif
   2677   asm volatile(
   2678       "pld [%[lhs]]\n"
   2679       "pld [%[rhs]]\n"
   2680 
   2681       // Clear aggregators.
   2682       "vmov.i32 q0, #0\n"
   2683       "vmov.i32 q1, #0\n"
   2684       "vmov.i32 q2, #0\n"
   2685       "vmov.i32 q3, q0\n"
   2686       "vmov.i32 q4, q1\n"
   2687       "vmov.i32 q5, q2\n"
   2688 
   2689       // General NxM lanes loop.
   2690       "1:"
   2691 
   2692       // Subtract counter.
   2693       "subs %[count], %[count], #8\n"
   2694 
   2695       "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
   2696       "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
   2697       "pld [%[lhs], #64]\n"
   2698       "pld [%[rhs], #64]\n"
   2699       "vmull.u8 q9, d15, d12\n"
   2700       "vmull.u8 q10, d16, d12\n"
   2701       "vmull.u8 q11, d15, d13\n"
   2702       "vmull.u8 q12, d16, d13\n"
   2703       "vmull.u8 q13, d15, d14\n"
   2704       "vmull.u8 q14, d16, d14\n"
   2705       "vpadal.u16 q0, q9\n"
   2706       "vpadal.u16 q1, q10\n"
   2707       "vpadal.u16 q2, q11\n"
   2708       "vpadal.u16 q3, q12\n"
   2709       "vpadal.u16 q4, q13\n"
   2710       "vpadal.u16 q5, q14\n"
   2711 
   2712       // Loop break.
   2713       "bgt 1b\n"
   2714 
   2715       // StaticQuantizationInt32::Prepare
   2716       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   2717       "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
   2718       "vdup.32 q8, d12[0]\n"
   2719       "vdup.32 q9, d12[1]\n"
   2720       "vdup.32 q6, d13[0]\n"
   2721 
   2722       // RowMajorOutput::Prepare
   2723       "add r0, %[result], %[stride]\n"
   2724       "add r1, r0, %[stride]\n"
   2725 
   2726       // Reduce aggregators.
   2727       "vpadd.u32 d0, d0, d1\n"
   2728       "vpadd.u32 d2, d2, d3\n"
   2729       "vpadd.u32 d0, d0, d2\n"
   2730       "vpadd.u32 d4, d4, d5\n"
   2731       "vpadd.u32 d6, d6, d7\n"
   2732       "vpadd.u32 d4, d4, d6\n"
   2733       "vpadd.u32 d8, d8, d9\n"
   2734       "vpadd.u32 d10, d10, d11\n"
   2735       "vpadd.u32 d8, d8, d10\n"
   2736 
   2737       // StaticQuantizationInt32::Transform
   2738       "vadd.s32 q0, q0, q8\n"
   2739       "vadd.s32 q2, q2, q9\n"
   2740       "vadd.s32 q4, q4, q6\n"
   2741       "vadd.s32 q0, q0, q7\n"
   2742       "vadd.s32 q2, q2, q7\n"
   2743       "vadd.s32 q4, q4, q7\n"
   2744 
   2745       // RowMajorOutput::Output
   2746       "vst1.32 {d0}, [%[result]]!\n"
   2747       "vst1.32 {d4}, [r0]!\n"
   2748       "vst1.32 {d8}, [r1]!\n"
   2749       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2750       : [count] "r"(params.kernel.count),
   2751         [stride] "r"(params.output_stream.stride)
   2752       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   2753         "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
   2754         "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
   2755         "cc", "memory");
   2756 }
   2757 
   2758 template <>
   2759 inline void MulKernel<
   2760     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3,
   2761     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2762                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
   2763                                          RowMajor>& params,
   2764                  int32_t* result) {
   2765 #ifdef DEBUG
   2766 #ifdef DEBUG_METAGEMM_VERBOSE
   2767   std::cout << __FILE__ << "(" << __LINE__
   2768             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
   2769                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, "
   2770                "8>::Multiply()"
   2771             << std::endl
   2772             << std::flush;
   2773 #endif
   2774 #endif
   2775   asm volatile(
   2776       "pld [%[lhs]]\n"
   2777       "pld [%[rhs]]\n"
   2778 
   2779       // Clear aggregators.
   2780       "vmov.i32 q0, #0\n"
   2781       "vmov.i32 q1, #0\n"
   2782       "vmov.i32 q2, #0\n"
   2783       "vmov.i32 q3, q0\n"
   2784       "vmov.i32 q4, q1\n"
   2785       "vmov.i32 q5, q2\n"
   2786       "vmov.i32 q6, q3\n"
   2787       "vmov.i32 q7, q4\n"
   2788       "vmov.i32 q8, q5\n"
   2789 
   2790       // 3x3 lanes loop.
   2791       "1:"
   2792 
   2793       "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
   2794       "vld1.8 {d18}, [%[lhs]:64]!\n"
   2795       "vmull.u8 q12, d18, d21\n"
   2796       "vld1.8 {d19}, [%[lhs]:64]!\n"
   2797       "vmull.u8 q13, d18, d22\n"
   2798       "vld1.8 {d20}, [%[lhs]:64]!\n"
   2799       "vmull.u8 q14, d18, d23\n"
   2800       "pld [%[lhs], #64]\n"
   2801       "vmull.u8 q15, d19, d21\n"
   2802       "pld [%[rhs], #64]\n"
   2803       "vpadal.u16 q0, q12\n"
   2804       "vpadal.u16 q1, q13\n"
   2805       "vpadal.u16 q2, q14\n"
   2806       "vpadal.u16 q3, q15\n"
   2807       "vmull.u8 q12, d19, d22\n"
   2808       "vmull.u8 q13, d19, d23\n"
   2809       "vmull.u8 q14, d20, d21\n"
   2810       "vmull.u8 q15, d20, d22\n"
   2811 
   2812       // Subtract counter.
   2813       "subs %[count], %[count], #8\n"
   2814 
   2815       "vmull.u8 q9, d20, d23\n"
   2816       "vpadal.u16 q4, q12\n"
   2817       "vpadal.u16 q5, q13\n"
   2818       "vpadal.u16 q6, q14\n"
   2819       "vpadal.u16 q7, q15\n"
   2820       "vpadal.u16 q8, q9\n"
   2821 
   2822       // Loop break.
   2823       "bgt 1b\n"
   2824 
   2825       // StaticQuantizationInt32::Prepare
   2826       "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
   2827       "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
   2828       "vdup.32 q11, d18[0]\n"
   2829       "vdup.32 q12, d18[1]\n"
   2830       "vdup.32 q9, d19[0]\n"
   2831 
   2832       // RowMajorOutput::Prepare
   2833       "add r0, %[result], %[stride]\n"
   2834       "add r1, r0, %[stride]\n"
   2835 
   2836       // Reduce aggregators.
   2837       "vpadd.u32 d0, d0, d1\n"
   2838       "vpadd.u32 d2, d2, d3\n"
   2839       "vpadd.u32 d4, d4, d5\n"
   2840       "vpadd.u32 d0, d0, d2\n"
   2841       "vpadd.u32 d1, d4, d4\n"
   2842       "vpadd.u32 d6, d6, d7\n"
   2843       "vpadd.u32 d8, d8, d9\n"
   2844       "vpadd.u32 d10, d10, d11\n"
   2845       "vpadd.u32 d6, d6, d8\n"
   2846       "vpadd.u32 d7, d10, d10\n"
   2847       "vpadd.u32 d12, d12, d13\n"
   2848       "vpadd.u32 d14, d14, d15\n"
   2849       "vpadd.u32 d16, d16, d17\n"
   2850       "vpadd.u32 d12, d12, d14\n"
   2851       "vpadd.u32 d13, d16, d16\n"
   2852 
   2853       // StaticQuantizationInt32::Transform
   2854       "vadd.s32 q0, q0, q11\n"
   2855       "vadd.s32 q3, q3, q12\n"
   2856       "vadd.s32 q6, q6, q9\n"
   2857       "vadd.s32 q0, q0, q10\n"
   2858       "vadd.s32 q3, q3, q10\n"
   2859       "vadd.s32 q6, q6, q10\n"
   2860 
   2861       // RowMajorOutput::Output
   2862       "vst1.32 {d0}, [%[result]]!\n"
   2863       "vst1.32 {d1[0]}, [%[result]]!\n"
   2864       "vst1.32 {d6}, [r0]!\n"
   2865       "vst1.32 {d7[0]}, [r0]!\n"
   2866       "vst1.32 {d12}, [r1]!\n"
   2867       "vst1.32 {d13[0]}, [r1]!\n"
   2868       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2869       : [count] "r"(params.kernel.count),
   2870         [stride] "r"(params.output_stream.stride)
   2871       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   2872         "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
   2873         "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
   2874         "d30", "d31", "cc", "memory");
   2875 }
   2876 
   2877 template <>
   2878 inline void MulKernel<
   2879     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1,
   2880     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2881                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   2882                                          RowMajor>& params,
   2883                  float* result) {
   2884 #ifdef DEBUG
   2885 #ifdef DEBUG_METAGEMM_VERBOSE
   2886   std::cout << __FILE__ << "(" << __LINE__
   2887             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   2888                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, "
   2889                "8>::Multiply()"
   2890             << std::endl
   2891             << std::flush;
   2892 #endif
   2893 #endif
   2894   asm volatile(
   2895       "pld [%[lhs]]\n"
   2896       "pld [%[rhs]]\n"
   2897 
   2898       // Clear aggregators.
   2899       "vmov.i32 q0, #0\n"
   2900 
   2901       // General NxM lanes loop.
   2902       "1:"
   2903 
   2904       // Subtract counter.
   2905       "subs %[count], %[count], #8\n"
   2906 
   2907       "vld1.32 {d2}, [%[lhs]:64]!\n"
   2908       "vld1.32 {d3}, [%[rhs]:64]!\n"
   2909       "pld [%[lhs], #64]\n"
   2910       "pld [%[rhs], #64]\n"
   2911       "vmull.u8 q2, d3, d2\n"
   2912       "vpadal.u16 q0, q2\n"
   2913 
   2914       // Loop break.
   2915       "bgt 1b\n"
   2916 
   2917       // StaticQuantizationFloat::Prepare
   2918       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   2919       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   2920       "vdup.32 q6, %[scale]\n"
   2921       "vdup.32 q4, d8[0]\n"
   2922 
   2923       // RowMajorOutput::Prepare
   2924 
   2925       // Reduce aggregators.
   2926       "vpadd.u32 d0, d0, d1\n"
   2927       "vpadd.u32 d0, d0, d0\n"
   2928 
   2929       // StaticQuantizationFloat::Transform
   2930       "vadd.s32 q0, q0, q4\n"
   2931       "vadd.s32 q0, q0, q5\n"
   2932       "vcvt.f32.s32 q0, q0\n"
   2933       "vmul.f32 q0, q0, q6\n"
   2934 
   2935       // RowMajorOutput::Output
   2936       "vst1.32 {d0[0]}, [%[result]]!\n"
   2937       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   2938       : [count] "r"(params.kernel.count),
   2939         [stride] "r"(params.output_stream.stride),
   2940         [scale] "r"(params.kernel.scale)
   2941       : "d0", "d1", "d2", "d3", "d4", "d5", "d8", "d9", "d10", "d11", "d12",
   2942         "d13", "cc", "memory");
   2943 }
   2944 
   2945 template <>
   2946 inline void MulKernel<
   2947     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2,
   2948     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   2949                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   2950                                          RowMajor>& params,
   2951                  float* result) {
   2952 #ifdef DEBUG
   2953 #ifdef DEBUG_METAGEMM_VERBOSE
   2954   std::cout << __FILE__ << "(" << __LINE__
   2955             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   2956                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, "
   2957                "8>::Multiply()"
   2958             << std::endl
   2959             << std::flush;
   2960 #endif
   2961 #endif
   2962   asm volatile(
   2963       "pld [%[lhs]]\n"
   2964       "pld [%[rhs]]\n"
   2965 
   2966       // Clear aggregators.
   2967       "vmov.i32 q0, #0\n"
   2968       "vmov.i32 q1, #0\n"
   2969 
   2970       // General NxM lanes loop.
   2971       "1:"
   2972 
   2973       // Subtract counter.
   2974       "subs %[count], %[count], #8\n"
   2975 
   2976       "vld1.32 {d4}, [%[lhs]:64]!\n"
   2977       "vld1.32 {d5, d6}, [%[rhs]:64]!\n"
   2978       "pld [%[lhs], #64]\n"
   2979       "pld [%[rhs], #64]\n"
   2980       "vmull.u8 q4, d5, d4\n"
   2981       "vmull.u8 q5, d6, d4\n"
   2982       "vpadal.u16 q0, q4\n"
   2983       "vpadal.u16 q1, q5\n"
   2984 
   2985       // Loop break.
   2986       "bgt 1b\n"
   2987 
   2988       // StaticQuantizationFloat::Prepare
   2989       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   2990       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   2991       "vdup.32 q6, %[scale]\n"
   2992       "vdup.32 q4, d8[0]\n"
   2993 
   2994       // RowMajorOutput::Prepare
   2995 
   2996       // Reduce aggregators.
   2997       "vpadd.u32 d0, d0, d1\n"
   2998       "vpadd.u32 d2, d2, d3\n"
   2999       "vpadd.u32 d0, d0, d2\n"
   3000 
   3001       // StaticQuantizationFloat::Transform
   3002       "vadd.s32 q0, q0, q4\n"
   3003       "vadd.s32 q0, q0, q5\n"
   3004       "vcvt.f32.s32 q0, q0\n"
   3005       "vmul.f32 q0, q0, q6\n"
   3006 
   3007       // RowMajorOutput::Output
   3008       "vst1.32 {d0}, [%[result]]!\n"
   3009       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3010       : [count] "r"(params.kernel.count),
   3011         [stride] "r"(params.output_stream.stride),
   3012         [scale] "r"(params.kernel.scale)
   3013       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10", "d11",
   3014         "d12", "d13", "cc", "memory");
   3015 }
   3016 
   3017 template <>
   3018 inline void MulKernel<
   3019     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3,
   3020     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3021                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3022                                          RowMajor>& params,
   3023                  float* result) {
   3024 #ifdef DEBUG
   3025 #ifdef DEBUG_METAGEMM_VERBOSE
   3026   std::cout << __FILE__ << "(" << __LINE__
   3027             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3028                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, "
   3029                "8>::Multiply()"
   3030             << std::endl
   3031             << std::flush;
   3032 #endif
   3033 #endif
   3034   asm volatile(
   3035       "pld [%[lhs]]\n"
   3036       "pld [%[rhs]]\n"
   3037 
   3038       // Clear aggregators.
   3039       "vmov.i32 q0, #0\n"
   3040       "vmov.i32 q1, #0\n"
   3041       "vmov.i32 q2, #0\n"
   3042 
   3043       // General NxM lanes loop.
   3044       "1:"
   3045 
   3046       // Subtract counter.
   3047       "subs %[count], %[count], #8\n"
   3048 
   3049       "vld1.32 {d6}, [%[lhs]:64]!\n"
   3050       "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n"
   3051       "pld [%[lhs], #64]\n"
   3052       "pld [%[rhs], #64]\n"
   3053       "vmull.u8 q5, d7, d6\n"
   3054       "vmull.u8 q6, d8, d6\n"
   3055       "vmull.u8 q7, d9, d6\n"
   3056       "vpadal.u16 q0, q5\n"
   3057       "vpadal.u16 q1, q6\n"
   3058       "vpadal.u16 q2, q7\n"
   3059 
   3060       // Loop break.
   3061       "bgt 1b\n"
   3062 
   3063       // StaticQuantizationFloat::Prepare
   3064       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   3065       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   3066       "vdup.32 q6, %[scale]\n"
   3067       "vdup.32 q4, d8[0]\n"
   3068 
   3069       // RowMajorOutput::Prepare
   3070 
   3071       // Reduce aggregators.
   3072       "vpadd.u32 d0, d0, d1\n"
   3073       "vpadd.u32 d2, d2, d3\n"
   3074       "vpadd.u32 d4, d4, d5\n"
   3075       "vpadd.u32 d0, d0, d2\n"
   3076       "vpadd.u32 d1, d4, d4\n"
   3077 
   3078       // StaticQuantizationFloat::Transform
   3079       "vadd.s32 q0, q0, q4\n"
   3080       "vadd.s32 q0, q0, q5\n"
   3081       "vcvt.f32.s32 q0, q0\n"
   3082       "vmul.f32 q0, q0, q6\n"
   3083 
   3084       // RowMajorOutput::Output
   3085       "vst1.32 {d0}, [%[result]]!\n"
   3086       "vst1.32 {d1[0]}, [%[result]]!\n"
   3087       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3088       : [count] "r"(params.kernel.count),
   3089         [stride] "r"(params.output_stream.stride),
   3090         [scale] "r"(params.kernel.scale)
   3091       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3092         "d11", "d12", "d13", "d14", "d15", "cc", "memory");
   3093 }
   3094 
   3095 template <>
   3096 inline void MulKernel<
   3097     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4,
   3098     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3099                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3100                                          RowMajor>& params,
   3101                  float* result) {
   3102 #ifdef DEBUG
   3103 #ifdef DEBUG_METAGEMM_VERBOSE
   3104   std::cout << __FILE__ << "(" << __LINE__
   3105             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3106                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, "
   3107                "8>::Multiply()"
   3108             << std::endl
   3109             << std::flush;
   3110 #endif
   3111 #endif
   3112   asm volatile(
   3113       "pld [%[lhs]]\n"
   3114       "pld [%[rhs]]\n"
   3115 
   3116       // Clear aggregators.
   3117       "vmov.i32 q0, #0\n"
   3118       "vmov.i32 q1, #0\n"
   3119       "vmov.i32 q2, #0\n"
   3120       "vmov.i32 q3, q0\n"
   3121 
   3122       // General NxM lanes loop.
   3123       "1:"
   3124 
   3125       // Subtract counter.
   3126       "subs %[count], %[count], #8\n"
   3127 
   3128       "vld1.32 {d8}, [%[lhs]:64]!\n"
   3129       "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n"
   3130       "pld [%[lhs], #64]\n"
   3131       "pld [%[rhs], #64]\n"
   3132       "vmull.u8 q7, d9, d8\n"
   3133       "vmull.u8 q8, d10, d8\n"
   3134       "vmull.u8 q9, d11, d8\n"
   3135       "vmull.u8 q10, d12, d8\n"
   3136       "vpadal.u16 q0, q7\n"
   3137       "vpadal.u16 q1, q8\n"
   3138       "vpadal.u16 q2, q9\n"
   3139       "vpadal.u16 q3, q10\n"
   3140 
   3141       // Loop break.
   3142       "bgt 1b\n"
   3143 
   3144       // StaticQuantizationFloat::Prepare
   3145       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   3146       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   3147       "vdup.32 q6, %[scale]\n"
   3148       "vdup.32 q4, d8[0]\n"
   3149 
   3150       // RowMajorOutput::Prepare
   3151 
   3152       // Reduce aggregators.
   3153       "vpadd.u32 d0, d0, d1\n"
   3154       "vpadd.u32 d2, d2, d3\n"
   3155       "vpadd.u32 d4, d4, d5\n"
   3156       "vpadd.u32 d6, d6, d7\n"
   3157       "vpadd.u32 d0, d0, d2\n"
   3158       "vpadd.u32 d1, d4, d6\n"
   3159 
   3160       // StaticQuantizationFloat::Transform
   3161       "vadd.s32 q0, q0, q4\n"
   3162       "vadd.s32 q0, q0, q5\n"
   3163       "vcvt.f32.s32 q0, q0\n"
   3164       "vmul.f32 q0, q0, q6\n"
   3165 
   3166       // RowMajorOutput::Output
   3167       "vst1.32 {d0, d1}, [%[result]]!\n"
   3168       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3169       : [count] "r"(params.kernel.count),
   3170         [stride] "r"(params.output_stream.stride),
   3171         [scale] "r"(params.kernel.scale)
   3172       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3173         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   3174         "d21", "cc", "memory");
   3175 }
   3176 
   3177 template <>
   3178 inline void MulKernel<
   3179     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5,
   3180     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3181                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3182                                          RowMajor>& params,
   3183                  float* result) {
   3184 #ifdef DEBUG
   3185 #ifdef DEBUG_METAGEMM_VERBOSE
   3186   std::cout << __FILE__ << "(" << __LINE__
   3187             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3188                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, "
   3189                "8>::Multiply()"
   3190             << std::endl
   3191             << std::flush;
   3192 #endif
   3193 #endif
   3194   asm volatile(
   3195       "pld [%[lhs]]\n"
   3196       "pld [%[rhs]]\n"
   3197 
   3198       // Clear aggregators.
   3199       "vmov.i32 q0, #0\n"
   3200       "vmov.i32 q1, #0\n"
   3201       "vmov.i32 q2, #0\n"
   3202       "vmov.i32 q3, q0\n"
   3203       "vmov.i32 q4, q1\n"
   3204 
   3205       // General 1xM lanes loop.
   3206       "1:"
   3207 
   3208       // Subtract counter.
   3209       "subs %[count], %[count], #8\n"
   3210 
   3211       "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n"
   3212       "vld1.32 {d14}, [%[lhs]:64]!\n"
   3213       "pld [%[lhs], #64]\n"
   3214       "vmull.u8 q8, d10, d14\n"
   3215       "vmull.u8 q9, d11, d14\n"
   3216       "vmull.u8 q10, d12, d14\n"
   3217       "vmull.u8 q11, d13, d14\n"
   3218       "vld1.32 {d10}, [%[rhs]:64]!\n"
   3219       "pld [%[rhs], #128]\n"
   3220       "vpadal.u16 q0, q8\n"
   3221       "vpadal.u16 q1, q9\n"
   3222       "vpadal.u16 q2, q10\n"
   3223       "vpadal.u16 q3, q11\n"
   3224       "vmull.u8 q8, d10, d14\n"
   3225       "vpadal.u16 q4, q8\n"
   3226 
   3227       // Loop break.
   3228       "bgt 1b\n"
   3229 
   3230       // StaticQuantizationFloat::Prepare
   3231       "vld1.32 {d10, d11}, [%[lhs]:64]!\n"
   3232       "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
   3233       "vdup.32 q8, %[scale]\n"
   3234       "vdup.32 q5, d10[0]\n"
   3235 
   3236       // RowMajorOutput::Prepare
   3237 
   3238       // Reduce aggregators.
   3239       "vpadd.u32 d0, d0, d1\n"
   3240       "vpadd.u32 d2, d2, d3\n"
   3241       "vpadd.u32 d4, d4, d5\n"
   3242       "vpadd.u32 d6, d6, d7\n"
   3243       "vpadd.u32 d8, d8, d9\n"
   3244       "vpadd.u32 d0, d0, d2\n"
   3245       "vpadd.u32 d1, d4, d6\n"
   3246       "vpadd.u32 d2, d8, d8\n"
   3247 
   3248       // StaticQuantizationFloat::Transform
   3249       "vadd.s32 q0, q0, q5\n"
   3250       "vadd.s32 q1, q1, q5\n"
   3251       "vadd.s32 q0, q0, q6\n"
   3252       "vadd.s32 q1, q1, q7\n"
   3253       "vcvt.f32.s32 q0, q0\n"
   3254       "vcvt.f32.s32 q1, q1\n"
   3255       "vmul.f32 q0, q0, q8\n"
   3256       "vmul.f32 q1, q1, q8\n"
   3257 
   3258       // RowMajorOutput::Output
   3259       "vst1.32 {d0, d1}, [%[result]]!\n"
   3260       "vst1.32 {d2[0]}, [%[result]]!\n"
   3261       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3262       : [count] "r"(params.kernel.count),
   3263         [stride] "r"(params.output_stream.stride),
   3264         [scale] "r"(params.kernel.scale)
   3265       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3266         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   3267         "d21", "d22", "d23", "cc", "memory");
   3268 }
   3269 
   3270 template <>
   3271 inline void MulKernel<
   3272     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6,
   3273     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3274                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3275                                          RowMajor>& params,
   3276                  float* result) {
   3277 #ifdef DEBUG
   3278 #ifdef DEBUG_METAGEMM_VERBOSE
   3279   std::cout << __FILE__ << "(" << __LINE__
   3280             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3281                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, "
   3282                "8>::Multiply()"
   3283             << std::endl
   3284             << std::flush;
   3285 #endif
   3286 #endif
   3287   asm volatile(
   3288       "pld [%[lhs]]\n"
   3289       "pld [%[rhs]]\n"
   3290 
   3291       // Clear aggregators.
   3292       "vmov.i32 q0, #0\n"
   3293       "vmov.i32 q1, #0\n"
   3294       "vmov.i32 q2, #0\n"
   3295       "vmov.i32 q3, q0\n"
   3296       "vmov.i32 q4, q1\n"
   3297       "vmov.i32 q5, q2\n"
   3298 
   3299       // General 1xM lanes loop.
   3300       "1:"
   3301 
   3302       // Subtract counter.
   3303       "subs %[count], %[count], #8\n"
   3304 
   3305       "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n"
   3306       "vld1.32 {d16}, [%[lhs]:64]!\n"
   3307       "pld [%[lhs], #64]\n"
   3308       "vmull.u8 q9, d12, d16\n"
   3309       "vmull.u8 q10, d13, d16\n"
   3310       "vmull.u8 q11, d14, d16\n"
   3311       "vmull.u8 q12, d15, d16\n"
   3312       "vld1.32 {d12, d13}, [%[rhs]:64]!\n"
   3313       "pld [%[rhs], #128]\n"
   3314       "vpadal.u16 q0, q9\n"
   3315       "vpadal.u16 q1, q10\n"
   3316       "vpadal.u16 q2, q11\n"
   3317       "vpadal.u16 q3, q12\n"
   3318       "vmull.u8 q9, d12, d16\n"
   3319       "vmull.u8 q10, d13, d16\n"
   3320       "vpadal.u16 q4, q9\n"
   3321       "vpadal.u16 q5, q10\n"
   3322 
   3323       // Loop break.
   3324       "bgt 1b\n"
   3325 
   3326       // StaticQuantizationFloat::Prepare
   3327       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   3328       "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
   3329       "vdup.32 q9, %[scale]\n"
   3330       "vdup.32 q6, d12[0]\n"
   3331 
   3332       // RowMajorOutput::Prepare
   3333 
   3334       // Reduce aggregators.
   3335       "vpadd.u32 d0, d0, d1\n"
   3336       "vpadd.u32 d2, d2, d3\n"
   3337       "vpadd.u32 d4, d4, d5\n"
   3338       "vpadd.u32 d6, d6, d7\n"
   3339       "vpadd.u32 d8, d8, d9\n"
   3340       "vpadd.u32 d10, d10, d11\n"
   3341       "vpadd.u32 d0, d0, d2\n"
   3342       "vpadd.u32 d1, d4, d6\n"
   3343       "vpadd.u32 d2, d8, d10\n"
   3344 
   3345       // StaticQuantizationFloat::Transform
   3346       "vadd.s32 q0, q0, q6\n"
   3347       "vadd.s32 q1, q1, q6\n"
   3348       "vadd.s32 q0, q0, q7\n"
   3349       "vadd.s32 q1, q1, q8\n"
   3350       "vcvt.f32.s32 q0, q0\n"
   3351       "vcvt.f32.s32 q1, q1\n"
   3352       "vmul.f32 q0, q0, q9\n"
   3353       "vmul.f32 q1, q1, q9\n"
   3354 
   3355       // RowMajorOutput::Output
   3356       "vst1.32 {d0, d1, d2}, [%[result]]!\n"
   3357       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3358       : [count] "r"(params.kernel.count),
   3359         [stride] "r"(params.output_stream.stride),
   3360         [scale] "r"(params.kernel.scale)
   3361       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3362         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   3363         "d21", "d22", "d23", "d24", "d25", "cc", "memory");
   3364 }
   3365 
   3366 template <>
   3367 inline void MulKernel<
   3368     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7,
   3369     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3370                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3371                                          RowMajor>& params,
   3372                  float* result) {
   3373 #ifdef DEBUG
   3374 #ifdef DEBUG_METAGEMM_VERBOSE
   3375   std::cout << __FILE__ << "(" << __LINE__
   3376             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3377                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, "
   3378                "8>::Multiply()"
   3379             << std::endl
   3380             << std::flush;
   3381 #endif
   3382 #endif
   3383   asm volatile(
   3384       "pld [%[lhs]]\n"
   3385       "pld [%[rhs]]\n"
   3386 
   3387       // Clear aggregators.
   3388       "vmov.i32 q0, #0\n"
   3389       "vmov.i32 q1, #0\n"
   3390       "vmov.i32 q2, #0\n"
   3391       "vmov.i32 q3, q0\n"
   3392       "vmov.i32 q4, q1\n"
   3393       "vmov.i32 q5, q2\n"
   3394       "vmov.i32 q6, q3\n"
   3395 
   3396       // General 1xM lanes loop.
   3397       "1:"
   3398 
   3399       // Subtract counter.
   3400       "subs %[count], %[count], #8\n"
   3401 
   3402       "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n"
   3403       "vld1.32 {d18}, [%[lhs]:64]!\n"
   3404       "pld [%[lhs], #64]\n"
   3405       "vmull.u8 q10, d14, d18\n"
   3406       "vmull.u8 q11, d15, d18\n"
   3407       "vmull.u8 q12, d16, d18\n"
   3408       "vmull.u8 q13, d17, d18\n"
   3409       "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
   3410       "pld [%[rhs], #128]\n"
   3411       "vpadal.u16 q0, q10\n"
   3412       "vpadal.u16 q1, q11\n"
   3413       "vpadal.u16 q2, q12\n"
   3414       "vpadal.u16 q3, q13\n"
   3415       "vmull.u8 q10, d14, d18\n"
   3416       "vmull.u8 q11, d15, d18\n"
   3417       "vmull.u8 q12, d16, d18\n"
   3418       "vpadal.u16 q4, q10\n"
   3419       "vpadal.u16 q5, q11\n"
   3420       "vpadal.u16 q6, q12\n"
   3421 
   3422       // Loop break.
   3423       "bgt 1b\n"
   3424 
   3425       // StaticQuantizationFloat::Prepare
   3426       "vld1.32 {d14, d15}, [%[lhs]:64]!\n"
   3427       "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n"
   3428       "vdup.32 q10, %[scale]\n"
   3429       "vdup.32 q7, d14[0]\n"
   3430 
   3431       // RowMajorOutput::Prepare
   3432 
   3433       // Reduce aggregators.
   3434       "vpadd.u32 d0, d0, d1\n"
   3435       "vpadd.u32 d2, d2, d3\n"
   3436       "vpadd.u32 d4, d4, d5\n"
   3437       "vpadd.u32 d6, d6, d7\n"
   3438       "vpadd.u32 d8, d8, d9\n"
   3439       "vpadd.u32 d10, d10, d11\n"
   3440       "vpadd.u32 d12, d12, d13\n"
   3441       "vpadd.u32 d0, d0, d2\n"
   3442       "vpadd.u32 d1, d4, d6\n"
   3443       "vpadd.u32 d2, d8, d10\n"
   3444       "vpadd.u32 d3, d12, d12\n"
   3445 
   3446       // StaticQuantizationFloat::Transform
   3447       "vadd.s32 q0, q0, q7\n"
   3448       "vadd.s32 q1, q1, q7\n"
   3449       "vadd.s32 q0, q0, q8\n"
   3450       "vadd.s32 q1, q1, q9\n"
   3451       "vcvt.f32.s32 q0, q0\n"
   3452       "vcvt.f32.s32 q1, q1\n"
   3453       "vmul.f32 q0, q0, q10\n"
   3454       "vmul.f32 q1, q1, q10\n"
   3455 
   3456       // RowMajorOutput::Output
   3457       "vst1.32 {d0, d1, d2}, [%[result]]!\n"
   3458       "vst1.32 {d3[0]}, [%[result]]!\n"
   3459       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3460       : [count] "r"(params.kernel.count),
   3461         [stride] "r"(params.output_stream.stride),
   3462         [scale] "r"(params.kernel.scale)
   3463       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3464         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   3465         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "cc", "memory");
   3466 }
   3467 
   3468 template <>
   3469 inline void MulKernel<
   3470     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8,
   3471     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3472                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3473                                          RowMajor>& params,
   3474                  float* result) {
   3475 #ifdef DEBUG
   3476 #ifdef DEBUG_METAGEMM_VERBOSE
   3477   std::cout << __FILE__ << "(" << __LINE__
   3478             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3479                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, "
   3480                "8>::Multiply()"
   3481             << std::endl
   3482             << std::flush;
   3483 #endif
   3484 #endif
   3485   asm volatile(
   3486       "pld [%[lhs]]\n"
   3487       "pld [%[rhs]]\n"
   3488 
   3489       // Clear aggregators.
   3490       "vmov.i32 q0, #0\n"
   3491       "vmov.i32 q1, #0\n"
   3492       "vmov.i32 q2, #0\n"
   3493       "vmov.i32 q3, q0\n"
   3494       "vmov.i32 q4, q1\n"
   3495       "vmov.i32 q5, q2\n"
   3496       "vmov.i32 q6, q3\n"
   3497       "vmov.i32 q7, q4\n"
   3498 
   3499       // 1x8 lanes loop.
   3500       "1:"
   3501 
   3502       "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
   3503       "vld1.32 {d16}, [%[lhs]:64]!\n"
   3504       "vmull.u8 q11, d16, d17\n"
   3505       "vmull.u8 q12, d16, d18\n"
   3506       "vmull.u8 q13, d16, d19\n"
   3507       "vmull.u8 q14, d16, d20\n"
   3508       "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n"
   3509       "vpadal.u16 q0, q11\n"
   3510       "vpadal.u16 q1, q12\n"
   3511       "vpadal.u16 q2, q13\n"
   3512       "vpadal.u16 q3, q14\n"
   3513       "pld [%[rhs], #256]\n"
   3514       "vmull.u8 q15, d16, d17\n"
   3515       "vmull.u8 q11, d16, d18\n"
   3516       "vmull.u8 q12, d16, d19\n"
   3517       "vmull.u8 q13, d16, d20\n"
   3518       "pld [%[lhs], #32]\n"
   3519 
   3520       // Subtract counter.
   3521       "subs %[count], %[count], #8\n"
   3522 
   3523       "vpadal.u16 q4, q15\n"
   3524       "vpadal.u16 q5, q11\n"
   3525       "vpadal.u16 q6, q12\n"
   3526       "vpadal.u16 q7, q13\n"
   3527 
   3528       // Loop break.
   3529       "bgt 1b\n"
   3530 
   3531       // StaticQuantizationFloat::Prepare
   3532       "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
   3533       "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n"
   3534       "vdup.32 q11, %[scale]\n"
   3535       "vdup.32 q8, d16[0]\n"
   3536 
   3537       // RowMajorOutput::Prepare
   3538 
   3539       // Reduce aggregators.
   3540       "vpadd.u32 d0, d0, d1\n"
   3541       "vpadd.u32 d2, d2, d3\n"
   3542       "vpadd.u32 d4, d4, d5\n"
   3543       "vpadd.u32 d6, d6, d7\n"
   3544       "vpadd.u32 d8, d8, d9\n"
   3545       "vpadd.u32 d10, d10, d11\n"
   3546       "vpadd.u32 d12, d12, d13\n"
   3547       "vpadd.u32 d14, d14, d15\n"
   3548       "vpadd.u32 d0, d0, d2\n"
   3549       "vpadd.u32 d1, d4, d6\n"
   3550       "vpadd.u32 d2, d8, d10\n"
   3551       "vpadd.u32 d3, d12, d14\n"
   3552 
   3553       // StaticQuantizationFloat::Transform
   3554       "vadd.s32 q0, q0, q8\n"
   3555       "vadd.s32 q1, q1, q8\n"
   3556       "vadd.s32 q0, q0, q9\n"
   3557       "vadd.s32 q1, q1, q10\n"
   3558       "vcvt.f32.s32 q0, q0\n"
   3559       "vcvt.f32.s32 q1, q1\n"
   3560       "vmul.f32 q0, q0, q11\n"
   3561       "vmul.f32 q1, q1, q11\n"
   3562 
   3563       // RowMajorOutput::Output
   3564       "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n"
   3565       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3566       : [count] "r"(params.kernel.count),
   3567         [stride] "r"(params.output_stream.stride),
   3568         [scale] "r"(params.kernel.scale)
   3569       : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3570         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   3571         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
   3572         "d31", "cc", "memory");
   3573 }
   3574 
   3575 template <>
   3576 inline void MulKernel<
   3577     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1,
   3578     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3579                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3580                                          RowMajor>& params,
   3581                  float* result) {
   3582 #ifdef DEBUG
   3583 #ifdef DEBUG_METAGEMM_VERBOSE
   3584   std::cout << __FILE__ << "(" << __LINE__
   3585             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3586                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, "
   3587                "8>::Multiply()"
   3588             << std::endl
   3589             << std::flush;
   3590 #endif
   3591 #endif
   3592   asm volatile(
   3593       "pld [%[lhs]]\n"
   3594       "pld [%[rhs]]\n"
   3595 
   3596       // Clear aggregators.
   3597       "vmov.i32 q0, #0\n"
   3598       "vmov.i32 q1, #0\n"
   3599 
   3600       // General NxM lanes loop.
   3601       "1:"
   3602 
   3603       // Subtract counter.
   3604       "subs %[count], %[count], #8\n"
   3605 
   3606       "vld1.32 {d4, d5}, [%[lhs]:64]!\n"
   3607       "vld1.32 {d6}, [%[rhs]:64]!\n"
   3608       "pld [%[lhs], #64]\n"
   3609       "pld [%[rhs], #64]\n"
   3610       "vmull.u8 q4, d6, d4\n"
   3611       "vmull.u8 q5, d6, d5\n"
   3612       "vpadal.u16 q0, q4\n"
   3613       "vpadal.u16 q1, q5\n"
   3614 
   3615       // Loop break.
   3616       "bgt 1b\n"
   3617 
   3618       // StaticQuantizationFloat::Prepare
   3619       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   3620       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   3621       "vdup.32 q6, %[scale]\n"
   3622       "vdup.32 q2, d8[0]\n"
   3623       "vdup.32 q4, d8[1]\n"
   3624 
   3625       // RowMajorOutput::Prepare
   3626       "add r0, %[result], %[stride]\n"
   3627 
   3628       // Reduce aggregators.
   3629       "vpadd.u32 d0, d0, d1\n"
   3630       "vpadd.u32 d0, d0, d0\n"
   3631       "vpadd.u32 d2, d2, d3\n"
   3632       "vpadd.u32 d2, d2, d2\n"
   3633 
   3634       // StaticQuantizationFloat::Transform
   3635       "vadd.s32 q0, q0, q2\n"
   3636       "vadd.s32 q1, q1, q4\n"
   3637       "vadd.s32 q0, q0, q5\n"
   3638       "vadd.s32 q1, q1, q5\n"
   3639       "vcvt.f32.s32 q0, q0\n"
   3640       "vcvt.f32.s32 q1, q1\n"
   3641       "vmul.f32 q0, q0, q6\n"
   3642       "vmul.f32 q1, q1, q6\n"
   3643 
   3644       // RowMajorOutput::Output
   3645       "vst1.32 {d0[0]}, [%[result]]!\n"
   3646       "vst1.32 {d2[0]}, [r0]!\n"
   3647       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3648       : [count] "r"(params.kernel.count),
   3649         [stride] "r"(params.output_stream.stride),
   3650         [scale] "r"(params.kernel.scale)
   3651       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d8", "d9", "d10",
   3652         "d11", "d12", "d13", "cc", "memory");
   3653 }
   3654 
   3655 template <>
   3656 inline void MulKernel<
   3657     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2,
   3658     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3659                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3660                                          RowMajor>& params,
   3661                  float* result) {
   3662 #ifdef DEBUG
   3663 #ifdef DEBUG_METAGEMM_VERBOSE
   3664   std::cout << __FILE__ << "(" << __LINE__
   3665             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3666                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, "
   3667                "8>::Multiply()"
   3668             << std::endl
   3669             << std::flush;
   3670 #endif
   3671 #endif
   3672   asm volatile(
   3673       "pld [%[lhs]]\n"
   3674       "pld [%[rhs]]\n"
   3675 
   3676       // Clear aggregators.
   3677       "vmov.i32 q0, #0\n"
   3678       "vmov.i32 q1, #0\n"
   3679       "vmov.i32 q2, #0\n"
   3680       "vmov.i32 q3, q0\n"
   3681 
   3682       // General NxM lanes loop.
   3683       "1:"
   3684 
   3685       // Subtract counter.
   3686       "subs %[count], %[count], #8\n"
   3687 
   3688       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   3689       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   3690       "pld [%[lhs], #64]\n"
   3691       "pld [%[rhs], #64]\n"
   3692       "vmull.u8 q6, d10, d8\n"
   3693       "vmull.u8 q7, d11, d8\n"
   3694       "vmull.u8 q8, d10, d9\n"
   3695       "vmull.u8 q9, d11, d9\n"
   3696       "vpadal.u16 q0, q6\n"
   3697       "vpadal.u16 q1, q7\n"
   3698       "vpadal.u16 q2, q8\n"
   3699       "vpadal.u16 q3, q9\n"
   3700 
   3701       // Loop break.
   3702       "bgt 1b\n"
   3703 
   3704       // StaticQuantizationFloat::Prepare
   3705       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   3706       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   3707       "vdup.32 q6, %[scale]\n"
   3708       "vdup.32 q7, d8[0]\n"
   3709       "vdup.32 q4, d8[1]\n"
   3710 
   3711       // RowMajorOutput::Prepare
   3712       "add r0, %[result], %[stride]\n"
   3713 
   3714       // Reduce aggregators.
   3715       "vpadd.u32 d0, d0, d1\n"
   3716       "vpadd.u32 d2, d2, d3\n"
   3717       "vpadd.u32 d0, d0, d2\n"
   3718       "vpadd.u32 d4, d4, d5\n"
   3719       "vpadd.u32 d6, d6, d7\n"
   3720       "vpadd.u32 d4, d4, d6\n"
   3721 
   3722       // StaticQuantizationFloat::Transform
   3723       "vadd.s32 q0, q0, q7\n"
   3724       "vadd.s32 q2, q2, q4\n"
   3725       "vadd.s32 q0, q0, q5\n"
   3726       "vadd.s32 q2, q2, q5\n"
   3727       "vcvt.f32.s32 q0, q0\n"
   3728       "vcvt.f32.s32 q2, q2\n"
   3729       "vmul.f32 q0, q0, q6\n"
   3730       "vmul.f32 q2, q2, q6\n"
   3731 
   3732       // RowMajorOutput::Output
   3733       "vst1.32 {d0}, [%[result]]!\n"
   3734       "vst1.32 {d4}, [r0]!\n"
   3735       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3736       : [count] "r"(params.kernel.count),
   3737         [stride] "r"(params.output_stream.stride),
   3738         [scale] "r"(params.kernel.scale)
   3739       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3740         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "cc",
   3741         "memory");
   3742 }
   3743 
   3744 template <>
   3745 inline void MulKernel<
   3746     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3,
   3747     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3748                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3749                                          RowMajor>& params,
   3750                  float* result) {
   3751 #ifdef DEBUG
   3752 #ifdef DEBUG_METAGEMM_VERBOSE
   3753   std::cout << __FILE__ << "(" << __LINE__
   3754             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3755                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, "
   3756                "8>::Multiply()"
   3757             << std::endl
   3758             << std::flush;
   3759 #endif
   3760 #endif
   3761   asm volatile(
   3762       "pld [%[lhs]]\n"
   3763       "pld [%[rhs]]\n"
   3764 
   3765       // Clear aggregators.
   3766       "vmov.i32 q0, #0\n"
   3767       "vmov.i32 q1, #0\n"
   3768       "vmov.i32 q2, #0\n"
   3769       "vmov.i32 q3, q0\n"
   3770       "vmov.i32 q4, q1\n"
   3771       "vmov.i32 q5, q2\n"
   3772 
   3773       // General NxM lanes loop.
   3774       "1:"
   3775 
   3776       // Subtract counter.
   3777       "subs %[count], %[count], #8\n"
   3778 
   3779       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   3780       "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n"
   3781       "pld [%[lhs], #64]\n"
   3782       "pld [%[rhs], #64]\n"
   3783       "vmull.u8 q9, d14, d12\n"
   3784       "vmull.u8 q10, d15, d12\n"
   3785       "vmull.u8 q11, d16, d12\n"
   3786       "vmull.u8 q12, d14, d13\n"
   3787       "vmull.u8 q13, d15, d13\n"
   3788       "vmull.u8 q14, d16, d13\n"
   3789       "vpadal.u16 q0, q9\n"
   3790       "vpadal.u16 q1, q10\n"
   3791       "vpadal.u16 q2, q11\n"
   3792       "vpadal.u16 q3, q12\n"
   3793       "vpadal.u16 q4, q13\n"
   3794       "vpadal.u16 q5, q14\n"
   3795 
   3796       // Loop break.
   3797       "bgt 1b\n"
   3798 
   3799       // StaticQuantizationFloat::Prepare
   3800       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   3801       "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
   3802       "vdup.32 q8, %[scale]\n"
   3803       "vdup.32 q9, d12[0]\n"
   3804       "vdup.32 q6, d12[1]\n"
   3805 
   3806       // RowMajorOutput::Prepare
   3807       "add r0, %[result], %[stride]\n"
   3808 
   3809       // Reduce aggregators.
   3810       "vpadd.u32 d0, d0, d1\n"
   3811       "vpadd.u32 d2, d2, d3\n"
   3812       "vpadd.u32 d4, d4, d5\n"
   3813       "vpadd.u32 d0, d0, d2\n"
   3814       "vpadd.u32 d1, d4, d4\n"
   3815       "vpadd.u32 d6, d6, d7\n"
   3816       "vpadd.u32 d8, d8, d9\n"
   3817       "vpadd.u32 d10, d10, d11\n"
   3818       "vpadd.u32 d6, d6, d8\n"
   3819       "vpadd.u32 d7, d10, d10\n"
   3820 
   3821       // StaticQuantizationFloat::Transform
   3822       "vadd.s32 q0, q0, q9\n"
   3823       "vadd.s32 q3, q3, q6\n"
   3824       "vadd.s32 q0, q0, q7\n"
   3825       "vadd.s32 q3, q3, q7\n"
   3826       "vcvt.f32.s32 q0, q0\n"
   3827       "vcvt.f32.s32 q3, q3\n"
   3828       "vmul.f32 q0, q0, q8\n"
   3829       "vmul.f32 q3, q3, q8\n"
   3830 
   3831       // RowMajorOutput::Output
   3832       "vst1.32 {d0}, [%[result]]!\n"
   3833       "vst1.32 {d1[0]}, [%[result]]!\n"
   3834       "vst1.32 {d6}, [r0]!\n"
   3835       "vst1.32 {d7[0]}, [r0]!\n"
   3836       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3837       : [count] "r"(params.kernel.count),
   3838         [stride] "r"(params.output_stream.stride),
   3839         [scale] "r"(params.kernel.scale)
   3840       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3841         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   3842         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "cc",
   3843         "memory");
   3844 }
   3845 
   3846 template <>
   3847 inline void MulKernel<
   3848     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4,
   3849     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3850                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3851                                          RowMajor>& params,
   3852                  float* result) {
   3853 #ifdef DEBUG
   3854 #ifdef DEBUG_METAGEMM_VERBOSE
   3855   std::cout << __FILE__ << "(" << __LINE__
   3856             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3857                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, "
   3858                "8>::Multiply()"
   3859             << std::endl
   3860             << std::flush;
   3861 #endif
   3862 #endif
   3863   asm volatile(
   3864       "pld [%[lhs]]\n"
   3865       "pld [%[rhs]]\n"
   3866 
   3867       // Clear aggregators.
   3868       "vmov.i32 q0, #0\n"
   3869       "vmov.i32 q1, #0\n"
   3870       "vmov.i32 q2, #0\n"
   3871       "vmov.i32 q3, q0\n"
   3872       "vmov.i32 q4, q1\n"
   3873       "vmov.i32 q5, q2\n"
   3874       "vmov.i32 q6, q3\n"
   3875       "vmov.i32 q7, q4\n"
   3876 
   3877       // 2x4 lanes loop.
   3878       "1:"
   3879 
   3880       "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n"
   3881       "vld1.8 {d16}, [%[lhs]:64]!\n"
   3882       "vmull.u8 q11, d16, d18\n"
   3883       "vld1.8 {d17}, [%[lhs]:64]!\n"
   3884       "vmull.u8 q12, d16, d19\n"
   3885       "pld [%[rhs], #64]\n"
   3886       "vmull.u8 q13, d16, d20\n"
   3887       "pld [%[lhs], #64]\n"
   3888       "vmull.u8 q14, d16, d21\n"
   3889       "vmull.u8 q15, d17, d18\n"
   3890       "vpadal.u16 q0, q11\n"
   3891       "vpadal.u16 q1, q12\n"
   3892       "vpadal.u16 q2, q13\n"
   3893       "vmull.u8 q11, d17, d19\n"
   3894       "vmull.u8 q12, d17, d20\n"
   3895       "vmull.u8 q13, d17, d21\n"
   3896 
   3897       // Subtract counter.
   3898       "subs %[count], %[count], #8\n"
   3899 
   3900       "vpadal.u16 q3, q14\n"
   3901       "vpadal.u16 q4, q15\n"
   3902       "vpadal.u16 q5, q11\n"
   3903       "vpadal.u16 q6, q12\n"
   3904       "vpadal.u16 q7, q13\n"
   3905 
   3906       // Loop break.
   3907       "bgt 1b\n"
   3908 
   3909       // StaticQuantizationFloat::Prepare
   3910       "vld1.32 {d16, d17}, [%[lhs]:64]!\n"
   3911       "vld1.32 {d18, d19}, [%[rhs]:64]!\n"
   3912       "vdup.32 q10, %[scale]\n"
   3913       "vdup.32 q11, d16[0]\n"
   3914       "vdup.32 q8, d16[1]\n"
   3915 
   3916       // RowMajorOutput::Prepare
   3917       "add r0, %[result], %[stride]\n"
   3918 
   3919       // Reduce aggregators.
   3920       "vpadd.u32 d0, d0, d1\n"
   3921       "vpadd.u32 d2, d2, d3\n"
   3922       "vpadd.u32 d4, d4, d5\n"
   3923       "vpadd.u32 d6, d6, d7\n"
   3924       "vpadd.u32 d0, d0, d2\n"
   3925       "vpadd.u32 d1, d4, d6\n"
   3926       "vpadd.u32 d8, d8, d9\n"
   3927       "vpadd.u32 d10, d10, d11\n"
   3928       "vpadd.u32 d12, d12, d13\n"
   3929       "vpadd.u32 d14, d14, d15\n"
   3930       "vpadd.u32 d8, d8, d10\n"
   3931       "vpadd.u32 d9, d12, d14\n"
   3932 
   3933       // StaticQuantizationFloat::Transform
   3934       "vadd.s32 q0, q0, q11\n"
   3935       "vadd.s32 q4, q4, q8\n"
   3936       "vadd.s32 q0, q0, q9\n"
   3937       "vadd.s32 q4, q4, q9\n"
   3938       "vcvt.f32.s32 q0, q0\n"
   3939       "vcvt.f32.s32 q4, q4\n"
   3940       "vmul.f32 q0, q0, q10\n"
   3941       "vmul.f32 q4, q4, q10\n"
   3942 
   3943       // RowMajorOutput::Output
   3944       "vst1.32 {d0, d1}, [%[result]]!\n"
   3945       "vst1.32 {d8, d9}, [r0]!\n"
   3946       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   3947       : [count] "r"(params.kernel.count),
   3948         [stride] "r"(params.output_stream.stride),
   3949         [scale] "r"(params.kernel.scale)
   3950       : "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
   3951         "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
   3952         "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
   3953         "d31", "cc", "memory");
   3954 }
   3955 
   3956 template <>
   3957 inline void MulKernel<
   3958     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1,
   3959     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   3960                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   3961                                          RowMajor>& params,
   3962                  float* result) {
   3963 #ifdef DEBUG
   3964 #ifdef DEBUG_METAGEMM_VERBOSE
   3965   std::cout << __FILE__ << "(" << __LINE__
   3966             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   3967                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, "
   3968                "8>::Multiply()"
   3969             << std::endl
   3970             << std::flush;
   3971 #endif
   3972 #endif
   3973   asm volatile(
   3974       "pld [%[lhs]]\n"
   3975       "pld [%[rhs]]\n"
   3976 
   3977       // Clear aggregators.
   3978       "vmov.i32 q0, #0\n"
   3979       "vmov.i32 q1, #0\n"
   3980       "vmov.i32 q2, #0\n"
   3981 
   3982       // General NxM lanes loop.
   3983       "1:"
   3984 
   3985       // Subtract counter.
   3986       "subs %[count], %[count], #8\n"
   3987 
   3988       "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n"
   3989       "vld1.32 {d9}, [%[rhs]:64]!\n"
   3990       "pld [%[lhs], #64]\n"
   3991       "pld [%[rhs], #64]\n"
   3992       "vmull.u8 q5, d9, d6\n"
   3993       "vmull.u8 q6, d9, d7\n"
   3994       "vmull.u8 q7, d9, d8\n"
   3995       "vpadal.u16 q0, q5\n"
   3996       "vpadal.u16 q1, q6\n"
   3997       "vpadal.u16 q2, q7\n"
   3998 
   3999       // Loop break.
   4000       "bgt 1b\n"
   4001 
   4002       // StaticQuantizationFloat::Prepare
   4003       "vld1.32 {d8, d9}, [%[lhs]:64]!\n"
   4004       "vld1.32 {d10, d11}, [%[rhs]:64]!\n"
   4005       "vdup.32 q6, %[scale]\n"
   4006       "vdup.32 q3, d8[0]\n"
   4007       "vdup.32 q7, d8[1]\n"
   4008       "vdup.32 q4, d9[0]\n"
   4009 
   4010       // RowMajorOutput::Prepare
   4011       "add r0, %[result], %[stride]\n"
   4012       "add r1, r0, %[stride]\n"
   4013 
   4014       // Reduce aggregators.
   4015       "vpadd.u32 d0, d0, d1\n"
   4016       "vpadd.u32 d0, d0, d0\n"
   4017       "vpadd.u32 d2, d2, d3\n"
   4018       "vpadd.u32 d2, d2, d2\n"
   4019       "vpadd.u32 d4, d4, d5\n"
   4020       "vpadd.u32 d4, d4, d4\n"
   4021 
   4022       // StaticQuantizationFloat::Transform
   4023       "vadd.s32 q0, q0, q3\n"
   4024       "vadd.s32 q1, q1, q7\n"
   4025       "vadd.s32 q2, q2, q4\n"
   4026       "vadd.s32 q0, q0, q5\n"
   4027       "vadd.s32 q1, q1, q5\n"
   4028       "vadd.s32 q2, q2, q5\n"
   4029       "vcvt.f32.s32 q0, q0\n"
   4030       "vcvt.f32.s32 q1, q1\n"
   4031       "vcvt.f32.s32 q2, q2\n"
   4032       "vmul.f32 q0, q0, q6\n"
   4033       "vmul.f32 q1, q1, q6\n"
   4034       "vmul.f32 q2, q2, q6\n"
   4035 
   4036       // RowMajorOutput::Output
   4037       "vst1.32 {d0[0]}, [%[result]]!\n"
   4038       "vst1.32 {d2[0]}, [r0]!\n"
   4039       "vst1.32 {d4[0]}, [r1]!\n"
   4040       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   4041       : [count] "r"(params.kernel.count),
   4042         [stride] "r"(params.output_stream.stride),
   4043         [scale] "r"(params.kernel.scale)
   4044       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   4045         "d10", "d11", "d12", "d13", "d14", "d15", "cc", "memory");
   4046 }
   4047 
   4048 template <>
   4049 inline void MulKernel<
   4050     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2,
   4051     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   4052                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   4053                                          RowMajor>& params,
   4054                  float* result) {
   4055 #ifdef DEBUG
   4056 #ifdef DEBUG_METAGEMM_VERBOSE
   4057   std::cout << __FILE__ << "(" << __LINE__
   4058             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   4059                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, "
   4060                "8>::Multiply()"
   4061             << std::endl
   4062             << std::flush;
   4063 #endif
   4064 #endif
   4065   asm volatile(
   4066       "pld [%[lhs]]\n"
   4067       "pld [%[rhs]]\n"
   4068 
   4069       // Clear aggregators.
   4070       "vmov.i32 q0, #0\n"
   4071       "vmov.i32 q1, #0\n"
   4072       "vmov.i32 q2, #0\n"
   4073       "vmov.i32 q3, q0\n"
   4074       "vmov.i32 q4, q1\n"
   4075       "vmov.i32 q5, q2\n"
   4076 
   4077       // General NxM lanes loop.
   4078       "1:"
   4079 
   4080       // Subtract counter.
   4081       "subs %[count], %[count], #8\n"
   4082 
   4083       "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n"
   4084       "vld1.32 {d15, d16}, [%[rhs]:64]!\n"
   4085       "pld [%[lhs], #64]\n"
   4086       "pld [%[rhs], #64]\n"
   4087       "vmull.u8 q9, d15, d12\n"
   4088       "vmull.u8 q10, d16, d12\n"
   4089       "vmull.u8 q11, d15, d13\n"
   4090       "vmull.u8 q12, d16, d13\n"
   4091       "vmull.u8 q13, d15, d14\n"
   4092       "vmull.u8 q14, d16, d14\n"
   4093       "vpadal.u16 q0, q9\n"
   4094       "vpadal.u16 q1, q10\n"
   4095       "vpadal.u16 q2, q11\n"
   4096       "vpadal.u16 q3, q12\n"
   4097       "vpadal.u16 q4, q13\n"
   4098       "vpadal.u16 q5, q14\n"
   4099 
   4100       // Loop break.
   4101       "bgt 1b\n"
   4102 
   4103       // StaticQuantizationFloat::Prepare
   4104       "vld1.32 {d12, d13}, [%[lhs]:64]!\n"
   4105       "vld1.32 {d14, d15}, [%[rhs]:64]!\n"
   4106       "vdup.32 q8, %[scale]\n"
   4107       "vdup.32 q9, d12[0]\n"
   4108       "vdup.32 q10, d12[1]\n"
   4109       "vdup.32 q6, d13[0]\n"
   4110 
   4111       // RowMajorOutput::Prepare
   4112       "add r0, %[result], %[stride]\n"
   4113       "add r1, r0, %[stride]\n"
   4114 
   4115       // Reduce aggregators.
   4116       "vpadd.u32 d0, d0, d1\n"
   4117       "vpadd.u32 d2, d2, d3\n"
   4118       "vpadd.u32 d0, d0, d2\n"
   4119       "vpadd.u32 d4, d4, d5\n"
   4120       "vpadd.u32 d6, d6, d7\n"
   4121       "vpadd.u32 d4, d4, d6\n"
   4122       "vpadd.u32 d8, d8, d9\n"
   4123       "vpadd.u32 d10, d10, d11\n"
   4124       "vpadd.u32 d8, d8, d10\n"
   4125 
   4126       // StaticQuantizationFloat::Transform
   4127       "vadd.s32 q0, q0, q9\n"
   4128       "vadd.s32 q2, q2, q10\n"
   4129       "vadd.s32 q4, q4, q6\n"
   4130       "vadd.s32 q0, q0, q7\n"
   4131       "vadd.s32 q2, q2, q7\n"
   4132       "vadd.s32 q4, q4, q7\n"
   4133       "vcvt.f32.s32 q0, q0\n"
   4134       "vcvt.f32.s32 q2, q2\n"
   4135       "vcvt.f32.s32 q4, q4\n"
   4136       "vmul.f32 q0, q0, q8\n"
   4137       "vmul.f32 q2, q2, q8\n"
   4138       "vmul.f32 q4, q4, q8\n"
   4139 
   4140       // RowMajorOutput::Output
   4141       "vst1.32 {d0}, [%[result]]!\n"
   4142       "vst1.32 {d4}, [r0]!\n"
   4143       "vst1.32 {d8}, [r1]!\n"
   4144       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   4145       : [count] "r"(params.kernel.count),
   4146         [stride] "r"(params.output_stream.stride),
   4147         [scale] "r"(params.kernel.scale)
   4148       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   4149         "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
   4150         "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
   4151         "cc", "memory");
   4152 }
   4153 
   4154 template <>
   4155 inline void MulKernel<
   4156     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3,
   4157     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
   4158                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
   4159                                          RowMajor>& params,
   4160                  float* result) {
   4161 #ifdef DEBUG
   4162 #ifdef DEBUG_METAGEMM_VERBOSE
   4163   std::cout << __FILE__ << "(" << __LINE__
   4164             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
   4165                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, "
   4166                "8>::Multiply()"
   4167             << std::endl
   4168             << std::flush;
   4169 #endif
   4170 #endif
   4171   asm volatile(
   4172       "pld [%[lhs]]\n"
   4173       "pld [%[rhs]]\n"
   4174 
   4175       // Clear aggregators.
   4176       "vmov.i32 q0, #0\n"
   4177       "vmov.i32 q1, #0\n"
   4178       "vmov.i32 q2, #0\n"
   4179       "vmov.i32 q3, q0\n"
   4180       "vmov.i32 q4, q1\n"
   4181       "vmov.i32 q5, q2\n"
   4182       "vmov.i32 q6, q3\n"
   4183       "vmov.i32 q7, q4\n"
   4184       "vmov.i32 q8, q5\n"
   4185 
   4186       // 3x3 lanes loop.
   4187       "1:"
   4188 
   4189       "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n"
   4190       "vld1.8 {d18}, [%[lhs]:64]!\n"
   4191       "vmull.u8 q12, d18, d21\n"
   4192       "vld1.8 {d19}, [%[lhs]:64]!\n"
   4193       "vmull.u8 q13, d18, d22\n"
   4194       "vld1.8 {d20}, [%[lhs]:64]!\n"
   4195       "vmull.u8 q14, d18, d23\n"
   4196       "pld [%[lhs], #64]\n"
   4197       "vmull.u8 q15, d19, d21\n"
   4198       "pld [%[rhs], #64]\n"
   4199       "vpadal.u16 q0, q12\n"
   4200       "vpadal.u16 q1, q13\n"
   4201       "vpadal.u16 q2, q14\n"
   4202       "vpadal.u16 q3, q15\n"
   4203       "vmull.u8 q12, d19, d22\n"
   4204       "vmull.u8 q13, d19, d23\n"
   4205       "vmull.u8 q14, d20, d21\n"
   4206       "vmull.u8 q15, d20, d22\n"
   4207 
   4208       // Subtract counter.
   4209       "subs %[count], %[count], #8\n"
   4210 
   4211       "vmull.u8 q9, d20, d23\n"
   4212       "vpadal.u16 q4, q12\n"
   4213       "vpadal.u16 q5, q13\n"
   4214       "vpadal.u16 q6, q14\n"
   4215       "vpadal.u16 q7, q15\n"
   4216       "vpadal.u16 q8, q9\n"
   4217 
   4218       // Loop break.
   4219       "bgt 1b\n"
   4220 
   4221       // StaticQuantizationFloat::Prepare
   4222       "vld1.32 {d18, d19}, [%[lhs]:64]!\n"
   4223       "vld1.32 {d20, d21}, [%[rhs]:64]!\n"
   4224       "vdup.32 q11, %[scale]\n"
   4225       "vdup.32 q12, d18[0]\n"
   4226       "vdup.32 q13, d18[1]\n"
   4227       "vdup.32 q9, d19[0]\n"
   4228 
   4229       // RowMajorOutput::Prepare
   4230       "add r0, %[result], %[stride]\n"
   4231       "add r1, r0, %[stride]\n"
   4232 
   4233       // Reduce aggregators.
   4234       "vpadd.u32 d0, d0, d1\n"
   4235       "vpadd.u32 d2, d2, d3\n"
   4236       "vpadd.u32 d4, d4, d5\n"
   4237       "vpadd.u32 d0, d0, d2\n"
   4238       "vpadd.u32 d1, d4, d4\n"
   4239       "vpadd.u32 d6, d6, d7\n"
   4240       "vpadd.u32 d8, d8, d9\n"
   4241       "vpadd.u32 d10, d10, d11\n"
   4242       "vpadd.u32 d6, d6, d8\n"
   4243       "vpadd.u32 d7, d10, d10\n"
   4244       "vpadd.u32 d12, d12, d13\n"
   4245       "vpadd.u32 d14, d14, d15\n"
   4246       "vpadd.u32 d16, d16, d17\n"
   4247       "vpadd.u32 d12, d12, d14\n"
   4248       "vpadd.u32 d13, d16, d16\n"
   4249 
   4250       // StaticQuantizationFloat::Transform
   4251       "vadd.s32 q0, q0, q12\n"
   4252       "vadd.s32 q3, q3, q13\n"
   4253       "vadd.s32 q6, q6, q9\n"
   4254       "vadd.s32 q0, q0, q10\n"
   4255       "vadd.s32 q3, q3, q10\n"
   4256       "vadd.s32 q6, q6, q10\n"
   4257       "vcvt.f32.s32 q0, q0\n"
   4258       "vcvt.f32.s32 q3, q3\n"
   4259       "vcvt.f32.s32 q6, q6\n"
   4260       "vmul.f32 q0, q0, q11\n"
   4261       "vmul.f32 q3, q3, q11\n"
   4262       "vmul.f32 q6, q6, q11\n"
   4263 
   4264       // RowMajorOutput::Output
   4265       "vst1.32 {d0}, [%[result]]!\n"
   4266       "vst1.32 {d1[0]}, [%[result]]!\n"
   4267       "vst1.32 {d6}, [r0]!\n"
   4268       "vst1.32 {d7[0]}, [r0]!\n"
   4269       "vst1.32 {d12}, [r1]!\n"
   4270       "vst1.32 {d13[0]}, [r1]!\n"
   4271       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
   4272       : [count] "r"(params.kernel.count),
   4273         [stride] "r"(params.output_stream.stride),
   4274         [scale] "r"(params.kernel.scale)
   4275       : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
   4276         "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
   4277         "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
   4278         "d30", "d31", "cc", "memory");
   4279 }
   4280 
   4281 }  // namespace meta
   4282 }  // namespace gemmlowp
   4283 
   4284 #else
   4285 #warning "Meta gemm for arm32 requires: GEMMLOWP_NEON_32!"
   4286 #endif
   4287 
   4288 #endif  // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_32_H_
   4289