Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // kernel_SSE.h: a collection of Intel SSE optimized kernels.
     16 // Check in kernel_default.h which one(s) are actually used by default.
     17 // Others are mere experiments; they are still covered by tests
     18 // in case they might be useful some day.
     19 //
     20 
     21 #ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_
     22 #define GEMMLOWP_INTERNAL_KERNEL_SSE_H_
     23 
     24 #include "kernel.h"
     25 
     26 #include <string.h>
     27 #include <cassert>
     28 
     29 namespace gemmlowp {
     30 
     31 #ifdef GEMMLOWP_SSE4_32
     32 struct SSE4_32_Kernel4x4Depth2 : KernelBase {
     33   typedef KernelFormat<
     34       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>,
     35       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
     36       Format;
     37 
     38   const char* Name() const override { return "SSE, 4x4, depth 2"; }
     39 
     40   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
     41            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
     42            const std::uint8_t* rhs_ptr, std::size_t start_depth,
     43            std::size_t run_depth) const override {
     44     ScopedProfilingLabel label("optimized kernel");
     45     assert(dst_row_stride == 1);
     46     std::int32_t run_depth_cells = run_depth / Format::kDepth;
     47     /* Main loop */
     48 
     49     // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
     50     // A 4x2 block Lhs is stored in 16bit in xmm0.
     51     // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7.
     52     //
     53     //                   +-------+-------+-------+-------+
     54     //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
     55     //              Rhs  +-------+---------------+-------+
     56     //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
     57     //                   +-------+-------+-------+-------+
     58     //
     59     //                   |       |       |       |       |
     60     //
     61     //    Lhs            |       |       |       |       |
     62     //
     63     //  +--+--+ - - - -  +-------+-------+-------+-------+
     64     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
     65     //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
     66     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
     67     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
     68     //  +--+--+ - - - -  +-------+-------+-------+-------+
     69     //
     70     //                              Accumulator
     71 
     72     asm volatile(
     73 
     74         // set accumulators to zero.
     75         "pxor %%xmm4  , %%xmm4 \n\t"
     76         "pxor %%xmm5  , %%xmm5 \n\t"
     77         "pxor %%xmm6  , %%xmm6 \n\t"
     78         "pxor %%xmm7  , %%xmm7 \n\t"
     79 
     80         "movl  %[run_depth_cells], %%eax\n\t"
     81         "subl $2, %%eax\n\t"
     82         "js outerLoop1%=\n\t"
     83 
     84         // Loop for K unrolled by 4
     85         "outerLoop2%=:\n\t"
     86 
     87         // K = 1,2
     88         // RHS cell to xmm1
     89         "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
     90 
     91         // LHS cell
     92         "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
     93         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
     94         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
     95         "pmaddwd %%xmm0, %%xmm2         \n\t"
     96         "pmaddwd %%xmm0, %%xmm3         \n\t"
     97         "paddd %%xmm2, %%xmm4           \n\t"
     98         "paddd %%xmm3, %%xmm5           \n\t"
     99 
    100         "prefetcht0 0x80(%[lhs_ptr]) \n\t"
    101 
    102         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    103         "pmaddwd %%xmm0, %%xmm2         \n\t"
    104         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    105         "pmaddwd %%xmm0, %%xmm3         \n\t"
    106 
    107         "prefetcht0 0x80(%[rhs_ptr]) \n\t"
    108 
    109         // K = 3,4
    110         // RHS cell to xmm1
    111         "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
    112 
    113         "paddd %%xmm2, %%xmm6           \n\t"
    114         "paddd %%xmm3, %%xmm7           \n\t"
    115 
    116         // LHS cell
    117         "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
    118         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    119         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    120         "pmaddwd %%xmm0, %%xmm2         \n\t"
    121         "pmaddwd %%xmm0, %%xmm3         \n\t"
    122         "paddd %%xmm2, %%xmm4           \n\t"
    123         "paddd %%xmm3, %%xmm5           \n\t"
    124         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    125         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    126 
    127         "addl $0x10, %[lhs_ptr]         \n\t"
    128         "addl $0x10, %[rhs_ptr]         \n\t"
    129 
    130         "pmaddwd %%xmm0, %%xmm3         \n\t"
    131         "paddd %%xmm3, %%xmm7           \n\t"
    132         "pmaddwd %%xmm0, %%xmm2         \n\t"
    133         "paddd %%xmm2, %%xmm6           \n\t"
    134 
    135         "subl $2, %[run_depth_cells]\n\t"
    136         "ja outerLoop2%=\n\t"
    137 
    138         "movl %[run_depth_cells], %%eax\n\t"
    139         "decl %%eax\n\t"
    140         "js finish%=\n\t"
    141 
    142         // Loop for K unrolled by 2
    143         "outerLoop1%=:\n\t"
    144 
    145         // RHS cell to xmm1
    146         "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
    147 
    148         // LHS cell
    149         "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
    150         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    151         "pmaddwd %%xmm0, %%xmm2         \n\t"
    152         "paddd %%xmm2, %%xmm4           \n\t"
    153         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    154         "pmaddwd %%xmm0, %%xmm3         \n\t"
    155         "paddd %%xmm3, %%xmm5           \n\t"
    156 
    157         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    158         "pmaddwd %%xmm0, %%xmm2         \n\t"
    159         "paddd %%xmm2, %%xmm6           \n\t"
    160         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    161         "pmaddwd %%xmm0, %%xmm3         \n\t"
    162         "paddd %%xmm3, %%xmm7           \n\t"
    163 
    164         "addl $0x08, %[lhs_ptr]\n\t"
    165         "addl $0x08, %[rhs_ptr]\n\t"
    166 
    167         "decl %[run_depth_cells]\n\t"
    168         "jnz outerLoop1%=\n\t"
    169 
    170         "finish%=:\n\t"
    171 
    172         "movl  %[dst_col_stride], %%eax\n\t"
    173         "shll $2, %%eax\n\t"
    174 
    175         "movl  %[start_depth], %%ecx\n\t"
    176         "test %%ecx, %%ecx\n\t"
    177         "jz storeDst%=\n\t"
    178 
    179         "leal (%%eax,%%eax,0x2), %%ecx\n\t"
    180         "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
    181         "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t"
    182         "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t"
    183         "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t"
    184 
    185         "storeDst%=:\n\t"
    186 
    187         "leal (%%eax,%%eax,0x2), %%ecx\n\t"
    188         "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
    189         "movdqu %%xmm5  , 0x00(%[dst_ptr], %%eax, 1)\n\t"
    190         "movdqu %%xmm6  , 0x00(%[dst_ptr], %%eax, 2)\n\t"
    191         "movdqu %%xmm7  , 0x00(%[dst_ptr], %%ecx, 1)\n\t"
    192 
    193         :  // outputs
    194         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
    195         [dst_ptr] "+r"(dst_ptr)
    196         :  // inputs
    197         [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride),
    198         [run_depth_cells] "g"(run_depth_cells)
    199         :  // clobbers
    200         "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
    201         "%xmm6", "%xmm7", "%eax", "%ecx");
    202   }
    203 };
    204 #endif
    205 #ifdef GEMMLOWP_SSE4_64
    206 struct SSE4_64_Kernel12x4Depth2 : KernelBase {
    207   typedef KernelFormat<
    208       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
    209       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
    210       Format;
    211 
    212   const char* Name() const override { return "SSE, 12x4, depth 2"; }
    213 
    214   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
    215            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
    216            const std::uint8_t* rhs_ptr, std::size_t start_depth,
    217            std::size_t run_depth) const override {
    218     ScopedProfilingLabel label("optimized kernel");
    219     assert(dst_row_stride == 1);
    220     const std::int64_t run_depth_cells = run_depth / Format::kDepth;
    221     const std::int64_t dst_col_stride_q = dst_col_stride;
    222 
    223     /* Main loop */
    224 
    225     // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
    226     // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced
    227     // every Iteration.
    228     // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15.
    229     //
    230     //                   +-------+-------+-------+-------+
    231     //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
    232     //              Rhs  +-------+---------------+-------+
    233     //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
    234     //                   +-------+-------+-------+-------+
    235     //
    236     //                   |       |       |       |       |
    237     //
    238     //    Lhs            |       |       |       |       |
    239     //
    240     //  +--+--+ - - - -  +-------+-------+-------+-------+
    241     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
    242     //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
    243     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
    244     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
    245     //  +--+--+ - - - -  +-------+-------+-------+-------+
    246     //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
    247     //  |xmm0 | (Iter2)  | xmm8  | xmm9  | xmm10 | xmm11 |
    248     //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
    249     //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
    250     //  +--+--+ - - - -  +-------+-------+-------+-------+
    251     //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
    252     //  |xmm0 | (Iter3)  | xmm12 | xmm13 | xmm14 | xmm15 |
    253     //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
    254     //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
    255     //  +--+--+ - - - -  +-------+-------+-------+-------+
    256     //
    257     //                              Accumulator
    258 
    259     asm volatile(
    260 
    261         // Set registers for destination
    262         "movq  %[dst_col_stride_q], %%r12\n\t"
    263         "shlq $2, %%r12\n\t"
    264         "leaq (%%r12,%%r12,0x2), %%r13\n\t"
    265 
    266         // Set accumulators to zero.
    267         "pxor %%xmm4  , %%xmm4 \n\t"
    268         "pxor %%xmm5  , %%xmm5 \n\t"
    269         "pxor %%xmm6  , %%xmm6 \n\t"
    270         "pxor %%xmm7  , %%xmm7 \n\t"
    271         "pxor %%xmm8  , %%xmm8 \n\t"
    272         "pxor %%xmm9  , %%xmm9 \n\t"
    273         "pxor %%xmm10 , %%xmm10\n\t"
    274         "pxor %%xmm11 , %%xmm11\n\t"
    275         "pxor %%xmm12 , %%xmm12\n\t"
    276         "pxor %%xmm13 , %%xmm13\n\t"
    277         "pxor %%xmm14 , %%xmm14\n\t"
    278         "pxor %%xmm15 , %%xmm15\n\t"
    279 
    280         "movq  %[run_depth_cells], %%r14\n\t"
    281         "subq $2, %%r14\n\t"
    282         "js outerLoop1%=\n\t"
    283 
    284         // Loop for K unrolled by 4
    285         "outerLoop2%=:\n\t"
    286 
    287         // K = 1,2
    288         // RHS cell to xmm1
    289 
    290         "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
    291 
    292         // LHS cell
    293         "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
    294         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    295         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    296         "pmaddwd %%xmm0, %%xmm2         \n\t"
    297         "pmaddwd %%xmm0, %%xmm3         \n\t"
    298         "paddd %%xmm2, %%xmm4           \n\t"
    299         "paddd %%xmm3, %%xmm5           \n\t"
    300 
    301         "prefetcht0 0x80(%[lhs_ptr]) \n\t"
    302 
    303         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    304         "pmaddwd %%xmm0, %%xmm2         \n\t"
    305         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    306         "pmaddwd %%xmm0, %%xmm3         \n\t"
    307 
    308         // next LHS cell
    309         "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
    310 
    311         "paddd %%xmm2, %%xmm6           \n\t"
    312         "paddd %%xmm3, %%xmm7           \n\t"
    313 
    314         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    315         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    316         "pmaddwd %%xmm0, %%xmm2         \n\t"
    317         "pmaddwd %%xmm0, %%xmm3         \n\t"
    318         "paddd %%xmm2, %%xmm8           \n\t"
    319         "paddd %%xmm3, %%xmm9           \n\t"
    320 
    321         "prefetcht0 0x80(%[rhs_ptr]) \n\t"
    322 
    323         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    324         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    325         "pmaddwd %%xmm0, %%xmm2         \n\t"
    326         "pmaddwd %%xmm0, %%xmm3         \n\t"
    327         "paddd %%xmm2, %%xmm10          \n\t"
    328         "paddd %%xmm3, %%xmm11          \n\t"
    329 
    330         // next LHS cell
    331         "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
    332         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    333         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    334         "pmaddwd %%xmm0, %%xmm2         \n\t"
    335         "pmaddwd %%xmm0, %%xmm3         \n\t"
    336         "paddd %%xmm2, %%xmm12          \n\t"
    337         "paddd %%xmm3, %%xmm13          \n\t"
    338 
    339         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    340         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    341         "pmaddwd %%xmm0, %%xmm2         \n\t"
    342         "pmaddwd %%xmm0, %%xmm3         \n\t"
    343         "paddd %%xmm2, %%xmm14          \n\t"
    344         "paddd %%xmm3, %%xmm15          \n\t"
    345 
    346         // K = 3,4
    347         // RHS cell to xmm1
    348         "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
    349 
    350         // LHS cell
    351         "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t"
    352         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    353         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    354         "pmaddwd %%xmm0, %%xmm2         \n\t"
    355         "pmaddwd %%xmm0, %%xmm3         \n\t"
    356         "paddd %%xmm2, %%xmm4           \n\t"
    357         "paddd %%xmm3, %%xmm5           \n\t"
    358 
    359         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    360         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    361         "pmaddwd %%xmm0, %%xmm2         \n\t"
    362         "pmaddwd %%xmm0, %%xmm3         \n\t"
    363         "paddd %%xmm2, %%xmm6           \n\t"
    364         "paddd %%xmm3, %%xmm7           \n\t"
    365 
    366         // next LHS cell
    367         "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t"
    368         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    369         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    370         "pmaddwd %%xmm0, %%xmm2         \n\t"
    371         "pmaddwd %%xmm0, %%xmm3         \n\t"
    372         "paddd %%xmm2, %%xmm8           \n\t"
    373         "paddd %%xmm3, %%xmm9           \n\t"
    374 
    375         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    376         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    377         "pmaddwd %%xmm0, %%xmm2         \n\t"
    378         "pmaddwd %%xmm0, %%xmm3         \n\t"
    379         "paddd %%xmm2, %%xmm10          \n\t"
    380         "paddd %%xmm3, %%xmm11          \n\t"
    381 
    382         // next LHS cell
    383         "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t"
    384 
    385         "addq $0x30, %[lhs_ptr]         \n\t"
    386         "addq $0x10, %[rhs_ptr]         \n\t"
    387 
    388         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    389         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    390         "pmaddwd %%xmm0, %%xmm2         \n\t"
    391         "pmaddwd %%xmm0, %%xmm3         \n\t"
    392         "paddd %%xmm2, %%xmm12          \n\t"
    393         "paddd %%xmm3, %%xmm13          \n\t"
    394 
    395         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    396         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    397         "pmaddwd %%xmm0, %%xmm2         \n\t"
    398         "pmaddwd %%xmm0, %%xmm3         \n\t"
    399         "paddd %%xmm2, %%xmm14          \n\t"
    400         "paddd %%xmm3, %%xmm15          \n\t"
    401 
    402         "subq $2, %[run_depth_cells]\n\t"
    403         "ja outerLoop2%=\n\t"
    404 
    405         "movq %[run_depth_cells], %%r14\n\t"
    406         "decq %%r14\n\t"
    407         "js finish%=\n\t"
    408 
    409         // Loop for K unrolled by 2
    410         "outerLoop1%=:\n\t"
    411 
    412         // RHS cell to xmm1
    413         "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
    414 
    415         // LHS cell
    416         "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
    417         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    418         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    419         "pmaddwd %%xmm0, %%xmm2         \n\t"
    420         "pmaddwd %%xmm0, %%xmm3         \n\t"
    421         "paddd %%xmm2, %%xmm4           \n\t"
    422         "paddd %%xmm3, %%xmm5           \n\t"
    423         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    424         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    425         "pmaddwd %%xmm0, %%xmm2         \n\t"
    426         "pmaddwd %%xmm0, %%xmm3         \n\t"
    427         "paddd %%xmm2, %%xmm6           \n\t"
    428         "paddd %%xmm3, %%xmm7           \n\t"
    429 
    430         // next LHS cell
    431         "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
    432         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    433         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    434         "pmaddwd %%xmm0, %%xmm2         \n\t"
    435         "pmaddwd %%xmm0, %%xmm3         \n\t"
    436         "paddd %%xmm2, %%xmm8           \n\t"
    437         "paddd %%xmm3, %%xmm9           \n\t"
    438         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    439         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    440         "pmaddwd %%xmm0, %%xmm2         \n\t"
    441         "pmaddwd %%xmm0, %%xmm3         \n\t"
    442         "paddd %%xmm2, %%xmm10          \n\t"
    443         "paddd %%xmm3, %%xmm11          \n\t"
    444 
    445         // next LHS cell
    446         "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
    447 
    448         "addq $0x18, %[lhs_ptr]         \n\t"
    449         "addq $0x08, %[rhs_ptr]         \n\t"
    450 
    451         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    452         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    453         "pmaddwd %%xmm0, %%xmm2         \n\t"
    454         "pmaddwd %%xmm0, %%xmm3         \n\t"
    455         "paddd %%xmm2, %%xmm12          \n\t"
    456         "paddd %%xmm3, %%xmm13          \n\t"
    457         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    458         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    459         "pmaddwd %%xmm0, %%xmm2         \n\t"
    460         "pmaddwd %%xmm0, %%xmm3         \n\t"
    461         "paddd %%xmm2, %%xmm14          \n\t"
    462         "paddd %%xmm3, %%xmm15          \n\t"
    463 
    464         "decq %[run_depth_cells]\n\t"
    465         "jnz outerLoop1%=\n\t"
    466 
    467         "finish%=:\n\t"
    468 
    469         "test %[start_depth], %[start_depth]\n\t"
    470         "jz storeDst%=\n\t"
    471 
    472         "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
    473         "paddd 0x10(%[dst_ptr])           , %%xmm8 \n\t"
    474         "paddd 0x20(%[dst_ptr])           , %%xmm12\n\t"
    475         "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t"
    476         "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t"
    477         "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t"
    478         "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t"
    479         "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t"
    480         "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t"
    481         "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t"
    482         "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t"
    483         "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t"
    484 
    485         "storeDst%=:\n\t"
    486 
    487         "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
    488         "movdqu %%xmm8  , 0x10(%[dst_ptr])          \n\t"
    489         "movdqu %%xmm12 , 0x20(%[dst_ptr])          \n\t"
    490         "movdqu %%xmm5  , 0x00(%[dst_ptr], %%r12, 1)\n\t"
    491         "movdqu %%xmm9  , 0x10(%[dst_ptr], %%r12, 1)\n\t"
    492         "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t"
    493         "movdqu %%xmm6  , 0x00(%[dst_ptr], %%r12, 2)\n\t"
    494         "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t"
    495         "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t"
    496         "movdqu %%xmm7  , 0x00(%[dst_ptr], %%r13, 1)\n\t"
    497         "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t"
    498         "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t"
    499 
    500         :  // outputs
    501         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
    502         [dst_ptr] "+r"(dst_ptr)
    503         :  // inputs
    504         [start_depth] "r"(start_depth),
    505         [dst_col_stride_q] "r"(dst_col_stride_q),
    506         [run_depth_cells] "r"(run_depth_cells)
    507         :  // clobbers
    508         "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
    509         "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14",
    510         "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15");
    511   }
    512 };
    513 #endif
    514 
    515 }  // namespace gemmlowp
    516 
    517 #endif  // GEMMLOWP_INTERNAL_KERNEL_SSE_H_
    518