Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // kernel_SSE.h: a collection of Intel SSE optimized kernels.
     16 // Check in kernel_default.h which one(s) are actually used by default.
     17 // Others are mere experiments; they are still covered by tests
     18 // in case they might be useful some day.
     19 //
     20 
     21 #ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_
     22 #define GEMMLOWP_INTERNAL_KERNEL_SSE_H_
     23 
     24 #include "kernel.h"
     25 
     26 #include <string.h>
     27 #include <cassert>
     28 
     29 namespace gemmlowp {
     30 
     31 #ifdef GEMMLOWP_SSE4_32
     32 struct SSE4_32_Kernel4x4Depth2 : KernelBase {
     33   typedef KernelFormat<
     34       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>,
     35       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
     36       Format;
     37 
     38   const char* Name() const override { return "SSE, 4x4, depth 2"; }
     39 
     40   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
     41            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
     42            const std::uint8_t* rhs_ptr, std::size_t start_depth,
     43            std::size_t run_depth) const override {
     44     ScopedProfilingLabel label("optimized kernel");
     45     assert(dst_row_stride == 1);
     46     std::int32_t run_depth_cells = run_depth / Format::kDepth;
     47     /* Main loop */
     48 
     49     // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
     50     // A 4x2 block Lhs is stored in 16bit in xmm0.
     51     // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7.
     52     //
     53     //                   +-------+-------+-------+-------+
     54     //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
     55     //              Rhs  +-------+---------------+-------+
     56     //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
     57     //                   +-------+-------+-------+-------+
     58     //
     59     //                   |       |       |       |       |
     60     //
     61     //    Lhs            |       |       |       |       |
     62     //
     63     //  +--+--+ - - - -  +-------+-------+-------+-------+
     64     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
     65     //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
     66     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
     67     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
     68     //  +--+--+ - - - -  +-------+-------+-------+-------+
     69     //
     70     //                              Accumulator
     71 
     72     asm volatile(
     73 
     74         // set accumulators to zero.
     75         "pxor %%xmm4  , %%xmm4 \n\t"
     76         "pxor %%xmm5  , %%xmm5 \n\t"
     77         "pxor %%xmm6  , %%xmm6 \n\t"
     78         "pxor %%xmm7  , %%xmm7 \n\t"
     79 
     80         "movl  %[run_depth_cells], %%eax\n\t"
     81         "subl $2, %%eax\n\t"
     82         "js outerLoop1%=\n\t"
     83 
     84         // Loop for K unrolled by 4
     85         "outerLoop2%=:\n\t"
     86 
     87         // K = 1,2
     88         // RHS cell to xmm1
     89         "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
     90 
     91         // LHS cell
     92         "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
     93         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
     94         "pmaddwd %%xmm0, %%xmm2         \n\t"
     95         "paddd %%xmm2, %%xmm4           \n\t"
     96         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
     97         "pmaddwd %%xmm0, %%xmm3         \n\t"
     98         "paddd %%xmm3, %%xmm5           \n\t"
     99 
    100         "prefetcht0 0x80(%[lhs_ptr]) \n\t"
    101 
    102         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    103         "pmaddwd %%xmm0, %%xmm2         \n\t"
    104         "paddd %%xmm2, %%xmm6           \n\t"
    105         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    106         "pmaddwd %%xmm0, %%xmm3         \n\t"
    107         "paddd %%xmm3, %%xmm7           \n\t"
    108 
    109         "prefetcht0 0x80(%[rhs_ptr]) \n\t"
    110 
    111         // K = 3,4
    112         // RHS cell to xmm1
    113         "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
    114 
    115         // LHS cell
    116         "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
    117         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    118         "pmaddwd %%xmm0, %%xmm2         \n\t"
    119         "paddd %%xmm2, %%xmm4           \n\t"
    120         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    121         "pmaddwd %%xmm0, %%xmm3         \n\t"
    122         "paddd %%xmm3, %%xmm5           \n\t"
    123 
    124         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    125         "pmaddwd %%xmm0, %%xmm2         \n\t"
    126         "paddd %%xmm2, %%xmm6           \n\t"
    127         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    128         "pmaddwd %%xmm0, %%xmm3         \n\t"
    129         "paddd %%xmm3, %%xmm7           \n\t"
    130 
    131         "addl $0x10, %[lhs_ptr]\n\t"
    132         "addl $0x10, %[rhs_ptr]\n\t"
    133 
    134         "subl $2, %[run_depth_cells]\n\t"
    135         "jnz outerLoop2%=\n\t"
    136 
    137         "movl %[run_depth_cells], %%eax\n\t"
    138         "decl %%eax\n\t"
    139         "js finish%=\n\t"
    140 
    141         // Loop for K unrolled by 2
    142         "outerLoop1%=:\n\t"
    143 
    144         // RHS cell to xmm1
    145         "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
    146 
    147         // LHS cell
    148         "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
    149         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    150         "pmaddwd %%xmm0, %%xmm2         \n\t"
    151         "paddd %%xmm2, %%xmm4           \n\t"
    152         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    153         "pmaddwd %%xmm0, %%xmm3         \n\t"
    154         "paddd %%xmm3, %%xmm5           \n\t"
    155 
    156         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    157         "pmaddwd %%xmm0, %%xmm2         \n\t"
    158         "paddd %%xmm2, %%xmm6           \n\t"
    159         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    160         "pmaddwd %%xmm0, %%xmm3         \n\t"
    161         "paddd %%xmm3, %%xmm7           \n\t"
    162 
    163         "addl $0x08, %[lhs_ptr]\n\t"
    164         "addl $0x08, %[rhs_ptr]\n\t"
    165 
    166         "decl %[run_depth_cells]\n\t"
    167         "jnz outerLoop1%=\n\t"
    168 
    169         "finish%=:\n\t"
    170 
    171         "movl  %[dst_col_stride], %%eax\n\t"
    172         "shll $2, %%eax\n\t"
    173 
    174         "movl  %[start_depth], %%ecx\n\t"
    175         "test %%ecx, %%ecx\n\t"
    176         "jz storeDst%=\n\t"
    177 
    178         "leal (%%eax,%%eax,0x2), %%ecx\n\t"
    179         "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
    180         "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t"
    181         "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t"
    182         "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t"
    183 
    184         "storeDst%=:\n\t"
    185 
    186         "leal (%%eax,%%eax,0x2), %%ecx\n\t"
    187         "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
    188         "movdqu %%xmm5  , 0x00(%[dst_ptr], %%eax, 1)\n\t"
    189         "movdqu %%xmm6  , 0x00(%[dst_ptr], %%eax, 2)\n\t"
    190         "movdqu %%xmm7  , 0x00(%[dst_ptr], %%ecx, 1)\n\t"
    191 
    192         :  // outputs
    193         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
    194         [dst_ptr] "+r"(dst_ptr)
    195         :  // inputs
    196         [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride),
    197         [run_depth_cells] "g"(run_depth_cells)
    198         :  // clobbers
    199         "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
    200         "%xmm6", "%xmm7", "%eax", "%ecx");
    201   }
    202 };
    203 #endif
    204 #ifdef GEMMLOWP_SSE4_64
    205 struct SSE4_64_Kernel12x4Depth2 : KernelBase {
    206   typedef KernelFormat<
    207       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
    208       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
    209       Format;
    210 
    211   const char* Name() const override { return "SSE, 12x4, depth 2"; }
    212 
    213   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
    214            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
    215            const std::uint8_t* rhs_ptr, std::size_t start_depth,
    216            std::size_t run_depth) const override {
    217     ScopedProfilingLabel label("optimized kernel");
    218     assert(dst_row_stride == 1);
    219     const std::int64_t run_depth_cells = run_depth / Format::kDepth;
    220     const std::int64_t dst_col_stride_q = dst_col_stride;
    221 
    222     /* Main loop */
    223 
    224     // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
    225     // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced
    226     // every Iteration.
    227     // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15.
    228     //
    229     //                   +-------+-------+-------+-------+
    230     //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
    231     //              Rhs  +-------+---------------+-------+
    232     //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
    233     //                   +-------+-------+-------+-------+
    234     //
    235     //                   |       |       |       |       |
    236     //
    237     //    Lhs            |       |       |       |       |
    238     //
    239     //  +--+--+ - - - -  +-------+-------+-------+-------+
    240     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
    241     //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
    242     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
    243     //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
    244     //  +--+--+ - - - -  +-------+-------+-------+-------+
    245     //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
    246     //  |xmm0 | (Iter2)  | xmm8  | xmm9  | xmm10 | xmm11 |
    247     //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
    248     //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
    249     //  +--+--+ - - - -  +-------+-------+-------+-------+
    250     //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
    251     //  |xmm0 | (Iter3)  | xmm12 | xmm13 | xmm14 | xmm15 |
    252     //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
    253     //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
    254     //  +--+--+ - - - -  +-------+-------+-------+-------+
    255     //
    256     //                              Accumulator
    257 
    258     asm volatile(
    259 
    260         // Set registers for destination
    261         "movq  %[dst_col_stride_q], %%r12\n\t"
    262         "shlq $2, %%r12\n\t"
    263         "leaq (%%r12,%%r12,0x2), %%r13\n\t"
    264 
    265         // Set accumulators to zero.
    266         "pxor %%xmm4  , %%xmm4 \n\t"
    267         "pxor %%xmm5  , %%xmm5 \n\t"
    268         "pxor %%xmm6  , %%xmm6 \n\t"
    269         "pxor %%xmm7  , %%xmm7 \n\t"
    270         "pxor %%xmm8  , %%xmm8 \n\t"
    271         "pxor %%xmm9  , %%xmm9 \n\t"
    272         "pxor %%xmm10 , %%xmm10\n\t"
    273         "pxor %%xmm11 , %%xmm11\n\t"
    274         "pxor %%xmm12 , %%xmm12\n\t"
    275         "pxor %%xmm13 , %%xmm13\n\t"
    276         "pxor %%xmm14 , %%xmm14\n\t"
    277         "pxor %%xmm15 , %%xmm15\n\t"
    278 
    279         "movq  %[run_depth_cells], %%r14\n\t"
    280         "subq $2, %%r14\n\t"
    281         "js outerLoop1%=\n\t"
    282 
    283         // Loop for K unrolled by 4
    284         "outerLoop2%=:\n\t"
    285 
    286         // K = 1,2
    287         // RHS cell to xmm1
    288 
    289         "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
    290 
    291         // LHS cell
    292         "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
    293         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    294         "pmaddwd %%xmm0, %%xmm2         \n\t"
    295         "paddd %%xmm2, %%xmm4           \n\t"
    296         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    297         "pmaddwd %%xmm0, %%xmm3         \n\t"
    298         "paddd %%xmm3, %%xmm5           \n\t"
    299 
    300         "prefetcht0 0x80(%[lhs_ptr]) \n\t"
    301 
    302         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    303         "pmaddwd %%xmm0, %%xmm2         \n\t"
    304         "paddd %%xmm2, %%xmm6           \n\t"
    305         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    306         "pmaddwd %%xmm0, %%xmm3         \n\t"
    307         "paddd %%xmm3, %%xmm7           \n\t"
    308 
    309         // next LHS cell
    310         "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
    311         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    312         "pmaddwd %%xmm0, %%xmm2         \n\t"
    313         "paddd %%xmm2, %%xmm8           \n\t"
    314         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    315         "pmaddwd %%xmm0, %%xmm3         \n\t"
    316         "paddd %%xmm3, %%xmm9           \n\t"
    317 
    318         "prefetcht0 0x80(%[rhs_ptr]) \n\t"
    319 
    320         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    321         "pmaddwd %%xmm0, %%xmm2         \n\t"
    322         "paddd %%xmm2, %%xmm10          \n\t"
    323         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    324         "pmaddwd %%xmm0, %%xmm3         \n\t"
    325         "paddd %%xmm3, %%xmm11          \n\t"
    326 
    327         // next LHS cell
    328         "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
    329         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    330         "pmaddwd %%xmm0, %%xmm2         \n\t"
    331         "paddd %%xmm2, %%xmm12          \n\t"
    332         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    333         "pmaddwd %%xmm0, %%xmm3         \n\t"
    334         "paddd %%xmm3, %%xmm13          \n\t"
    335 
    336         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    337         "pmaddwd %%xmm0, %%xmm2         \n\t"
    338         "paddd %%xmm2, %%xmm14          \n\t"
    339         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    340         "pmaddwd %%xmm0, %%xmm3         \n\t"
    341         "paddd %%xmm3, %%xmm15          \n\t"
    342 
    343         // K = 3,4
    344         // RHS cell to xmm1
    345         "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
    346 
    347         // LHS cell
    348         "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t"
    349         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    350         "pmaddwd %%xmm0, %%xmm2         \n\t"
    351         "paddd %%xmm2, %%xmm4           \n\t"
    352         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    353         "pmaddwd %%xmm0, %%xmm3         \n\t"
    354         "paddd %%xmm3, %%xmm5           \n\t"
    355 
    356         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    357         "pmaddwd %%xmm0, %%xmm2         \n\t"
    358         "paddd %%xmm2, %%xmm6           \n\t"
    359         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    360         "pmaddwd %%xmm0, %%xmm3         \n\t"
    361         "paddd %%xmm3, %%xmm7           \n\t"
    362 
    363         // next LHS cell
    364         "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t"
    365         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    366         "pmaddwd %%xmm0, %%xmm2         \n\t"
    367         "paddd %%xmm2, %%xmm8           \n\t"
    368         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    369         "pmaddwd %%xmm0, %%xmm3         \n\t"
    370         "paddd %%xmm3, %%xmm9           \n\t"
    371 
    372         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    373         "pmaddwd %%xmm0, %%xmm2         \n\t"
    374         "paddd %%xmm2, %%xmm10          \n\t"
    375         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    376         "pmaddwd %%xmm0, %%xmm3         \n\t"
    377         "paddd %%xmm3, %%xmm11          \n\t"
    378 
    379         // next LHS cell
    380         "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t"
    381         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    382         "pmaddwd %%xmm0, %%xmm2         \n\t"
    383         "paddd %%xmm2, %%xmm12          \n\t"
    384         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    385         "pmaddwd %%xmm0, %%xmm3         \n\t"
    386         "paddd %%xmm3, %%xmm13          \n\t"
    387 
    388         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    389         "pmaddwd %%xmm0, %%xmm2         \n\t"
    390         "paddd %%xmm2, %%xmm14          \n\t"
    391         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    392         "pmaddwd %%xmm0, %%xmm3         \n\t"
    393         "paddd %%xmm3, %%xmm15          \n\t"
    394 
    395         "addq $0x30, %[lhs_ptr]\n\t"
    396         "addq $0x10, %[rhs_ptr]\n\t"
    397 
    398         "subq $2, %[run_depth_cells]\n\t"
    399         "jnz outerLoop2%=\n\t"
    400 
    401         "movq %[run_depth_cells], %%r14\n\t"
    402         "decq %%r14\n\t"
    403         "js finish%=\n\t"
    404 
    405         // Loop for K unrolled by 2
    406         "outerLoop1%=:\n\t"
    407 
    408         // RHS cell to xmm1
    409         "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
    410 
    411         // LHS cell
    412         "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
    413         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    414         "pmaddwd %%xmm0, %%xmm2         \n\t"
    415         "paddd %%xmm2, %%xmm4           \n\t"
    416         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    417         "pmaddwd %%xmm0, %%xmm3         \n\t"
    418         "paddd %%xmm3, %%xmm5           \n\t"
    419         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    420         "pmaddwd %%xmm0, %%xmm2         \n\t"
    421         "paddd %%xmm2, %%xmm6           \n\t"
    422         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    423         "pmaddwd %%xmm0, %%xmm3         \n\t"
    424         "paddd %%xmm3, %%xmm7           \n\t"
    425 
    426         // next LHS cell
    427         "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
    428         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    429         "pmaddwd %%xmm0, %%xmm2         \n\t"
    430         "paddd %%xmm2, %%xmm8           \n\t"
    431         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    432         "pmaddwd %%xmm0, %%xmm3         \n\t"
    433         "paddd %%xmm3, %%xmm9           \n\t"
    434         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    435         "pmaddwd %%xmm0, %%xmm2         \n\t"
    436         "paddd %%xmm2, %%xmm10          \n\t"
    437         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    438         "pmaddwd %%xmm0, %%xmm3         \n\t"
    439         "paddd %%xmm3, %%xmm11          \n\t"
    440 
    441         // next LHS cell
    442         "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
    443         "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
    444         "pmaddwd %%xmm0, %%xmm2         \n\t"
    445         "paddd %%xmm2, %%xmm12          \n\t"
    446         "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
    447         "pmaddwd %%xmm0, %%xmm3         \n\t"
    448         "paddd %%xmm3, %%xmm13          \n\t"
    449         "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
    450         "pmaddwd %%xmm0, %%xmm2         \n\t"
    451         "paddd %%xmm2, %%xmm14          \n\t"
    452         "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
    453         "pmaddwd %%xmm0, %%xmm3         \n\t"
    454         "paddd %%xmm3, %%xmm15          \n\t"
    455 
    456         "addq $0x18, %[lhs_ptr]\n\t"
    457         "addq $0x08, %[rhs_ptr]\n\t"
    458 
    459         "decq %[run_depth_cells]\n\t"
    460         "jnz outerLoop1%=\n\t"
    461 
    462         "finish%=:\n\t"
    463 
    464         "test %[start_depth], %[start_depth]\n\t"
    465         "jz storeDst%=\n\t"
    466 
    467         "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
    468         "paddd 0x10(%[dst_ptr])           , %%xmm8 \n\t"
    469         "paddd 0x20(%[dst_ptr])           , %%xmm12\n\t"
    470         "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t"
    471         "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t"
    472         "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t"
    473         "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t"
    474         "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t"
    475         "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t"
    476         "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t"
    477         "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t"
    478         "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t"
    479 
    480         "storeDst%=:\n\t"
    481 
    482         "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
    483         "movdqu %%xmm8  , 0x10(%[dst_ptr])          \n\t"
    484         "movdqu %%xmm12 , 0x20(%[dst_ptr])          \n\t"
    485         "movdqu %%xmm5  , 0x00(%[dst_ptr], %%r12, 1)\n\t"
    486         "movdqu %%xmm9  , 0x10(%[dst_ptr], %%r12, 1)\n\t"
    487         "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t"
    488         "movdqu %%xmm6  , 0x00(%[dst_ptr], %%r12, 2)\n\t"
    489         "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t"
    490         "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t"
    491         "movdqu %%xmm7  , 0x00(%[dst_ptr], %%r13, 1)\n\t"
    492         "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t"
    493         "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t"
    494 
    495         :  // outputs
    496         [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
    497         [dst_ptr] "+r"(dst_ptr)
    498         :  // inputs
    499         [start_depth] "r"(start_depth),
    500         [dst_col_stride_q] "r"(dst_col_stride_q),
    501         [run_depth_cells] "r"(run_depth_cells)
    502         :  // clobbers
    503         "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
    504         "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14",
    505         "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15");
    506   }
    507 };
    508 #endif
    509 
    510 }  // namespace gemmlowp
    511 
    512 #endif  // GEMMLOWP_INTERNAL_KERNEL_SSE_H_
    513