Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
     12 #define VPX_DSP_X86_TRANSPOSE_SSE2_H_
     13 
     14 #include <emmintrin.h>  // SSE2
     15 
     16 #include "./vpx_config.h"
     17 
     18 static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
     19   // Unpack 16 bit elements. Goes from:
     20   // in[0]: 00 01 02 03
     21   // in[1]: 10 11 12 13
     22   // in[2]: 20 21 22 23
     23   // in[3]: 30 31 32 33
     24   // to:
     25   // a0:    00 10 01 11  02 12 03 13
     26   // a1:    20 30 21 31  22 32 23 33
     27   const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
     28   const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
     29 
     30   // Unpack 32 bit elements resulting in:
     31   // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
     32   return _mm_unpacklo_epi16(a0, a1);
     33 }
     34 
     35 static INLINE void transpose_8bit_8x8(const __m128i *const in,
     36                                       __m128i *const out) {
     37   // Unpack 8 bit elements. Goes from:
     38   // in[0]: 00 01 02 03 04 05 06 07
     39   // in[1]: 10 11 12 13 14 15 16 17
     40   // in[2]: 20 21 22 23 24 25 26 27
     41   // in[3]: 30 31 32 33 34 35 36 37
     42   // in[4]: 40 41 42 43 44 45 46 47
     43   // in[5]: 50 51 52 53 54 55 56 57
     44   // in[6]: 60 61 62 63 64 65 66 67
     45   // in[7]: 70 71 72 73 74 75 76 77
     46   // to:
     47   // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
     48   // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
     49   // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
     50   // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
     51   const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
     52   const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
     53   const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
     54   const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
     55 
     56   // Unpack 16 bit elements resulting in:
     57   // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     58   // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
     59   // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     60   // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
     61   const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
     62   const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
     63   const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
     64   const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
     65 
     66   // Unpack 32 bit elements resulting in:
     67   // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
     68   // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
     69   // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
     70   // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
     71   const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
     72   const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
     73   const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
     74   const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
     75 
     76   // Unpack 64 bit elements resulting in:
     77   // out[0]: 00 10 20 30 40 50 60 70
     78   // out[1]: 01 11 21 31 41 51 61 71
     79   // out[2]: 02 12 22 32 42 52 62 72
     80   // out[3]: 03 13 23 33 43 53 63 73
     81   // out[4]: 04 14 24 34 44 54 64 74
     82   // out[5]: 05 15 25 35 45 55 65 75
     83   // out[6]: 06 16 26 36 46 56 66 76
     84   // out[7]: 07 17 27 37 47 57 67 77
     85   out[0] = _mm_unpacklo_epi64(c0, c0);
     86   out[1] = _mm_unpackhi_epi64(c0, c0);
     87   out[2] = _mm_unpacklo_epi64(c1, c1);
     88   out[3] = _mm_unpackhi_epi64(c1, c1);
     89   out[4] = _mm_unpacklo_epi64(c2, c2);
     90   out[5] = _mm_unpackhi_epi64(c2, c2);
     91   out[6] = _mm_unpacklo_epi64(c3, c3);
     92   out[7] = _mm_unpackhi_epi64(c3, c3);
     93 }
     94 
     95 static INLINE void transpose_16bit_4x4(const __m128i *const in,
     96                                        __m128i *const out) {
     97   // Unpack 16 bit elements. Goes from:
     98   // in[0]: 00 01 02 03  XX XX XX XX
     99   // in[1]: 10 11 12 13  XX XX XX XX
    100   // in[2]: 20 21 22 23  XX XX XX XX
    101   // in[3]: 30 31 32 33  XX XX XX XX
    102   // to:
    103   // a0:    00 10 01 11  02 12 03 13
    104   // a1:    20 30 21 31  22 32 23 33
    105   const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
    106   const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
    107 
    108   // Unpack 32 bit elements resulting in:
    109   // out[0]: 00 10 20 30  01 11 21 31
    110   // out[1]: 02 12 22 32  03 13 23 33
    111   out[0] = _mm_unpacklo_epi32(a0, a1);
    112   out[1] = _mm_unpackhi_epi32(a0, a1);
    113 }
    114 
    115 static INLINE void transpose_16bit_4x8(const __m128i *const in,
    116                                        __m128i *const out) {
    117   // Unpack 16 bit elements. Goes from:
    118   // in[0]: 00 01 02 03  XX XX XX XX
    119   // in[1]: 10 11 12 13  XX XX XX XX
    120   // in[2]: 20 21 22 23  XX XX XX XX
    121   // in[3]: 30 31 32 33  XX XX XX XX
    122   // in[4]: 40 41 42 43  XX XX XX XX
    123   // in[5]: 50 51 52 53  XX XX XX XX
    124   // in[6]: 60 61 62 63  XX XX XX XX
    125   // in[7]: 70 71 72 73  XX XX XX XX
    126   // to:
    127   // a0:    00 10 01 11  02 12 03 13
    128   // a1:    20 30 21 31  22 32 23 33
    129   // a2:    40 50 41 51  42 52 43 53
    130   // a3:    60 70 61 71  62 72 63 73
    131   const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
    132   const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
    133   const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
    134   const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
    135 
    136   // Unpack 32 bit elements resulting in:
    137   // b0: 00 10 20 30  01 11 21 31
    138   // b1: 40 50 60 70  41 51 61 71
    139   // b2: 02 12 22 32  03 13 23 33
    140   // b3: 42 52 62 72  43 53 63 73
    141   const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
    142   const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
    143   const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
    144   const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
    145 
    146   // Unpack 64 bit elements resulting in:
    147   // out[0]: 00 10 20 30  40 50 60 70
    148   // out[1]: 01 11 21 31  41 51 61 71
    149   // out[2]: 02 12 22 32  42 52 62 72
    150   // out[3]: 03 13 23 33  43 53 63 73
    151   out[0] = _mm_unpacklo_epi64(b0, b1);
    152   out[1] = _mm_unpackhi_epi64(b0, b1);
    153   out[2] = _mm_unpacklo_epi64(b2, b3);
    154   out[3] = _mm_unpackhi_epi64(b2, b3);
    155 }
    156 
    157 static INLINE void transpose_16bit_8x8(const __m128i *const in,
    158                                        __m128i *const out) {
    159   // Unpack 16 bit elements. Goes from:
    160   // in[0]: 00 01 02 03  04 05 06 07
    161   // in[1]: 10 11 12 13  14 15 16 17
    162   // in[2]: 20 21 22 23  24 25 26 27
    163   // in[3]: 30 31 32 33  34 35 36 37
    164   // in[4]: 40 41 42 43  44 45 46 47
    165   // in[5]: 50 51 52 53  54 55 56 57
    166   // in[6]: 60 61 62 63  64 65 66 67
    167   // in[7]: 70 71 72 73  74 75 76 77
    168   // to:
    169   // a0:    00 10 01 11  02 12 03 13
    170   // a1:    20 30 21 31  22 32 23 33
    171   // a2:    40 50 41 51  42 52 43 53
    172   // a3:    60 70 61 71  62 72 63 73
    173   // a4:    04 14 05 15  06 16 07 17
    174   // a5:    24 34 25 35  26 36 27 37
    175   // a6:    44 54 45 55  46 56 47 57
    176   // a7:    64 74 65 75  66 76 67 77
    177   const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
    178   const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
    179   const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
    180   const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
    181   const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
    182   const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
    183   const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
    184   const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
    185 
    186   // Unpack 32 bit elements resulting in:
    187   // b0: 00 10 20 30  01 11 21 31
    188   // b1: 40 50 60 70  41 51 61 71
    189   // b2: 04 14 24 34  05 15 25 35
    190   // b3: 44 54 64 74  45 55 65 75
    191   // b4: 02 12 22 32  03 13 23 33
    192   // b5: 42 52 62 72  43 53 63 73
    193   // b6: 06 16 26 36  07 17 27 37
    194   // b7: 46 56 66 76  47 57 67 77
    195   const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
    196   const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
    197   const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
    198   const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
    199   const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
    200   const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
    201   const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
    202   const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
    203 
    204   // Unpack 64 bit elements resulting in:
    205   // out[0]: 00 10 20 30  40 50 60 70
    206   // out[1]: 01 11 21 31  41 51 61 71
    207   // out[2]: 02 12 22 32  42 52 62 72
    208   // out[3]: 03 13 23 33  43 53 63 73
    209   // out[4]: 04 14 24 34  44 54 64 74
    210   // out[5]: 05 15 25 35  45 55 65 75
    211   // out[6]: 06 16 26 36  46 56 66 76
    212   // out[7]: 07 17 27 37  47 57 67 77
    213   out[0] = _mm_unpacklo_epi64(b0, b1);
    214   out[1] = _mm_unpackhi_epi64(b0, b1);
    215   out[2] = _mm_unpacklo_epi64(b4, b5);
    216   out[3] = _mm_unpackhi_epi64(b4, b5);
    217   out[4] = _mm_unpacklo_epi64(b2, b3);
    218   out[5] = _mm_unpackhi_epi64(b2, b3);
    219   out[6] = _mm_unpacklo_epi64(b6, b7);
    220   out[7] = _mm_unpackhi_epi64(b6, b7);
    221 }
    222 
    223 // Transpose in-place
    224 static INLINE void transpose_16bit_16x16(__m128i *const left,
    225                                          __m128i *const right) {
    226   __m128i tbuf[8];
    227   transpose_16bit_8x8(left, left);
    228   transpose_16bit_8x8(right, tbuf);
    229   transpose_16bit_8x8(left + 8, right);
    230   transpose_16bit_8x8(right + 8, right + 8);
    231 
    232   left[8] = tbuf[0];
    233   left[9] = tbuf[1];
    234   left[10] = tbuf[2];
    235   left[11] = tbuf[3];
    236   left[12] = tbuf[4];
    237   left[13] = tbuf[5];
    238   left[14] = tbuf[6];
    239   left[15] = tbuf[7];
    240 }
    241 
    242 static INLINE void transpose_32bit_4x4(const __m128i *const in,
    243                                        __m128i *const out) {
    244   // Unpack 32 bit elements. Goes from:
    245   // in[0]: 00 01 02 03
    246   // in[1]: 10 11 12 13
    247   // in[2]: 20 21 22 23
    248   // in[3]: 30 31 32 33
    249   // to:
    250   // a0:    00 10 01 11
    251   // a1:    20 30 21 31
    252   // a2:    02 12 03 13
    253   // a3:    22 32 23 33
    254 
    255   const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
    256   const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
    257   const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
    258   const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
    259 
    260   // Unpack 64 bit elements resulting in:
    261   // out[0]: 00 10 20 30
    262   // out[1]: 01 11 21 31
    263   // out[2]: 02 12 22 32
    264   // out[3]: 03 13 23 33
    265   out[0] = _mm_unpacklo_epi64(a0, a1);
    266   out[1] = _mm_unpackhi_epi64(a0, a1);
    267   out[2] = _mm_unpacklo_epi64(a2, a3);
    268   out[3] = _mm_unpackhi_epi64(a2, a3);
    269 }
    270 
    271 static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
    272                                          __m128i *const out) {
    273   // Unpack 32 bit elements. Goes from:
    274   // in[0]: 00 01 02 03
    275   // in[1]: 10 11 12 13
    276   // in[2]: 20 21 22 23
    277   // in[3]: 30 31 32 33
    278   // in[4]: 04 05 06 07
    279   // in[5]: 14 15 16 17
    280   // in[6]: 24 25 26 27
    281   // in[7]: 34 35 36 37
    282   // to:
    283   // a0:    00 10 01 11
    284   // a1:    20 30 21 31
    285   // a2:    02 12 03 13
    286   // a3:    22 32 23 33
    287   // a4:    04 14 05 15
    288   // a5:    24 34 25 35
    289   // a6:    06 16 07 17
    290   // a7:    26 36 27 37
    291   const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
    292   const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
    293   const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
    294   const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
    295   const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
    296   const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
    297   const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
    298   const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
    299 
    300   // Unpack 64 bit elements resulting in:
    301   // out[0]: 00 10 20 30
    302   // out[1]: 01 11 21 31
    303   // out[2]: 02 12 22 32
    304   // out[3]: 03 13 23 33
    305   // out[4]: 04 14 24 34
    306   // out[5]: 05 15 25 35
    307   // out[6]: 06 16 26 36
    308   // out[7]: 07 17 27 37
    309   out[0] = _mm_unpacklo_epi64(a0, a1);
    310   out[1] = _mm_unpackhi_epi64(a0, a1);
    311   out[2] = _mm_unpacklo_epi64(a2, a3);
    312   out[3] = _mm_unpackhi_epi64(a2, a3);
    313   out[4] = _mm_unpacklo_epi64(a4, a5);
    314   out[5] = _mm_unpackhi_epi64(a4, a5);
    315   out[6] = _mm_unpacklo_epi64(a6, a7);
    316   out[7] = _mm_unpackhi_epi64(a6, a7);
    317 }
    318 
    319 static INLINE void transpose_32bit_8x4(const __m128i *const in,
    320                                        __m128i *const out) {
    321   // Unpack 32 bit elements. Goes from:
    322   // in[0]: 00 01 02 03
    323   // in[1]: 04 05 06 07
    324   // in[2]: 10 11 12 13
    325   // in[3]: 14 15 16 17
    326   // in[4]: 20 21 22 23
    327   // in[5]: 24 25 26 27
    328   // in[6]: 30 31 32 33
    329   // in[7]: 34 35 36 37
    330   // to:
    331   // a0: 00 10 01 11
    332   // a1: 20 30 21 31
    333   // a2: 02 12 03 13
    334   // a3: 22 32 23 33
    335   // a4: 04 14 05 15
    336   // a5: 24 34 25 35
    337   // a6: 06 16 07 17
    338   // a7: 26 36 27 37
    339   const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
    340   const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
    341   const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
    342   const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
    343   const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
    344   const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
    345   const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
    346   const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
    347 
    348   // Unpack 64 bit elements resulting in:
    349   // out[0]: 00 10 20 30
    350   // out[1]: 01 11 21 31
    351   // out[2]: 02 12 22 32
    352   // out[3]: 03 13 23 33
    353   // out[4]: 04 14 24 34
    354   // out[5]: 05 15 25 35
    355   // out[6]: 06 16 26 36
    356   // out[7]: 07 17 27 37
    357   out[0] = _mm_unpacklo_epi64(a0, a1);
    358   out[1] = _mm_unpackhi_epi64(a0, a1);
    359   out[2] = _mm_unpacklo_epi64(a2, a3);
    360   out[3] = _mm_unpackhi_epi64(a2, a3);
    361   out[4] = _mm_unpacklo_epi64(a4, a5);
    362   out[5] = _mm_unpackhi_epi64(a4, a5);
    363   out[6] = _mm_unpacklo_epi64(a6, a7);
    364   out[7] = _mm_unpackhi_epi64(a6, a7);
    365 }
    366 
    367 #endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_
    368