Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/basic_types.h"
     12 
     13 #include "libyuv/compare_row.h"
     14 #include "libyuv/row.h"
     15 
     16 #if defined(_MSC_VER)
     17 #include <intrin.h>  // For __popcnt
     18 #endif
     19 
     20 #ifdef __cplusplus
     21 namespace libyuv {
     22 extern "C" {
     23 #endif
     24 
     25 // This module is for 32 bit Visual C x86 and clangcl
     26 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
     27 
     28 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
     29                                const uint8_t* src_b,
     30                                int count) {
     31   uint32_t diff = 0u;
     32 
     33   int i;
     34   for (i = 0; i < count - 3; i += 4) {
     35     uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
     36     src_a += 4;
     37     src_b += 4;
     38     diff += __popcnt(x);
     39   }
     40   return diff;
     41 }
     42 
     43 __declspec(naked) uint32_t
     44     SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
     45   __asm {
     46     mov        eax, [esp + 4]  // src_a
     47     mov        edx, [esp + 8]  // src_b
     48     mov        ecx, [esp + 12]  // count
     49     pxor       xmm0, xmm0
     50     pxor       xmm5, xmm5
     51 
     52   wloop:
     53     movdqu     xmm1, [eax]
     54     lea        eax,  [eax + 16]
     55     movdqu     xmm2, [edx]
     56     lea        edx,  [edx + 16]
     57     movdqa     xmm3, xmm1  // abs trick
     58     psubusb    xmm1, xmm2
     59     psubusb    xmm2, xmm3
     60     por        xmm1, xmm2
     61     movdqa     xmm2, xmm1
     62     punpcklbw  xmm1, xmm5
     63     punpckhbw  xmm2, xmm5
     64     pmaddwd    xmm1, xmm1
     65     pmaddwd    xmm2, xmm2
     66     paddd      xmm0, xmm1
     67     paddd      xmm0, xmm2
     68     sub        ecx, 16
     69     jg         wloop
     70 
     71     pshufd     xmm1, xmm0, 0xee
     72     paddd      xmm0, xmm1
     73     pshufd     xmm1, xmm0, 0x01
     74     paddd      xmm0, xmm1
     75     movd       eax, xmm0
     76     ret
     77   }
     78 }
     79 
     80 // Visual C 2012 required for AVX2.
     81 #if _MSC_VER >= 1700
     82 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
     83 #pragma warning(disable : 4752)
     84 __declspec(naked) uint32_t
     85     SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
     86   __asm {
     87     mov        eax, [esp + 4]  // src_a
     88     mov        edx, [esp + 8]  // src_b
     89     mov        ecx, [esp + 12]  // count
     90     vpxor      ymm0, ymm0, ymm0  // sum
     91     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     92     sub        edx, eax
     93 
     94   wloop:
     95     vmovdqu    ymm1, [eax]
     96     vmovdqu    ymm2, [eax + edx]
     97     lea        eax,  [eax + 32]
     98     vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
     99     vpsubusb   ymm2, ymm2, ymm1
    100     vpor       ymm1, ymm2, ymm3
    101     vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
    102     vpunpckhbw ymm1, ymm1, ymm5
    103     vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
    104     vpmaddwd   ymm1, ymm1, ymm1
    105     vpaddd     ymm0, ymm0, ymm1
    106     vpaddd     ymm0, ymm0, ymm2
    107     sub        ecx, 32
    108     jg         wloop
    109 
    110     vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
    111     vpaddd     ymm0, ymm0, ymm1
    112     vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
    113     vpaddd     ymm0, ymm0, ymm1
    114     vpermq     ymm1, ymm0, 0x02  // high + low lane.
    115     vpaddd     ymm0, ymm0, ymm1
    116     vmovd      eax, xmm0
    117     vzeroupper
    118     ret
    119   }
    120 }
    121 #endif  // _MSC_VER >= 1700
    122 
    123 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
    124 uvec32 kHashMul0 = {
    125     0x0c3525e1,  // 33 ^ 15
    126     0xa3476dc1,  // 33 ^ 14
    127     0x3b4039a1,  // 33 ^ 13
    128     0x4f5f0981,  // 33 ^ 12
    129 };
    130 uvec32 kHashMul1 = {
    131     0x30f35d61,  // 33 ^ 11
    132     0x855cb541,  // 33 ^ 10
    133     0x040a9121,  // 33 ^ 9
    134     0x747c7101,  // 33 ^ 8
    135 };
    136 uvec32 kHashMul2 = {
    137     0xec41d4e1,  // 33 ^ 7
    138     0x4cfa3cc1,  // 33 ^ 6
    139     0x025528a1,  // 33 ^ 5
    140     0x00121881,  // 33 ^ 4
    141 };
    142 uvec32 kHashMul3 = {
    143     0x00008c61,  // 33 ^ 3
    144     0x00000441,  // 33 ^ 2
    145     0x00000021,  // 33 ^ 1
    146     0x00000001,  // 33 ^ 0
    147 };
    148 
    149 __declspec(naked) uint32_t
    150     HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
    151   __asm {
    152     mov        eax, [esp + 4]  // src
    153     mov        ecx, [esp + 8]  // count
    154     movd       xmm0, [esp + 12]  // seed
    155 
    156     pxor       xmm7, xmm7  // constant 0 for unpck
    157     movdqa     xmm6, xmmword ptr kHash16x33
    158 
    159   wloop:
    160     movdqu     xmm1, [eax]  // src[0-15]
    161     lea        eax, [eax + 16]
    162     pmulld     xmm0, xmm6  // hash *= 33 ^ 16
    163     movdqa     xmm5, xmmword ptr kHashMul0
    164     movdqa     xmm2, xmm1
    165     punpcklbw  xmm2, xmm7  // src[0-7]
    166     movdqa     xmm3, xmm2
    167     punpcklwd  xmm3, xmm7  // src[0-3]
    168     pmulld     xmm3, xmm5
    169     movdqa     xmm5, xmmword ptr kHashMul1
    170     movdqa     xmm4, xmm2
    171     punpckhwd  xmm4, xmm7  // src[4-7]
    172     pmulld     xmm4, xmm5
    173     movdqa     xmm5, xmmword ptr kHashMul2
    174     punpckhbw  xmm1, xmm7  // src[8-15]
    175     movdqa     xmm2, xmm1
    176     punpcklwd  xmm2, xmm7  // src[8-11]
    177     pmulld     xmm2, xmm5
    178     movdqa     xmm5, xmmword ptr kHashMul3
    179     punpckhwd  xmm1, xmm7  // src[12-15]
    180     pmulld     xmm1, xmm5
    181     paddd      xmm3, xmm4  // add 16 results
    182     paddd      xmm1, xmm2
    183     paddd      xmm1, xmm3
    184 
    185     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
    186     paddd      xmm1, xmm2
    187     pshufd     xmm2, xmm1, 0x01
    188     paddd      xmm1, xmm2
    189     paddd      xmm0, xmm1
    190     sub        ecx, 16
    191     jg         wloop
    192 
    193     movd       eax, xmm0  // return hash
    194     ret
    195   }
    196 }
    197 
    198 // Visual C 2012 required for AVX2.
    199 #if _MSC_VER >= 1700
    200 __declspec(naked) uint32_t
    201     HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
    202   __asm {
    203     mov        eax, [esp + 4]  // src
    204     mov        ecx, [esp + 8]  // count
    205     vmovd      xmm0, [esp + 12]  // seed
    206 
    207   wloop:
    208     vpmovzxbd  xmm3, [eax]  // src[0-3]
    209     vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
    210     vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
    211     vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
    212     vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
    213     vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
    214     vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
    215     vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
    216     lea        eax, [eax + 16]
    217     vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
    218     vpaddd     xmm3, xmm3, xmm4  // add 16 results
    219     vpaddd     xmm1, xmm1, xmm2
    220     vpaddd     xmm1, xmm1, xmm3
    221     vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
    222     vpaddd     xmm1, xmm1,xmm2
    223     vpshufd    xmm2, xmm1, 0x01
    224     vpaddd     xmm1, xmm1, xmm2
    225     vpaddd     xmm0, xmm0, xmm1
    226     sub        ecx, 16
    227     jg         wloop
    228 
    229     vmovd      eax, xmm0  // return hash
    230     vzeroupper
    231     ret
    232   }
    233 }
    234 #endif  // _MSC_VER >= 1700
    235 
    236 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
    237 
    238 #ifdef __cplusplus
    239 }  // extern "C"
    240 }  // namespace libyuv
    241 #endif
    242