Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/basic_types.h"
     12 
     13 #include "libyuv/compare_row.h"
     14 #include "libyuv/row.h"
     15 
     16 #ifdef __cplusplus
     17 namespace libyuv {
     18 extern "C" {
     19 #endif
     20 
     21 // This module is for 32 bit Visual C x86 and clangcl
     22 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
     23 
     24 __declspec(naked) uint32
     25     SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
     26   __asm {
     27     mov        eax, [esp + 4]  // src_a
     28     mov        edx, [esp + 8]  // src_b
     29     mov        ecx, [esp + 12]  // count
     30     pxor       xmm0, xmm0
     31     pxor       xmm5, xmm5
     32 
     33   wloop:
     34     movdqu     xmm1, [eax]
     35     lea        eax,  [eax + 16]
     36     movdqu     xmm2, [edx]
     37     lea        edx,  [edx + 16]
     38     movdqa     xmm3, xmm1  // abs trick
     39     psubusb    xmm1, xmm2
     40     psubusb    xmm2, xmm3
     41     por        xmm1, xmm2
     42     movdqa     xmm2, xmm1
     43     punpcklbw  xmm1, xmm5
     44     punpckhbw  xmm2, xmm5
     45     pmaddwd    xmm1, xmm1
     46     pmaddwd    xmm2, xmm2
     47     paddd      xmm0, xmm1
     48     paddd      xmm0, xmm2
     49     sub        ecx, 16
     50     jg         wloop
     51 
     52     pshufd     xmm1, xmm0, 0xee
     53     paddd      xmm0, xmm1
     54     pshufd     xmm1, xmm0, 0x01
     55     paddd      xmm0, xmm1
     56     movd       eax, xmm0
     57     ret
     58   }
     59 }
     60 
     61 // Visual C 2012 required for AVX2.
     62 #if _MSC_VER >= 1700
     63 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
     64 #pragma warning(disable : 4752)
     65 __declspec(naked) uint32
     66     SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
     67   __asm {
     68     mov        eax, [esp + 4]  // src_a
     69     mov        edx, [esp + 8]  // src_b
     70     mov        ecx, [esp + 12]  // count
     71     vpxor      ymm0, ymm0, ymm0  // sum
     72     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     73     sub        edx, eax
     74 
     75   wloop:
     76     vmovdqu    ymm1, [eax]
     77     vmovdqu    ymm2, [eax + edx]
     78     lea        eax,  [eax + 32]
     79     vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
     80     vpsubusb   ymm2, ymm2, ymm1
     81     vpor       ymm1, ymm2, ymm3
     82     vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
     83     vpunpckhbw ymm1, ymm1, ymm5
     84     vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
     85     vpmaddwd   ymm1, ymm1, ymm1
     86     vpaddd     ymm0, ymm0, ymm1
     87     vpaddd     ymm0, ymm0, ymm2
     88     sub        ecx, 32
     89     jg         wloop
     90 
     91     vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
     92     vpaddd     ymm0, ymm0, ymm1
     93     vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
     94     vpaddd     ymm0, ymm0, ymm1
     95     vpermq     ymm1, ymm0, 0x02  // high + low lane.
     96     vpaddd     ymm0, ymm0, ymm1
     97     vmovd      eax, xmm0
     98     vzeroupper
     99     ret
    100   }
    101 }
    102 #endif  // _MSC_VER >= 1700
    103 
    104 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
    105 uvec32 kHashMul0 = {
    106     0x0c3525e1,  // 33 ^ 15
    107     0xa3476dc1,  // 33 ^ 14
    108     0x3b4039a1,  // 33 ^ 13
    109     0x4f5f0981,  // 33 ^ 12
    110 };
    111 uvec32 kHashMul1 = {
    112     0x30f35d61,  // 33 ^ 11
    113     0x855cb541,  // 33 ^ 10
    114     0x040a9121,  // 33 ^ 9
    115     0x747c7101,  // 33 ^ 8
    116 };
    117 uvec32 kHashMul2 = {
    118     0xec41d4e1,  // 33 ^ 7
    119     0x4cfa3cc1,  // 33 ^ 6
    120     0x025528a1,  // 33 ^ 5
    121     0x00121881,  // 33 ^ 4
    122 };
    123 uvec32 kHashMul3 = {
    124     0x00008c61,  // 33 ^ 3
    125     0x00000441,  // 33 ^ 2
    126     0x00000021,  // 33 ^ 1
    127     0x00000001,  // 33 ^ 0
    128 };
    129 
    130 __declspec(naked) uint32
    131     HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    132   __asm {
    133     mov        eax, [esp + 4]  // src
    134     mov        ecx, [esp + 8]  // count
    135     movd       xmm0, [esp + 12]  // seed
    136 
    137     pxor       xmm7, xmm7  // constant 0 for unpck
    138     movdqa     xmm6, xmmword ptr kHash16x33
    139 
    140   wloop:
    141     movdqu     xmm1, [eax]  // src[0-15]
    142     lea        eax, [eax + 16]
    143     pmulld     xmm0, xmm6  // hash *= 33 ^ 16
    144     movdqa     xmm5, xmmword ptr kHashMul0
    145     movdqa     xmm2, xmm1
    146     punpcklbw  xmm2, xmm7  // src[0-7]
    147     movdqa     xmm3, xmm2
    148     punpcklwd  xmm3, xmm7  // src[0-3]
    149     pmulld     xmm3, xmm5
    150     movdqa     xmm5, xmmword ptr kHashMul1
    151     movdqa     xmm4, xmm2
    152     punpckhwd  xmm4, xmm7  // src[4-7]
    153     pmulld     xmm4, xmm5
    154     movdqa     xmm5, xmmword ptr kHashMul2
    155     punpckhbw  xmm1, xmm7  // src[8-15]
    156     movdqa     xmm2, xmm1
    157     punpcklwd  xmm2, xmm7  // src[8-11]
    158     pmulld     xmm2, xmm5
    159     movdqa     xmm5, xmmword ptr kHashMul3
    160     punpckhwd  xmm1, xmm7  // src[12-15]
    161     pmulld     xmm1, xmm5
    162     paddd      xmm3, xmm4  // add 16 results
    163     paddd      xmm1, xmm2
    164     paddd      xmm1, xmm3
    165 
    166     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
    167     paddd      xmm1, xmm2
    168     pshufd     xmm2, xmm1, 0x01
    169     paddd      xmm1, xmm2
    170     paddd      xmm0, xmm1
    171     sub        ecx, 16
    172     jg         wloop
    173 
    174     movd       eax, xmm0  // return hash
    175     ret
    176   }
    177 }
    178 
    179 // Visual C 2012 required for AVX2.
    180 #if _MSC_VER >= 1700
    181 __declspec(naked) uint32
    182     HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
    183   __asm {
    184     mov        eax, [esp + 4]  // src
    185     mov        ecx, [esp + 8]  // count
    186     vmovd      xmm0, [esp + 12]  // seed
    187 
    188   wloop:
    189     vpmovzxbd  xmm3, [eax]  // src[0-3]
    190     vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
    191     vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
    192     vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
    193     vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
    194     vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
    195     vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
    196     vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
    197     lea        eax, [eax + 16]
    198     vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
    199     vpaddd     xmm3, xmm3, xmm4  // add 16 results
    200     vpaddd     xmm1, xmm1, xmm2
    201     vpaddd     xmm1, xmm1, xmm3
    202     vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
    203     vpaddd     xmm1, xmm1,xmm2
    204     vpshufd    xmm2, xmm1, 0x01
    205     vpaddd     xmm1, xmm1, xmm2
    206     vpaddd     xmm0, xmm0, xmm1
    207     sub        ecx, 16
    208     jg         wloop
    209 
    210     vmovd      eax, xmm0  // return hash
    211     vzeroupper
    212     ret
    213   }
    214 }
    215 #endif  // _MSC_VER >= 1700
    216 
    217 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
    218 
    219 #ifdef __cplusplus
    220 }  // extern "C"
    221 }  // namespace libyuv
    222 #endif
    223