Home | History | Annotate | Download | only in include
      1 /**
      2  * This file has no copyright assigned and is placed in the Public Domain.
      3  * This file is part of the mingw-w64 runtime package.
      4  * No warranty is given; refer to the file DISCLAIMER.PD within this package.
      5  */
      6 #ifndef _DVEC_H_INCLUDED
      7 #define _DVEC_H_INCLUDED
      8 #ifndef RC_INVOKED
      9 
     10 #if !defined __cplusplus
     11 #error This file is only supported in C++ compilations!
     12 #endif
     13 
     14 #include <intrin.h>
     15 #include <assert.h>
     16 #include <fvec.h>
     17 #include <crtdefs.h>
     18 
     19 #pragma pack(push,_CRT_PACKING)
     20 
     21 #if defined(_ENABLE_VEC_DEBUG)
     22 #include <iostream>
     23 #endif
     24 
     25 #ifdef __SSE__
     26 
     27 #pragma pack(push,16)
     28 
     29 #define EXPLICIT explicit
     30 
     31 class I8vec16;
     32 class Is8vec16;
     33 class Iu8vec16;
     34 class I16vec8;
     35 class Is16vec8;
     36 class Iu16vec8;
     37 class I32vec4;
     38 class Is32vec4;
     39 class Iu32vec4;
     40 class I64vec2;
     41 class I128vec1;
     42 
     43 #define _MM_16UB(element,vector) (*((unsigned char*)&(vector) + (element)))
     44 #define _MM_16B(element,vector) (*((signed char*)&(vector) + (element)))
     45 
     46 #define _MM_8UW(element,vector) (*((unsigned short*)&(vector) + (element)))
     47 #define _MM_8W(element,vector) (*((short*)&(vector) + (element)))
     48 
     49 #define _MM_4UDW(element,vector) (*((unsigned int*)&(vector) + (element)))
     50 #define _MM_4DW(element,vector) (*((int*)&(vector) + (element)))
     51 
     52 #define _MM_2QW(element,vector) (*((__int64*)&(vector) + (element)))
     53 
     54 __MINGW_EXTENSION inline const __m128i get_mask128()
     55 {
     56   static const __m128i mask128 = _mm_set1_epi64(M64((__int64)0xffffffffffffffffll));
     57   return mask128;
     58 }
     59 
     60 class M128
     61 {
     62 protected:
     63   __m128i vec;
     64 
     65 public:
     66   M128() { }
     67   M128(__m128i mm) { vec = mm; }
     68 
     69   operator __m128i() const { return vec; }
     70 
     71   M128& operator&=(const M128 &a) { return *this = (M128) _mm_and_si128(vec,a); }
     72   M128& operator|=(const M128 &a) { return *this = (M128) _mm_or_si128(vec,a); }
     73   M128& operator^=(const M128 &a) { return *this = (M128) _mm_xor_si128(vec,a); }
     74 
     75 };
     76 
     77 inline M128 operator&(const M128 &a,const M128 &b) { return _mm_and_si128(a,b); }
     78 inline M128 operator|(const M128 &a,const M128 &b) { return _mm_or_si128(a,b); }
     79 inline M128 operator^(const M128 &a,const M128 &b) { return _mm_xor_si128(a,b); }
     80 inline M128 andnot(const M128 &a,const M128 &b) { return _mm_andnot_si128(a,b); }
     81 
     82 class I128vec1 : public M128
     83 {
     84 public:
     85   I128vec1() { }
     86   I128vec1(__m128i mm) : M128(mm) { }
     87 
     88   I128vec1& operator= (const M128 &a) { return *this = (I128vec1) a; }
     89   I128vec1& operator&=(const M128 &a) { return *this = (I128vec1) _mm_and_si128(vec,a); }
     90   I128vec1& operator|=(const M128 &a) { return *this = (I128vec1) _mm_or_si128(vec,a); }
     91   I128vec1& operator^=(const M128 &a) { return *this = (I128vec1) _mm_xor_si128(vec,a); }
     92 
     93 };
     94 
     95 class I64vec2 : public M128
     96 {
     97 public:
     98   I64vec2() { }
     99   I64vec2(__m128i mm) : M128(mm) { }
    100 
    101   __MINGW_EXTENSION I64vec2(__m64 q1,__m64 q0)
    102   {
    103     _MM_2QW(0,vec) = *(__int64*)&q0;
    104     _MM_2QW(1,vec) = *(__int64*)&q1;
    105   }
    106 
    107   I64vec2& operator= (const M128 &a) { return *this = (I64vec2) a; }
    108 
    109   I64vec2& operator&=(const M128 &a) { return *this = (I64vec2) _mm_and_si128(vec,a); }
    110   I64vec2& operator|=(const M128 &a) { return *this = (I64vec2) _mm_or_si128(vec,a); }
    111   I64vec2& operator^=(const M128 &a) { return *this = (I64vec2) _mm_xor_si128(vec,a); }
    112 
    113   I64vec2& operator +=(const I64vec2 &a) { return *this = (I64vec2) _mm_add_epi64(vec,a); }
    114   I64vec2& operator -=(const I64vec2 &a) { return *this = (I64vec2) _mm_sub_epi64(vec,a); }
    115 
    116   I64vec2 operator<<(const I64vec2 &a) { return _mm_sll_epi64(vec,a); }
    117   I64vec2 operator<<(int count) { return _mm_slli_epi64(vec,count); }
    118   I64vec2& operator<<=(const I64vec2 &a) { return *this = (I64vec2) _mm_sll_epi64(vec,a); }
    119   I64vec2& operator<<=(int count) { return *this = (I64vec2) _mm_slli_epi64(vec,count); }
    120   I64vec2 operator>>(const I64vec2 &a) { return _mm_srl_epi64(vec,a); }
    121   I64vec2 operator>>(int count) { return _mm_srli_epi64(vec,count); }
    122   I64vec2& operator>>=(const I64vec2 &a) { return *this = (I64vec2) _mm_srl_epi64(vec,a); }
    123   I64vec2& operator>>=(int count) { return *this = (I64vec2) _mm_srli_epi64(vec,count); }
    124 
    125   __MINGW_EXTENSION const __int64& operator[](int i)const
    126   {
    127     assert(static_cast<unsigned int>(i) < 2);
    128     return _MM_2QW(i,vec);
    129   }
    130 
    131   __MINGW_EXTENSION __int64& operator[](int i)
    132   {
    133     assert(static_cast<unsigned int>(i) < 2);
    134     return _MM_2QW(i,vec);
    135   }
    136 
    137 };
    138 
    139 inline I64vec2 unpack_low(const I64vec2 &a,const I64vec2 &b) {return _mm_unpacklo_epi64(a,b); }
    140 inline I64vec2 unpack_high(const I64vec2 &a,const I64vec2 &b) {return _mm_unpackhi_epi64(a,b); }
    141 
    142 class I32vec4 : public M128
    143 {
    144 public:
    145   I32vec4() { }
    146   I32vec4(__m128i mm) : M128(mm) { }
    147 
    148   I32vec4& operator= (const M128 &a) { return *this = (I32vec4) a; }
    149 
    150   I32vec4& operator&=(const M128 &a) { return *this = (I32vec4) _mm_and_si128(vec,a); }
    151   I32vec4& operator|=(const M128 &a) { return *this = (I32vec4) _mm_or_si128(vec,a); }
    152   I32vec4& operator^=(const M128 &a) { return *this = (I32vec4) _mm_xor_si128(vec,a); }
    153 
    154   I32vec4& operator +=(const I32vec4 &a) { return *this = (I32vec4)_mm_add_epi32(vec,a); }
    155   I32vec4& operator -=(const I32vec4 &a) { return *this = (I32vec4)_mm_sub_epi32(vec,a); }
    156 
    157   I32vec4 operator<<(const I32vec4 &a) { return _mm_sll_epi32(vec,a); }
    158   I32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
    159   I32vec4& operator<<=(const I32vec4 &a) { return *this = (I32vec4)_mm_sll_epi32(vec,a); }
    160   I32vec4& operator<<=(int count) { return *this = (I32vec4)_mm_slli_epi32(vec,count); }
    161 
    162 };
    163 
    164 inline I32vec4 cmpeq(const I32vec4 &a,const I32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
    165 inline I32vec4 cmpneq(const I32vec4 &a,const I32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
    166 
    167 inline I32vec4 unpack_low(const I32vec4 &a,const I32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
    168 inline I32vec4 unpack_high(const I32vec4 &a,const I32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
    169 
    170 class Is32vec4 : public I32vec4
    171 {
    172 public:
    173   Is32vec4() { }
    174   Is32vec4(__m128i mm) : I32vec4(mm) { }
    175   Is32vec4(int i3,int i2,int i1,int i0)
    176   {
    177     _MM_4DW(0,vec) = i0;
    178     _MM_4DW(1,vec) = i1;
    179     _MM_4DW(2,vec) = i2;
    180     _MM_4DW(3,vec) = i3;
    181   }
    182 
    183   Is32vec4& operator= (const M128 &a) { return *this = (Is32vec4) a; }
    184 
    185   Is32vec4& operator&=(const M128 &a) { return *this = (Is32vec4) _mm_and_si128(vec,a); }
    186   Is32vec4& operator|=(const M128 &a) { return *this = (Is32vec4) _mm_or_si128(vec,a); }
    187   Is32vec4& operator^=(const M128 &a) { return *this = (Is32vec4) _mm_xor_si128(vec,a); }
    188 
    189   Is32vec4& operator +=(const I32vec4 &a) { return *this = (Is32vec4)_mm_add_epi32(vec,a); }
    190   Is32vec4& operator -=(const I32vec4 &a) { return *this = (Is32vec4)_mm_sub_epi32(vec,a); }
    191 
    192   Is32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
    193   Is32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
    194   Is32vec4& operator<<=(const M128 &a) { return *this = (Is32vec4)_mm_sll_epi32(vec,a); }
    195   Is32vec4& operator<<=(int count) { return *this = (Is32vec4)_mm_slli_epi32(vec,count); }
    196 
    197   Is32vec4 operator>>(const M128 &a) { return _mm_sra_epi32(vec,a); }
    198   Is32vec4 operator>>(int count) { return _mm_srai_epi32(vec,count); }
    199   Is32vec4& operator>>=(const M128 &a) { return *this = (Is32vec4) _mm_sra_epi32(vec,a); }
    200   Is32vec4& operator>>=(int count) { return *this = (Is32vec4) _mm_srai_epi32(vec,count); }
    201 
    202 #if defined(_ENABLE_VEC_DEBUG)
    203 
    204   friend std::ostream& operator<< (std::ostream &os,const Is32vec4 &a)
    205   {
    206     os << "[3]:" << _MM_4DW(3,a)
    207       << " [2]:" << _MM_4DW(2,a)
    208       << " [1]:" << _MM_4DW(1,a)
    209       << " [0]:" << _MM_4DW(0,a);
    210     return os;
    211   }
    212 #endif
    213 
    214   const int& operator[](int i)const
    215   {
    216     assert(static_cast<unsigned int>(i) < 4);
    217     return _MM_4DW(i,vec);
    218   }
    219 
    220   int& operator[](int i)
    221   {
    222     assert(static_cast<unsigned int>(i) < 4);
    223     return _MM_4DW(i,vec);
    224   }
    225 };
    226 
    227 inline Is32vec4 cmpeq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
    228 inline Is32vec4 cmpneq(const Is32vec4 &a,const Is32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
    229 inline Is32vec4 cmpgt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(a,b); }
    230 inline Is32vec4 cmplt(const Is32vec4 &a,const Is32vec4 &b) { return _mm_cmpgt_epi32(b,a); }
    231 
    232 inline Is32vec4 unpack_low(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
    233 inline Is32vec4 unpack_high(const Is32vec4 &a,const Is32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
    234 
    235 class Iu32vec4 : public I32vec4
    236 {
    237 public:
    238   Iu32vec4() { }
    239   Iu32vec4(__m128i mm) : I32vec4(mm) { }
    240   Iu32vec4(unsigned int ui3,unsigned int ui2,unsigned int ui1,unsigned int ui0)
    241   {
    242     _MM_4UDW(0,vec) = ui0;
    243     _MM_4UDW(1,vec) = ui1;
    244     _MM_4UDW(2,vec) = ui2;
    245     _MM_4UDW(3,vec) = ui3;
    246   }
    247 
    248   Iu32vec4& operator= (const M128 &a) { return *this = (Iu32vec4) a; }
    249 
    250   Iu32vec4& operator&=(const M128 &a) { return *this = (Iu32vec4) _mm_and_si128(vec,a); }
    251   Iu32vec4& operator|=(const M128 &a) { return *this = (Iu32vec4) _mm_or_si128(vec,a); }
    252   Iu32vec4& operator^=(const M128 &a) { return *this = (Iu32vec4) _mm_xor_si128(vec,a); }
    253 
    254   Iu32vec4& operator +=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_add_epi32(vec,a); }
    255   Iu32vec4& operator -=(const I32vec4 &a) { return *this = (Iu32vec4)_mm_sub_epi32(vec,a); }
    256 
    257   Iu32vec4 operator<<(const M128 &a) { return _mm_sll_epi32(vec,a); }
    258   Iu32vec4 operator<<(int count) { return _mm_slli_epi32(vec,count); }
    259   Iu32vec4& operator<<=(const M128 &a) { return *this = (Iu32vec4)_mm_sll_epi32(vec,a); }
    260   Iu32vec4& operator<<=(int count) { return *this = (Iu32vec4)_mm_slli_epi32(vec,count); }
    261   Iu32vec4 operator>>(const M128 &a) { return _mm_srl_epi32(vec,a); }
    262   Iu32vec4 operator>>(int count) { return _mm_srli_epi32(vec,count); }
    263   Iu32vec4& operator>>=(const M128 &a) { return *this = (Iu32vec4) _mm_srl_epi32(vec,a); }
    264   Iu32vec4& operator>>=(int count) { return *this = (Iu32vec4) _mm_srli_epi32(vec,count); }
    265 
    266 #if defined(_ENABLE_VEC_DEBUG)
    267 
    268   friend std::ostream& operator<< (std::ostream &os,const Iu32vec4 &a)
    269   {
    270     os << "[3]:" << _MM_4UDW(3,a)
    271       << " [2]:" << _MM_4UDW(2,a)
    272       << " [1]:" << _MM_4UDW(1,a)
    273       << " [0]:" << _MM_4UDW(0,a);
    274     return os;
    275   }
    276 #endif
    277 
    278   const unsigned int& operator[](int i)const
    279   {
    280     assert(static_cast<unsigned int>(i) < 4);
    281     return _MM_4UDW(i,vec);
    282   }
    283 
    284   unsigned int& operator[](int i)
    285   {
    286     assert(static_cast<unsigned int>(i) < 4);
    287     return _MM_4UDW(i,vec);
    288   }
    289 };
    290 
    291 inline I64vec2 operator*(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_mul_epu32(a,b); }
    292 inline Iu32vec4 cmpeq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_cmpeq_epi32(a,b); }
    293 inline Iu32vec4 cmpneq(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b),get_mask128()); }
    294 
    295 inline Iu32vec4 unpack_low(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpacklo_epi32(a,b); }
    296 inline Iu32vec4 unpack_high(const Iu32vec4 &a,const Iu32vec4 &b) { return _mm_unpackhi_epi32(a,b); }
    297 
    298 class I16vec8 : public M128
    299 {
    300 public:
    301   I16vec8() { }
    302   I16vec8(__m128i mm) : M128(mm) { }
    303 
    304   I16vec8& operator= (const M128 &a) { return *this = (I16vec8) a; }
    305 
    306   I16vec8& operator&=(const M128 &a) { return *this = (I16vec8) _mm_and_si128(vec,a); }
    307   I16vec8& operator|=(const M128 &a) { return *this = (I16vec8) _mm_or_si128(vec,a); }
    308   I16vec8& operator^=(const M128 &a) { return *this = (I16vec8) _mm_xor_si128(vec,a); }
    309 
    310   I16vec8& operator +=(const I16vec8 &a) { return *this = (I16vec8) _mm_add_epi16(vec,a); }
    311   I16vec8& operator -=(const I16vec8 &a) { return *this = (I16vec8) _mm_sub_epi16(vec,a); }
    312   I16vec8& operator *=(const I16vec8 &a) { return *this = (I16vec8) _mm_mullo_epi16(vec,a); }
    313 
    314   I16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
    315   I16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
    316   I16vec8& operator<<=(const M128 &a) { return *this = (I16vec8)_mm_sll_epi16(vec,a); }
    317   I16vec8& operator<<=(int count) { return *this = (I16vec8)_mm_slli_epi16(vec,count); }
    318 
    319 };
    320 
    321 inline I16vec8 operator*(const I16vec8 &a,const I16vec8 &b) { return _mm_mullo_epi16(a,b); }
    322 
    323 inline I16vec8 cmpeq(const I16vec8 &a,const I16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
    324 inline I16vec8 cmpneq(const I16vec8 &a,const I16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
    325 
    326 inline I16vec8 unpack_low(const I16vec8 &a,const I16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
    327 inline I16vec8 unpack_high(const I16vec8 &a,const I16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
    328 
    329 class Is16vec8 : public I16vec8
    330 {
    331 public:
    332   Is16vec8() { }
    333   Is16vec8(__m128i mm) : I16vec8(mm) { }
    334   Is16vec8(signed short s7,signed short s6,signed short s5,signed short s4,signed short s3,signed short s2,signed short s1,signed short s0)
    335   {
    336     _MM_8W(0,vec) = s0;
    337     _MM_8W(1,vec) = s1;
    338     _MM_8W(2,vec) = s2;
    339     _MM_8W(3,vec) = s3;
    340     _MM_8W(4,vec) = s4;
    341     _MM_8W(5,vec) = s5;
    342     _MM_8W(6,vec) = s6;
    343     _MM_8W(7,vec) = s7;
    344   }
    345 
    346   Is16vec8& operator= (const M128 &a) { return *this = (Is16vec8) a; }
    347 
    348   Is16vec8& operator&=(const M128 &a) { return *this = (Is16vec8) _mm_and_si128(vec,a); }
    349   Is16vec8& operator|=(const M128 &a) { return *this = (Is16vec8) _mm_or_si128(vec,a); }
    350   Is16vec8& operator^=(const M128 &a) { return *this = (Is16vec8) _mm_xor_si128(vec,a); }
    351 
    352   Is16vec8& operator +=(const I16vec8 &a) { return *this = (Is16vec8) _mm_add_epi16(vec,a); }
    353   Is16vec8& operator -=(const I16vec8 &a) { return *this = (Is16vec8) _mm_sub_epi16(vec,a); }
    354   Is16vec8& operator *=(const I16vec8 &a) { return *this = (Is16vec8) _mm_mullo_epi16(vec,a); }
    355 
    356   Is16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
    357   Is16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
    358   Is16vec8& operator<<=(const M128 &a) { return *this = (Is16vec8)_mm_sll_epi16(vec,a); }
    359   Is16vec8& operator<<=(int count) { return *this = (Is16vec8)_mm_slli_epi16(vec,count); }
    360 
    361   Is16vec8 operator>>(const M128 &a) { return _mm_sra_epi16(vec,a); }
    362   Is16vec8 operator>>(int count) { return _mm_srai_epi16(vec,count); }
    363   Is16vec8& operator>>=(const M128 &a) { return *this = (Is16vec8)_mm_sra_epi16(vec,a); }
    364   Is16vec8& operator>>=(int count) { return *this = (Is16vec8)_mm_srai_epi16(vec,count); }
    365 
    366 #if defined(_ENABLE_VEC_DEBUG)
    367 
    368   friend std::ostream& operator<< (std::ostream &os,const Is16vec8 &a)
    369   {
    370     os << "[7]:" << _MM_8W(7,a)
    371       << " [6]:" << _MM_8W(6,a)
    372       << " [5]:" << _MM_8W(5,a)
    373       << " [4]:" << _MM_8W(4,a)
    374       << " [3]:" << _MM_8W(3,a)
    375       << " [2]:" << _MM_8W(2,a)
    376       << " [1]:" << _MM_8W(1,a)
    377       << " [0]:" << _MM_8W(0,a);
    378     return os;
    379   }
    380 #endif
    381 
    382   const signed short& operator[](int i)const
    383   {
    384     assert(static_cast<unsigned int>(i) < 8);
    385     return _MM_8W(i,vec);
    386   }
    387 
    388   signed short& operator[](int i)
    389   {
    390     assert(static_cast<unsigned int>(i) < 8);
    391     return _MM_8W(i,vec);
    392   }
    393 };
    394 
    395 inline Is16vec8 operator*(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mullo_epi16(a,b); }
    396 
    397 inline Is16vec8 cmpeq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
    398 inline Is16vec8 cmpneq(const Is16vec8 &a,const Is16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
    399 inline Is16vec8 cmpgt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(a,b); }
    400 inline Is16vec8 cmplt(const Is16vec8 &a,const Is16vec8 &b) { return _mm_cmpgt_epi16(b,a); }
    401 
    402 inline Is16vec8 unpack_low(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
    403 inline Is16vec8 unpack_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
    404 
    405 inline Is16vec8 mul_high(const Is16vec8 &a,const Is16vec8 &b) { return _mm_mulhi_epi16(a,b); }
    406 inline Is32vec4 mul_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_madd_epi16(a,b);}
    407 
    408 inline Is16vec8 sat_add(const Is16vec8 &a,const Is16vec8 &b) { return _mm_adds_epi16(a,b); }
    409 inline Is16vec8 sat_sub(const Is16vec8 &a,const Is16vec8 &b) { return _mm_subs_epi16(a,b); }
    410 
    411 inline Is16vec8 simd_max(const Is16vec8 &a,const Is16vec8 &b) { return _mm_max_epi16(a,b); }
    412 inline Is16vec8 simd_min(const Is16vec8 &a,const Is16vec8 &b) { return _mm_min_epi16(a,b); }
    413 
    414 class Iu16vec8 : public I16vec8
    415 {
    416 public:
    417   Iu16vec8() { }
    418   Iu16vec8(__m128i mm) : I16vec8(mm) { }
    419   Iu16vec8(unsigned short s7,unsigned short s6,unsigned short s5,unsigned short s4,unsigned short s3,unsigned short s2,unsigned short s1,unsigned short s0)
    420   {
    421     _MM_8UW(0,vec) = s0;
    422     _MM_8UW(1,vec) = s1;
    423     _MM_8UW(2,vec) = s2;
    424     _MM_8UW(3,vec) = s3;
    425     _MM_8UW(4,vec) = s4;
    426     _MM_8UW(5,vec) = s5;
    427     _MM_8UW(6,vec) = s6;
    428     _MM_8UW(7,vec) = s7;
    429   }
    430 
    431   Iu16vec8& operator= (const M128 &a) { return *this = (Iu16vec8) a; }
    432 
    433   Iu16vec8& operator&=(const M128 &a) { return *this = (Iu16vec8) _mm_and_si128(vec,a); }
    434   Iu16vec8& operator|=(const M128 &a) { return *this = (Iu16vec8) _mm_or_si128(vec,a); }
    435   Iu16vec8& operator^=(const M128 &a) { return *this = (Iu16vec8) _mm_xor_si128(vec,a); }
    436 
    437   Iu16vec8& operator +=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_add_epi16(vec,a); }
    438   Iu16vec8& operator -=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_sub_epi16(vec,a); }
    439   Iu16vec8& operator *=(const I16vec8 &a) { return *this = (Iu16vec8) _mm_mullo_epi16(vec,a); }
    440 
    441   Iu16vec8 operator<<(const M128 &a) { return _mm_sll_epi16(vec,a); }
    442   Iu16vec8 operator<<(int count) { return _mm_slli_epi16(vec,count); }
    443   Iu16vec8& operator<<=(const M128 &a) { return *this = (Iu16vec8)_mm_sll_epi16(vec,a); }
    444   Iu16vec8& operator<<=(int count) { return *this = (Iu16vec8)_mm_slli_epi16(vec,count); }
    445   Iu16vec8 operator>>(const M128 &a) { return _mm_srl_epi16(vec,a); }
    446   Iu16vec8 operator>>(int count) { return _mm_srli_epi16(vec,count); }
    447   Iu16vec8& operator>>=(const M128 &a) { return *this = (Iu16vec8) _mm_srl_epi16(vec,a); }
    448   Iu16vec8& operator>>=(int count) { return *this = (Iu16vec8) _mm_srli_epi16(vec,count); }
    449 
    450 #if defined(_ENABLE_VEC_DEBUG)
    451 
    452   friend std::ostream& operator << (std::ostream &os,const Iu16vec8 &a)
    453   {
    454     os << "[7]:" << unsigned short(_MM_8UW(7,a))
    455       << " [6]:" << unsigned short(_MM_8UW(6,a))
    456       << " [5]:" << unsigned short(_MM_8UW(5,a))
    457       << " [4]:" << unsigned short(_MM_8UW(4,a))
    458       << " [3]:" << unsigned short(_MM_8UW(3,a))
    459       << " [2]:" << unsigned short(_MM_8UW(2,a))
    460       << " [1]:" << unsigned short(_MM_8UW(1,a))
    461       << " [0]:" << unsigned short(_MM_8UW(0,a));
    462     return os;
    463   }
    464 #endif
    465 
    466   const unsigned short& operator[](int i)const
    467   {
    468     assert(static_cast<unsigned int>(i) < 8);
    469     return _MM_8UW(i,vec);
    470   }
    471 
    472   unsigned short& operator[](int i)
    473   {
    474     assert(static_cast<unsigned int>(i) < 8);
    475     return _MM_8UW(i,vec);
    476   }
    477 };
    478 
    479 inline Iu16vec8 operator*(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mullo_epi16(a,b); }
    480 
    481 inline Iu16vec8 cmpeq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_cmpeq_epi16(a,b); }
    482 inline Iu16vec8 cmpneq(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_andnot_si128(_mm_cmpeq_epi16(a,b),get_mask128()); }
    483 
    484 inline Iu16vec8 unpack_low(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpacklo_epi16(a,b); }
    485 inline Iu16vec8 unpack_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_unpackhi_epi16(a,b); }
    486 
    487 inline Iu16vec8 sat_add(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_adds_epu16(a,b); }
    488 inline Iu16vec8 sat_sub(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_subs_epu16(a,b); }
    489 
    490 inline Iu16vec8 simd_avg(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_avg_epu16(a,b); }
    491 inline I16vec8 mul_high(const Iu16vec8 &a,const Iu16vec8 &b) { return _mm_mulhi_epu16(a,b); }
    492 
    493 class I8vec16 : public M128
    494 {
    495 public:
    496   I8vec16() { }
    497   I8vec16(__m128i mm) : M128(mm) { }
    498 
    499   I8vec16& operator= (const M128 &a) { return *this = (I8vec16) a; }
    500 
    501   I8vec16& operator&=(const M128 &a) { return *this = (I8vec16) _mm_and_si128(vec,a); }
    502   I8vec16& operator|=(const M128 &a) { return *this = (I8vec16) _mm_or_si128(vec,a); }
    503   I8vec16& operator^=(const M128 &a) { return *this = (I8vec16) _mm_xor_si128(vec,a); }
    504 
    505   I8vec16& operator +=(const I8vec16 &a) { return *this = (I8vec16) _mm_add_epi8(vec,a); }
    506   I8vec16& operator -=(const I8vec16 &a) { return *this = (I8vec16) _mm_sub_epi8(vec,a); }
    507 
    508 };
    509 
    510 inline I8vec16 cmpeq(const I8vec16 &a,const I8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
    511 inline I8vec16 cmpneq(const I8vec16 &a,const I8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
    512 
    513 inline I8vec16 unpack_low(const I8vec16 &a,const I8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
    514 inline I8vec16 unpack_high(const I8vec16 &a,const I8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
    515 
    516 class Is8vec16 : public I8vec16
    517 {
    518 public:
    519   Is8vec16() { }
    520   Is8vec16(__m128i mm) : I8vec16(mm) { }
    521 
    522   Is8vec16& operator= (const M128 &a) { return *this = (Is8vec16) a; }
    523 
    524   Is8vec16& operator&=(const M128 &a) { return *this = (Is8vec16) _mm_and_si128(vec,a); }
    525   Is8vec16& operator|=(const M128 &a) { return *this = (Is8vec16) _mm_or_si128(vec,a); }
    526   Is8vec16& operator^=(const M128 &a) { return *this = (Is8vec16) _mm_xor_si128(vec,a); }
    527 
    528   Is8vec16& operator +=(const I8vec16 &a) { return *this = (Is8vec16) _mm_add_epi8(vec,a); }
    529   Is8vec16& operator -=(const I8vec16 &a) { return *this = (Is8vec16) _mm_sub_epi8(vec,a); }
    530 
    531 #if defined(_ENABLE_VEC_DEBUG)
    532 
    533   friend std::ostream& operator << (std::ostream &os,const Is8vec16 &a)
    534   {
    535     os << "[15]:" << short(_MM_16B(15,a))
    536       << " [14]:" << short(_MM_16B(14,a))
    537       << " [13]:" << short(_MM_16B(13,a))
    538       << " [12]:" << short(_MM_16B(12,a))
    539       << " [11]:" << short(_MM_16B(11,a))
    540       << " [10]:" << short(_MM_16B(10,a))
    541       << " [9]:" << short(_MM_16B(9,a))
    542       << " [8]:" << short(_MM_16B(8,a))
    543       << " [7]:" << short(_MM_16B(7,a))
    544       << " [6]:" << short(_MM_16B(6,a))
    545       << " [5]:" << short(_MM_16B(5,a))
    546       << " [4]:" << short(_MM_16B(4,a))
    547       << " [3]:" << short(_MM_16B(3,a))
    548       << " [2]:" << short(_MM_16B(2,a))
    549       << " [1]:" << short(_MM_16B(1,a))
    550       << " [0]:" << short(_MM_16B(0,a));
    551     return os;
    552   }
    553 #endif
    554 
    555   const signed char& operator[](int i)const
    556   {
    557     assert(static_cast<unsigned int>(i) < 16);
    558     return _MM_16B(i,vec);
    559   }
    560 
    561   signed char& operator[](int i)
    562   {
    563     assert(static_cast<unsigned int>(i) < 16);
    564     return _MM_16B(i,vec);
    565   }
    566 
    567 };
    568 
    569 inline Is8vec16 cmpeq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
    570 inline Is8vec16 cmpneq(const Is8vec16 &a,const Is8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
    571 inline Is8vec16 cmpgt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmpgt_epi8(a,b); }
    572 inline Is8vec16 cmplt(const Is8vec16 &a,const Is8vec16 &b) { return _mm_cmplt_epi8(a,b); }
    573 
    574 inline Is8vec16 unpack_low(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
    575 inline Is8vec16 unpack_high(const Is8vec16 &a,const Is8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
    576 
    577 inline Is8vec16 sat_add(const Is8vec16 &a,const Is8vec16 &b) { return _mm_adds_epi8(a,b); }
    578 inline Is8vec16 sat_sub(const Is8vec16 &a,const Is8vec16 &b) { return _mm_subs_epi8(a,b); }
    579 
    580 class Iu8vec16 : public I8vec16
    581 {
    582 public:
    583   Iu8vec16() { }
    584   Iu8vec16(__m128i mm) : I8vec16(mm) { }
    585 
    586   Iu8vec16& operator= (const M128 &a) { return *this = (Iu8vec16) a; }
    587 
    588   Iu8vec16& operator&=(const M128 &a) { return *this = (Iu8vec16) _mm_and_si128(vec,a); }
    589   Iu8vec16& operator|=(const M128 &a) { return *this = (Iu8vec16) _mm_or_si128(vec,a); }
    590   Iu8vec16& operator^=(const M128 &a) { return *this = (Iu8vec16) _mm_xor_si128(vec,a); }
    591 
    592   Iu8vec16& operator +=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_add_epi8(vec,a); }
    593   Iu8vec16& operator -=(const I8vec16 &a) { return *this = (Iu8vec16) _mm_sub_epi8(vec,a); }
    594 
    595 #if defined(_ENABLE_VEC_DEBUG)
    596 
    597   friend std::ostream& operator << (std::ostream &os,const Iu8vec16 &a)
    598   {
    599     os << "[15]:" << unsigned short(_MM_16UB(15,a))
    600       << " [14]:" << unsigned short(_MM_16UB(14,a))
    601       << " [13]:" << unsigned short(_MM_16UB(13,a))
    602       << " [12]:" << unsigned short(_MM_16UB(12,a))
    603       << " [11]:" << unsigned short(_MM_16UB(11,a))
    604       << " [10]:" << unsigned short(_MM_16UB(10,a))
    605       << " [9]:" << unsigned short(_MM_16UB(9,a))
    606       << " [8]:" << unsigned short(_MM_16UB(8,a))
    607       << " [7]:" << unsigned short(_MM_16UB(7,a))
    608       << " [6]:" << unsigned short(_MM_16UB(6,a))
    609       << " [5]:" << unsigned short(_MM_16UB(5,a))
    610       << " [4]:" << unsigned short(_MM_16UB(4,a))
    611       << " [3]:" << unsigned short(_MM_16UB(3,a))
    612       << " [2]:" << unsigned short(_MM_16UB(2,a))
    613       << " [1]:" << unsigned short(_MM_16UB(1,a))
    614       << " [0]:" << unsigned short(_MM_16UB(0,a));
    615     return os;
    616   }
    617 #endif
    618 
    619   const unsigned char& operator[](int i)const
    620   {
    621     assert(static_cast<unsigned int>(i) < 16);
    622     return _MM_16UB(i,vec);
    623   }
    624 
    625   unsigned char& operator[](int i)
    626   {
    627     assert(static_cast<unsigned int>(i) < 16);
    628     return _MM_16UB(i,vec);
    629   }
    630 
    631 };
    632 
    633 inline Iu8vec16 cmpeq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_cmpeq_epi8(a,b); }
    634 inline Iu8vec16 cmpneq(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_andnot_si128(_mm_cmpeq_epi8(a,b),get_mask128()); }
    635 
    636 inline Iu8vec16 unpack_low(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpacklo_epi8(a,b); }
    637 inline Iu8vec16 unpack_high(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_unpackhi_epi8(a,b); }
    638 
    639 inline Iu8vec16 sat_add(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_adds_epu8(a,b); }
    640 inline Iu8vec16 sat_sub(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_subs_epu8(a,b); }
    641 
    642 inline I64vec2 sum_abs(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_sad_epu8(a,b); }
    643 
    644 inline Iu8vec16 simd_avg(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_avg_epu8(a,b); }
    645 inline Iu8vec16 simd_max(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_max_epu8(a,b); }
    646 inline Iu8vec16 simd_min(const Iu8vec16 &a,const Iu8vec16 &b) { return _mm_min_epu8(a,b); }
    647 
    648 inline Is16vec8 pack_sat(const Is32vec4 &a,const Is32vec4 &b) { return _mm_packs_epi32(a,b); }
    649 inline Is8vec16 pack_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packs_epi16(a,b); }
    650 inline Iu8vec16 packu_sat(const Is16vec8 &a,const Is16vec8 &b) { return _mm_packus_epi16(a,b);}
    651 
    652 #define IVEC128_LOGICALS(vect,element) inline I##vect##vec##element operator& (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_and_si128(a,b); } inline I##vect##vec##element operator| (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_or_si128(a,b); } inline I##vect##vec##element operator^ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_xor_si128(a,b); } inline I##vect##vec##element andnot (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_andnot_si128(a,b); }
    653 
    654 IVEC128_LOGICALS(8,16)
    655 IVEC128_LOGICALS(u8,16)
    656 IVEC128_LOGICALS(s8,16)
    657 IVEC128_LOGICALS(16,8)
    658 IVEC128_LOGICALS(u16,8)
    659 IVEC128_LOGICALS(s16,8)
    660 IVEC128_LOGICALS(32,4)
    661 IVEC128_LOGICALS(u32,4)
    662 IVEC128_LOGICALS(s32,4)
    663 IVEC128_LOGICALS(64,2)
    664 IVEC128_LOGICALS(128,1)
    665 #undef IVEC128_LOGICALS
    666 
    667 #define IVEC128_ADD_SUB(vect,element,opsize) inline I##vect##vec##element operator+ (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_add_##opsize(a,b); } inline I##vect##vec##element operator- (const I##vect##vec##element &a,const I##vect##vec##element &b) { return _mm_sub_##opsize(a,b); }
    668 
    669 IVEC128_ADD_SUB(8,16,epi8)
    670 IVEC128_ADD_SUB(u8,16,epi8)
    671 IVEC128_ADD_SUB(s8,16,epi8)
    672 IVEC128_ADD_SUB(16,8,epi16)
    673 IVEC128_ADD_SUB(u16,8,epi16)
    674 IVEC128_ADD_SUB(s16,8,epi16)
    675 IVEC128_ADD_SUB(32,4,epi32)
    676 IVEC128_ADD_SUB(u32,4,epi32)
    677 IVEC128_ADD_SUB(s32,4,epi32)
    678 IVEC128_ADD_SUB(64,2,epi64)
    679 #undef IVEC128_ADD_SUB
    680 
    681 #define IVEC128_SELECT(vect12,vect34,element,selop,arg1,arg2) inline I##vect34##vec##element select_##selop (const I##vect12##vec##element &a,const I##vect12##vec##element &b,const I##vect34##vec##element &c,const I##vect34##vec##element &d) { I##vect12##vec##element mask = cmp##selop(a,b); return(I##vect34##vec##element ((mask & arg1) | I##vect12##vec##element ((_mm_andnot_si128(mask,arg2))))); }
    682 IVEC128_SELECT(8,s8,16,eq,c,d)
    683 IVEC128_SELECT(8,u8,16,eq,c,d)
    684 IVEC128_SELECT(8,8,16,eq,c,d)
    685 IVEC128_SELECT(8,s8,16,neq,c,d)
    686 IVEC128_SELECT(8,u8,16,neq,c,d)
    687 IVEC128_SELECT(8,8,16,neq,c,d)
    688 
    689 IVEC128_SELECT(16,s16,8,eq,c,d)
    690 IVEC128_SELECT(16,u16,8,eq,c,d)
    691 IVEC128_SELECT(16,16,8,eq,c,d)
    692 IVEC128_SELECT(16,s16,8,neq,c,d)
    693 IVEC128_SELECT(16,u16,8,neq,c,d)
    694 IVEC128_SELECT(16,16,8,neq,c,d)
    695 
    696 IVEC128_SELECT(32,s32,4,eq,c,d)
    697 IVEC128_SELECT(32,u32,4,eq,c,d)
    698 IVEC128_SELECT(32,32,4,eq,c,d)
    699 IVEC128_SELECT(32,s32,4,neq,c,d)
    700 IVEC128_SELECT(32,u32,4,neq,c,d)
    701 IVEC128_SELECT(32,32,4,neq,c,d)
    702 
    703 IVEC128_SELECT(s8,s8,16,gt,c,d)
    704 IVEC128_SELECT(s8,u8,16,gt,c,d)
    705 IVEC128_SELECT(s8,8,16,gt,c,d)
    706 IVEC128_SELECT(s8,s8,16,lt,c,d)
    707 IVEC128_SELECT(s8,u8,16,lt,c,d)
    708 IVEC128_SELECT(s8,8,16,lt,c,d)
    709 
    710 IVEC128_SELECT(s16,s16,8,gt,c,d)
    711 IVEC128_SELECT(s16,u16,8,gt,c,d)
    712 IVEC128_SELECT(s16,16,8,gt,c,d)
    713 IVEC128_SELECT(s16,s16,8,lt,c,d)
    714 IVEC128_SELECT(s16,u16,8,lt,c,d)
    715 IVEC128_SELECT(s16,16,8,lt,c,d)
    716 
    717 #undef IVEC128_SELECT
    718 
    719 class F64vec2
    720 {
    721 protected:
    722   __m128d vec;
    723 public:
    724 
    725   F64vec2() {}
    726 
    727   F64vec2(__m128d m) { vec = m;}
    728 
    729   F64vec2(double d1,double d0) { vec= _mm_set_pd(d1,d0); }
    730 
    731   EXPLICIT F64vec2(double d) { vec = _mm_set1_pd(d); }
    732 
    733   operator __m128d() const { return vec; }
    734 
    735   friend F64vec2 operator &(const F64vec2 &a,const F64vec2 &b) { return _mm_and_pd(a,b); }
    736   friend F64vec2 operator |(const F64vec2 &a,const F64vec2 &b) { return _mm_or_pd(a,b); }
    737   friend F64vec2 operator ^(const F64vec2 &a,const F64vec2 &b) { return _mm_xor_pd(a,b); }
    738 
    739   friend F64vec2 operator +(const F64vec2 &a,const F64vec2 &b) { return _mm_add_pd(a,b); }
    740   friend F64vec2 operator -(const F64vec2 &a,const F64vec2 &b) { return _mm_sub_pd(a,b); }
    741   friend F64vec2 operator *(const F64vec2 &a,const F64vec2 &b) { return _mm_mul_pd(a,b); }
    742   friend F64vec2 operator /(const F64vec2 &a,const F64vec2 &b) { return _mm_div_pd(a,b); }
    743 
    744   F64vec2& operator +=(F64vec2 &a) { return *this = _mm_add_pd(vec,a); }
    745   F64vec2& operator -=(F64vec2 &a) { return *this = _mm_sub_pd(vec,a); }
    746   F64vec2& operator *=(F64vec2 &a) { return *this = _mm_mul_pd(vec,a); }
    747   F64vec2& operator /=(F64vec2 &a) { return *this = _mm_div_pd(vec,a); }
    748   F64vec2& operator &=(F64vec2 &a) { return *this = _mm_and_pd(vec,a); }
    749   F64vec2& operator |=(F64vec2 &a) { return *this = _mm_or_pd(vec,a); }
    750   F64vec2& operator ^=(F64vec2 &a) { return *this = _mm_xor_pd(vec,a); }
    751 
    752   friend double add_horizontal(F64vec2 &a)
    753   {
    754     F64vec2 ftemp = _mm_add_sd(a,_mm_shuffle_pd(a,a,1));
    755     return ftemp[0];
    756   }
    757 
    758   friend F64vec2 andnot(const F64vec2 &a,const F64vec2 &b) { return _mm_andnot_pd(a,b); }
    759 
    760   friend F64vec2 sqrt(const F64vec2 &a) { return _mm_sqrt_pd(a); }
    761 
    762 #define F64vec2_COMP(op) friend F64vec2 cmp##op (const F64vec2 &a,const F64vec2 &b) { return _mm_cmp##op##_pd(a,b); }
    763   F64vec2_COMP(eq)
    764     F64vec2_COMP(lt)
    765     F64vec2_COMP(le)
    766     F64vec2_COMP(gt)
    767     F64vec2_COMP(ge)
    768     F64vec2_COMP(ngt)
    769     F64vec2_COMP(nge)
    770     F64vec2_COMP(neq)
    771     F64vec2_COMP(nlt)
    772     F64vec2_COMP(nle)
    773 #undef F64vec2_COMP
    774 
    775     friend F64vec2 simd_min(const F64vec2 &a,const F64vec2 &b) { return _mm_min_pd(a,b); }
    776   friend F64vec2 simd_max(const F64vec2 &a,const F64vec2 &b) { return _mm_max_pd(a,b); }
    777 
    778 #define F64vec2_COMI(op) friend int comi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_comi##op##_sd(a,b); }
    779   F64vec2_COMI(eq)
    780     F64vec2_COMI(lt)
    781     F64vec2_COMI(le)
    782     F64vec2_COMI(gt)
    783     F64vec2_COMI(ge)
    784     F64vec2_COMI(neq)
    785 #undef F64vec2_COMI
    786 
    787 #define F64vec2_UCOMI(op) friend int ucomi##op (const F64vec2 &a,const F64vec2 &b) { return _mm_ucomi##op##_sd(a,b); }
    788     F64vec2_UCOMI(eq)
    789     F64vec2_UCOMI(lt)
    790     F64vec2_UCOMI(le)
    791     F64vec2_UCOMI(gt)
    792     F64vec2_UCOMI(ge)
    793     F64vec2_UCOMI(neq)
    794 #undef F64vec2_UCOMI
    795 
    796 #if defined(_ENABLE_VEC_DEBUG)
    797 
    798   friend std::ostream & operator<<(std::ostream & os,const F64vec2 &a) {
    799     double *dp = (double*)&a;
    800     os << " [1]:" << *(dp+1)
    801       << " [0]:" << *dp;
    802     return os;
    803   }
    804 #endif
    805 
    806   const double &operator[](int i) const {
    807     assert((0 <= i) && (i <= 1));
    808     double *dp = (double*)&vec;
    809     return *(dp+i);
    810   }
    811 
    812   double &operator[](int i) {
    813     assert((0 <= i) && (i <= 1));
    814     double *dp = (double*)&vec;
    815     return *(dp+i);
    816   }
    817 };
    818 
    819 inline F64vec2 unpack_low(const F64vec2 &a,const F64vec2 &b) { return _mm_unpacklo_pd(a,b); }
    820 inline F64vec2 unpack_high(const F64vec2 &a,const F64vec2 &b) { return _mm_unpackhi_pd(a,b); }
    821 inline int move_mask(const F64vec2 &a) { return _mm_movemask_pd(a); }
    822 inline void loadu(F64vec2 &a,double *p) { a = _mm_loadu_pd(p); }
    823 inline void storeu(double *p,const F64vec2 &a) { _mm_storeu_pd(p,a); }
    824 inline void store_nta(double *p,F64vec2 &a) { _mm_stream_pd(p,a); }
    825 
    826 #define F64vec2_SELECT(op) inline F64vec2 select_##op (const F64vec2 &a,const F64vec2 &b,const F64vec2 &c,const F64vec2 &d) { F64vec2 mask = _mm_cmp##op##_pd(a,b); return((mask & c) | F64vec2((_mm_andnot_pd(mask,d)))); }
    827 F64vec2_SELECT(eq)
    828 F64vec2_SELECT(lt)
    829 F64vec2_SELECT(le)
    830 F64vec2_SELECT(gt)
    831 F64vec2_SELECT(ge)
    832 F64vec2_SELECT(neq)
    833 F64vec2_SELECT(nlt)
    834 F64vec2_SELECT(nle)
    835 #undef F64vec2_SELECT
    836 
    837 inline int F64vec2ToInt(const F64vec2 &a) { return _mm_cvttsd_si32(a); }
    838 inline F64vec2 F32vec4ToF64vec2(const F32vec4 &a) { return _mm_cvtps_pd(a); }
    839 inline F32vec4 F64vec2ToF32vec4(const F64vec2 &a) { return _mm_cvtpd_ps(a); }
    840 inline F64vec2 IntToF64vec2(const F64vec2 &a,int b) { return _mm_cvtsi32_sd(a,b); }
    841 
    842 #pragma pack(pop)
    843 
    844 #endif /* ifdef __SSE__ */
    845 
    846 #pragma pack(pop)
    847 #endif
    848 #endif
    849