Home | History | Annotate | Download | only in b_BasicEm
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /* ---- includes ----------------------------------------------------------- */
     18 
     19 #include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */
     20 
     21 #if ( defined( WIN64 ) || defined( HW_SSE2 ) )
     22 
     23 #include "emmintrin.h"
     24 
     25 /* disable warning "local variable 'x' used without having been initialized" */
     26 #pragma warning( disable : 4700 )
     27 
     28 
     29 /** Using half register (64-bit) in SSE2 to calculate dot product.
     30  *  This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c.
     31  *  Dependencies: input vectors need to be 16-bit aligned
     32  *  Return Value: int32 containing resultL of dot product
     33  */
     34 int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
     35 {
     36 	__m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8;
     37 	int16* vec1L = ( int16* )vec1A;
     38 	int16* vec2L = ( int16* )vec2A;
     39 
     40 	int32 resultL = 0;
     41 	uint32 alignOffSetL = 0;
     42 
     43 	/* initialize registers to 0 */
     44 	m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 );
     45 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
     46 	m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 );
     47 
     48 	alignOffSetL = sizeA % 16;
     49 	sizeA >>= 4;
     50 
     51 	if( sizeA )
     52 	{
     53 		while( sizeA > 0 )
     54 		{
     55 			m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] );
     56 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
     57 
     58 			m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] );
     59 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
     60 
     61 			m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] );
     62 
     63 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 );
     64 
     65 			m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] );
     66 			m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] );
     67 
     68 			m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 );
     69 
     70 			m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] );
     71 
     72 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
     73 
     74 			m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] );
     75 
     76 			m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 );
     77 
     78 			m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] );
     79 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 );
     80 
     81 			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 );
     82 
     83 			vec1L += 16;
     84 			vec2L += 16;
     85 			sizeA--;
     86 		}
     87 
     88 		/* sum up accumulators */
     89 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
     90 
     91 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
     92 
     93 		m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 );
     94 
     95 		m_XMM0 = _mm_srli_epi64( m_XMM0, 32 );
     96 
     97 		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
     98 
     99 		resultL = _mm_cvtsi128_si32( m_XMM7 );
    100 	}
    101 
    102 	/* switch statements produces faster code than loop */
    103 	switch( alignOffSetL )
    104 	{
    105 		case 15:
    106 			resultL += ( int32 )*vec1L++ * *vec2L++;
    107 		case 14:
    108 			resultL += ( int32 )*vec1L++ * *vec2L++;
    109 		case 13:
    110 			resultL += ( int32 )*vec1L++ * *vec2L++;
    111 		case 12:
    112 			resultL += ( int32 )*vec1L++ * *vec2L++;
    113 		case 11:
    114 			resultL += ( int32 )*vec1L++ * *vec2L++;
    115 		case 10:
    116 			resultL += ( int32 )*vec1L++ * *vec2L++;
    117 		case 9:
    118 			resultL += ( int32 )*vec1L++ * *vec2L++;
    119 		case 8:
    120 			resultL += ( int32 )*vec1L++ * *vec2L++;
    121 		case 7:
    122 			resultL += ( int32 )*vec1L++ * *vec2L++;
    123 		case 6:
    124 			resultL += ( int32 )*vec1L++ * *vec2L++;
    125 		case 5:
    126 			resultL += ( int32 )*vec1L++ * *vec2L++;
    127 		case 4:
    128 			resultL += ( int32 )*vec1L++ * *vec2L++;
    129 		case 3:
    130 			resultL += ( int32 )*vec1L++ * *vec2L++;
    131 		case 2:
    132 			resultL += ( int32 )*vec1L++ * *vec2L++;
    133 		case 1:
    134 			resultL += ( int32 )*vec1L++ * *vec2L++;
    135 	}
    136 
    137 	return resultL;
    138 }
    139 
    140 /* ------------------------------------------------------------------------- */
    141 
    142 /** Using full register (128-bit) in SSE2 to calculate dot Product.
    143  *  Dependencies: 16-bit aligned
    144  *  Return Value: int32 containing dot Product
    145  */
    146 int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    147 {
    148 	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
    149 	int16* vec1L = ( int16* )vec1A;
    150 	int16* vec2L = ( int16* )vec2A;
    151 
    152 	int32 resultL = 0;
    153 	uint32 alignOffSetL = 0;
    154 
    155 	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
    156 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
    157 
    158 	alignOffSetL = sizeA % 16;
    159 	sizeA >>= 4;
    160 
    161 	if( sizeA )
    162 	{
    163 		while( sizeA > 0 )
    164 		{
    165 			m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] );
    166 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
    167 
    168 			m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] );
    169 
    170 			m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] );
    171 
    172 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
    173 
    174 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
    175 
    176 			m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] );
    177 
    178 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
    179 
    180 			vec1L += 16;
    181 			vec2L += 16;
    182 			sizeA--;
    183 		}
    184 
    185 		/* sum up accumulators */
    186 		m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
    187 
    188 		m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 );
    189 
    190 		resultL = _mm_cvtsi128_si32( m_XMM0 );	/* 1st 32bits */
    191 
    192 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
    193 
    194 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 2nd 32bits */
    195 
    196 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
    197 
    198 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 3rd 32bits */
    199 
    200 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
    201 
    202 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 4th 32bits */
    203 	}
    204 
    205 	switch( alignOffSetL )
    206 	{
    207 		case 15:
    208 			resultL += ( int32 )*vec1L++ * *vec2L++;
    209 		case 14:
    210 			resultL += ( int32 )*vec1L++ * *vec2L++;
    211 		case 13:
    212 			resultL += ( int32 )*vec1L++ * *vec2L++;
    213 		case 12:
    214 			resultL += ( int32 )*vec1L++ * *vec2L++;
    215 		case 11:
    216 			resultL += ( int32 )*vec1L++ * *vec2L++;
    217 		case 10:
    218 			resultL += ( int32 )*vec1L++ * *vec2L++;
    219 		case 9:
    220 			resultL += ( int32 )*vec1L++ * *vec2L++;
    221 		case 8:
    222 			resultL += ( int32 )*vec1L++ * *vec2L++;
    223 		case 7:
    224 			resultL += ( int32 )*vec1L++ * *vec2L++;
    225 		case 6:
    226 			resultL += ( int32 )*vec1L++ * *vec2L++;
    227 		case 5:
    228 			resultL += ( int32 )*vec1L++ * *vec2L++;
    229 		case 4:
    230 			resultL += ( int32 )*vec1L++ * *vec2L++;
    231 		case 3:
    232 			resultL += ( int32 )*vec1L++ * *vec2L++;
    233 		case 2:
    234 			resultL += ( int32 )*vec1L++ * *vec2L++;
    235 		case 1:
    236 			resultL += ( int32 )*vec1L++ * *vec2L++;
    237 	}
    238 
    239 	return resultL;
    240 }
    241 
    242 /* ------------------------------------------------------------------------- */
    243 
    244 
    245 /** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version).
    246  *  Dependencies: memory does not need to be 16-bit aligned
    247  *  Return Value: int32 containing dot product
    248  */
    249 int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
    250 {
    251 	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
    252 	int16* vec1L = ( int16* )vec1A;
    253 	int16* vec2L = ( int16* )vec2A;
    254 	int32 resultL = 0;
    255 	uint32 alignOffSetL = 0;
    256 
    257 	/* initialize registers to 0 */
    258 	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
    259 	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
    260 
    261 
    262 	alignOffSetL = sizeA % 16;
    263 	sizeA >>= 4;
    264 
    265 	if( sizeA )
    266 	{
    267 		while( sizeA > 0 )
    268 		{
    269 			m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] );
    270 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
    271 
    272 			m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] );
    273 
    274 			m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] );
    275 
    276 			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
    277 
    278 			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
    279 
    280 			m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] );
    281 
    282 			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
    283 
    284 			vec1L += 16;
    285 			vec2L += 16;
    286 			sizeA--;
    287 		}
    288 
    289 		/* sum up accumulators */
    290 		m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
    291 
    292 		m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 );
    293 
    294 		resultL = _mm_cvtsi128_si32( m_XMM0 );	/* 1st 32bits */
    295 
    296 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
    297 
    298 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 2nd 32bits */
    299 
    300 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
    301 
    302 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 3rd 32bits */
    303 
    304 		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
    305 
    306 		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 4th 32bits */
    307 	}
    308 
    309 
    310 	switch( alignOffSetL )
    311 	{
    312 		case 15:
    313 			resultL += ( int32 )*vec1L++ * *vec2L++;
    314 		case 14:
    315 			resultL += ( int32 )*vec1L++ * *vec2L++;
    316 		case 13:
    317 			resultL += ( int32 )*vec1L++ * *vec2L++;
    318 		case 12:
    319 			resultL += ( int32 )*vec1L++ * *vec2L++;
    320 		case 11:
    321 			resultL += ( int32 )*vec1L++ * *vec2L++;
    322 		case 10:
    323 			resultL += ( int32 )*vec1L++ * *vec2L++;
    324 		case 9:
    325 			resultL += ( int32 )*vec1L++ * *vec2L++;
    326 		case 8:
    327 			resultL += ( int32 )*vec1L++ * *vec2L++;
    328 		case 7:
    329 			resultL += ( int32 )*vec1L++ * *vec2L++;
    330 		case 6:
    331 			resultL += ( int32 )*vec1L++ * *vec2L++;
    332 		case 5:
    333 			resultL += ( int32 )*vec1L++ * *vec2L++;
    334 		case 4:
    335 			resultL += ( int32 )*vec1L++ * *vec2L++;
    336 		case 3:
    337 			resultL += ( int32 )*vec1L++ * *vec2L++;
    338 		case 2:
    339 			resultL += ( int32 )*vec1L++ * *vec2L++;
    340 		case 1:
    341 			resultL += ( int32 )*vec1L++ * *vec2L++;
    342 	}
    343 
    344 	return resultL;
    345 }
    346 
    347 /* ------------------------------------------------------------------------- */
    348 
    349 #endif /* HW_SSE2 */
    350