Home | History | Annotate | Download | only in pixman
      1 /* The gcc-provided loongson intrinsic functions are way too fucking broken
      2  * to be of any use, otherwise I'd use them.
      3  *
      4  * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
      5  *   close enough that they could have implemented the _mm_*-style intrinsic
      6  *   interface and had a ton of optimized code available to them. Instead they
      7  *   implemented something much, much worse.
      8  *
      9  * - pshuf takes a dead first argument, causing extra instructions to be
     10  *   generated.
     11  *
     12  * - There are no 64-bit shift or logical intrinsics, which means you have
     13  *   to implement them with inline assembly, but this is a nightmare because
     14  *   gcc doesn't understand that the integer vector datatypes are actually in
     15  *   floating-point registers, so you end up with braindead code like
     16  *
     17  *	punpcklwd	$f9,$f9,$f5
     18  *	    dmtc1	v0,$f8
     19  *	punpcklwd	$f19,$f19,$f5
     20  *	    dmfc1	t9,$f9
     21  *	    dmtc1	v0,$f9
     22  *	    dmtc1	t9,$f20
     23  *	    dmfc1	s0,$f19
     24  *	punpcklbh	$f20,$f20,$f2
     25  *
     26  *   where crap just gets copied back and forth between integer and floating-
     27  *   point registers ad nauseum.
     28  *
     29  * Instead of trying to workaround the problems from these crap intrinsics, I
     30  * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
     31  * assembly.
     32  */
     33 
     34 #include <stdint.h>
     35 
     36 /* vectors are stored in 64-bit floating-point registers */
     37 typedef double __m64;
     38 /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
     39 typedef float  __m32;
     40 
     41 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     42 _mm_setzero_si64 (void)
     43 {
     44 	return 0.0;
     45 }
     46 
     47 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     48 _mm_add_pi16 (__m64 __m1, __m64 __m2)
     49 {
     50 	__m64 ret;
     51 	asm("paddh %0, %1, %2\n\t"
     52 	   : "=f" (ret)
     53 	   : "f" (__m1), "f" (__m2)
     54 	);
     55 	return ret;
     56 }
     57 
     58 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     59 _mm_add_pi32 (__m64 __m1, __m64 __m2)
     60 {
     61 	__m64 ret;
     62 	asm("paddw %0, %1, %2\n\t"
     63 	   : "=f" (ret)
     64 	   : "f" (__m1), "f" (__m2)
     65 	);
     66 	return ret;
     67 }
     68 
     69 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     70 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
     71 {
     72 	__m64 ret;
     73 	asm("paddush %0, %1, %2\n\t"
     74 	   : "=f" (ret)
     75 	   : "f" (__m1), "f" (__m2)
     76 	);
     77 	return ret;
     78 }
     79 
     80 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     81 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
     82 {
     83 	__m64 ret;
     84 	asm("paddusb %0, %1, %2\n\t"
     85 	   : "=f" (ret)
     86 	   : "f" (__m1), "f" (__m2)
     87 	);
     88 	return ret;
     89 }
     90 
     91 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     92 _mm_and_si64 (__m64 __m1, __m64 __m2)
     93 {
     94 	__m64 ret;
     95 	asm("and %0, %1, %2\n\t"
     96 	   : "=f" (ret)
     97 	   : "f" (__m1), "f" (__m2)
     98 	);
     99 	return ret;
    100 }
    101 
    102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    103 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    104 {
    105 	__m64 ret;
    106 	asm("pcmpeqw %0, %1, %2\n\t"
    107 	   : "=f" (ret)
    108 	   : "f" (__m1), "f" (__m2)
    109 	);
    110 	return ret;
    111 }
    112 
    113 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    114 _mm_empty (void)
    115 {
    116 
    117 }
    118 
    119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    120 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
    121 {
    122 	__m64 ret;
    123 	asm("pmaddhw %0, %1, %2\n\t"
    124 	   : "=f" (ret)
    125 	   : "f" (__m1), "f" (__m2)
    126 	);
    127 	return ret;
    128 }
    129 
    130 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    131 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
    132 {
    133 	__m64 ret;
    134 	asm("pmulhuh %0, %1, %2\n\t"
    135 	   : "=f" (ret)
    136 	   : "f" (__m1), "f" (__m2)
    137 	);
    138 	return ret;
    139 }
    140 
    141 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    142 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
    143 {
    144 	__m64 ret;
    145 	asm("pmullh %0, %1, %2\n\t"
    146 	   : "=f" (ret)
    147 	   : "f" (__m1), "f" (__m2)
    148 	);
    149 	return ret;
    150 }
    151 
    152 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    153 _mm_or_si64 (__m64 __m1, __m64 __m2)
    154 {
    155 	__m64 ret;
    156 	asm("or %0, %1, %2\n\t"
    157 	   : "=f" (ret)
    158 	   : "f" (__m1), "f" (__m2)
    159 	);
    160 	return ret;
    161 }
    162 
    163 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    164 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    165 {
    166 	__m64 ret;
    167 	asm("packushb %0, %1, %2\n\t"
    168 	   : "=f" (ret)
    169 	   : "f" (__m1), "f" (__m2)
    170 	);
    171 	return ret;
    172 }
    173 
    174 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    175 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
    176 {
    177 	__m64 ret;
    178 	asm("packsswh %0, %1, %2\n\t"
    179 	   : "=f" (ret)
    180 	   : "f" (__m1), "f" (__m2)
    181 	);
    182 	return ret;
    183 }
    184 
    185 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
    186  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
    187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    188 _mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
    189 {
    190 	if (__builtin_constant_p (__w3) &&
    191 	    __builtin_constant_p (__w2) &&
    192 	    __builtin_constant_p (__w1) &&
    193 	    __builtin_constant_p (__w0))
    194 	{
    195 		uint64_t val = ((uint64_t)__w3 << 48)
    196 			     | ((uint64_t)__w2 << 32)
    197 			     | ((uint64_t)__w1 << 16)
    198 			     | ((uint64_t)__w0 <<  0);
    199 		return *(__m64 *)&val;
    200 	}
    201 	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
    202 	{
    203 		/* TODO: handle other cases */
    204 		uint64_t val = __w3;
    205 		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
    206 		__m64 ret;
    207 		asm("pshufh %0, %1, %2\n\t"
    208 		    : "=f" (ret)
    209 		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
    210 		);
    211 		return ret;
    212 	}
    213 	uint64_t val = ((uint64_t)__w3 << 48)
    214 		     | ((uint64_t)__w2 << 32)
    215 		     | ((uint64_t)__w1 << 16)
    216 		     | ((uint64_t)__w0 <<  0);
    217 	return *(__m64 *)&val;
    218 }
    219 
    220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    221 _mm_set_pi32 (unsigned __i1, unsigned __i0)
    222 {
    223 	if (__builtin_constant_p (__i1) &&
    224 	    __builtin_constant_p (__i0))
    225 	{
    226 		uint64_t val = ((uint64_t)__i1 << 32)
    227 			     | ((uint64_t)__i0 <<  0);
    228 		return *(__m64 *)&val;
    229 	}
    230 	else if (__i1 == __i0)
    231 	{
    232 		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
    233 		__m64 ret;
    234 		asm("pshufh %0, %1, %2\n\t"
    235 		    : "=f" (ret)
    236 		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
    237 		);
    238 		return ret;
    239 	}
    240 	uint64_t val = ((uint64_t)__i1 << 32)
    241 		     | ((uint64_t)__i0 <<  0);
    242 	return *(__m64 *)&val;
    243 }
    244 #undef _MM_SHUFFLE
    245 
    246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    247 _mm_shuffle_pi16 (__m64 __m, int64_t __n)
    248 {
    249 	__m64 ret;
    250 	asm("pshufh %0, %1, %2\n\t"
    251 	    : "=f" (ret)
    252 	    : "f" (__m), "f" (*(__m64 *)&__n)
    253 	);
    254 	return ret;
    255 }
    256 
    257 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    258 _mm_slli_pi16 (__m64 __m, int64_t __count)
    259 {
    260 	__m64 ret;
    261 	asm("psllh  %0, %1, %2\n\t"
    262 	   : "=f" (ret)
    263 	   : "f" (__m), "f" (*(__m64 *)&__count)
    264 	);
    265 	return ret;
    266 }
    267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    268 _mm_slli_si64 (__m64 __m, int64_t __count)
    269 {
    270 	__m64 ret;
    271 	asm("dsll  %0, %1, %2\n\t"
    272 	   : "=f" (ret)
    273 	   : "f" (__m), "f" (*(__m64 *)&__count)
    274 	);
    275 	return ret;
    276 }
    277 
    278 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    279 _mm_srli_pi16 (__m64 __m, int64_t __count)
    280 {
    281 	__m64 ret;
    282 	asm("psrlh %0, %1, %2\n\t"
    283 	   : "=f" (ret)
    284 	   : "f" (__m), "f" (*(__m64 *)&__count)
    285 	);
    286 	return ret;
    287 }
    288 
    289 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    290 _mm_srli_pi32 (__m64 __m, int64_t __count)
    291 {
    292 	__m64 ret;
    293 	asm("psrlw %0, %1, %2\n\t"
    294 	   : "=f" (ret)
    295 	   : "f" (__m), "f" (*(__m64 *)&__count)
    296 	);
    297 	return ret;
    298 }
    299 
    300 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    301 _mm_srli_si64 (__m64 __m, int64_t __count)
    302 {
    303 	__m64 ret;
    304 	asm("dsrl  %0, %1, %2\n\t"
    305 	   : "=f" (ret)
    306 	   : "f" (__m), "f" (*(__m64 *)&__count)
    307 	);
    308 	return ret;
    309 }
    310 
    311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    312 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    313 {
    314 	__m64 ret;
    315 	asm("psubh %0, %1, %2\n\t"
    316 	   : "=f" (ret)
    317 	   : "f" (__m1), "f" (__m2)
    318 	);
    319 	return ret;
    320 }
    321 
    322 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    323 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    324 {
    325 	__m64 ret;
    326 	asm("punpckhbh %0, %1, %2\n\t"
    327 	   : "=f" (ret)
    328 	   : "f" (__m1), "f" (__m2)
    329 	);
    330 	return ret;
    331 }
    332 
    333 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    334 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    335 {
    336 	__m64 ret;
    337 	asm("punpckhhw %0, %1, %2\n\t"
    338 	   : "=f" (ret)
    339 	   : "f" (__m1), "f" (__m2)
    340 	);
    341 	return ret;
    342 }
    343 
    344 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    345 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    346 {
    347 	__m64 ret;
    348 	asm("punpcklbh %0, %1, %2\n\t"
    349 	   : "=f" (ret)
    350 	   : "f" (__m1), "f" (__m2)
    351 	);
    352 	return ret;
    353 }
    354 
    355 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
    356  * allows load8888 to use 32-bit loads */
    357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    358 _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
    359 {
    360 	__m64 ret;
    361 	asm("punpcklbh %0, %1, %2\n\t"
    362 	   : "=f" (ret)
    363 	   : "f" (__m1), "f" (__m2)
    364 	);
    365 	return ret;
    366 }
    367 
    368 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    369 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    370 {
    371 	__m64 ret;
    372 	asm("punpcklhw %0, %1, %2\n\t"
    373 	   : "=f" (ret)
    374 	   : "f" (__m1), "f" (__m2)
    375 	);
    376 	return ret;
    377 }
    378 
    379 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    380 _mm_xor_si64 (__m64 __m1, __m64 __m2)
    381 {
    382 	__m64 ret;
    383 	asm("xor %0, %1, %2\n\t"
    384 	   : "=f" (ret)
    385 	   : "f" (__m1), "f" (__m2)
    386 	);
    387 	return ret;
    388 }
    389 
    390 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    391 loongson_extract_pi16 (__m64 __m, int64_t __pos)
    392 {
    393 	__m64 ret;
    394 	asm("pextrh %0, %1, %2\n\t"
    395 	   : "=f" (ret)
    396 	   : "f" (__m), "f" (*(__m64 *)&__pos)
    397 	);
    398 	return ret;
    399 }
    400 
    401 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    402 loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos)
    403 {
    404 	__m64 ret;
    405 	asm("pinsrh_%3 %0, %1, %2\n\t"
    406 	   : "=f" (ret)
    407 	   : "f" (__m1), "f" (__m2), "i" (__pos)
    408 	);
    409 	return ret;
    410 }
    411