Home | History | Annotate | Download | only in pixman
      1 /* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
      2 /*
      3  * Copyright  2000 SuSE, Inc.
      4  * Copyright  2007 Red Hat, Inc.
      5  *
      6  * Permission to use, copy, modify, distribute, and sell this software and its
      7  * documentation for any purpose is hereby granted without fee, provided that
      8  * the above copyright notice appear in all copies and that both that
      9  * copyright notice and this permission notice appear in supporting
     10  * documentation, and that the name of SuSE not be used in advertising or
     11  * publicity pertaining to distribution of the software without specific,
     12  * written prior permission.  SuSE makes no representations about the
     13  * suitability of this software for any purpose.  It is provided "as is"
     14  * without express or implied warranty.
     15  *
     16  * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
     18  * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
     20  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
     21  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     22  *
     23  * Author:  Keith Packard, SuSE, Inc.
     24  */
     25 
     26 #ifndef PIXMAN_FAST_PATH_H__
     27 #define PIXMAN_FAST_PATH_H__
     28 
     29 #include "pixman-private.h"
     30 
     31 #define PIXMAN_REPEAT_COVER -1
     32 
     33 /* Flags describing input parameters to fast path macro template.
     34  * Turning on some flag values may indicate that
     35  * "some property X is available so template can use this" or
     36  * "some property X should be handled by template".
     37  *
     38  * FLAG_HAVE_SOLID_MASK
     39  *  Input mask is solid so template should handle this.
     40  *
     41  * FLAG_HAVE_NON_SOLID_MASK
     42  *  Input mask is bits mask so template should handle this.
     43  *
     44  * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
     45  * exclusive. (It's not allowed to turn both flags on)
     46  */
     47 #define FLAG_NONE				(0)
     48 #define FLAG_HAVE_SOLID_MASK			(1 <<   1)
     49 #define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
     50 
     51 /* To avoid too short repeated scanline function calls, extend source
     52  * scanlines having width less than below constant value.
     53  */
     54 #define REPEAT_NORMAL_MIN_WIDTH			64
     55 
     56 static force_inline pixman_bool_t
     57 repeat (pixman_repeat_t repeat, int *c, int size)
     58 {
     59     if (repeat == PIXMAN_REPEAT_NONE)
     60     {
     61 	if (*c < 0 || *c >= size)
     62 	    return FALSE;
     63     }
     64     else if (repeat == PIXMAN_REPEAT_NORMAL)
     65     {
     66 	while (*c >= size)
     67 	    *c -= size;
     68 	while (*c < 0)
     69 	    *c += size;
     70     }
     71     else if (repeat == PIXMAN_REPEAT_PAD)
     72     {
     73 	*c = CLIP (*c, 0, size - 1);
     74     }
     75     else /* REFLECT */
     76     {
     77 	*c = MOD (*c, size * 2);
     78 	if (*c >= size)
     79 	    *c = size * 2 - *c - 1;
     80     }
     81     return TRUE;
     82 }
     83 
     84 static force_inline int
     85 pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
     86 {
     87     return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
     88 	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
     89 }
     90 
     91 #if BILINEAR_INTERPOLATION_BITS <= 4
     92 /* Inspired by Filter_32_opaque from Skia */
     93 static force_inline uint32_t
     94 bilinear_interpolation (uint32_t tl, uint32_t tr,
     95 			uint32_t bl, uint32_t br,
     96 			int distx, int disty)
     97 {
     98     int distxy, distxiy, distixy, distixiy;
     99     uint32_t lo, hi;
    100 
    101     distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
    102     disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
    103 
    104     distxy = distx * disty;
    105     distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
    106     distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
    107     distixiy =
    108 	16 * 16 - (disty << 4) -
    109 	(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
    110 
    111     lo = (tl & 0xff00ff) * distixiy;
    112     hi = ((tl >> 8) & 0xff00ff) * distixiy;
    113 
    114     lo += (tr & 0xff00ff) * distxiy;
    115     hi += ((tr >> 8) & 0xff00ff) * distxiy;
    116 
    117     lo += (bl & 0xff00ff) * distixy;
    118     hi += ((bl >> 8) & 0xff00ff) * distixy;
    119 
    120     lo += (br & 0xff00ff) * distxy;
    121     hi += ((br >> 8) & 0xff00ff) * distxy;
    122 
    123     return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
    124 }
    125 
    126 #else
    127 #if SIZEOF_LONG > 4
    128 
    129 static force_inline uint32_t
    130 bilinear_interpolation (uint32_t tl, uint32_t tr,
    131 			uint32_t bl, uint32_t br,
    132 			int distx, int disty)
    133 {
    134     uint64_t distxy, distxiy, distixy, distixiy;
    135     uint64_t tl64, tr64, bl64, br64;
    136     uint64_t f, r;
    137 
    138     distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
    139     disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
    140 
    141     distxy = distx * disty;
    142     distxiy = distx * (256 - disty);
    143     distixy = (256 - distx) * disty;
    144     distixiy = (256 - distx) * (256 - disty);
    145 
    146     /* Alpha and Blue */
    147     tl64 = tl & 0xff0000ff;
    148     tr64 = tr & 0xff0000ff;
    149     bl64 = bl & 0xff0000ff;
    150     br64 = br & 0xff0000ff;
    151 
    152     f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
    153     r = f & 0x0000ff0000ff0000ull;
    154 
    155     /* Red and Green */
    156     tl64 = tl;
    157     tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
    158 
    159     tr64 = tr;
    160     tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
    161 
    162     bl64 = bl;
    163     bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
    164 
    165     br64 = br;
    166     br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
    167 
    168     f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
    169     r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
    170 
    171     return (uint32_t)(r >> 16);
    172 }
    173 
    174 #else
    175 
    176 static force_inline uint32_t
    177 bilinear_interpolation (uint32_t tl, uint32_t tr,
    178 			uint32_t bl, uint32_t br,
    179 			int distx, int disty)
    180 {
    181     int distxy, distxiy, distixy, distixiy;
    182     uint32_t f, r;
    183 
    184     distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
    185     disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
    186 
    187     distxy = distx * disty;
    188     distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
    189     distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
    190     distixiy =
    191 	256 * 256 - (disty << 8) -
    192 	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
    193 
    194     /* Blue */
    195     r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
    196       + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
    197 
    198     /* Green */
    199     f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
    200       + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
    201     r |= f & 0xff000000;
    202 
    203     tl >>= 16;
    204     tr >>= 16;
    205     bl >>= 16;
    206     br >>= 16;
    207     r >>= 16;
    208 
    209     /* Red */
    210     f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
    211       + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
    212     r |= f & 0x00ff0000;
    213 
    214     /* Alpha */
    215     f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
    216       + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
    217     r |= f & 0xff000000;
    218 
    219     return r;
    220 }
    221 
    222 #endif
    223 #endif // BILINEAR_INTERPOLATION_BITS <= 4
    224 
    225 /*
    226  * For each scanline fetched from source image with PAD repeat:
    227  * - calculate how many pixels need to be padded on the left side
    228  * - calculate how many pixels need to be padded on the right side
    229  * - update width to only count pixels which are fetched from the image
    230  * All this information is returned via 'width', 'left_pad', 'right_pad'
    231  * arguments. The code is assuming that 'unit_x' is positive.
    232  *
    233  * Note: 64-bit math is used in order to avoid potential overflows, which
    234  *       is probably excessive in many cases. This particular function
    235  *       may need its own correctness test and performance tuning.
    236  */
    237 static force_inline void
    238 pad_repeat_get_scanline_bounds (int32_t         source_image_width,
    239 				pixman_fixed_t  vx,
    240 				pixman_fixed_t  unit_x,
    241 				int32_t *       width,
    242 				int32_t *       left_pad,
    243 				int32_t *       right_pad)
    244 {
    245     int64_t max_vx = (int64_t) source_image_width << 16;
    246     int64_t tmp;
    247     if (vx < 0)
    248     {
    249 	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
    250 	if (tmp > *width)
    251 	{
    252 	    *left_pad = *width;
    253 	    *width = 0;
    254 	}
    255 	else
    256 	{
    257 	    *left_pad = (int32_t) tmp;
    258 	    *width -= (int32_t) tmp;
    259 	}
    260     }
    261     else
    262     {
    263 	*left_pad = 0;
    264     }
    265     tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
    266     if (tmp < 0)
    267     {
    268 	*right_pad = *width;
    269 	*width = 0;
    270     }
    271     else if (tmp >= *width)
    272     {
    273 	*right_pad = 0;
    274     }
    275     else
    276     {
    277 	*right_pad = *width - (int32_t) tmp;
    278 	*width = (int32_t) tmp;
    279     }
    280 }
    281 
    282 /* A macroified version of specialized nearest scalers for some
    283  * common 8888 and 565 formats. It supports SRC and OVER ops.
    284  *
    285  * There are two repeat versions, one that handles repeat normal,
    286  * and one without repeat handling that only works if the src region
    287  * used is completely covered by the pre-repeated source samples.
    288  *
    289  * The loops are unrolled to process two pixels per iteration for better
    290  * performance on most CPU architectures (superscalar processors
    291  * can issue several operations simultaneously, other processors can hide
    292  * instructions latencies by pipelining operations). Unrolling more
    293  * does not make much sense because the compiler will start running out
    294  * of spare registers soon.
    295  */
    296 
    297 #define GET_8888_ALPHA(s) ((s) >> 24)
    298  /* This is not actually used since we don't have an OVER with
    299     565 source, but it is needed to build. */
    300 #define GET_0565_ALPHA(s) 0xff
    301 #define GET_x888_ALPHA(s) 0xff
    302 
    303 #define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
    304 			      src_type_t, dst_type_t, OP, repeat_mode)				\
    305 static force_inline void									\
    306 scanline_func_name (dst_type_t       *dst,							\
    307 		    const src_type_t *src,							\
    308 		    int32_t           w,							\
    309 		    pixman_fixed_t    vx,							\
    310 		    pixman_fixed_t    unit_x,							\
    311 		    pixman_fixed_t    src_width_fixed,						\
    312 		    pixman_bool_t     fully_transparent_src)					\
    313 {												\
    314 	uint32_t   d;										\
    315 	src_type_t s1, s2;									\
    316 	uint8_t    a1, a2;									\
    317 	int        x1, x2;									\
    318 												\
    319 	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
    320 	    return;										\
    321 												\
    322 	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
    323 	    abort();										\
    324 												\
    325 	while ((w -= 2) >= 0)									\
    326 	{											\
    327 	    x1 = pixman_fixed_to_int (vx);							\
    328 	    vx += unit_x;									\
    329 	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
    330 	    {											\
    331 		/* This works because we know that unit_x is positive */			\
    332 		while (vx >= 0)									\
    333 		    vx -= src_width_fixed;							\
    334 	    }											\
    335 	    s1 = *(src + x1);									\
    336 												\
    337 	    x2 = pixman_fixed_to_int (vx);							\
    338 	    vx += unit_x;									\
    339 	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
    340 	    {											\
    341 		/* This works because we know that unit_x is positive */			\
    342 		while (vx >= 0)									\
    343 		    vx -= src_width_fixed;							\
    344 	    }											\
    345 	    s2 = *(src + x2);									\
    346 												\
    347 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
    348 	    {											\
    349 		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
    350 		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
    351 												\
    352 		if (a1 == 0xff)									\
    353 		{										\
    354 		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
    355 		}										\
    356 		else if (s1)									\
    357 		{										\
    358 		    d = convert_ ## DST_FORMAT ## _to_8888 (*dst);				\
    359 		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
    360 		    a1 ^= 0xff;									\
    361 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
    362 		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
    363 		}										\
    364 		dst++;										\
    365 												\
    366 		if (a2 == 0xff)									\
    367 		{										\
    368 		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
    369 		}										\
    370 		else if (s2)									\
    371 		{										\
    372 		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
    373 		    s2 = convert_## SRC_FORMAT ## _to_8888 (s2);				\
    374 		    a2 ^= 0xff;									\
    375 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
    376 		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
    377 		}										\
    378 		dst++;										\
    379 	    }											\
    380 	    else /* PIXMAN_OP_SRC */								\
    381 	    {											\
    382 		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
    383 		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
    384 	    }											\
    385 	}											\
    386 												\
    387 	if (w & 1)										\
    388 	{											\
    389 	    x1 = pixman_fixed_to_int (vx);							\
    390 	    s1 = *(src + x1);									\
    391 												\
    392 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
    393 	    {											\
    394 		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
    395 												\
    396 		if (a1 == 0xff)									\
    397 		{										\
    398 		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
    399 		}										\
    400 		else if (s1)									\
    401 		{										\
    402 		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
    403 		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
    404 		    a1 ^= 0xff;									\
    405 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
    406 		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
    407 		}										\
    408 		dst++;										\
    409 	    }											\
    410 	    else /* PIXMAN_OP_SRC */								\
    411 	    {											\
    412 		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
    413 	    }											\
    414 	}											\
    415 }
    416 
    417 #define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
    418 				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
    419 static void											\
    420 fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
    421 						   pixman_composite_info_t *info)               \
    422 {												\
    423     PIXMAN_COMPOSITE_ARGS (info);					                        \
    424     dst_type_t *dst_line;						                        \
    425     mask_type_t *mask_line;									\
    426     src_type_t *src_first_line;									\
    427     int       y;										\
    428     pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);		\
    429     pixman_fixed_t max_vy;									\
    430     pixman_vector_t v;										\
    431     pixman_fixed_t vx, vy;									\
    432     pixman_fixed_t unit_x, unit_y;								\
    433     int32_t left_pad, right_pad;								\
    434 												\
    435     src_type_t *src;										\
    436     dst_type_t *dst;										\
    437     mask_type_t solid_mask;									\
    438     const mask_type_t *mask = &solid_mask;							\
    439     int src_stride, mask_stride, dst_stride;							\
    440 												\
    441     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
    442     if (have_mask)										\
    443     {												\
    444 	if (mask_is_solid)									\
    445 	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
    446 	else											\
    447 	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
    448 				   mask_stride, mask_line, 1);					\
    449     }												\
    450     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
    451      * transformed from destination space to source space */					\
    452     PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
    453 												\
    454     /* reference point is the center of the pixel */						\
    455     v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
    456     v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
    457     v.vector[2] = pixman_fixed_1;								\
    458 												\
    459     if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
    460 	return;											\
    461 												\
    462     unit_x = src_image->common.transform->matrix[0][0];						\
    463     unit_y = src_image->common.transform->matrix[1][1];						\
    464 												\
    465     /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
    466     v.vector[0] -= pixman_fixed_e;								\
    467     v.vector[1] -= pixman_fixed_e;								\
    468 												\
    469     vx = v.vector[0];										\
    470     vy = v.vector[1];										\
    471 												\
    472     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
    473     {												\
    474 	max_vy = pixman_int_to_fixed (src_image->bits.height);					\
    475 												\
    476 	/* Clamp repeating positions inside the actual samples */				\
    477 	repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);					\
    478 	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
    479     }												\
    480 												\
    481     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
    482 	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
    483     {												\
    484 	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
    485 					&width, &left_pad, &right_pad);				\
    486 	vx += left_pad * unit_x;								\
    487     }												\
    488 												\
    489     while (--height >= 0)									\
    490     {												\
    491 	dst = dst_line;										\
    492 	dst_line += dst_stride;									\
    493 	if (have_mask && !mask_is_solid)							\
    494 	{											\
    495 	    mask = mask_line;									\
    496 	    mask_line += mask_stride;								\
    497 	}											\
    498 												\
    499 	y = pixman_fixed_to_int (vy);								\
    500 	vy += unit_y;										\
    501 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
    502 	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
    503 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
    504 	{											\
    505 	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
    506 	    src = src_first_line + src_stride * y;						\
    507 	    if (left_pad > 0)									\
    508 	    {											\
    509 		scanline_func (mask, dst,							\
    510 			       src + src_image->bits.width - src_image->bits.width + 1,		\
    511 			       left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
    512 	    }											\
    513 	    if (width > 0)									\
    514 	    {											\
    515 		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
    516 			       dst + left_pad, src + src_image->bits.width, width,		\
    517 			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
    518 	    }											\
    519 	    if (right_pad > 0)									\
    520 	    {											\
    521 		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
    522 			       dst + left_pad + width, src + src_image->bits.width,		\
    523 			       right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
    524 	    }											\
    525 	}											\
    526 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
    527 	{											\
    528 	    static const src_type_t zero[1] = { 0 };						\
    529 	    if (y < 0 || y >= src_image->bits.height)						\
    530 	    {											\
    531 		scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,		\
    532 			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
    533 		continue;									\
    534 	    }											\
    535 	    src = src_first_line + src_stride * y;						\
    536 	    if (left_pad > 0)									\
    537 	    {											\
    538 		scanline_func (mask, dst, zero + 1, left_pad,					\
    539 			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
    540 	    }											\
    541 	    if (width > 0)									\
    542 	    {											\
    543 		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
    544 			       dst + left_pad, src + src_image->bits.width, width,		\
    545 			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
    546 	    }											\
    547 	    if (right_pad > 0)									\
    548 	    {											\
    549 		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
    550 			       dst + left_pad + width, zero + 1, right_pad,			\
    551 			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
    552 	    }											\
    553 	}											\
    554 	else											\
    555 	{											\
    556 	    src = src_first_line + src_stride * y;						\
    557 	    scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed,	\
    558 			   unit_x, src_width_fixed, FALSE);					\
    559 	}											\
    560     }												\
    561 }
    562 
    563 /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
    564 #define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
    565 				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
    566 	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
    567 				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
    568 
    569 #define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
    570 			      repeat_mode)							\
    571     static force_inline void									\
    572     scanline_func##scale_func_name##_wrapper (							\
    573 		    const uint8_t    *mask,							\
    574 		    dst_type_t       *dst,							\
    575 		    const src_type_t *src,							\
    576 		    int32_t          w,								\
    577 		    pixman_fixed_t   vx,							\
    578 		    pixman_fixed_t   unit_x,							\
    579 		    pixman_fixed_t   max_vx,							\
    580 		    pixman_bool_t    fully_transparent_src)					\
    581     {												\
    582 	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
    583     }												\
    584     FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
    585 			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
    586 
    587 #define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
    588 			      repeat_mode)							\
    589 	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
    590 			      dst_type_t, repeat_mode)
    591 
    592 #define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
    593 		     src_type_t, dst_type_t, OP, repeat_mode)				\
    594     FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
    595 			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
    596 			  OP, repeat_mode)						\
    597     FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
    598 			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
    599 			  src_type_t, dst_type_t, repeat_mode)
    600 
    601 
    602 #define SCALED_NEAREST_FLAGS						\
    603     (FAST_PATH_SCALE_TRANSFORM	|					\
    604      FAST_PATH_NO_ALPHA_MAP	|					\
    605      FAST_PATH_NEAREST_FILTER	|					\
    606      FAST_PATH_NO_ACCESSORS	|					\
    607      FAST_PATH_NARROW_FORMAT)
    608 
    609 #define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
    610     {   PIXMAN_OP_ ## op,						\
    611 	PIXMAN_ ## s,							\
    612 	(SCALED_NEAREST_FLAGS		|				\
    613 	 FAST_PATH_NORMAL_REPEAT	|				\
    614 	 FAST_PATH_X_UNIT_POSITIVE),					\
    615 	PIXMAN_null, 0,							\
    616 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    617 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
    618     }
    619 
    620 #define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
    621     {   PIXMAN_OP_ ## op,						\
    622 	PIXMAN_ ## s,							\
    623 	(SCALED_NEAREST_FLAGS		|				\
    624 	 FAST_PATH_PAD_REPEAT		|				\
    625 	 FAST_PATH_X_UNIT_POSITIVE),					\
    626 	PIXMAN_null, 0,							\
    627 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    628 	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
    629     }
    630 
    631 #define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
    632     {   PIXMAN_OP_ ## op,						\
    633 	PIXMAN_ ## s,							\
    634 	(SCALED_NEAREST_FLAGS		|				\
    635 	 FAST_PATH_NONE_REPEAT		|				\
    636 	 FAST_PATH_X_UNIT_POSITIVE),					\
    637 	PIXMAN_null, 0,							\
    638 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    639 	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
    640     }
    641 
    642 #define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
    643     {   PIXMAN_OP_ ## op,						\
    644 	PIXMAN_ ## s,							\
    645 	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
    646 	PIXMAN_null, 0,							\
    647 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    648 	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
    649     }
    650 
    651 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
    652     {   PIXMAN_OP_ ## op,						\
    653 	PIXMAN_ ## s,							\
    654 	(SCALED_NEAREST_FLAGS		|				\
    655 	 FAST_PATH_NORMAL_REPEAT	|				\
    656 	 FAST_PATH_X_UNIT_POSITIVE),					\
    657 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
    658 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    659 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
    660     }
    661 
    662 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
    663     {   PIXMAN_OP_ ## op,						\
    664 	PIXMAN_ ## s,							\
    665 	(SCALED_NEAREST_FLAGS		|				\
    666 	 FAST_PATH_PAD_REPEAT		|				\
    667 	 FAST_PATH_X_UNIT_POSITIVE),					\
    668 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
    669 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    670 	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
    671     }
    672 
    673 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
    674     {   PIXMAN_OP_ ## op,						\
    675 	PIXMAN_ ## s,							\
    676 	(SCALED_NEAREST_FLAGS		|				\
    677 	 FAST_PATH_NONE_REPEAT		|				\
    678 	 FAST_PATH_X_UNIT_POSITIVE),					\
    679 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
    680 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    681 	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
    682     }
    683 
    684 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
    685     {   PIXMAN_OP_ ## op,						\
    686 	PIXMAN_ ## s,							\
    687 	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
    688 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
    689 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    690 	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
    691     }
    692 
    693 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
    694     {   PIXMAN_OP_ ## op,						\
    695 	PIXMAN_ ## s,							\
    696 	(SCALED_NEAREST_FLAGS		|				\
    697 	 FAST_PATH_NORMAL_REPEAT	|				\
    698 	 FAST_PATH_X_UNIT_POSITIVE),					\
    699 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
    700 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    701 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
    702     }
    703 
    704 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
    705     {   PIXMAN_OP_ ## op,						\
    706 	PIXMAN_ ## s,							\
    707 	(SCALED_NEAREST_FLAGS		|				\
    708 	 FAST_PATH_PAD_REPEAT		|				\
    709 	 FAST_PATH_X_UNIT_POSITIVE),					\
    710 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
    711 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    712 	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
    713     }
    714 
    715 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
    716     {   PIXMAN_OP_ ## op,						\
    717 	PIXMAN_ ## s,							\
    718 	(SCALED_NEAREST_FLAGS		|				\
    719 	 FAST_PATH_NONE_REPEAT		|				\
    720 	 FAST_PATH_X_UNIT_POSITIVE),					\
    721 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
    722 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    723 	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
    724     }
    725 
    726 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
    727     {   PIXMAN_OP_ ## op,						\
    728 	PIXMAN_ ## s,							\
    729 	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
    730 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
    731 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
    732 	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
    733     }
    734 
    735 /* Prefer the use of 'cover' variant, because it is faster */
    736 #define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
    737     SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
    738     SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
    739     SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
    740     SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
    741 
    742 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
    743     SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
    744     SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
    745     SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
    746 
    747 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
    748     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
    749     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
    750     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
    751 
    752 /*****************************************************************************/
    753 
    754 /*
    755  * Identify 5 zones in each scanline for bilinear scaling. Depending on
    756  * whether 2 pixels to be interpolated are fetched from the image itself,
    757  * from the padding area around it or from both image and padding area.
    758  */
    759 static force_inline void
    760 bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
    761 					 pixman_fixed_t  vx,
    762 					 pixman_fixed_t  unit_x,
    763 					 int32_t *       left_pad,
    764 					 int32_t *       left_tz,
    765 					 int32_t *       width,
    766 					 int32_t *       right_tz,
    767 					 int32_t *       right_pad)
    768 {
    769 	int width1 = *width, left_pad1, right_pad1;
    770 	int width2 = *width, left_pad2, right_pad2;
    771 
    772 	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
    773 					&width1, &left_pad1, &right_pad1);
    774 	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
    775 					unit_x, &width2, &left_pad2, &right_pad2);
    776 
    777 	*left_pad = left_pad2;
    778 	*left_tz = left_pad1 - left_pad2;
    779 	*right_tz = right_pad2 - right_pad1;
    780 	*right_pad = right_pad1;
    781 	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
    782 }
    783 
    784 /*
    785  * Main loop template for single pass bilinear scaling. It needs to be
    786  * provided with 'scanline_func' which should do the compositing operation.
    787  * The needed function has the following prototype:
    788  *
    789  *	scanline_func (dst_type_t *       dst,
    790  *		       const mask_type_ * mask,
    791  *		       const src_type_t * src_top,
    792  *		       const src_type_t * src_bottom,
    793  *		       int32_t            width,
    794  *		       int                weight_top,
    795  *		       int                weight_bottom,
    796  *		       pixman_fixed_t     vx,
    797  *		       pixman_fixed_t     unit_x,
    798  *		       pixman_fixed_t     max_vx,
    799  *		       pixman_bool_t      zero_src)
    800  *
    801  * Where:
    802  *  dst                 - destination scanline buffer for storing results
    803  *  mask                - mask buffer (or single value for solid mask)
    804  *  src_top, src_bottom - two source scanlines
    805  *  width               - number of pixels to process
    806  *  weight_top          - weight of the top row for interpolation
    807  *  weight_bottom       - weight of the bottom row for interpolation
    808  *  vx                  - initial position for fetching the first pair of
    809  *                        pixels from the source buffer
    810  *  unit_x              - position increment needed to move to the next pair
    811  *                        of pixels
    812  *  max_vx              - image size as a fixed point value, can be used for
    813  *                        implementing NORMAL repeat (when it is supported)
    814  *  zero_src            - boolean hint variable, which is set to TRUE when
    815  *                        all source pixels are fetched from zero padding
    816  *                        zone for NONE repeat
    817  *
    818  * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to
    819  *       BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that
    820  *       for NONE repeat when handling fuzzy antialiased top or bottom image
    821  *       edges. Also both top and bottom weight variables are guaranteed to
    822  *       have value, which is less than BILINEAR_INTERPOLATION_RANGE.
    823  *       For example, the weights can fit into unsigned byte or be used
    824  *       with 8-bit SIMD multiplication instructions for 8-bit interpolation
    825  *       precision.
    826  */
    827 #define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
    828 				  dst_type_t, repeat_mode, flags)				\
    829 static void											\
    830 fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
    831 						   pixman_composite_info_t *info)		\
    832 {												\
    833     PIXMAN_COMPOSITE_ARGS (info);								\
    834     dst_type_t *dst_line;									\
    835     mask_type_t *mask_line;									\
    836     src_type_t *src_first_line;									\
    837     int       y1, y2;										\
    838     pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
    839     pixman_vector_t v;										\
    840     pixman_fixed_t vx, vy;									\
    841     pixman_fixed_t unit_x, unit_y;								\
    842     int32_t left_pad, left_tz, right_tz, right_pad;						\
    843 												\
    844     dst_type_t *dst;										\
    845     mask_type_t solid_mask;									\
    846     const mask_type_t *mask = &solid_mask;							\
    847     int src_stride, mask_stride, dst_stride;							\
    848 												\
    849     int src_width;										\
    850     pixman_fixed_t src_width_fixed;								\
    851     int max_x;											\
    852     pixman_bool_t need_src_extension;								\
    853 												\
    854     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
    855     if (flags & FLAG_HAVE_SOLID_MASK)								\
    856     {												\
    857 	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
    858 	mask_stride = 0;									\
    859     }												\
    860     else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
    861     {												\
    862 	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
    863 			       mask_stride, mask_line, 1);					\
    864     }												\
    865 												\
    866     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
    867      * transformed from destination space to source space */					\
    868     PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
    869 												\
    870     /* reference point is the center of the pixel */						\
    871     v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
    872     v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
    873     v.vector[2] = pixman_fixed_1;								\
    874 												\
    875     if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
    876 	return;											\
    877 												\
    878     unit_x = src_image->common.transform->matrix[0][0];						\
    879     unit_y = src_image->common.transform->matrix[1][1];						\
    880 												\
    881     v.vector[0] -= pixman_fixed_1 / 2;								\
    882     v.vector[1] -= pixman_fixed_1 / 2;								\
    883 												\
    884     vy = v.vector[1];										\
    885 												\
    886     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
    887 	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
    888     {												\
    889 	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
    890 					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
    891 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
    892 	{											\
    893 	    /* PAD repeat does not need special handling for 'transition zones' and */		\
    894 	    /* they can be combined with 'padding zones' safely */				\
    895 	    left_pad += left_tz;								\
    896 	    right_pad += right_tz;								\
    897 	    left_tz = right_tz = 0;								\
    898 	}											\
    899 	v.vector[0] += left_pad * unit_x;							\
    900     }												\
    901 												\
    902     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
    903     {												\
    904 	vx = v.vector[0];									\
    905 	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
    906 	max_x = pixman_fixed_to_int (vx + (width - 1) * (int64_t)unit_x) + 1;			\
    907 												\
    908 	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
    909 	{											\
    910 	    src_width = 0;									\
    911 												\
    912 	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
    913 		src_width += src_image->bits.width;						\
    914 												\
    915 	    need_src_extension = TRUE;								\
    916 	}											\
    917 	else											\
    918 	{											\
    919 	    src_width = src_image->bits.width;							\
    920 	    need_src_extension = FALSE;								\
    921 	}											\
    922 												\
    923 	src_width_fixed = pixman_int_to_fixed (src_width);					\
    924     }												\
    925 												\
    926     while (--height >= 0)									\
    927     {												\
    928 	int weight1, weight2;									\
    929 	dst = dst_line;										\
    930 	dst_line += dst_stride;									\
    931 	vx = v.vector[0];									\
    932 	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
    933 	{											\
    934 	    mask = mask_line;									\
    935 	    mask_line += mask_stride;								\
    936 	}											\
    937 												\
    938 	y1 = pixman_fixed_to_int (vy);								\
    939 	weight2 = pixman_fixed_to_bilinear_weight (vy);						\
    940 	if (weight2)										\
    941 	{											\
    942 	    /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */	\
    943 	    y2 = y1 + 1;									\
    944 	    weight1 = BILINEAR_INTERPOLATION_RANGE - weight2;					\
    945 	}											\
    946 	else											\
    947 	{											\
    948 	    /* set both top and bottom row to the same scanline and tweak weights */		\
    949 	    y2 = y1;										\
    950 	    weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2;				\
    951 	}											\
    952 	vy += unit_y;										\
    953 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
    954 	{											\
    955 	    src_type_t *src1, *src2;								\
    956 	    src_type_t buf1[2];									\
    957 	    src_type_t buf2[2];									\
    958 	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
    959 	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
    960 	    src1 = src_first_line + src_stride * y1;						\
    961 	    src2 = src_first_line + src_stride * y2;						\
    962 												\
    963 	    if (left_pad > 0)									\
    964 	    {											\
    965 		buf1[0] = buf1[1] = src1[0];							\
    966 		buf2[0] = buf2[1] = src2[0];							\
    967 		scanline_func (dst, mask,							\
    968 			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
    969 		dst += left_pad;								\
    970 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
    971 		    mask += left_pad;								\
    972 	    }											\
    973 	    if (width > 0)									\
    974 	    {											\
    975 		scanline_func (dst, mask,							\
    976 			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
    977 		dst += width;									\
    978 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
    979 		    mask += width;								\
    980 	    }											\
    981 	    if (right_pad > 0)									\
    982 	    {											\
    983 		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
    984 		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
    985 		scanline_func (dst, mask,							\
    986 			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
    987 	    }											\
    988 	}											\
    989 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
    990 	{											\
    991 	    src_type_t *src1, *src2;								\
    992 	    src_type_t buf1[2];									\
    993 	    src_type_t buf2[2];									\
    994 	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
    995 	    if (y1 < 0)										\
    996 	    {											\
    997 		weight1 = 0;									\
    998 		y1 = 0;										\
    999 	    }											\
   1000 	    if (y1 >= src_image->bits.height)							\
   1001 	    {											\
   1002 		weight1 = 0;									\
   1003 		y1 = src_image->bits.height - 1;						\
   1004 	    }											\
   1005 	    if (y2 < 0)										\
   1006 	    {											\
   1007 		weight2 = 0;									\
   1008 		y2 = 0;										\
   1009 	    }											\
   1010 	    if (y2 >= src_image->bits.height)							\
   1011 	    {											\
   1012 		weight2 = 0;									\
   1013 		y2 = src_image->bits.height - 1;						\
   1014 	    }											\
   1015 	    src1 = src_first_line + src_stride * y1;						\
   1016 	    src2 = src_first_line + src_stride * y2;						\
   1017 												\
   1018 	    if (left_pad > 0)									\
   1019 	    {											\
   1020 		buf1[0] = buf1[1] = 0;								\
   1021 		buf2[0] = buf2[1] = 0;								\
   1022 		scanline_func (dst, mask,							\
   1023 			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
   1024 		dst += left_pad;								\
   1025 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   1026 		    mask += left_pad;								\
   1027 	    }											\
   1028 	    if (left_tz > 0)									\
   1029 	    {											\
   1030 		buf1[0] = 0;									\
   1031 		buf1[1] = src1[0];								\
   1032 		buf2[0] = 0;									\
   1033 		buf2[1] = src2[0];								\
   1034 		scanline_func (dst, mask,							\
   1035 			       buf1, buf2, left_tz, weight1, weight2,				\
   1036 			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
   1037 		dst += left_tz;									\
   1038 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   1039 		    mask += left_tz;								\
   1040 		vx += left_tz * unit_x;								\
   1041 	    }											\
   1042 	    if (width > 0)									\
   1043 	    {											\
   1044 		scanline_func (dst, mask,							\
   1045 			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
   1046 		dst += width;									\
   1047 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   1048 		    mask += width;								\
   1049 		vx += width * unit_x;								\
   1050 	    }											\
   1051 	    if (right_tz > 0)									\
   1052 	    {											\
   1053 		buf1[0] = src1[src_image->bits.width - 1];					\
   1054 		buf1[1] = 0;									\
   1055 		buf2[0] = src2[src_image->bits.width - 1];					\
   1056 		buf2[1] = 0;									\
   1057 		scanline_func (dst, mask,							\
   1058 			       buf1, buf2, right_tz, weight1, weight2,				\
   1059 			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
   1060 		dst += right_tz;								\
   1061 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   1062 		    mask += right_tz;								\
   1063 	    }											\
   1064 	    if (right_pad > 0)									\
   1065 	    {											\
   1066 		buf1[0] = buf1[1] = 0;								\
   1067 		buf2[0] = buf2[1] = 0;								\
   1068 		scanline_func (dst, mask,							\
   1069 			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
   1070 	    }											\
   1071 	}											\
   1072 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
   1073 	{											\
   1074 	    int32_t	    num_pixels;								\
   1075 	    int32_t	    width_remain;							\
   1076 	    src_type_t *    src_line_top;							\
   1077 	    src_type_t *    src_line_bottom;							\
   1078 	    src_type_t	    buf1[2];								\
   1079 	    src_type_t	    buf2[2];								\
   1080 	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
   1081 	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
   1082 	    int		    i, j;								\
   1083 												\
   1084 	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
   1085 	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
   1086 	    src_line_top = src_first_line + src_stride * y1;					\
   1087 	    src_line_bottom = src_first_line + src_stride * y2;					\
   1088 												\
   1089 	    if (need_src_extension)								\
   1090 	    {											\
   1091 		for (i=0; i<src_width;)								\
   1092 		{										\
   1093 		    for (j=0; j<src_image->bits.width; j++, i++)				\
   1094 		    {										\
   1095 			extended_src_line0[i] = src_line_top[j];				\
   1096 			extended_src_line1[i] = src_line_bottom[j];				\
   1097 		    }										\
   1098 		}										\
   1099 												\
   1100 		src_line_top = &extended_src_line0[0];						\
   1101 		src_line_bottom = &extended_src_line1[0];					\
   1102 	    }											\
   1103 												\
   1104 	    /* Top & Bottom wrap around buffer */						\
   1105 	    buf1[0] = src_line_top[src_width - 1];						\
   1106 	    buf1[1] = src_line_top[0];								\
   1107 	    buf2[0] = src_line_bottom[src_width - 1];						\
   1108 	    buf2[1] = src_line_bottom[0];							\
   1109 												\
   1110 	    width_remain = width;								\
   1111 												\
   1112 	    while (width_remain > 0)								\
   1113 	    {											\
   1114 		/* We use src_width_fixed because it can make vx in original source range */	\
   1115 		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
   1116 												\
   1117 		/* Wrap around part */								\
   1118 		if (pixman_fixed_to_int (vx) == src_width - 1)					\
   1119 		{										\
   1120 		    /* for positive unit_x							\
   1121 		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
   1122 		     *										\
   1123 		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
   1124 		     * So we are safe from overflow.						\
   1125 		     */										\
   1126 		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
   1127 												\
   1128 		    if (num_pixels > width_remain)						\
   1129 			num_pixels = width_remain;						\
   1130 												\
   1131 		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
   1132 				   weight1, weight2, pixman_fixed_frac(vx),			\
   1133 				   unit_x, src_width_fixed, FALSE);				\
   1134 												\
   1135 		    width_remain -= num_pixels;							\
   1136 		    vx += num_pixels * unit_x;							\
   1137 		    dst += num_pixels;								\
   1138 												\
   1139 		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
   1140 			mask += num_pixels;							\
   1141 												\
   1142 		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
   1143 		}										\
   1144 												\
   1145 		/* Normal scanline composite */							\
   1146 		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
   1147 		{										\
   1148 		    /* for positive unit_x							\
   1149 		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
   1150 		     *										\
   1151 		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
   1152 		     * So we are safe from overflow here.					\
   1153 		     */										\
   1154 		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
   1155 				  / unit_x) + 1;						\
   1156 												\
   1157 		    if (num_pixels > width_remain)						\
   1158 			num_pixels = width_remain;						\
   1159 												\
   1160 		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
   1161 				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
   1162 												\
   1163 		    width_remain -= num_pixels;							\
   1164 		    vx += num_pixels * unit_x;							\
   1165 		    dst += num_pixels;								\
   1166 												\
   1167 		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
   1168 		        mask += num_pixels;							\
   1169 		}										\
   1170 	    }											\
   1171 	}											\
   1172 	else											\
   1173 	{											\
   1174 	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
   1175 			   src_first_line + src_stride * y2, width,				\
   1176 			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
   1177 	}											\
   1178     }												\
   1179 }
   1180 
   1181 /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
   1182 #define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
   1183 				  dst_type_t, repeat_mode, flags)				\
   1184 	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
   1185 				  dst_type_t, repeat_mode, flags)
   1186 
   1187 #define SCALED_BILINEAR_FLAGS						\
   1188     (FAST_PATH_SCALE_TRANSFORM	|					\
   1189      FAST_PATH_NO_ALPHA_MAP	|					\
   1190      FAST_PATH_BILINEAR_FILTER	|					\
   1191      FAST_PATH_NO_ACCESSORS	|					\
   1192      FAST_PATH_NARROW_FORMAT)
   1193 
   1194 #define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
   1195     {   PIXMAN_OP_ ## op,						\
   1196 	PIXMAN_ ## s,							\
   1197 	(SCALED_BILINEAR_FLAGS		|				\
   1198 	 FAST_PATH_PAD_REPEAT		|				\
   1199 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1200 	PIXMAN_null, 0,							\
   1201 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1202 	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
   1203     }
   1204 
   1205 #define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
   1206     {   PIXMAN_OP_ ## op,						\
   1207 	PIXMAN_ ## s,							\
   1208 	(SCALED_BILINEAR_FLAGS		|				\
   1209 	 FAST_PATH_NONE_REPEAT		|				\
   1210 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1211 	PIXMAN_null, 0,							\
   1212 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1213 	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
   1214     }
   1215 
   1216 #define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
   1217     {   PIXMAN_OP_ ## op,						\
   1218 	PIXMAN_ ## s,							\
   1219 	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
   1220 	PIXMAN_null, 0,							\
   1221 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1222 	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
   1223     }
   1224 
   1225 #define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
   1226     {   PIXMAN_OP_ ## op,						\
   1227 	PIXMAN_ ## s,							\
   1228 	(SCALED_BILINEAR_FLAGS		|				\
   1229 	 FAST_PATH_NORMAL_REPEAT	|				\
   1230 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1231 	PIXMAN_null, 0,							\
   1232 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1233 	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
   1234     }
   1235 
   1236 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
   1237     {   PIXMAN_OP_ ## op,						\
   1238 	PIXMAN_ ## s,							\
   1239 	(SCALED_BILINEAR_FLAGS		|				\
   1240 	 FAST_PATH_PAD_REPEAT		|				\
   1241 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1242 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
   1243 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1244 	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
   1245     }
   1246 
   1247 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
   1248     {   PIXMAN_OP_ ## op,						\
   1249 	PIXMAN_ ## s,							\
   1250 	(SCALED_BILINEAR_FLAGS		|				\
   1251 	 FAST_PATH_NONE_REPEAT		|				\
   1252 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1253 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
   1254 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1255 	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
   1256     }
   1257 
   1258 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
   1259     {   PIXMAN_OP_ ## op,						\
   1260 	PIXMAN_ ## s,							\
   1261 	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
   1262 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
   1263 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1264 	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
   1265     }
   1266 
   1267 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
   1268     {   PIXMAN_OP_ ## op,						\
   1269 	PIXMAN_ ## s,							\
   1270 	(SCALED_BILINEAR_FLAGS		|				\
   1271 	 FAST_PATH_NORMAL_REPEAT	|				\
   1272 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1273 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
   1274 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1275 	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
   1276     }
   1277 
   1278 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
   1279     {   PIXMAN_OP_ ## op,						\
   1280 	PIXMAN_ ## s,							\
   1281 	(SCALED_BILINEAR_FLAGS		|				\
   1282 	 FAST_PATH_PAD_REPEAT		|				\
   1283 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1284 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
   1285 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1286 	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
   1287     }
   1288 
   1289 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
   1290     {   PIXMAN_OP_ ## op,						\
   1291 	PIXMAN_ ## s,							\
   1292 	(SCALED_BILINEAR_FLAGS		|				\
   1293 	 FAST_PATH_NONE_REPEAT		|				\
   1294 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1295 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
   1296 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1297 	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
   1298     }
   1299 
   1300 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
   1301     {   PIXMAN_OP_ ## op,						\
   1302 	PIXMAN_ ## s,							\
   1303 	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
   1304 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
   1305 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1306 	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
   1307     }
   1308 
   1309 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
   1310     {   PIXMAN_OP_ ## op,						\
   1311 	PIXMAN_ ## s,							\
   1312 	(SCALED_BILINEAR_FLAGS		|				\
   1313 	 FAST_PATH_NORMAL_REPEAT	|				\
   1314 	 FAST_PATH_X_UNIT_POSITIVE),					\
   1315 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
   1316 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
   1317 	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
   1318     }
   1319 
   1320 /* Prefer the use of 'cover' variant, because it is faster */
   1321 #define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
   1322     SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
   1323     SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
   1324     SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
   1325     SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
   1326 
   1327 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
   1328     SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
   1329     SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
   1330     SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
   1331     SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
   1332 
   1333 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
   1334     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
   1335     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
   1336     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
   1337     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
   1338 
   1339 #endif
   1340