Home | History | Annotate | Download | only in opts
      1 /*
      2  **
      3  ** Copyright 2009, The Android Open Source Project
      4  **
      5  ** Licensed under the Apache License, Version 2.0 (the "License");
      6  ** you may not use this file except in compliance with the License.
      7  ** You may obtain a copy of the License at
      8  **
      9  **     http://www.apache.org/licenses/LICENSE-2.0
     10  **
     11  ** Unless required by applicable law or agreed to in writing, software
     12  ** distributed under the License is distributed on an "AS IS" BASIS,
     13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  ** See the License for the specific language governing permissions and
     15  ** limitations under the License.
     16  */
     17 
     18 #include <emmintrin.h>
     19 #include "SkBitmapProcState_opts_SSE2.h"
     20 #include "SkUtils.h"
     21 
     22 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
     23                                    const uint32_t* xy,
     24                                    int count, uint32_t* colors) {
     25     SkASSERT(count > 0 && colors != NULL);
     26     SkASSERT(s.fDoFilter);
     27     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
     28     SkASSERT(s.fAlphaScale == 256);
     29 
     30     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
     31     unsigned rb = s.fBitmap->rowBytes();
     32     uint32_t XY = *xy++;
     33     unsigned y0 = XY >> 14;
     34     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
     35     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
     36     unsigned subY = y0 & 0xF;
     37 
     38     // ( 0,  0,  0,  0,  0,  0,  0, 16)
     39     __m128i sixteen = _mm_cvtsi32_si128(16);
     40 
     41     // ( 0,  0,  0,  0, 16, 16, 16, 16)
     42     sixteen = _mm_shufflelo_epi16(sixteen, 0);
     43 
     44     // ( 0,  0,  0,  0,  0,  0,  0,  y)
     45     __m128i allY = _mm_cvtsi32_si128(subY);
     46 
     47     // ( 0,  0,  0,  0,  y,  y,  y,  y)
     48     allY = _mm_shufflelo_epi16(allY, 0);
     49 
     50     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
     51     __m128i negY = _mm_sub_epi16(sixteen, allY);
     52 
     53     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
     54     allY = _mm_unpacklo_epi64(allY, negY);
     55 
     56     // (16, 16, 16, 16, 16, 16, 16, 16 )
     57     sixteen = _mm_shuffle_epi32(sixteen, 0);
     58 
     59     // ( 0,  0,  0,  0,  0,  0,  0,  0)
     60     __m128i zero = _mm_setzero_si128();
     61     do {
     62         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
     63         unsigned x0 = XX >> 18;
     64         unsigned x1 = XX & 0x3FFF;
     65 
     66         // (0, 0, 0, 0, 0, 0, 0, x)
     67         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
     68 
     69         // (0, 0, 0, 0, x, x, x, x)
     70         allX = _mm_shufflelo_epi16(allX, 0);
     71 
     72         // (x, x, x, x, x, x, x, x)
     73         allX = _mm_shuffle_epi32(allX, 0);
     74 
     75         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
     76         __m128i negX = _mm_sub_epi16(sixteen, allX);
     77 
     78         // Load 4 samples (pixels).
     79         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
     80         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
     81         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
     82         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
     83 
     84         // (0, 0, a00, a10)
     85         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
     86 
     87         // Expand to 16 bits per component.
     88         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
     89 
     90         // ((a00 * (16-y)), (a10 * y)).
     91         a00a10 = _mm_mullo_epi16(a00a10, allY);
     92 
     93         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
     94         a00a10 = _mm_mullo_epi16(a00a10, negX);
     95 
     96         // (0, 0, a01, a10)
     97         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
     98 
     99         // Expand to 16 bits per component.
    100         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
    101 
    102         // (a01 * (16-y)), (a11 * y)
    103         a01a11 = _mm_mullo_epi16(a01a11, allY);
    104 
    105         // (a01 * (16-y) * x), (a11 * y * x)
    106         a01a11 = _mm_mullo_epi16(a01a11, allX);
    107 
    108         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
    109         __m128i sum = _mm_add_epi16(a00a10, a01a11);
    110 
    111         // (DC, a00*w00 + a01*w01)
    112         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
    113 
    114         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
    115         sum = _mm_add_epi16(sum, shifted);
    116 
    117         // Divide each 16 bit component by 256.
    118         sum = _mm_srli_epi16(sum, 8);
    119 
    120         // Pack lower 4 16 bit values of sum into lower 4 bytes.
    121         sum = _mm_packus_epi16(sum, zero);
    122 
    123         // Extract low int and store.
    124         *colors++ = _mm_cvtsi128_si32(sum);
    125     } while (--count > 0);
    126 }
    127 
    128 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
    129                                   const uint32_t* xy,
    130                                   int count, uint32_t* colors) {
    131     SkASSERT(count > 0 && colors != NULL);
    132     SkASSERT(s.fDoFilter);
    133     SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
    134     SkASSERT(s.fAlphaScale < 256);
    135 
    136     const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
    137     unsigned rb = s.fBitmap->rowBytes();
    138     uint32_t XY = *xy++;
    139     unsigned y0 = XY >> 14;
    140     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
    141     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
    142     unsigned subY = y0 & 0xF;
    143 
    144     // ( 0,  0,  0,  0,  0,  0,  0, 16)
    145     __m128i sixteen = _mm_cvtsi32_si128(16);
    146 
    147     // ( 0,  0,  0,  0, 16, 16, 16, 16)
    148     sixteen = _mm_shufflelo_epi16(sixteen, 0);
    149 
    150     // ( 0,  0,  0,  0,  0,  0,  0,  y)
    151     __m128i allY = _mm_cvtsi32_si128(subY);
    152 
    153     // ( 0,  0,  0,  0,  y,  y,  y,  y)
    154     allY = _mm_shufflelo_epi16(allY, 0);
    155 
    156     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
    157     __m128i negY = _mm_sub_epi16(sixteen, allY);
    158 
    159     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
    160     allY = _mm_unpacklo_epi64(allY, negY);
    161 
    162     // (16, 16, 16, 16, 16, 16, 16, 16 )
    163     sixteen = _mm_shuffle_epi32(sixteen, 0);
    164 
    165     // ( 0,  0,  0,  0,  0,  0,  0,  0)
    166     __m128i zero = _mm_setzero_si128();
    167 
    168     // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
    169     __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
    170 
    171     do {
    172         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
    173         unsigned x0 = XX >> 18;
    174         unsigned x1 = XX & 0x3FFF;
    175 
    176         // (0, 0, 0, 0, 0, 0, 0, x)
    177         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
    178 
    179         // (0, 0, 0, 0, x, x, x, x)
    180         allX = _mm_shufflelo_epi16(allX, 0);
    181 
    182         // (x, x, x, x, x, x, x, x)
    183         allX = _mm_shuffle_epi32(allX, 0);
    184 
    185         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
    186         __m128i negX = _mm_sub_epi16(sixteen, allX);
    187 
    188         // Load 4 samples (pixels).
    189         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
    190         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
    191         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
    192         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
    193 
    194         // (0, 0, a00, a10)
    195         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
    196 
    197         // Expand to 16 bits per component.
    198         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
    199 
    200         // ((a00 * (16-y)), (a10 * y)).
    201         a00a10 = _mm_mullo_epi16(a00a10, allY);
    202 
    203         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
    204         a00a10 = _mm_mullo_epi16(a00a10, negX);
    205 
    206         // (0, 0, a01, a10)
    207         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
    208 
    209         // Expand to 16 bits per component.
    210         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
    211 
    212         // (a01 * (16-y)), (a11 * y)
    213         a01a11 = _mm_mullo_epi16(a01a11, allY);
    214 
    215         // (a01 * (16-y) * x), (a11 * y * x)
    216         a01a11 = _mm_mullo_epi16(a01a11, allX);
    217 
    218         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
    219         __m128i sum = _mm_add_epi16(a00a10, a01a11);
    220 
    221         // (DC, a00*w00 + a01*w01)
    222         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
    223 
    224         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
    225         sum = _mm_add_epi16(sum, shifted);
    226 
    227         // Divide each 16 bit component by 256.
    228         sum = _mm_srli_epi16(sum, 8);
    229 
    230         // Multiply by alpha.
    231         sum = _mm_mullo_epi16(sum, alpha);
    232 
    233         // Divide each 16 bit component by 256.
    234         sum = _mm_srli_epi16(sum, 8);
    235 
    236         // Pack lower 4 16 bit values of sum into lower 4 bytes.
    237         sum = _mm_packus_epi16(sum, zero);
    238 
    239         // Extract low int and store.
    240         *colors++ = _mm_cvtsi128_si32(sum);
    241     } while (--count > 0);
    242 }
    243