Home | History | Annotate | Download | only in dsp
      1 // Copyright 2016 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // MSA version of rescaling functions
     11 //
     12 // Author: Prashant Patil (prashant.patil (at) imgtec.com)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
     17 
     18 #include <assert.h>
     19 
     20 #include "src/utils/rescaler_utils.h"
     21 #include "src/dsp/msa_macro.h"
     22 
     23 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
     24 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
     25 #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
     26 
     27 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
     28   v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
     29   v16u8 t0, t1, t2, t3, t4, t5;                                       \
     30   v2u64 out0, out1, out2, out3;                                       \
     31   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
     32   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
     33   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
     34   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
     35   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
     36   PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
     37   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
     38   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
     39   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
     40   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
     41   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
     42   PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
     43   PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
     44   dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
     45 } while (0)
     46 
     47 #define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
     48   v4u32 tmp0, tmp1;                                   \
     49   v16i8 t0, t1;                                       \
     50   v2u64 out0, out1;                                   \
     51   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
     52   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
     53   SRAR_D2_UD(out0, out1, shift);                      \
     54   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
     55   t1 = __msa_pckev_b(t0, t0);                         \
     56   t0 = __msa_pckev_b(t1, t1);                         \
     57   dst = __msa_copy_s_w((v4i32)t0, 0);                 \
     58 } while (0)
     59 
     60 #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
     61                           dst0, dst1, dst2, dst3) do {         \
     62   v4u32 tmp0, tmp1, tmp2, tmp3;                                \
     63   v2u64 out0, out1, out2, out3;                                \
     64   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
     65   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
     66   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
     67   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
     68   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
     69   PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
     70   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
     71   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
     72   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
     73   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
     74   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
     75   PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
     76 } while (0)
     77 
     78 #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
     79   v4u32 tmp0, tmp1;                                      \
     80   v2u64 out0, out1;                                      \
     81   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
     82   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
     83   SRAR_D2_UD(out0, out1, shift);                         \
     84   dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
     85 } while (0)
     86 
     87 #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
     88                           dst0, dst1) do {                         \
     89   v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
     90   v2u64 out0, out1, out2, out3;                                    \
     91   ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
     92   ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
     93   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
     94   DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
     95   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
     96   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
     97   DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
     98   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
     99   PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
    100 } while (0)
    101 
    102 #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
    103   v4u32 tmp0, tmp1;                                               \
    104   v2u64 out0, out1;                                               \
    105   v16i8 t0, t1;                                                   \
    106   ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
    107   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
    108   SRAR_D2_UD(out0, out1, shift);                                  \
    109   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
    110   SRAR_D2_UD(out0, out1, shift);                                  \
    111   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
    112   t1 = __msa_pckev_b(t0, t0);                                     \
    113   t0 = __msa_pckev_b(t1, t1);                                     \
    114   dst = __msa_copy_s_w((v4i32)t0, 0);                             \
    115 } while (0)
    116 
    117 static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
    118                                           int length,
    119                                           WebPRescaler* const wrk) {
    120   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
    121   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
    122   const v4i32 zero = { 0 };
    123 
    124   while (length >= 16) {
    125     v4u32 src0, src1, src2, src3;
    126     v16u8 out;
    127     LD_UW4(frow, 4, src0, src1, src2, src3);
    128     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
    129     ST_UB(out, dst);
    130     length -= 16;
    131     frow   += 16;
    132     dst    += 16;
    133   }
    134   if (length > 0) {
    135     int x_out;
    136     if (length >= 12) {
    137       uint32_t val0_m, val1_m, val2_m;
    138       v4u32 src0, src1, src2;
    139       LD_UW3(frow, 4, src0, src1, src2);
    140       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
    141       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
    142       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
    143       SW3(val0_m, val1_m, val2_m, dst, 4);
    144       length -= 12;
    145       frow   += 12;
    146       dst    += 12;
    147     } else if (length >= 8) {
    148       uint32_t val0_m, val1_m;
    149       v4u32 src0, src1;
    150       LD_UW2(frow, 4, src0, src1);
    151       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
    152       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
    153       SW2(val0_m, val1_m, dst, 4);
    154       length -= 8;
    155       frow   += 8;
    156       dst    += 8;
    157     } else if (length >= 4) {
    158       uint32_t val0_m;
    159       const v4u32 src0 = LD_UW(frow);
    160       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
    161       SW(val0_m, dst);
    162       length -= 4;
    163       frow   += 4;
    164       dst    += 4;
    165     }
    166     for (x_out = 0; x_out < length; ++x_out) {
    167       const uint32_t J = frow[x_out];
    168       const int v = (int)MULT_FIX(J, wrk->fy_scale);
    169       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
    170     }
    171   }
    172 }
    173 
    174 static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
    175                                           uint8_t* dst, int length,
    176                                           WebPRescaler* const wrk) {
    177   const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
    178   const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
    179   const v4i32 B1 = __msa_fill_w(B);
    180   const v4i32 A1 = __msa_fill_w(A);
    181   const v4i32 AB = __msa_ilvr_w(A1, B1);
    182   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
    183   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
    184 
    185   while (length >= 16) {
    186     v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
    187     v16u8 t0, t1, t2, t3, t4, t5;
    188     LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
    189     LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
    190     CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
    191     CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
    192     PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
    193     t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
    194     ST_UB(t0, dst);
    195     frow   += 16;
    196     irow   += 16;
    197     dst    += 16;
    198     length -= 16;
    199   }
    200   if (length > 0) {
    201     int x_out;
    202     if (length >= 12) {
    203       uint32_t val0_m, val1_m, val2_m;
    204       v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
    205       LD_UW3(frow, 4, frow0, frow1, frow2);
    206       LD_UW3(irow, 4, irow0, irow1, irow2);
    207       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
    208       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
    209       CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
    210       SW3(val0_m, val1_m, val2_m, dst, 4);
    211       frow   += 12;
    212       irow   += 12;
    213       dst    += 12;
    214       length -= 12;
    215     } else if (length >= 8) {
    216       uint32_t val0_m, val1_m;
    217       v4u32 frow0, frow1, irow0, irow1;
    218       LD_UW2(frow, 4, frow0, frow1);
    219       LD_UW2(irow, 4, irow0, irow1);
    220       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
    221       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
    222       SW2(val0_m, val1_m, dst, 4);
    223       frow   += 4;
    224       irow   += 4;
    225       dst    += 4;
    226       length -= 4;
    227     } else if (length >= 4) {
    228       uint32_t val0_m;
    229       const v4u32 frow0 = LD_UW(frow + 0);
    230       const v4u32 irow0 = LD_UW(irow + 0);
    231       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
    232       SW(val0_m, dst);
    233       frow   += 4;
    234       irow   += 4;
    235       dst    += 4;
    236       length -= 4;
    237     }
    238     for (x_out = 0; x_out < length; ++x_out) {
    239       const uint64_t I = (uint64_t)A * frow[x_out]
    240                        + (uint64_t)B * irow[x_out];
    241       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
    242       const int v = (int)MULT_FIX(J, wrk->fy_scale);
    243       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
    244     }
    245   }
    246 }
    247 
    248 static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
    249   uint8_t* dst = wrk->dst;
    250   rescaler_t* irow = wrk->irow;
    251   const int x_out_max = wrk->dst_width * wrk->num_channels;
    252   const rescaler_t* frow = wrk->frow;
    253   assert(!WebPRescalerOutputDone(wrk));
    254   assert(wrk->y_accum <= 0);
    255   assert(wrk->y_expand);
    256   assert(wrk->y_sub != 0);
    257   if (wrk->y_accum == 0) {
    258     ExportRowExpand_0(frow, dst, x_out_max, wrk);
    259   } else {
    260     ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
    261   }
    262 }
    263 
    264 #if 0  // disabled for now. TODO(skal): make match the C-code
    265 static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
    266                                           uint8_t* dst, int length,
    267                                           const uint32_t yscale,
    268                                           WebPRescaler* const wrk) {
    269   const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
    270   const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
    271   const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
    272   const v4i32 zero = { 0 };
    273 
    274   while (length >= 16) {
    275     v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
    276     v16u8 out;
    277     LD_UW4(frow, 4, src0, src1, src2, src3);
    278     CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
    279                       frac0, frac1, frac2, frac3);
    280     LD_UW4(irow, 4, src0, src1, src2, src3);
    281     SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
    282          src0, src1, src2, src3);
    283     CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
    284     ST_UB(out, dst);
    285     ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
    286     frow   += 16;
    287     irow   += 16;
    288     dst    += 16;
    289     length -= 16;
    290   }
    291   if (length > 0) {
    292     int x_out;
    293     if (length >= 12) {
    294       uint32_t val0_m, val1_m, val2_m;
    295       v4u32 src0, src1, src2, frac0, frac1, frac2;
    296       LD_UW3(frow, 4, src0, src1, src2);
    297       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
    298       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
    299       CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
    300       LD_UW3(irow, 4, src0, src1, src2);
    301       SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
    302       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
    303       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
    304       CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
    305       SW3(val0_m, val1_m, val2_m, dst, 4);
    306       ST_UW3(frac0, frac1, frac2, irow, 4);
    307       frow   += 12;
    308       irow   += 12;
    309       dst    += 12;
    310       length -= 12;
    311     } else if (length >= 8) {
    312       uint32_t val0_m, val1_m;
    313       v4u32 src0, src1, frac0, frac1;
    314       LD_UW2(frow, 4, src0, src1);
    315       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
    316       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
    317       LD_UW2(irow, 4, src0, src1);
    318       SUB2(src0, frac0, src1, frac1, src0, src1);
    319       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
    320       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
    321       SW2(val0_m, val1_m, dst, 4);
    322       ST_UW2(frac0, frac1, irow, 4);
    323       frow   += 8;
    324       irow   += 8;
    325       dst    += 8;
    326       length -= 8;
    327     } else if (length >= 4) {
    328       uint32_t val0_m;
    329       v4u32 frac0;
    330       v4u32 src0 = LD_UW(frow);
    331       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
    332       src0 = LD_UW(irow);
    333       src0 = src0 - frac0;
    334       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
    335       SW(val0_m, dst);
    336       ST_UW(frac0, irow);
    337       frow   += 4;
    338       irow   += 4;
    339       dst    += 4;
    340       length -= 4;
    341     }
    342     for (x_out = 0; x_out < length; ++x_out) {
    343       const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
    344       const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
    345       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
    346       irow[x_out] = frac;
    347     }
    348   }
    349 }
    350 
    351 static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
    352                                           int length,
    353                                           WebPRescaler* const wrk) {
    354   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
    355   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
    356   const v4i32 zero = { 0 };
    357 
    358   while (length >= 16) {
    359     v4u32 src0, src1, src2, src3;
    360     v16u8 dst0;
    361     LD_UW4(irow, 4, src0, src1, src2, src3);
    362     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
    363     ST_UB(dst0, dst);
    364     ST_SW4(zero, zero, zero, zero, irow, 4);
    365     length -= 16;
    366     irow   += 16;
    367     dst    += 16;
    368   }
    369   if (length > 0) {
    370     int x_out;
    371     if (length >= 12) {
    372       uint32_t val0_m, val1_m, val2_m;
    373       v4u32 src0, src1, src2;
    374       LD_UW3(irow, 4, src0, src1, src2);
    375       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
    376       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
    377       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
    378       SW3(val0_m, val1_m, val2_m, dst, 4);
    379       ST_SW3(zero, zero, zero, irow, 4);
    380       length -= 12;
    381       irow   += 12;
    382       dst    += 12;
    383     } else if (length >= 8) {
    384       uint32_t val0_m, val1_m;
    385       v4u32 src0, src1;
    386       LD_UW2(irow, 4, src0, src1);
    387       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
    388       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
    389       SW2(val0_m, val1_m, dst, 4);
    390       ST_SW2(zero, zero, irow, 4);
    391       length -= 8;
    392       irow   += 8;
    393       dst    += 8;
    394     } else if (length >= 4) {
    395       uint32_t val0_m;
    396       const v4u32 src0 = LD_UW(irow + 0);
    397       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
    398       SW(val0_m, dst);
    399       ST_SW(zero, irow);
    400       length -= 4;
    401       irow   += 4;
    402       dst    += 4;
    403     }
    404     for (x_out = 0; x_out < length; ++x_out) {
    405       const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
    406       dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
    407       irow[x_out] = 0;
    408     }
    409   }
    410 }
    411 
    412 static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
    413   uint8_t* dst = wrk->dst;
    414   rescaler_t* irow = wrk->irow;
    415   const int x_out_max = wrk->dst_width * wrk->num_channels;
    416   const rescaler_t* frow = wrk->frow;
    417   const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
    418   assert(!WebPRescalerOutputDone(wrk));
    419   assert(wrk->y_accum <= 0);
    420   assert(!wrk->y_expand);
    421   if (yscale) {
    422     ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
    423   } else {
    424     ExportRowShrink_1(irow, dst, x_out_max, wrk);
    425   }
    426 }
    427 #endif  // 0
    428 
    429 //------------------------------------------------------------------------------
    430 // Entry point
    431 
    432 extern void WebPRescalerDspInitMSA(void);
    433 
    434 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
    435   WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
    436 //  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
    437 }
    438 
    439 #else     // !WEBP_USE_MSA
    440 
    441 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
    442 
    443 #endif    // WEBP_USE_MSA
    444