Home | History | Annotate | Download | only in loongson
      1 /*
      2  * Loongson MMI optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
      5  * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
      6  *                          All Rights Reserved.
      7  * Authors:  ZhuChen     <zhuchen (at) loongson.cn>
      8  *           CaiWanwei   <caiwanwei (at) loongson.cn>
      9  *           SunZhangzhi <sunzhangzhi-cq (at) loongson.cn>
     10  *
     11  * Based on the x86 SIMD extension for IJG JPEG library
     12  * Copyright (C) 1999-2006, MIYASAKA Masaru.
     13  *
     14  * This software is provided 'as-is', without any express or implied
     15  * warranty.  In no event will the authors be held liable for any damages
     16  * arising from the use of this software.
     17  *
     18  * Permission is granted to anyone to use this software for any purpose,
     19  * including commercial applications, and to alter it and redistribute it
     20  * freely, subject to the following restrictions:
     21  *
     22  * 1. The origin of this software must not be misrepresented; you must not
     23  *    claim that you wrote the original software. If you use this software
     24  *    in a product, an acknowledgment in the product documentation would be
     25  *    appreciated but is not required.
     26  * 2. Altered source versions must be plainly marked as such, and must not be
     27  *    misrepresented as being the original software.
     28  * 3. This notice may not be removed or altered from any source distribution.
     29  */
     30 
     31 /* CHROMA UPSAMPLING */
     32 
     33 #include "jsimd_mmi.h"
     34 
     35 
     36 enum const_index {
     37   index_PW_THREE,
     38   index_PW_SEVEN,
     39   index_PW_EIGHT,
     40 };
     41 
     42 static uint64_t const_value[] = {
     43   _uint64_set_pi16(3, 3, 3, 3),
     44   _uint64_set_pi16(7, 7, 7, 7),
     45   _uint64_set_pi16(8, 8, 8, 8),
     46 };
     47 
     48 #define PW_THREE  get_const_value(index_PW_THREE)
     49 #define PW_SEVEN  get_const_value(index_PW_SEVEN)
     50 #define PW_EIGHT  get_const_value(index_PW_EIGHT)
     51 
     52 
     53 #define PROCESS_ROW(r) { \
     54   mm7 = _mm_load_si64((__m64 *)outptr##r);      /* mm7=IntrL=( 0 1 2 3) */ \
     55   mm3 = _mm_load_si64((__m64 *)outptr##r + 1);  /* mm3=IntrH=( 4 5 6 7) */ \
     56   \
     57   mm0 = mm7; \
     58   mm4 = mm3; \
     59   mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT);                   /* mm0=( 1 2 3 -) */ \
     60   mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \
     61   mm5 = mm7; \
     62   mm6 = mm3; \
     63   mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \
     64   mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT);                   /* mm6=( - 4 5 6) */ \
     65   \
     66   mm0 = _mm_or_si64(mm0, mm4);                /* mm0=( 1 2 3 4) */ \
     67   mm5 = _mm_or_si64(mm5, mm6);                /* mm5=( 3 4 5 6) */ \
     68   \
     69   mm1 = mm7; \
     70   mm2 = mm3; \
     71   mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT);     /* mm1=( - 0 1 2) */ \
     72   mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT);     /* mm2=( 5 6 7 -) */ \
     73   mm4 = mm3; \
     74   mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \
     75   \
     76   mm1 = _mm_or_si64(mm1, wk[r]);              /* mm1=(-1 0 1 2) */ \
     77   mm2 = _mm_or_si64(mm2, wk[r + 2]);          /* mm2=( 5 6 6 8) */ \
     78   \
     79   wk[r] = mm4; \
     80   \
     81   mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
     82   mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
     83   mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
     84   mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
     85   mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
     86   mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
     87   \
     88   mm1 = _mm_add_pi16(mm1, mm7); \
     89   mm5 = _mm_add_pi16(mm5, mm3); \
     90   mm1 = _mm_srli_pi16(mm1, 4);                /* mm1=OutrLE=( 0  2  4  6) */ \
     91   mm5 = _mm_srli_pi16(mm5, 4);                /* mm5=OutrHE=( 8 10 12 14) */ \
     92   mm0 = _mm_add_pi16(mm0, mm7); \
     93   mm2 = _mm_add_pi16(mm2, mm3); \
     94   mm0 = _mm_srli_pi16(mm0, 4);                /* mm0=OutrLO=( 1  3  5  7) */ \
     95   mm2 = _mm_srli_pi16(mm2, 4);                /* mm2=OutrHO=( 9 11 13 15) */ \
     96   \
     97   mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
     98   mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
     99   mm1 = _mm_or_si64(mm1, mm0);     /* mm1=OutrL=( 0  1  2  3  4  5  6  7) */ \
    100   mm5 = _mm_or_si64(mm5, mm2);     /* mm5=OutrH=( 8  9 10 11 12 13 14 15) */ \
    101   \
    102   _mm_store_si64((__m64 *)outptr##r, mm1); \
    103   _mm_store_si64((__m64 *)outptr##r + 1, mm5); \
    104 }
    105 
    106 void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
    107                                    JDIMENSION downsampled_width,
    108                                    JSAMPARRAY input_data,
    109                                    JSAMPARRAY *output_data_ptr)
    110 {
    111   JSAMPARRAY output_data = *output_data_ptr;
    112   JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
    113   int inrow, outrow, incol, tmp, tmp1;
    114   __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
    115   __m64 wk[4], mm_tmp;
    116 
    117   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
    118 
    119     inptr_1 = input_data[inrow - 1];
    120     inptr0 = input_data[inrow];
    121     inptr1 = input_data[inrow + 1];
    122     outptr0 = output_data[outrow++];
    123     outptr1 = output_data[outrow++];
    124 
    125     if (downsampled_width & 7) {
    126       tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
    127       tmp1 =  downsampled_width * sizeof(JSAMPLE);
    128       asm("daddu  $8, %3, %6\r\n"
    129           "lb     $9, ($8)\r\n"
    130           "daddu  $8, %3, %7\r\n"
    131           "sb     $9, ($8)\r\n"
    132           "daddu  $8, %4, %6\r\n"
    133           "lb     $9, ($8)\r\n"
    134           "daddu  $8, %4, %7\r\n"
    135           "sb     $9, ($8)\r\n"
    136           "daddu  $8, %5, %6\r\n"
    137           "lb     $9, ($8)\r\n"
    138           "daddu  $8, %5, %7\r\n"
    139           "sb     $9, ($8)\r\n"
    140           : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
    141           : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
    142           : "$8", "$9"
    143          );
    144     }
    145 
    146     /* process the first column block */
    147     mm0 = _mm_load_si64((__m64 *)inptr0);     /* mm0 = row[ 0][0] */
    148     mm1 = _mm_load_si64((__m64 *)inptr_1);    /* mm1 = row[-1][0] */
    149     mm2 = _mm_load_si64((__m64 *)inptr1);     /* mm2 = row[ 1][0] */
    150 
    151     mm3 = _mm_xor_si64(mm3, mm3);             /* mm3 = (all 0's) */
    152     mm4 = mm0;
    153     mm0 = _mm_unpacklo_pi8(mm0, mm3);         /* mm0 = row[ 0][0]( 0 1 2 3) */
    154     mm4 = _mm_unpackhi_pi8(mm4, mm3);         /* mm4 = row[ 0][0]( 4 5 6 7) */
    155     mm5 = mm1;
    156     mm1 = _mm_unpacklo_pi8(mm1, mm3);         /* mm1 = row[-1][0]( 0 1 2 3) */
    157     mm5 = _mm_unpackhi_pi8(mm5, mm3);         /* mm5 = row[-1][0]( 4 5 6 7) */
    158     mm6 = mm2;
    159     mm2 = _mm_unpacklo_pi8(mm2, mm3);         /* mm2 = row[+1][0]( 0 1 2 3) */
    160     mm6 = _mm_unpackhi_pi8(mm6, mm3);         /* mm6 = row[+1][0]( 4 5 6 7) */
    161 
    162     mm0 = _mm_mullo_pi16(mm0, PW_THREE);
    163     mm4 = _mm_mullo_pi16(mm4, PW_THREE);
    164 
    165     mm7 = _mm_cmpeq_pi8(mm7, mm7);
    166     mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT);
    167 
    168     mm1 = _mm_add_pi16(mm1, mm0);             /* mm1=Int0L=( 0 1 2 3) */
    169     mm5 = _mm_add_pi16(mm5, mm4);             /* mm5=Int0H=( 4 5 6 7) */
    170     mm2 = _mm_add_pi16(mm2, mm0);             /* mm2=Int1L=( 0 1 2 3) */
    171     mm6 = _mm_add_pi16(mm6, mm4);             /* mm6=Int1H=( 4 5 6 7) */
    172 
    173     _mm_store_si64((__m64 *)outptr0, mm1);      /* temporarily save */
    174     _mm_store_si64((__m64 *)outptr0 + 1, mm5);  /* the intermediate data */
    175     _mm_store_si64((__m64 *)outptr1, mm2);
    176     _mm_store_si64((__m64 *)outptr1 + 1, mm6);
    177 
    178     mm1 = _mm_and_si64(mm1, mm7);             /* mm1=( 0 - - -) */
    179     mm2 = _mm_and_si64(mm2, mm7);             /* mm2=( 0 - - -) */
    180 
    181     wk[0] = mm1;
    182     wk[1] = mm2;
    183 
    184     for (incol = downsampled_width; incol > 0;
    185          incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
    186          outptr0 += 16, outptr1 += 16) {
    187 
    188       if (incol > 8) {
    189         /* process the next column block */
    190         mm0 = _mm_load_si64((__m64 *)inptr0 + 1);   /* mm0 = row[ 0][1] */
    191         mm1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* mm1 = row[-1][1] */
    192         mm2 = _mm_load_si64((__m64 *)inptr1 + 1);   /* mm2 = row[+1][1] */
    193 
    194         mm3 = _mm_setzero_si64();             /* mm3 = (all 0's) */
    195         mm4 = mm0;
    196         mm0 = _mm_unpacklo_pi8(mm0, mm3);     /* mm0 = row[ 0][1]( 0 1 2 3) */
    197         mm4 = _mm_unpackhi_pi8(mm4, mm3);     /* mm4 = row[ 0][1]( 4 5 6 7) */
    198         mm5 = mm1;
    199         mm1 = _mm_unpacklo_pi8(mm1, mm3);     /* mm1 = row[-1][1]( 0 1 2 3) */
    200         mm5 = _mm_unpackhi_pi8(mm5, mm3);     /* mm5 = row[-1][1]( 4 5 6 7) */
    201         mm6 = mm2;
    202         mm2 = _mm_unpacklo_pi8(mm2, mm3);     /* mm2 = row[+1][1]( 0 1 2 3) */
    203         mm6 = _mm_unpackhi_pi8(mm6, mm3);     /* mm6 = row[+1][1]( 4 5 6 7) */
    204 
    205         mm0 = _mm_mullo_pi16(mm0, PW_THREE);
    206         mm4 = _mm_mullo_pi16(mm4, PW_THREE);
    207 
    208         mm1 = _mm_add_pi16(mm1, mm0);         /* mm1 = Int0L = ( 0 1 2 3) */
    209         mm5 = _mm_add_pi16(mm5, mm4);         /* mm5 = Int0H = ( 4 5 6 7) */
    210         mm2 = _mm_add_pi16(mm2, mm0);         /* mm2 = Int1L = ( 0 1 2 3) */
    211         mm6 = _mm_add_pi16(mm6, mm4);         /* mm6 = Int1H = ( 4 5 6 7) */
    212 
    213         _mm_store_si64((__m64 *)outptr0 + 2, mm1);  /* temporarily save */
    214         _mm_store_si64((__m64 *)outptr0 + 3, mm5);  /* the intermediate data */
    215         _mm_store_si64((__m64 *)outptr1 + 2, mm2);
    216         _mm_store_si64((__m64 *)outptr1 + 3, mm6);
    217 
    218         mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */
    219         mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */
    220 
    221         wk[2] = mm1;
    222         wk[3] = mm2;
    223       } else {
    224         /* process the last column block */
    225         mm1 = _mm_cmpeq_pi8(mm1, mm1);
    226         mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT);
    227         mm2 = mm1;
    228 
    229         mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1);
    230         mm1 = _mm_and_si64(mm1, mm_tmp);      /* mm1=( - - - 7) */
    231         mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1);
    232         mm2 = _mm_and_si64(mm2, mm_tmp);      /* mm2=( - - - 7) */
    233 
    234         wk[2] = mm1;
    235         wk[3] = mm2;
    236       }
    237 
    238       /* process the upper row */
    239       PROCESS_ROW(0)
    240 
    241       /* process the lower row */
    242       PROCESS_ROW(1)
    243     }
    244   }
    245 }
    246