Home | History | Annotate | Download | only in jpeg
      1 /*
      2  * Copyright (C) 2010-2011 Intel Corporation
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define JPEG_INTERNALS
     18 #include "jinclude.h"
     19 #include "jpeglib.h"
     20 #include "jdct.h"		/* Private declarations for DCT subsystem */
     21 
     22 #ifdef ANDROID_INTELSSE2_IDCT
     23 #include <emmintrin.h>
     24 
     25 #if DCTSIZE != 8
     26   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
     27 #endif
     28 
     29 #define BITS_INV_ACC 4
     30 #define SHIFT_INV_ROW 12
     31 #define SHIFT_INV_COL 5
     32 const short RND_INV_ROW = 2048;
     33 const short RND_INV_COL = 16;
     34 const short RND_INV_CORR = 15;
     35 
     36 static const short __attribute__ ((aligned(16))) M128_one_corr[8] = {1,1,1,1,1,1,1,1};
     37 static const short __attribute__ ((aligned(16))) M128_round_inv_row[8] = {2048,0,2048,0,2048,0,2048,0};
     38 static const short __attribute__ ((aligned(16))) M128_round_inv_col[8] = {16,16,16,16,16,16,16,16};
     39 static const short __attribute__ ((aligned(16))) M128_round_inv_corr[8] = {15,15,15,15,15,15,15,15};
     40 
     41 static const short __attribute__ ((aligned(16))) M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036};
     42 static const short __attribute__ ((aligned(16))) M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146};
     43 static const short __attribute__ ((aligned(16))) M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746};
     44 static const short __attribute__ ((aligned(16))) M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};
     45 
     46 static const short __attribute__ ((aligned(16))) jpeg_adjust[8] = {128, 128, 128, 128, 128, 128, 128, 128};
     47 
     48 // Table for rows 0,4
     49 static const short __attribute__ ((aligned(16))) M128_tab_i_04[32] = {
     50 16384, 21407, 16384, 8867,
     51 16384, -8867, 16384, -21407,
     52 16384, 8867, -16384, -21407,
     53 -16384, 21407, 16384, -8867,
     54 22725, 19266, 19266, -4520,
     55 12873, -22725, 4520, -12873,
     56 12873, 4520, -22725, -12873,
     57 4520, 19266, 19266, -22725
     58 };
     59 
     60 // Table for rows 1,7
     61 static const short __attribute__ ((aligned(16))) M128_tab_i_17[32] = {
     62 22725, 29692, 22725, 12299,
     63 22725, -12299, 22725, -29692,
     64 22725, 12299, -22725, -29692,
     65 -22725, 29692, 22725, -12299,
     66 31521, 26722, 26722, -6270,
     67 17855, -31521, 6270, -17855,
     68 17855, 6270, -31521, -17855,
     69 6270, 26722, 26722, -31521
     70 };
     71 
     72 // Table for rows 2,6
     73 static const short __attribute__ ((aligned(16))) M128_tab_i_26[32] = {
     74 21407, 27969, 21407, 11585,
     75 21407, -11585, 21407, -27969,
     76 21407, 11585, -21407, -27969,
     77 -21407, 27969, 21407, -11585,
     78 29692, 25172, 25172, -5906,
     79 16819, -29692, 5906, -16819,
     80 16819, 5906, -29692, -16819,
     81 5906, 25172, 25172, -29692
     82 };
     83 
     84 // Table for rows 3,5
     85 static const short __attribute__ ((aligned(16))) M128_tab_i_35[32] = {
     86 19266, 25172, 19266, 10426,
     87 19266, -10426, 19266, -25172,
     88 19266, 10426, -19266, -25172,
     89 -19266, 25172, 19266, -10426,
     90 26722, 22654, 22654, -5315,
     91 15137, -26722, 5315, -15137,
     92 15137, 5315, -26722, -15137,
     93 5315, 22654, 22654, -26722
     94 };
     95 
     96 
     97 /*
     98  * Perform dequantization and inverse DCT on one block of coefficients by SSE.
     99  */
    100 
    101 GLOBAL(void)
    102 jpeg_idct_intelsse (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    103 		 JCOEFPTR coef_block,
    104 		 JSAMPARRAY output_buf, JDIMENSION output_col)
    105 {
    106   __m128i row0, tmp1, tmp2, tmp3, row2, tmp5, tmp6, tmp7;
    107   int ctr;
    108   JSAMPROW  outptrTemp;
    109   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    110   short __attribute__((aligned(16))) quantptrSSE[DCTSIZE2];
    111   short __attribute__((aligned(16))) workspaceSSE[DCTSIZE2];
    112   short __attribute__((aligned(16))) coef_blockSSE[DCTSIZE2];
    113   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    114   __m128i* tg3, *tg1, *tg2, *cos4;
    115   __m128i tm765, tp765, tm465, tp465, tp03, tm03, tp12, tm12, tp65, tm65;
    116   __m128i t0, t1, t2, t3, t4, t5, t6, t7;
    117   __m128i temp, temp2;
    118   short * wsptr;
    119   unsigned char * outptr;
    120 
    121 #define iDCT_8_2ROWs(table1, table2)   \
    122     row0 = _mm_shufflelo_epi16(row0, 0xD8); /*x7, x6, x5, x4, x3, x1, x2, x0*/    \
    123     row2 = _mm_shufflelo_epi16(row2, 0xD8);   \
    124     tmp1 = _mm_shuffle_epi32(row0, 0);      /*x2, x0, x2, x0, x2, x0, x2, x0*/    \
    125     tmp5 = _mm_shuffle_epi32(row2, 0);        \
    126                                                                                   \
    127     tmp3 = _mm_shuffle_epi32(row0, 0x55);   /*x3, x1, x3, x1, x3, x1, x3, x1*/    \
    128     tmp7 = _mm_shuffle_epi32(row2, 0x55);     \
    129     row0 = _mm_shufflehi_epi16(row0, 0xD8); /*x7, x5, x6, x4, x3, x1, x2, x0*/    \
    130     row2 = _mm_shufflehi_epi16(row2, 0xD8);   \
    131 						\
    132     tmp1 = _mm_madd_epi16(tmp1, * ( __m128i*)table1);      /*x2*w13+x0*w12, x2*w9+x0*w8, x2*w5+x0*w4, x2*w1+x0*w0*/   \
    133     tmp5 = _mm_madd_epi16(tmp5, * ( __m128i*)table2);       \
    134 						\
    135     tmp2 =  _mm_shuffle_epi32(row0, 0xAA);  /*x6, x4, x6, x4, x6, x4, x6, x4*/    \
    136     tmp6 = _mm_shuffle_epi32(row2, 0xAA);     \
    137     row0 = _mm_shuffle_epi32(row0, 0xFF);   /*x7, x5, x7, x5, x7, x5, x7, x5*/    \
    138     row2 = _mm_shuffle_epi32(row2, 0xFF);     \
    139 \
    140     tmp3 = _mm_madd_epi16(tmp3, * ( __m128i*)(table1+16)); /*x3*w29+x1*w28, x3*w25+x1*w24, x3*w21+x1*w20, x3*w17+x1*w16*/  \
    141     tmp7 = _mm_madd_epi16(tmp7, * ( __m128i*)(table2+16) ); \
    142     row0 = _mm_madd_epi16(row0, * ( __m128i*)(table1+24)); /*x7*w31+x5*w30, x7*w27+x5*w26, x7*w23+x5*w22, x7*w19+x5*w18*/  \
    143     row2 = _mm_madd_epi16(row2, * ( __m128i*)(table2+24) ); \
    144     tmp2 = _mm_madd_epi16(tmp2, * ( __m128i*)(table1+8) ); /*x6*w15+x4*w14, x6*w11+x4*w10, x6*w7+x4*w6, x6*w3+x4*w2*/  \
    145     tmp6 = _mm_madd_epi16(tmp6, * ( __m128i*)(table2+8) );  \
    146                                                              \
    147     tmp1 = _mm_add_epi32(tmp1, * ( __m128i*)M128_round_inv_row);       \
    148     tmp5 = _mm_add_epi32(tmp5, * ( __m128i*)M128_round_inv_row);      \
    149     row0 = _mm_add_epi32(row0, tmp3);    /*b3, b2, b1, b0*/  \
    150     row2 = _mm_add_epi32(row2, tmp7);                       \
    151     tmp1 = _mm_add_epi32(tmp1, tmp2);    /*a3, a2, a1, a0*/  \
    152     tmp5 = _mm_add_epi32(tmp5, tmp6);                       \
    153                                                              \
    154     tmp2 = tmp1;  \
    155     tmp6 = tmp5;  \
    156     tmp2 = _mm_sub_epi32(tmp2, row0); /*for row0. y4= a3-b3, y5=a2-b2, y6=a1-b1, y7=a0-b0 */   \
    157     tmp6 = _mm_sub_epi32(tmp6, row2);  \
    158     row0 = _mm_add_epi32(row0, tmp1); /*y3=a3+b3,y2=a2+b2,y1=a1+b1,y0=a0+b0*/   \
    159     row2 = _mm_add_epi32(row2, tmp5);  \
    160     tmp2 = _mm_srai_epi32(tmp2, SHIFT_INV_ROW);             \
    161     tmp6 = _mm_srai_epi32(tmp6, SHIFT_INV_ROW);  \
    162     row0 = _mm_srai_epi32(row0, SHIFT_INV_ROW);             \
    163     row2 = _mm_srai_epi32(row2, SHIFT_INV_ROW);  \
    164     tmp2 = _mm_shuffle_epi32(tmp2, 0x1B); /*y7, y6, y5, y4*/   \
    165     tmp6 = _mm_shuffle_epi32(tmp6, 0x1B);  \
    166     row0 = _mm_packs_epi32(row0, tmp2); /*row0 = y7,y6,y5,y4,y3,y2,y1,y0*/  \
    167     row2 = _mm_packs_epi32(row2, tmp6);  /*row2 = y7,...y0*/
    168 
    169 
    170 #define iDCT_8_COL()  \
    171     x3 = _mm_load_si128(( __m128i*)(wsptr+24));\
    172     x1 = _mm_load_si128(( __m128i*)(wsptr+8));\
    173     x5 = row0;\
    174     x7 = row2;\
    175 \
    176     tg3 = ( __m128i*)(M128_tg_3_16);\
    177     tg1 = ( __m128i*)(M128_tg_1_16);\
    178     tg2 = ( __m128i*)(M128_tg_2_16);\
    179     cos4 =(__m128i*)(M128_cos_4_16);\
    180 \
    181     temp = _mm_mulhi_epi16(x5, *tg3);  /*row5*tg3*/ \
    182     temp2 = _mm_mulhi_epi16(x3, *tg3);\
    183     temp = _mm_adds_epi16(temp, x5); /*coef adjustment*/ \
    184     temp2 = _mm_adds_epi16(temp2, x3);\
    185     tm765 = _mm_adds_epi16(temp, x3);\
    186     tm465 = _mm_subs_epi16(x5, temp2);\
    187 \
    188     temp = _mm_mulhi_epi16(x7, *tg1);  /*row7*tg1*/ \
    189     temp2 = _mm_mulhi_epi16(x1, *tg1);\
    190     tp765 = _mm_adds_epi16(temp, x1);\
    191     tp465 = _mm_subs_epi16(temp2, x7);  /*row1*tg1 - row7*/ \
    192 \
    193     t7 = _mm_adds_epi16(tp765, tm765);\
    194     t7 = _mm_adds_epi16(t7, *( __m128i*)M128_one_corr);\
    195     tp65 = _mm_subs_epi16(tp765, tm765);\
    196     t4 =  _mm_adds_epi16(tp465, tm465);\
    197     tm65 = _mm_subs_epi16(tp465, tm465);\
    198     tm65 = _mm_adds_epi16(tm65, *( __m128i*)M128_one_corr);\
    199 \
    200     x0 = _mm_load_si128(( __m128i*)(wsptr));\
    201     x4 = _mm_load_si128(( __m128i*)(wsptr+32));\
    202     x2 = _mm_load_si128(( __m128i*)(wsptr+16));\
    203     x6 = _mm_load_si128(( __m128i*)(wsptr+48));\
    204 \
    205     /*t6 = ( tp65 + tm65 ) * cos_4_16;*/ \
    206     temp = _mm_adds_epi16(tp65, tm65);\
    207     temp2 = _mm_subs_epi16(tp65, tm65);\
    208     t6 = _mm_mulhi_epi16(temp, *cos4);\
    209     t5 = _mm_mulhi_epi16(temp2, *cos4);\
    210     t6 = _mm_adds_epi16(t6, temp);\
    211     t6 = _mm_or_si128(t6, *( __m128i*)M128_one_corr);\
    212     t5 = _mm_adds_epi16(t5, temp2);\
    213     t5 = _mm_or_si128(t5, *( __m128i*)M128_one_corr);\
    214 \
    215     tp03 = _mm_adds_epi16(x0, x4);\
    216     tp12 = _mm_subs_epi16(x0, x4);\
    217 \
    218     temp = _mm_mulhi_epi16(x6, *tg2);\
    219     temp2 = _mm_mulhi_epi16(x2, *tg2);\
    220     tm03 = _mm_adds_epi16(temp, x2);\
    221     tm12 = _mm_subs_epi16(temp2, x6);\
    222 \
    223     t0 = _mm_adds_epi16(tp03, tm03);\
    224     t0 = _mm_adds_epi16(t0, *( __m128i*)M128_round_inv_col);\
    225     t3 = _mm_subs_epi16(tp03, tm03);\
    226     t3 = _mm_adds_epi16(t3, *( __m128i*)M128_round_inv_corr);\
    227     t1 = _mm_adds_epi16(tp12, tm12);\
    228     t1 = _mm_adds_epi16(t1, *( __m128i*)M128_round_inv_col);\
    229     t2 = _mm_subs_epi16(tp12, tm12);\
    230     t2 = _mm_adds_epi16(t2, *( __m128i*)M128_round_inv_corr);\
    231 \
    232     temp = _mm_adds_epi16(t0, t7);   /*y0*/ \
    233     temp2 = _mm_adds_epi16(t1, t6);  /*y1*/ \
    234     temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
    235     temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
    236     temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust); /*Add 128 for jpeg decoding*/ \
    237     temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
    238 \
    239     temp = _mm_packus_epi16(temp, temp2);\
    240     _mm_store_si128(( __m128i*)(outptr), temp);  /*store y0, y1*/ \
    241 \
    242     temp = _mm_adds_epi16(t2, t5);\
    243     temp2 = _mm_adds_epi16(t3, t4);\
    244     temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
    245     temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
    246     temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
    247     temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
    248 \
    249     temp = _mm_packus_epi16(temp, temp2);\
    250     _mm_store_si128(( __m128i*)(outptr+16), temp);  /*store y2, y3*/ \
    251 \
    252     temp = _mm_subs_epi16(t3, t4);\
    253     temp2 = _mm_subs_epi16(t2, t5);\
    254     temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
    255     temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
    256     temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
    257     temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
    258 \
    259     temp = _mm_packus_epi16(temp, temp2);\
    260     _mm_store_si128(( __m128i*)(outptr+32), temp);  /*store y4, y5*/ \
    261 \
    262     temp = _mm_subs_epi16(t1, t6);\
    263     temp2 = _mm_subs_epi16(t0, t7);\
    264     temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
    265     temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
    266     temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
    267     temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
    268 \
    269     temp = _mm_packus_epi16(temp, temp2);\
    270     _mm_store_si128(( __m128i*)(outptr+48), temp);  /*store y6, y7*/
    271 
    272 
    273   /*Memcpy to do 16byte alignment. */
    274   memcpy((char*)quantptrSSE, (char*)compptr->dct_table, sizeof(quantptrSSE));
    275   memcpy((char*)coef_blockSSE, (char*)coef_block, sizeof(coef_blockSSE));
    276 
    277   wsptr = (short *)workspaceSSE;
    278   outptr = (unsigned char*)workspaceSSE;
    279 
    280   // row 0 and row 2
    281   row0 = _mm_load_si128((__m128i const*)(coef_blockSSE));
    282   row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*2));
    283   row0 = _mm_mullo_epi16( row0, *(__m128i const*)quantptrSSE );
    284   row2 = _mm_mullo_epi16( row2, *(__m128i const*)(quantptrSSE+8*2) );
    285 
    286   iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26);
    287 
    288   _mm_store_si128((__m128i*)(wsptr), row0);
    289   _mm_store_si128((__m128i*)(wsptr+8*2), row2);
    290 
    291   // row 4 and row 6
    292   row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*4));
    293   row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*6));
    294   row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+8*4) );
    295   row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8*6) );
    296 
    297   iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26);
    298 
    299   _mm_store_si128((__m128i*)(wsptr+32), row0);
    300   _mm_store_si128((__m128i*)(wsptr+48), row2);
    301 
    302   // row 3 and row 1
    303   row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*3));
    304   row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*1));
    305   row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+24) );
    306   row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8) );
    307 
    308   iDCT_8_2ROWs(M128_tab_i_35, M128_tab_i_17);
    309 
    310   _mm_store_si128((__m128i*)(wsptr+24), row0);
    311   _mm_store_si128((__m128i*)(wsptr+8), row2);
    312 
    313   // row 5 and row 7
    314   row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*5));
    315   row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*7));
    316   row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+40) );
    317   row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+56));
    318 
    319   iDCT_8_2ROWs( M128_tab_i_35, M128_tab_i_17);
    320 
    321   iDCT_8_COL();
    322 
    323   for(ctr = 0; ctr < DCTSIZE; ctr++)
    324   {
    325     outptrTemp = output_buf[ctr] + output_col;
    326     memcpy(outptrTemp, outptr, DCTSIZE);
    327     outptr += DCTSIZE;   /* advance pointer to next row */
    328   }
    329 
    330   return;
    331 }
    332 #endif /* ANDROID_INTELSSE2_IDCT */
    333