1 /* 2 * Loongson MMI optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2014-2015, 2018, D. R. Commander. All Rights Reserved. 5 * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. 6 * All Rights Reserved. 7 * Authors: ZhuChen <zhuchen (at) loongson.cn> 8 * CaiWanwei <caiwanwei (at) loongson.cn> 9 * SunZhangzhi <sunzhangzhi-cq (at) loongson.cn> 10 * 11 * Based on the x86 SIMD extension for IJG JPEG library 12 * Copyright (C) 1999-2006, MIYASAKA Masaru. 13 * 14 * This software is provided 'as-is', without any express or implied 15 * warranty. In no event will the authors be held liable for any damages 16 * arising from the use of this software. 17 * 18 * Permission is granted to anyone to use this software for any purpose, 19 * including commercial applications, and to alter it and redistribute it 20 * freely, subject to the following restrictions: 21 * 22 * 1. The origin of this software must not be misrepresented; you must not 23 * claim that you wrote the original software. If you use this software 24 * in a product, an acknowledgment in the product documentation would be 25 * appreciated but is not required. 26 * 2. Altered source versions must be plainly marked as such, and must not be 27 * misrepresented as being the original software. 28 * 3. This notice may not be removed or altered from any source distribution. 29 */ 30 31 /* SLOW INTEGER INVERSE DCT */ 32 33 #include "jsimd_mmi.h" 34 35 36 #define CONST_BITS 13 37 #define PASS1_BITS 2 38 #define DESCALE_P1 (CONST_BITS - PASS1_BITS) 39 #define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) 40 #define CENTERJSAMPLE 128 41 42 #define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ 43 #define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ 44 #define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ 45 #define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ 46 #define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ 47 #define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ 48 #define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ 49 #define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ 50 #define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ 51 #define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ 52 #define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ 53 #define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ 54 55 enum const_index { 56 index_PW_F130_F054, 57 index_PW_F054_MF130, 58 index_PW_MF078_F117, 59 index_PW_F117_F078, 60 index_PW_MF060_MF089, 61 index_PW_MF089_F060, 62 index_PW_MF050_MF256, 63 index_PW_MF256_F050, 64 index_PD_DESCALE_P1, 65 index_PD_DESCALE_P2, 66 index_PB_CENTERJSAMP 67 }; 68 69 static uint64_t const_value[] = { 70 _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765), 71 FIX_0_541, (FIX_0_541 + FIX_0_765)), 72 _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541, 73 (FIX_0_541 - FIX_1_847), FIX_0_541), 74 _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961), 75 FIX_1_175, (FIX_1_175 - FIX_1_961)), 76 _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175, 77 (FIX_1_175 - FIX_0_390), FIX_1_175), 78 _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899), 79 -FIX_0_899, (FIX_0_298 - FIX_0_899)), 80 _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899, 81 (FIX_1_501 - FIX_0_899), -FIX_0_899), 82 _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562), 83 -FIX_2_562, (FIX_2_053 - FIX_2_562)), 84 _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562, 85 (FIX_3_072 - FIX_2_562), -FIX_2_562), 86 _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))), 87 _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))), 88 _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, 89 CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE) 90 }; 91 92 #define PW_F130_F054 get_const_value(index_PW_F130_F054) 93 #define PW_F054_MF130 get_const_value(index_PW_F054_MF130) 94 #define PW_MF078_F117 get_const_value(index_PW_MF078_F117) 95 #define PW_F117_F078 get_const_value(index_PW_F117_F078) 96 #define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089) 97 #define PW_MF089_F060 get_const_value(index_PW_MF089_F060) 98 #define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256) 99 #define PW_MF256_F050 get_const_value(index_PW_MF256_F050) 100 #define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1) 101 #define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2) 102 #define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP) 103 104 105 #define test_m32_zero(mm32) (!(*(uint32_t *)&mm32)) 106 #define test_m64_zero(mm64) (!(*(uint64_t *)&mm64)) 107 108 109 #define DO_IDCT_COMMON(PASS) { \ 110 __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \ 111 __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ 112 __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \ 113 __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \ 114 __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \ 115 \ 116 z3 = _mm_add_pi16(tmp0, tmp2); \ 117 z4 = _mm_add_pi16(tmp1, tmp3); \ 118 \ 119 /* (Original) \ 120 * z5 = (z3 + z4) * 1.175875602; \ 121 * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ 122 * z3 += z5; z4 += z5; \ 123 * \ 124 * (This implementation) \ 125 * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ 126 * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ 127 */ \ 128 \ 129 z34l = _mm_unpacklo_pi16(z3, z4); \ 130 z34h = _mm_unpackhi_pi16(z3, z4); \ 131 z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \ 132 z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \ 133 z4l = _mm_madd_pi16(z34l, PW_F117_F078); \ 134 z4h = _mm_madd_pi16(z34h, PW_F117_F078); \ 135 \ 136 /* (Original) \ 137 * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ 138 * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ 139 * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ 140 * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ 141 * tmp0 += z1 + z3; tmp1 += z2 + z4; \ 142 * tmp2 += z2 + z3; tmp3 += z1 + z4; \ 143 * \ 144 * (This implementation) \ 145 * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ 146 * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ 147 * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ 148 * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ 149 * tmp0 += z3; tmp1 += z4; \ 150 * tmp2 += z3; tmp3 += z4; \ 151 */ \ 152 \ 153 tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \ 154 tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \ 155 \ 156 tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \ 157 tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \ 158 tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \ 159 tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \ 160 \ 161 tmp0l = _mm_add_pi32(tmp0l, z3l); \ 162 tmp0h = _mm_add_pi32(tmp0h, z3h); \ 163 tmp3l = _mm_add_pi32(tmp3l, z4l); \ 164 tmp3h = _mm_add_pi32(tmp3h, z4h); \ 165 \ 166 tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \ 167 tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \ 168 \ 169 tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \ 170 tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \ 171 tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \ 172 tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \ 173 \ 174 tmp1l = _mm_add_pi32(tmp1l, z4l); \ 175 tmp1h = _mm_add_pi32(tmp1h, z4h); \ 176 tmp2l = _mm_add_pi32(tmp2l, z3l); \ 177 tmp2h = _mm_add_pi32(tmp2h, z3h); \ 178 \ 179 /* Final output stage */ \ 180 \ 181 out0l = _mm_add_pi32(tmp10l, tmp3l); \ 182 out0h = _mm_add_pi32(tmp10h, tmp3h); \ 183 out7l = _mm_sub_pi32(tmp10l, tmp3l); \ 184 out7h = _mm_sub_pi32(tmp10h, tmp3h); \ 185 \ 186 out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \ 187 out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \ 188 out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \ 189 out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \ 190 \ 191 out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \ 192 out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \ 193 out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \ 194 out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \ 195 \ 196 out0 = _mm_packs_pi32(out0l, out0h); \ 197 out7 = _mm_packs_pi32(out7l, out7h); \ 198 \ 199 out1l = _mm_add_pi32(tmp11l, tmp2l); \ 200 out1h = _mm_add_pi32(tmp11h, tmp2h); \ 201 out6l = _mm_sub_pi32(tmp11l, tmp2l); \ 202 out6h = _mm_sub_pi32(tmp11h, tmp2h); \ 203 \ 204 out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \ 205 out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \ 206 out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \ 207 out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \ 208 \ 209 out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \ 210 out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \ 211 out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \ 212 out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \ 213 \ 214 out1 = _mm_packs_pi32(out1l, out1h); \ 215 out6 = _mm_packs_pi32(out6l, out6h); \ 216 \ 217 out2l = _mm_add_pi32(tmp12l, tmp1l); \ 218 out2h = _mm_add_pi32(tmp12h, tmp1h); \ 219 out5l = _mm_sub_pi32(tmp12l, tmp1l); \ 220 out5h = _mm_sub_pi32(tmp12h, tmp1h); \ 221 \ 222 out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \ 223 out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \ 224 out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \ 225 out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \ 226 \ 227 out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \ 228 out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \ 229 out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \ 230 out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \ 231 \ 232 out2 = _mm_packs_pi32(out2l, out2h); \ 233 out5 = _mm_packs_pi32(out5l, out5h); \ 234 \ 235 out3l = _mm_add_pi32(tmp13l, tmp0l); \ 236 out3h = _mm_add_pi32(tmp13h, tmp0h); \ 237 \ 238 out4l = _mm_sub_pi32(tmp13l, tmp0l); \ 239 out4h = _mm_sub_pi32(tmp13h, tmp0h); \ 240 \ 241 out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \ 242 out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \ 243 out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \ 244 out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \ 245 \ 246 out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \ 247 out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \ 248 out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \ 249 out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \ 250 \ 251 out3 = _mm_packs_pi32(out3l, out3h); \ 252 out4 = _mm_packs_pi32(out4l, out4h); \ 253 } 254 255 #define DO_IDCT_PASS1(iter) { \ 256 __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \ 257 __m64 quant0l, quant1l, quant2l, quant3l; \ 258 __m64 quant4l, quant5l, quant6l, quant7l; \ 259 __m64 z23, z2, z3, z23l, z23h; \ 260 __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ 261 __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ 262 __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ 263 __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \ 264 __m32 col0a, col1a, mm0; \ 265 \ 266 col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \ 267 col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \ 268 mm0 = _mm_or_si32(col0a, col1a); \ 269 \ 270 if (test_m32_zero(mm0)) { \ 271 __m64 mm1, mm2; \ 272 \ 273 col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \ 274 col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \ 275 col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \ 276 col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \ 277 col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \ 278 col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \ 279 col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \ 280 col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \ 281 \ 282 mm1 = _mm_or_si64(col1l, col3l); \ 283 mm2 = _mm_or_si64(col2l, col4l); \ 284 mm1 = _mm_or_si64(mm1, col5l); \ 285 mm2 = _mm_or_si64(mm2, col6l); \ 286 mm1 = _mm_or_si64(mm1, col7l); \ 287 mm1 = _mm_or_si64(mm1, mm2); \ 288 \ 289 if (test_m64_zero(mm1)) { \ 290 __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \ 291 \ 292 /* AC terms all zero */ \ 293 \ 294 quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ 295 \ 296 dcval = _mm_mullo_pi16(col0l, quant0l); \ 297 dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \ 298 \ 299 dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \ 300 dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \ 301 \ 302 row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \ 303 row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \ 304 row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \ 305 row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \ 306 \ 307 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \ 308 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \ 309 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \ 310 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \ 311 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \ 312 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \ 313 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \ 314 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \ 315 \ 316 goto nextcolumn##iter; \ 317 } \ 318 } \ 319 \ 320 /* Even part \ 321 * \ 322 * (Original) \ 323 * z1 = (z2 + z3) * 0.541196100; \ 324 * tmp2 = z1 + z3 * -1.847759065; \ 325 * tmp3 = z1 + z2 * 0.765366865; \ 326 * \ 327 * (This implementation) \ 328 * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ 329 * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ 330 */ \ 331 \ 332 col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ 333 col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ 334 col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \ 335 col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \ 336 \ 337 quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ 338 quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \ 339 quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \ 340 quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \ 341 \ 342 z2 = _mm_mullo_pi16(col2l, quant2l); \ 343 z3 = _mm_mullo_pi16(col6l, quant6l); \ 344 \ 345 z23l = _mm_unpacklo_pi16(z2, z3); \ 346 z23h = _mm_unpackhi_pi16(z2, z3); \ 347 tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \ 348 tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \ 349 tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \ 350 tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \ 351 \ 352 z2 = _mm_mullo_pi16(col0l, quant0l); \ 353 z3 = _mm_mullo_pi16(col4l, quant4l); \ 354 \ 355 z23 = _mm_add_pi16(z2, z3); \ 356 tmp0l = _mm_loadlo_pi16_f(z23); \ 357 tmp0h = _mm_loadhi_pi16_f(z23); \ 358 tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \ 359 tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \ 360 \ 361 tmp10l = _mm_add_pi32(tmp0l, tmp3l); \ 362 tmp10h = _mm_add_pi32(tmp0h, tmp3h); \ 363 tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \ 364 tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \ 365 \ 366 z23 = _mm_sub_pi16(z2, z3); \ 367 tmp1l = _mm_loadlo_pi16_f(z23); \ 368 tmp1h = _mm_loadhi_pi16_f(z23); \ 369 tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \ 370 tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \ 371 \ 372 tmp11l = _mm_add_pi32(tmp1l, tmp2l); \ 373 tmp11h = _mm_add_pi32(tmp1h, tmp2h); \ 374 tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \ 375 tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \ 376 \ 377 /* Odd part */ \ 378 \ 379 col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ 380 col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ 381 col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \ 382 col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \ 383 \ 384 quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \ 385 quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \ 386 quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \ 387 quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \ 388 \ 389 tmp0 = _mm_mullo_pi16(col7l, quant7l); \ 390 tmp1 = _mm_mullo_pi16(col5l, quant5l); \ 391 tmp2 = _mm_mullo_pi16(col3l, quant3l); \ 392 tmp3 = _mm_mullo_pi16(col1l, quant1l); \ 393 \ 394 DO_IDCT_COMMON(1) \ 395 \ 396 /* out0=(00 10 20 30), out1=(01 11 21 31) */ \ 397 /* out2=(02 12 22 32), out3=(03 13 23 33) */ \ 398 /* out4=(04 14 24 34), out5=(05 15 25 35) */ \ 399 /* out6=(06 16 26 36), out7=(07 17 27 37) */ \ 400 \ 401 /* Transpose coefficients */ \ 402 \ 403 row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \ 404 row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \ 405 row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \ 406 row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \ 407 \ 408 row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \ 409 row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \ 410 row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \ 411 row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \ 412 \ 413 row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \ 414 row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \ 415 row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \ 416 row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \ 417 \ 418 row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \ 419 row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \ 420 row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \ 421 row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \ 422 \ 423 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \ 424 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \ 425 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \ 426 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \ 427 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \ 428 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \ 429 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \ 430 _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \ 431 } 432 433 #define DO_IDCT_PASS2(ctr) { \ 434 __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \ 435 __m64 z23, z23l, z23h; \ 436 __m64 col0123a, col0123b, col0123c, col0123d; \ 437 __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \ 438 __m64 col0, col1, col2, col3; \ 439 __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ 440 __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \ 441 \ 442 row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ 443 row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ 444 row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ 445 row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ 446 row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \ 447 row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \ 448 row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \ 449 row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \ 450 \ 451 /* Even part \ 452 * \ 453 * (Original) \ 454 * z1 = (z2 + z3) * 0.541196100; \ 455 * tmp2 = z1 + z3 * -1.847759065; \ 456 * tmp3 = z1 + z2 * 0.765366865; \ 457 * \ 458 * (This implementation) \ 459 * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ 460 * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ 461 */ \ 462 \ 463 z23l = _mm_unpacklo_pi16(row2l, row6l); \ 464 z23h = _mm_unpackhi_pi16(row2l, row6l); \ 465 \ 466 tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \ 467 tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \ 468 tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \ 469 tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \ 470 \ 471 z23 = _mm_add_pi16(row0l, row4l); \ 472 tmp0l = _mm_loadlo_pi16_f(z23); \ 473 tmp0h = _mm_loadhi_pi16_f(z23); \ 474 tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \ 475 tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \ 476 \ 477 tmp10l = _mm_add_pi32(tmp0l, tmp3l); \ 478 tmp10h = _mm_add_pi32(tmp0h, tmp3h); \ 479 tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \ 480 tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \ 481 \ 482 z23 = _mm_sub_pi16(row0l, row4l); \ 483 tmp1l = _mm_loadlo_pi16_f(z23); \ 484 tmp1h = _mm_loadhi_pi16_f(z23); \ 485 tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \ 486 tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \ 487 \ 488 tmp11l = _mm_add_pi32(tmp1l, tmp2l); \ 489 tmp11h = _mm_add_pi32(tmp1h, tmp2h); \ 490 tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \ 491 tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \ 492 \ 493 /* Odd part */ \ 494 \ 495 tmp0 = row7l; \ 496 tmp1 = row5l; \ 497 tmp2 = row3l; \ 498 tmp3 = row1l; \ 499 \ 500 DO_IDCT_COMMON(2) \ 501 \ 502 /* out0=(00 01 02 03), out1=(10 11 12 13) */ \ 503 /* out2=(20 21 22 23), out3=(30 31 32 33) */ \ 504 /* out4=(40 41 42 43), out5=(50 51 52 53) */ \ 505 /* out6=(60 61 62 63), out7=(70 71 72 73) */ \ 506 \ 507 row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \ 508 row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \ 509 row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \ 510 row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \ 511 \ 512 row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \ 513 row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \ 514 row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \ 515 row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \ 516 \ 517 /* Transpose coefficients */ \ 518 \ 519 col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \ 520 col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \ 521 col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \ 522 col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \ 523 \ 524 col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \ 525 col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \ 526 col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \ 527 col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \ 528 \ 529 col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \ 530 col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \ 531 col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \ 532 col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \ 533 \ 534 _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \ 535 _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \ 536 _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \ 537 _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \ 538 } 539 540 void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block, 541 JSAMPARRAY output_buf, JDIMENSION output_col) 542 { 543 __m64 tmp0, tmp1, tmp2, tmp3; 544 __m64 out0, out1, out2, out3, out4, out5, out6, out7; 545 JCOEFPTR inptr; 546 ISLOW_MULT_TYPE *quantptr; 547 JCOEF *wsptr; 548 JCOEF workspace[DCTSIZE2]; /* buffers data between passes */ 549 550 /* Pass 1: process columns. */ 551 552 inptr = coef_block; 553 quantptr = (ISLOW_MULT_TYPE *)dct_table; 554 wsptr = workspace; 555 556 DO_IDCT_PASS1(1) 557 nextcolumn1: 558 inptr += 4; 559 quantptr += 4; 560 wsptr += DCTSIZE * 4; 561 DO_IDCT_PASS1(2) 562 nextcolumn2: 563 564 /* Pass 2: process rows. */ 565 566 wsptr = workspace; 567 568 DO_IDCT_PASS2(0) 569 wsptr += 4; 570 DO_IDCT_PASS2(4) 571 } 572