1 /* 2 * Loongson MMI optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved. 5 * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. 6 * All Rights Reserved. 7 * Authors: ZhuChen <zhuchen (at) loongson.cn> 8 * CaiWanwei <caiwanwei (at) loongson.cn> 9 * SunZhangzhi <sunzhangzhi-cq (at) loongson.cn> 10 * 11 * Based on the x86 SIMD extension for IJG JPEG library 12 * Copyright (C) 1999-2006, MIYASAKA Masaru. 13 * 14 * This software is provided 'as-is', without any express or implied 15 * warranty. In no event will the authors be held liable for any damages 16 * arising from the use of this software. 17 * 18 * Permission is granted to anyone to use this software for any purpose, 19 * including commercial applications, and to alter it and redistribute it 20 * freely, subject to the following restrictions: 21 * 22 * 1. The origin of this software must not be misrepresented; you must not 23 * claim that you wrote the original software. If you use this software 24 * in a product, an acknowledgment in the product documentation would be 25 * appreciated but is not required. 26 * 2. Altered source versions must be plainly marked as such, and must not be 27 * misrepresented as being the original software. 28 * 3. This notice may not be removed or altered from any source distribution. 29 */ 30 31 /* CHROMA UPSAMPLING */ 32 33 #include "jsimd_mmi.h" 34 35 36 enum const_index { 37 index_PW_THREE, 38 index_PW_SEVEN, 39 index_PW_EIGHT, 40 }; 41 42 static uint64_t const_value[] = { 43 _uint64_set_pi16(3, 3, 3, 3), 44 _uint64_set_pi16(7, 7, 7, 7), 45 _uint64_set_pi16(8, 8, 8, 8), 46 }; 47 48 #define PW_THREE get_const_value(index_PW_THREE) 49 #define PW_SEVEN get_const_value(index_PW_SEVEN) 50 #define PW_EIGHT get_const_value(index_PW_EIGHT) 51 52 53 #define PROCESS_ROW(r) { \ 54 mm7 = _mm_load_si64((__m64 *)outptr##r); /* mm7=IntrL=( 0 1 2 3) */ \ 55 mm3 = _mm_load_si64((__m64 *)outptr##r + 1); /* mm3=IntrH=( 4 5 6 7) */ \ 56 \ 57 mm0 = mm7; \ 58 mm4 = mm3; \ 59 mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT); /* mm0=( 1 2 3 -) */ \ 60 mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \ 61 mm5 = mm7; \ 62 mm6 = mm3; \ 63 mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \ 64 mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT); /* mm6=( - 4 5 6) */ \ 65 \ 66 mm0 = _mm_or_si64(mm0, mm4); /* mm0=( 1 2 3 4) */ \ 67 mm5 = _mm_or_si64(mm5, mm6); /* mm5=( 3 4 5 6) */ \ 68 \ 69 mm1 = mm7; \ 70 mm2 = mm3; \ 71 mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT); /* mm1=( - 0 1 2) */ \ 72 mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT); /* mm2=( 5 6 7 -) */ \ 73 mm4 = mm3; \ 74 mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \ 75 \ 76 mm1 = _mm_or_si64(mm1, wk[r]); /* mm1=(-1 0 1 2) */ \ 77 mm2 = _mm_or_si64(mm2, wk[r + 2]); /* mm2=( 5 6 6 8) */ \ 78 \ 79 wk[r] = mm4; \ 80 \ 81 mm7 = _mm_mullo_pi16(mm7, PW_THREE); \ 82 mm3 = _mm_mullo_pi16(mm3, PW_THREE); \ 83 mm1 = _mm_add_pi16(mm1, PW_EIGHT); \ 84 mm5 = _mm_add_pi16(mm5, PW_EIGHT); \ 85 mm0 = _mm_add_pi16(mm0, PW_SEVEN); \ 86 mm2 = _mm_add_pi16(mm2, PW_SEVEN); \ 87 \ 88 mm1 = _mm_add_pi16(mm1, mm7); \ 89 mm5 = _mm_add_pi16(mm5, mm3); \ 90 mm1 = _mm_srli_pi16(mm1, 4); /* mm1=OutrLE=( 0 2 4 6) */ \ 91 mm5 = _mm_srli_pi16(mm5, 4); /* mm5=OutrHE=( 8 10 12 14) */ \ 92 mm0 = _mm_add_pi16(mm0, mm7); \ 93 mm2 = _mm_add_pi16(mm2, mm3); \ 94 mm0 = _mm_srli_pi16(mm0, 4); /* mm0=OutrLO=( 1 3 5 7) */ \ 95 mm2 = _mm_srli_pi16(mm2, 4); /* mm2=OutrHO=( 9 11 13 15) */ \ 96 \ 97 mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \ 98 mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \ 99 mm1 = _mm_or_si64(mm1, mm0); /* mm1=OutrL=( 0 1 2 3 4 5 6 7) */ \ 100 mm5 = _mm_or_si64(mm5, mm2); /* mm5=OutrH=( 8 9 10 11 12 13 14 15) */ \ 101 \ 102 _mm_store_si64((__m64 *)outptr##r, mm1); \ 103 _mm_store_si64((__m64 *)outptr##r + 1, mm5); \ 104 } 105 106 void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, 107 JDIMENSION downsampled_width, 108 JSAMPARRAY input_data, 109 JSAMPARRAY *output_data_ptr) 110 { 111 JSAMPARRAY output_data = *output_data_ptr; 112 JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; 113 int inrow, outrow, incol, tmp, tmp1; 114 __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0; 115 __m64 wk[4], mm_tmp; 116 117 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { 118 119 inptr_1 = input_data[inrow - 1]; 120 inptr0 = input_data[inrow]; 121 inptr1 = input_data[inrow + 1]; 122 outptr0 = output_data[outrow++]; 123 outptr1 = output_data[outrow++]; 124 125 if (downsampled_width & 7) { 126 tmp = (downsampled_width - 1) * sizeof(JSAMPLE); 127 tmp1 = downsampled_width * sizeof(JSAMPLE); 128 asm("daddu $8, %3, %6\r\n" 129 "lb $9, ($8)\r\n" 130 "daddu $8, %3, %7\r\n" 131 "sb $9, ($8)\r\n" 132 "daddu $8, %4, %6\r\n" 133 "lb $9, ($8)\r\n" 134 "daddu $8, %4, %7\r\n" 135 "sb $9, ($8)\r\n" 136 "daddu $8, %5, %6\r\n" 137 "lb $9, ($8)\r\n" 138 "daddu $8, %5, %7\r\n" 139 "sb $9, ($8)\r\n" 140 : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1) 141 : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1) 142 : "$8", "$9" 143 ); 144 } 145 146 /* process the first column block */ 147 mm0 = _mm_load_si64((__m64 *)inptr0); /* mm0 = row[ 0][0] */ 148 mm1 = _mm_load_si64((__m64 *)inptr_1); /* mm1 = row[-1][0] */ 149 mm2 = _mm_load_si64((__m64 *)inptr1); /* mm2 = row[ 1][0] */ 150 151 mm3 = _mm_xor_si64(mm3, mm3); /* mm3 = (all 0's) */ 152 mm4 = mm0; 153 mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][0]( 0 1 2 3) */ 154 mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][0]( 4 5 6 7) */ 155 mm5 = mm1; 156 mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][0]( 0 1 2 3) */ 157 mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][0]( 4 5 6 7) */ 158 mm6 = mm2; 159 mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][0]( 0 1 2 3) */ 160 mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][0]( 4 5 6 7) */ 161 162 mm0 = _mm_mullo_pi16(mm0, PW_THREE); 163 mm4 = _mm_mullo_pi16(mm4, PW_THREE); 164 165 mm7 = _mm_cmpeq_pi8(mm7, mm7); 166 mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT); 167 168 mm1 = _mm_add_pi16(mm1, mm0); /* mm1=Int0L=( 0 1 2 3) */ 169 mm5 = _mm_add_pi16(mm5, mm4); /* mm5=Int0H=( 4 5 6 7) */ 170 mm2 = _mm_add_pi16(mm2, mm0); /* mm2=Int1L=( 0 1 2 3) */ 171 mm6 = _mm_add_pi16(mm6, mm4); /* mm6=Int1H=( 4 5 6 7) */ 172 173 _mm_store_si64((__m64 *)outptr0, mm1); /* temporarily save */ 174 _mm_store_si64((__m64 *)outptr0 + 1, mm5); /* the intermediate data */ 175 _mm_store_si64((__m64 *)outptr1, mm2); 176 _mm_store_si64((__m64 *)outptr1 + 1, mm6); 177 178 mm1 = _mm_and_si64(mm1, mm7); /* mm1=( 0 - - -) */ 179 mm2 = _mm_and_si64(mm2, mm7); /* mm2=( 0 - - -) */ 180 181 wk[0] = mm1; 182 wk[1] = mm2; 183 184 for (incol = downsampled_width; incol > 0; 185 incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8, 186 outptr0 += 16, outptr1 += 16) { 187 188 if (incol > 8) { 189 /* process the next column block */ 190 mm0 = _mm_load_si64((__m64 *)inptr0 + 1); /* mm0 = row[ 0][1] */ 191 mm1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* mm1 = row[-1][1] */ 192 mm2 = _mm_load_si64((__m64 *)inptr1 + 1); /* mm2 = row[+1][1] */ 193 194 mm3 = _mm_setzero_si64(); /* mm3 = (all 0's) */ 195 mm4 = mm0; 196 mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][1]( 0 1 2 3) */ 197 mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][1]( 4 5 6 7) */ 198 mm5 = mm1; 199 mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][1]( 0 1 2 3) */ 200 mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][1]( 4 5 6 7) */ 201 mm6 = mm2; 202 mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][1]( 0 1 2 3) */ 203 mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][1]( 4 5 6 7) */ 204 205 mm0 = _mm_mullo_pi16(mm0, PW_THREE); 206 mm4 = _mm_mullo_pi16(mm4, PW_THREE); 207 208 mm1 = _mm_add_pi16(mm1, mm0); /* mm1 = Int0L = ( 0 1 2 3) */ 209 mm5 = _mm_add_pi16(mm5, mm4); /* mm5 = Int0H = ( 4 5 6 7) */ 210 mm2 = _mm_add_pi16(mm2, mm0); /* mm2 = Int1L = ( 0 1 2 3) */ 211 mm6 = _mm_add_pi16(mm6, mm4); /* mm6 = Int1H = ( 4 5 6 7) */ 212 213 _mm_store_si64((__m64 *)outptr0 + 2, mm1); /* temporarily save */ 214 _mm_store_si64((__m64 *)outptr0 + 3, mm5); /* the intermediate data */ 215 _mm_store_si64((__m64 *)outptr1 + 2, mm2); 216 _mm_store_si64((__m64 *)outptr1 + 3, mm6); 217 218 mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */ 219 mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */ 220 221 wk[2] = mm1; 222 wk[3] = mm2; 223 } else { 224 /* process the last column block */ 225 mm1 = _mm_cmpeq_pi8(mm1, mm1); 226 mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); 227 mm2 = mm1; 228 229 mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1); 230 mm1 = _mm_and_si64(mm1, mm_tmp); /* mm1=( - - - 7) */ 231 mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1); 232 mm2 = _mm_and_si64(mm2, mm_tmp); /* mm2=( - - - 7) */ 233 234 wk[2] = mm1; 235 wk[3] = mm2; 236 } 237 238 /* process the upper row */ 239 PROCESS_ROW(0) 240 241 /* process the lower row */ 242 PROCESS_ROW(1) 243 } 244 } 245 } 246