1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MIPS version of lossless functions 11 // 12 // Author(s): Djordje Pesut (djordje.pesut (at) imgtec.com) 13 // Jovan Zelincevic (jovan.zelincevic (at) imgtec.com) 14 15 #include "src/dsp/dsp.h" 16 #include "src/dsp/lossless.h" 17 #include "src/dsp/lossless_common.h" 18 19 #if defined(WEBP_USE_MIPS32) 20 21 #include <assert.h> 22 #include <math.h> 23 #include <stdlib.h> 24 #include <string.h> 25 26 static float FastSLog2Slow_MIPS32(uint32_t v) { 27 assert(v >= LOG_LOOKUP_IDX_MAX); 28 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { 29 uint32_t log_cnt, y, correction; 30 const int c24 = 24; 31 const float v_f = (float)v; 32 uint32_t temp; 33 34 // Xf = 256 = 2^8 35 // log_cnt is index of leading one in upper 24 bits 36 __asm__ volatile( 37 "clz %[log_cnt], %[v] \n\t" 38 "addiu %[y], $zero, 1 \n\t" 39 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" 40 "sllv %[y], %[y], %[log_cnt] \n\t" 41 "srlv %[temp], %[v], %[log_cnt] \n\t" 42 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), 43 [temp]"=r"(temp) 44 : [c24]"r"(c24), [v]"r"(v) 45 ); 46 47 // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 48 // Xf = floor(Xf) * (1 + (v % y) / v) 49 // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) 50 // The correction factor: log(1 + d) ~ d; for very small d values, so 51 // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v 52 // LOG_2_RECIPROCAL ~ 23/16 53 54 // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) 55 correction = (23 * (v & (y - 1))) >> 4; 56 return v_f * (kLog2Table[temp] + log_cnt) + correction; 57 } else { 58 return (float)(LOG_2_RECIPROCAL * v * log((double)v)); 59 } 60 } 61 62 static float FastLog2Slow_MIPS32(uint32_t v) { 63 assert(v >= LOG_LOOKUP_IDX_MAX); 64 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { 65 uint32_t log_cnt, y; 66 const int c24 = 24; 67 double log_2; 68 uint32_t temp; 69 70 __asm__ volatile( 71 "clz %[log_cnt], %[v] \n\t" 72 "addiu %[y], $zero, 1 \n\t" 73 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" 74 "sllv %[y], %[y], %[log_cnt] \n\t" 75 "srlv %[temp], %[v], %[log_cnt] \n\t" 76 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), 77 [temp]"=r"(temp) 78 : [c24]"r"(c24), [v]"r"(v) 79 ); 80 81 log_2 = kLog2Table[temp] + log_cnt; 82 if (v >= APPROX_LOG_MAX) { 83 // Since the division is still expensive, add this correction factor only 84 // for large values of 'v'. 85 86 const uint32_t correction = (23 * (v & (y - 1))) >> 4; 87 log_2 += (double)correction / v; 88 } 89 return (float)log_2; 90 } else { 91 return (float)(LOG_2_RECIPROCAL * log((double)v)); 92 } 93 } 94 95 // C version of this function: 96 // int i = 0; 97 // int64_t cost = 0; 98 // const uint32_t* pop = &population[4]; 99 // const uint32_t* LoopEnd = &population[length]; 100 // while (pop != LoopEnd) { 101 // ++i; 102 // cost += i * *pop; 103 // cost += i * *(pop + 1); 104 // pop += 2; 105 // } 106 // return (double)cost; 107 static double ExtraCost_MIPS32(const uint32_t* const population, int length) { 108 int i, temp0, temp1; 109 const uint32_t* pop = &population[4]; 110 const uint32_t* const LoopEnd = &population[length]; 111 112 __asm__ volatile( 113 "mult $zero, $zero \n\t" 114 "xor %[i], %[i], %[i] \n\t" 115 "beq %[pop], %[LoopEnd], 2f \n\t" 116 "1: \n\t" 117 "lw %[temp0], 0(%[pop]) \n\t" 118 "lw %[temp1], 4(%[pop]) \n\t" 119 "addiu %[i], %[i], 1 \n\t" 120 "addiu %[pop], %[pop], 8 \n\t" 121 "madd %[i], %[temp0] \n\t" 122 "madd %[i], %[temp1] \n\t" 123 "bne %[pop], %[LoopEnd], 1b \n\t" 124 "2: \n\t" 125 "mfhi %[temp0] \n\t" 126 "mflo %[temp1] \n\t" 127 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), 128 [i]"=&r"(i), [pop]"+r"(pop) 129 : [LoopEnd]"r"(LoopEnd) 130 : "memory", "hi", "lo" 131 ); 132 133 return (double)((int64_t)temp0 << 32 | temp1); 134 } 135 136 // C version of this function: 137 // int i = 0; 138 // int64_t cost = 0; 139 // const uint32_t* pX = &X[4]; 140 // const uint32_t* pY = &Y[4]; 141 // const uint32_t* LoopEnd = &X[length]; 142 // while (pX != LoopEnd) { 143 // const uint32_t xy0 = *pX + *pY; 144 // const uint32_t xy1 = *(pX + 1) + *(pY + 1); 145 // ++i; 146 // cost += i * xy0; 147 // cost += i * xy1; 148 // pX += 2; 149 // pY += 2; 150 // } 151 // return (double)cost; 152 static double ExtraCostCombined_MIPS32(const uint32_t* const X, 153 const uint32_t* const Y, int length) { 154 int i, temp0, temp1, temp2, temp3; 155 const uint32_t* pX = &X[4]; 156 const uint32_t* pY = &Y[4]; 157 const uint32_t* const LoopEnd = &X[length]; 158 159 __asm__ volatile( 160 "mult $zero, $zero \n\t" 161 "xor %[i], %[i], %[i] \n\t" 162 "beq %[pX], %[LoopEnd], 2f \n\t" 163 "1: \n\t" 164 "lw %[temp0], 0(%[pX]) \n\t" 165 "lw %[temp1], 0(%[pY]) \n\t" 166 "lw %[temp2], 4(%[pX]) \n\t" 167 "lw %[temp3], 4(%[pY]) \n\t" 168 "addiu %[i], %[i], 1 \n\t" 169 "addu %[temp0], %[temp0], %[temp1] \n\t" 170 "addu %[temp2], %[temp2], %[temp3] \n\t" 171 "addiu %[pX], %[pX], 8 \n\t" 172 "addiu %[pY], %[pY], 8 \n\t" 173 "madd %[i], %[temp0] \n\t" 174 "madd %[i], %[temp2] \n\t" 175 "bne %[pX], %[LoopEnd], 1b \n\t" 176 "2: \n\t" 177 "mfhi %[temp0] \n\t" 178 "mflo %[temp1] \n\t" 179 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), 180 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), 181 [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY) 182 : [LoopEnd]"r"(LoopEnd) 183 : "memory", "hi", "lo" 184 ); 185 186 return (double)((int64_t)temp0 << 32 | temp1); 187 } 188 189 #define HUFFMAN_COST_PASS \ 190 __asm__ volatile( \ 191 "sll %[temp1], %[temp0], 3 \n\t" \ 192 "addiu %[temp3], %[streak], -3 \n\t" \ 193 "addu %[temp2], %[pstreaks], %[temp1] \n\t" \ 194 "blez %[temp3], 1f \n\t" \ 195 "srl %[temp1], %[temp1], 1 \n\t" \ 196 "addu %[temp3], %[pcnts], %[temp1] \n\t" \ 197 "lw %[temp0], 4(%[temp2]) \n\t" \ 198 "lw %[temp1], 0(%[temp3]) \n\t" \ 199 "addu %[temp0], %[temp0], %[streak] \n\t" \ 200 "addiu %[temp1], %[temp1], 1 \n\t" \ 201 "sw %[temp0], 4(%[temp2]) \n\t" \ 202 "sw %[temp1], 0(%[temp3]) \n\t" \ 203 "b 2f \n\t" \ 204 "1: \n\t" \ 205 "lw %[temp0], 0(%[temp2]) \n\t" \ 206 "addu %[temp0], %[temp0], %[streak] \n\t" \ 207 "sw %[temp0], 0(%[temp2]) \n\t" \ 208 "2: \n\t" \ 209 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \ 210 [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \ 211 : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \ 212 [streak]"r"(streak) \ 213 : "memory" \ 214 ); 215 216 // Returns the various RLE counts 217 static WEBP_INLINE void GetEntropyUnrefinedHelper( 218 uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, 219 VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { 220 int* const pstreaks = &stats->streaks[0][0]; 221 int* const pcnts = &stats->counts[0]; 222 int temp0, temp1, temp2, temp3; 223 const int streak = i - *i_prev; 224 225 // Gather info for the bit entropy. 226 if (*val_prev != 0) { 227 bit_entropy->sum += (*val_prev) * streak; 228 bit_entropy->nonzeros += streak; 229 bit_entropy->nonzero_code = *i_prev; 230 bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; 231 if (bit_entropy->max_val < *val_prev) { 232 bit_entropy->max_val = *val_prev; 233 } 234 } 235 236 // Gather info for the Huffman cost. 237 temp0 = (*val_prev != 0); 238 HUFFMAN_COST_PASS 239 240 *val_prev = val; 241 *i_prev = i; 242 } 243 244 static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length, 245 VP8LBitEntropy* const bit_entropy, 246 VP8LStreaks* const stats) { 247 int i; 248 int i_prev = 0; 249 uint32_t x_prev = X[0]; 250 251 memset(stats, 0, sizeof(*stats)); 252 VP8LBitEntropyInit(bit_entropy); 253 254 for (i = 1; i < length; ++i) { 255 const uint32_t x = X[i]; 256 if (x != x_prev) { 257 GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats); 258 } 259 } 260 GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); 261 262 bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); 263 } 264 265 static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], 266 const uint32_t Y[], 267 int length, 268 VP8LBitEntropy* const entropy, 269 VP8LStreaks* const stats) { 270 int i = 1; 271 int i_prev = 0; 272 uint32_t xy_prev = X[0] + Y[0]; 273 274 memset(stats, 0, sizeof(*stats)); 275 VP8LBitEntropyInit(entropy); 276 277 for (i = 1; i < length; ++i) { 278 const uint32_t xy = X[i] + Y[i]; 279 if (xy != xy_prev) { 280 GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats); 281 } 282 } 283 GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats); 284 285 entropy->entropy += VP8LFastSLog2(entropy->sum); 286 } 287 288 #define ASM_START \ 289 __asm__ volatile( \ 290 ".set push \n\t" \ 291 ".set at \n\t" \ 292 ".set macro \n\t" \ 293 "1: \n\t" 294 295 // P2 = P0 + P1 296 // A..D - offsets 297 // E - temp variable to tell macro 298 // if pointer should be incremented 299 // literal_ and successive histograms could be unaligned 300 // so we must use ulw and usw 301 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \ 302 "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \ 303 "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \ 304 "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \ 305 "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \ 306 "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \ 307 "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \ 308 "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \ 309 "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \ 310 "addu %[temp4], %[temp4], %[temp0] \n\t" \ 311 "addu %[temp5], %[temp5], %[temp1] \n\t" \ 312 "addu %[temp6], %[temp6], %[temp2] \n\t" \ 313 "addu %[temp7], %[temp7], %[temp3] \n\t" \ 314 "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \ 315 ".if " #E " == 1 \n\t" \ 316 "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \ 317 ".endif \n\t" \ 318 "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \ 319 "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \ 320 "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \ 321 "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \ 322 "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \ 323 "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \ 324 ".set pop \n\t" \ 325 326 #define ASM_END_COMMON_0 \ 327 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ 328 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ 329 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \ 330 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \ 331 [pa]"+r"(pa), [pout]"+r"(pout) 332 333 #define ASM_END_COMMON_1 \ 334 : [LoopEnd]"r"(LoopEnd) \ 335 : "memory", "at" \ 336 ); 337 338 #define ASM_END_0 \ 339 ASM_END_COMMON_0 \ 340 , [pb]"+r"(pb) \ 341 ASM_END_COMMON_1 342 343 #define ASM_END_1 \ 344 ASM_END_COMMON_0 \ 345 ASM_END_COMMON_1 346 347 #define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \ 348 const uint32_t* pa = (const uint32_t*)(A); \ 349 const uint32_t* pb = (const uint32_t*)(B); \ 350 uint32_t* pout = (uint32_t*)(OUT); \ 351 const uint32_t* const LoopEnd = pa + (SIZE); \ 352 assert((SIZE) % 4 == 0); \ 353 ASM_START \ 354 ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \ 355 ASM_END_0 \ 356 if ((EXTRA_SIZE) > 0) { \ 357 const int last = (EXTRA_SIZE); \ 358 int i; \ 359 for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \ 360 } \ 361 } while (0) 362 363 #define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \ 364 const uint32_t* pa = (const uint32_t*)(A); \ 365 uint32_t* pout = (uint32_t*)(OUT); \ 366 const uint32_t* const LoopEnd = pa + (SIZE); \ 367 assert((SIZE) % 4 == 0); \ 368 ASM_START \ 369 ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \ 370 ASM_END_1 \ 371 if ((EXTRA_SIZE) > 0) { \ 372 const int last = (EXTRA_SIZE); \ 373 int i; \ 374 for (i = 0; i < last; ++i) pout[i] += pa[i]; \ 375 } \ 376 } while (0) 377 378 static void HistogramAdd_MIPS32(const VP8LHistogram* const a, 379 const VP8LHistogram* const b, 380 VP8LHistogram* const out) { 381 uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; 382 const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_) 383 - (NUM_LITERAL_CODES + NUM_LENGTH_CODES); 384 assert(a->palette_code_bits_ == b->palette_code_bits_); 385 386 if (b != out) { 387 ADD_VECTOR(a->literal_, b->literal_, out->literal_, 388 NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); 389 ADD_VECTOR(a->distance_, b->distance_, out->distance_, 390 NUM_DISTANCE_CODES, 0); 391 ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0); 392 ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0); 393 ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); 394 } else { 395 ADD_VECTOR_EQ(a->literal_, out->literal_, 396 NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); 397 ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0); 398 ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0); 399 ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0); 400 ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); 401 } 402 } 403 404 #undef ADD_VECTOR_EQ 405 #undef ADD_VECTOR 406 #undef ASM_END_1 407 #undef ASM_END_0 408 #undef ASM_END_COMMON_1 409 #undef ASM_END_COMMON_0 410 #undef ADD_TO_OUT 411 #undef ASM_START 412 413 //------------------------------------------------------------------------------ 414 // Entry point 415 416 extern void VP8LEncDspInitMIPS32(void); 417 418 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { 419 VP8LFastSLog2Slow = FastSLog2Slow_MIPS32; 420 VP8LFastLog2Slow = FastLog2Slow_MIPS32; 421 VP8LExtraCost = ExtraCost_MIPS32; 422 VP8LExtraCostCombined = ExtraCostCombined_MIPS32; 423 VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32; 424 VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32; 425 VP8LHistogramAdd = HistogramAdd_MIPS32; 426 } 427 428 #else // !WEBP_USE_MIPS32 429 430 WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32) 431 432 #endif // WEBP_USE_MIPS32 433