1 /* 2 * The copyright in this software is being made available under the 2-clauses 3 * BSD License, included below. This software may be subject to other third 4 * party and contributor rights, including patent rights, and no such rights 5 * are granted under this license. 6 * 7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium 8 * Copyright (c) 2002-2014, Professor Benoit Macq 9 * Copyright (c) 2001-2003, David Janssens 10 * Copyright (c) 2002-2003, Yannick Verschueren 11 * Copyright (c) 2003-2007, Francois-Olivier Devaux 12 * Copyright (c) 2003-2014, Antonin Descampe 13 * Copyright (c) 2005, Herve Drolon, FreeImage Team 14 * Copyright (c) 2007, Jonathan Ballard <dzonatas (at) dzonux.net> 15 * Copyright (c) 2007, Callum Lerwick <seg (at) haxxed.com> 16 * Copyright (c) 2017, IntoPIX SA <support (at) intopix.com> 17 * All rights reserved. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' 29 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 32 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 * POSSIBILITY OF SUCH DAMAGE. 39 */ 40 41 #include <assert.h> 42 43 #define OPJ_SKIP_POISON 44 #include "opj_includes.h" 45 46 #ifdef __SSE__ 47 #include <xmmintrin.h> 48 #endif 49 #ifdef __SSE2__ 50 #include <emmintrin.h> 51 #endif 52 #ifdef __SSSE3__ 53 #include <tmmintrin.h> 54 #endif 55 #ifdef __AVX2__ 56 #include <immintrin.h> 57 #endif 58 59 #if defined(__GNUC__) 60 #pragma GCC poison malloc calloc realloc free 61 #endif 62 63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */ 64 /*@{*/ 65 66 #ifdef __AVX2__ 67 /** Number of int32 values in a AVX2 register */ 68 #define VREG_INT_COUNT 8 69 #else 70 /** Number of int32 values in a SSE2 register */ 71 #define VREG_INT_COUNT 4 72 #endif 73 74 /** Number of columns that we can process in parallel in the vertical pass */ 75 #define PARALLEL_COLS_53 (2*VREG_INT_COUNT) 76 77 /** @name Local data structures */ 78 /*@{*/ 79 80 typedef struct dwt_local { 81 OPJ_INT32* mem; 82 OPJ_SIZE_T mem_count; 83 OPJ_INT32 dn; /* number of elements in high pass band */ 84 OPJ_INT32 sn; /* number of elements in low pass band */ 85 OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */ 86 } opj_dwt_t; 87 88 typedef union { 89 OPJ_FLOAT32 f[4]; 90 } opj_v4_t; 91 92 typedef struct v4dwt_local { 93 opj_v4_t* wavelet ; 94 OPJ_INT32 dn ; /* number of elements in high pass band */ 95 OPJ_INT32 sn ; /* number of elements in low pass band */ 96 OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */ 97 OPJ_UINT32 win_l_x0; /* start coord in low pass band */ 98 OPJ_UINT32 win_l_x1; /* end coord in low pass band */ 99 OPJ_UINT32 win_h_x0; /* start coord in high pass band */ 100 OPJ_UINT32 win_h_x1; /* end coord in high pass band */ 101 } opj_v4dwt_t ; 102 103 static const OPJ_FLOAT32 opj_dwt_alpha = 1.586134342f; /* 12994 */ 104 static const OPJ_FLOAT32 opj_dwt_beta = 0.052980118f; /* 434 */ 105 static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /* -7233 */ 106 static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /* -3633 */ 107 108 static const OPJ_FLOAT32 opj_K = 1.230174105f; /* 10078 */ 109 static const OPJ_FLOAT32 opj_c13318 = 1.625732422f; 110 111 /*@}*/ 112 113 /** 114 Virtual function type for wavelet transform in 1-D 115 */ 116 typedef void (*DWT1DFN)(const opj_dwt_t* v); 117 118 /** @name Local static functions */ 119 /*@{*/ 120 121 /** 122 Forward lazy transform (horizontal) 123 */ 124 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, 125 OPJ_INT32 sn, OPJ_INT32 cas); 126 /** 127 Forward lazy transform (vertical) 128 */ 129 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, 130 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas); 131 /** 132 Forward 5-3 wavelet transform in 1-D 133 */ 134 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn, 135 OPJ_INT32 sn, OPJ_INT32 cas); 136 /** 137 Forward 9-7 wavelet transform in 1-D 138 */ 139 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_SIZE_T a_count, 140 OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); 141 142 143 /** 144 Explicit calculation of the Quantization Stepsizes 145 */ 146 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, 147 opj_stepsize_t *bandno_stepsize); 148 /** 149 Inverse wavelet transform in 2-D. 150 */ 151 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, 152 const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i); 153 154 static OPJ_BOOL opj_dwt_decode_partial_tile( 155 opj_tcd_tilecomp_t* tilec, 156 OPJ_UINT32 numres); 157 158 static OPJ_BOOL opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec, 159 void(*p_function)(OPJ_INT32 *, OPJ_SIZE_T, OPJ_INT32, OPJ_INT32, OPJ_INT32)); 160 161 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, 162 OPJ_UINT32 i); 163 164 /* <summary> */ 165 /* Inverse 9-7 wavelet transform in 1-D. */ 166 /* </summary> */ 167 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt); 168 169 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, 170 OPJ_FLOAT32* OPJ_RESTRICT a, 171 OPJ_UINT32 width, 172 OPJ_UINT32 remaining_height); 173 174 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, 175 OPJ_FLOAT32* OPJ_RESTRICT a, 176 OPJ_UINT32 width, 177 OPJ_UINT32 nb_elts_read); 178 179 #ifdef __SSE__ 180 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, 181 OPJ_UINT32 start, 182 OPJ_UINT32 end, 183 const __m128 c); 184 185 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, 186 OPJ_UINT32 start, 187 OPJ_UINT32 end, 188 OPJ_UINT32 m, __m128 c); 189 190 #else 191 static void opj_v4dwt_decode_step1(opj_v4_t* w, 192 OPJ_UINT32 start, 193 OPJ_UINT32 end, 194 const OPJ_FLOAT32 c); 195 196 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, 197 OPJ_UINT32 start, 198 OPJ_UINT32 end, 199 OPJ_UINT32 m, 200 OPJ_FLOAT32 c); 201 202 #endif 203 204 /*@}*/ 205 206 /*@}*/ 207 208 #define IDX_S(i) (i)*2 209 #define IDX_D(i) 1 + (i)* 2 210 #define UNDERFLOW_SN(i) ((i) >= sn&&sn>0) 211 #define UNDERFLOW_DN(i) ((i) >= dn&&dn>0) 212 #define OVERFLOW_S(i) (IDX_S(i) >= a_count) 213 #define OVERFLOW_D(i) (IDX_D(i) >= a_count) 214 215 #define OPJ_S(i) a[IDX_S(i)] 216 #define OPJ_D(i) a[IDX_D(i)] 217 #define OPJ_S_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_SN(i) ? OPJ_S(sn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i))) 218 #define OPJ_D_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_DN(i) ? OPJ_D(dn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i))) 219 /* new */ 220 #define OPJ_SS_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_DN(i) ? OPJ_S(dn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i))) 221 #define OPJ_DD_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_SN(i) ? OPJ_D(sn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i))) 222 223 /* <summary> */ 224 /* This table contains the norms of the 5-3 wavelets for different bands. */ 225 /* </summary> */ 226 /* FIXME! the array should really be extended up to 33 resolution levels */ 227 /* See https://github.com/uclouvain/openjpeg/issues/493 */ 228 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = { 229 {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3}, 230 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, 231 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, 232 {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93} 233 }; 234 235 /* <summary> */ 236 /* This table contains the norms of the 9-7 wavelets for different bands. */ 237 /* </summary> */ 238 /* FIXME! the array should really be extended up to 33 resolution levels */ 239 /* See https://github.com/uclouvain/openjpeg/issues/493 */ 240 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = { 241 {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9}, 242 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, 243 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, 244 {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2} 245 }; 246 247 /* 248 ========================================================== 249 local functions 250 ========================================================== 251 */ 252 253 /* <summary> */ 254 /* Forward lazy transform (horizontal). */ 255 /* </summary> */ 256 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, 257 OPJ_INT32 sn, OPJ_INT32 cas) 258 { 259 OPJ_INT32 i; 260 OPJ_INT32 * l_dest = b; 261 OPJ_INT32 * l_src = a + cas; 262 263 for (i = 0; i < sn; ++i) { 264 *l_dest++ = *l_src; 265 l_src += 2; 266 } 267 268 l_dest = b + sn; 269 l_src = a + 1 - cas; 270 271 for (i = 0; i < dn; ++i) { 272 *l_dest++ = *l_src; 273 l_src += 2; 274 } 275 } 276 277 /* <summary> */ 278 /* Forward lazy transform (vertical). */ 279 /* </summary> */ 280 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, 281 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas) 282 { 283 OPJ_INT32 i = sn; 284 OPJ_INT32 * l_dest = b; 285 OPJ_INT32 * l_src = a + cas; 286 287 while (i--) { 288 *l_dest = *l_src; 289 l_dest += x; 290 l_src += 2; 291 } /* b[i*x]=a[2*i+cas]; */ 292 293 l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x; 294 l_src = a + 1 - cas; 295 296 i = dn; 297 while (i--) { 298 *l_dest = *l_src; 299 l_dest += x; 300 l_src += 2; 301 } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/ 302 } 303 304 #ifdef STANDARD_SLOW_VERSION 305 /* <summary> */ 306 /* Inverse lazy transform (horizontal). */ 307 /* </summary> */ 308 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a) 309 { 310 OPJ_INT32 *ai = a; 311 OPJ_INT32 *bi = h->mem + h->cas; 312 OPJ_INT32 i = h->sn; 313 while (i--) { 314 *bi = *(ai++); 315 bi += 2; 316 } 317 ai = a + h->sn; 318 bi = h->mem + 1 - h->cas; 319 i = h->dn ; 320 while (i--) { 321 *bi = *(ai++); 322 bi += 2; 323 } 324 } 325 326 /* <summary> */ 327 /* Inverse lazy transform (vertical). */ 328 /* </summary> */ 329 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) 330 { 331 OPJ_INT32 *ai = a; 332 OPJ_INT32 *bi = v->mem + v->cas; 333 OPJ_INT32 i = v->sn; 334 while (i--) { 335 *bi = *ai; 336 bi += 2; 337 ai += x; 338 } 339 ai = a + (v->sn * (OPJ_SIZE_T)x); 340 bi = v->mem + 1 - v->cas; 341 i = v->dn ; 342 while (i--) { 343 *bi = *ai; 344 bi += 2; 345 ai += x; 346 } 347 } 348 349 #endif /* STANDARD_SLOW_VERSION */ 350 351 /* <summary> */ 352 /* Forward 5-3 wavelet transform in 1-D. */ 353 /* </summary> */ 354 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn, 355 OPJ_INT32 sn, OPJ_INT32 cas) 356 { 357 OPJ_INT32 i; 358 359 if (!cas) { 360 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 361 for (i = 0; i < dn; i++) { 362 OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 363 } 364 for (i = 0; i < sn; i++) { 365 OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 366 } 367 } 368 } else { 369 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ 370 OPJ_S(0) *= 2; 371 } else { 372 for (i = 0; i < dn; i++) { 373 OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; 374 } 375 for (i = 0; i < sn; i++) { 376 OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; 377 } 378 } 379 } 380 } 381 382 #ifdef STANDARD_SLOW_VERSION 383 /* <summary> */ 384 /* Inverse 5-3 wavelet transform in 1-D. */ 385 /* </summary> */ 386 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn, 387 OPJ_INT32 sn, OPJ_INT32 cas) 388 { 389 OPJ_INT32 i; 390 391 if (!cas) { 392 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 393 for (i = 0; i < sn; i++) { 394 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 395 } 396 for (i = 0; i < dn; i++) { 397 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 398 } 399 } 400 } else { 401 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ 402 OPJ_S(0) /= 2; 403 } else { 404 for (i = 0; i < sn; i++) { 405 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; 406 } 407 for (i = 0; i < dn; i++) { 408 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; 409 } 410 } 411 } 412 } 413 414 static void opj_dwt_decode_1(const opj_dwt_t *v) 415 { 416 opj_dwt_decode_1_(v->mem, v->mem_count, v->dn, v->sn, v->cas); 417 } 418 419 #endif /* STANDARD_SLOW_VERSION */ 420 421 #if !defined(STANDARD_SLOW_VERSION) 422 static void opj_idwt53_h_cas0(OPJ_INT32* tmp, 423 const OPJ_INT32 sn, 424 const OPJ_INT32 len, 425 OPJ_INT32* tiledp) 426 { 427 OPJ_INT32 i, j; 428 const OPJ_INT32* in_even = &tiledp[0]; 429 const OPJ_INT32* in_odd = &tiledp[sn]; 430 431 #ifdef TWO_PASS_VERSION 432 /* For documentation purpose: performs lifting in two iterations, */ 433 /* but without explicit interleaving */ 434 435 assert(len > 1); 436 437 /* Even */ 438 tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1); 439 for (i = 2, j = 0; i <= len - 2; i += 2, j++) { 440 tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2); 441 } 442 if (len & 1) { /* if len is odd */ 443 tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1); 444 } 445 446 /* Odd */ 447 for (i = 1, j = 0; i < len - 1; i += 2, j++) { 448 tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1); 449 } 450 if (!(len & 1)) { /* if len is even */ 451 tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2]; 452 } 453 #else 454 OPJ_INT32 d1c, d1n, s1n, s0c, s0n; 455 456 assert(len > 1); 457 458 /* Improved version of the TWO_PASS_VERSION: */ 459 /* Performs lifting in one single iteration. Saves memory */ 460 /* accesses and explicit interleaving. */ 461 s1n = in_even[0]; 462 d1n = in_odd[0]; 463 s0n = s1n - ((d1n + 1) >> 1); 464 465 for (i = 0, j = 1; i < (len - 3); i += 2, j++) { 466 d1c = d1n; 467 s0c = s0n; 468 469 s1n = in_even[j]; 470 d1n = in_odd[j]; 471 472 s0n = s1n - ((d1c + d1n + 2) >> 2); 473 474 tmp[i ] = s0c; 475 tmp[i + 1] = d1c + ((s0c + s0n) >> 1); 476 } 477 478 tmp[i] = s0n; 479 480 if (len & 1) { 481 tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1); 482 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); 483 } else { 484 tmp[len - 1] = d1n + s0n; 485 } 486 #endif 487 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); 488 } 489 490 static void opj_idwt53_h_cas1(OPJ_INT32* tmp, 491 const OPJ_INT32 sn, 492 const OPJ_INT32 len, 493 OPJ_INT32* tiledp) 494 { 495 OPJ_INT32 i, j; 496 const OPJ_INT32* in_even = &tiledp[sn]; 497 const OPJ_INT32* in_odd = &tiledp[0]; 498 499 #ifdef TWO_PASS_VERSION 500 /* For documentation purpose: performs lifting in two iterations, */ 501 /* but without explicit interleaving */ 502 503 assert(len > 2); 504 505 /* Odd */ 506 for (i = 1, j = 0; i < len - 1; i += 2, j++) { 507 tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2); 508 } 509 if (!(len & 1)) { 510 tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1); 511 } 512 513 /* Even */ 514 tmp[0] = in_even[0] + tmp[1]; 515 for (i = 2, j = 1; i < len - 1; i += 2, j++) { 516 tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1); 517 } 518 if (len & 1) { 519 tmp[len - 1] = in_even[len / 2] + tmp[len - 2]; 520 } 521 #else 522 OPJ_INT32 s1, s2, dc, dn; 523 524 assert(len > 2); 525 526 /* Improved version of the TWO_PASS_VERSION: */ 527 /* Performs lifting in one single iteration. Saves memory */ 528 /* accesses and explicit interleaving. */ 529 530 s1 = in_even[1]; 531 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); 532 tmp[0] = in_even[0] + dc; 533 534 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { 535 536 s2 = in_even[j + 1]; 537 538 dn = in_odd[j] - ((s1 + s2 + 2) >> 2); 539 tmp[i ] = dc; 540 tmp[i + 1] = s1 + ((dn + dc) >> 1); 541 542 dc = dn; 543 s1 = s2; 544 } 545 546 tmp[i] = dc; 547 548 if (!(len & 1)) { 549 dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1); 550 tmp[len - 2] = s1 + ((dn + dc) >> 1); 551 tmp[len - 1] = dn; 552 } else { 553 tmp[len - 1] = s1 + dc; 554 } 555 #endif 556 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); 557 } 558 559 560 #endif /* !defined(STANDARD_SLOW_VERSION) */ 561 562 /* <summary> */ 563 /* Inverse 5-3 wavelet transform in 1-D for one row. */ 564 /* </summary> */ 565 /* Performs interleave, inverse wavelet transform and copy back to buffer */ 566 static void opj_idwt53_h(const opj_dwt_t *dwt, 567 OPJ_INT32* tiledp) 568 { 569 #ifdef STANDARD_SLOW_VERSION 570 /* For documentation purpose */ 571 opj_dwt_interleave_h(dwt, tiledp); 572 opj_dwt_decode_1(dwt); 573 memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32)); 574 #else 575 const OPJ_INT32 sn = dwt->sn; 576 const OPJ_INT32 len = sn + dwt->dn; 577 if (dwt->cas == 0) { /* Left-most sample is on even coordinate */ 578 if (len > 1) { 579 opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp); 580 } else { 581 /* Unmodified value */ 582 } 583 } else { /* Left-most sample is on odd coordinate */ 584 if (len == 1) { 585 tiledp[0] /= 2; 586 } else if (len == 2) { 587 OPJ_INT32* out = dwt->mem; 588 const OPJ_INT32* in_even = &tiledp[sn]; 589 const OPJ_INT32* in_odd = &tiledp[0]; 590 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); 591 out[0] = in_even[0] + out[1]; 592 memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32)); 593 } else if (len > 2) { 594 opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp); 595 } 596 } 597 #endif 598 } 599 600 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) 601 602 /* Conveniency macros to improve the readabilty of the formulas */ 603 #if __AVX2__ 604 #define VREG __m256i 605 #define LOAD_CST(x) _mm256_set1_epi32(x) 606 #define LOAD(x) _mm256_load_si256((const VREG*)(x)) 607 #define LOADU(x) _mm256_loadu_si256((const VREG*)(x)) 608 #define STORE(x,y) _mm256_store_si256((VREG*)(x),(y)) 609 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y)) 610 #define ADD(x,y) _mm256_add_epi32((x),(y)) 611 #define SUB(x,y) _mm256_sub_epi32((x),(y)) 612 #define SAR(x,y) _mm256_srai_epi32((x),(y)) 613 #else 614 #define VREG __m128i 615 #define LOAD_CST(x) _mm_set1_epi32(x) 616 #define LOAD(x) _mm_load_si128((const VREG*)(x)) 617 #define LOADU(x) _mm_loadu_si128((const VREG*)(x)) 618 #define STORE(x,y) _mm_store_si128((VREG*)(x),(y)) 619 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y)) 620 #define ADD(x,y) _mm_add_epi32((x),(y)) 621 #define SUB(x,y) _mm_sub_epi32((x),(y)) 622 #define SAR(x,y) _mm_srai_epi32((x),(y)) 623 #endif 624 #define ADD3(x,y,z) ADD(ADD(x,y),z) 625 626 static 627 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col, 628 const OPJ_INT32* tmp, 629 OPJ_INT32 len, 630 OPJ_SIZE_T stride) 631 { 632 OPJ_INT32 i; 633 for (i = 0; i < len; ++i) { 634 /* A memcpy(&tiledp_col[i * stride + 0], 635 &tmp[PARALLEL_COLS_53 * i + 0], 636 PARALLEL_COLS_53 * sizeof(OPJ_INT32)) 637 would do but would be a tiny bit slower. 638 We can take here advantage of our knowledge of alignment */ 639 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0], 640 LOAD(&tmp[PARALLEL_COLS_53 * i + 0])); 641 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT], 642 LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT])); 643 } 644 } 645 646 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or 647 * 16 in AVX2, when top-most pixel is on even coordinate */ 648 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2( 649 OPJ_INT32* tmp, 650 const OPJ_INT32 sn, 651 const OPJ_INT32 len, 652 OPJ_INT32* tiledp_col, 653 const OPJ_SIZE_T stride) 654 { 655 const OPJ_INT32* in_even = &tiledp_col[0]; 656 const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride]; 657 658 OPJ_INT32 i; 659 OPJ_SIZE_T j; 660 VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0; 661 VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1; 662 const VREG two = LOAD_CST(2); 663 664 assert(len > 1); 665 #if __AVX2__ 666 assert(PARALLEL_COLS_53 == 16); 667 assert(VREG_INT_COUNT == 8); 668 #else 669 assert(PARALLEL_COLS_53 == 8); 670 assert(VREG_INT_COUNT == 4); 671 #endif 672 673 /* Note: loads of input even/odd values must be done in a unaligned */ 674 /* fashion. But stores in tmp can be done with aligned store, since */ 675 /* the temporary buffer is properly aligned */ 676 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); 677 678 s1n_0 = LOADU(in_even + 0); 679 s1n_1 = LOADU(in_even + VREG_INT_COUNT); 680 d1n_0 = LOADU(in_odd); 681 d1n_1 = LOADU(in_odd + VREG_INT_COUNT); 682 683 /* s0n = s1n - ((d1n + 1) >> 1); <==> */ 684 /* s0n = s1n - ((d1n + d1n + 2) >> 2); */ 685 s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); 686 s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); 687 688 for (i = 0, j = 1; i < (len - 3); i += 2, j++) { 689 d1c_0 = d1n_0; 690 s0c_0 = s0n_0; 691 d1c_1 = d1n_1; 692 s0c_1 = s0n_1; 693 694 s1n_0 = LOADU(in_even + j * stride); 695 s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT); 696 d1n_0 = LOADU(in_odd + j * stride); 697 d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT); 698 699 /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/ 700 s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2)); 701 s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2)); 702 703 STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0); 704 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1); 705 706 /* d1c + ((s0c + s0n) >> 1) */ 707 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, 708 ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1))); 709 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, 710 ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1))); 711 } 712 713 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0); 714 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1); 715 716 if (len & 1) { 717 VREG tmp_len_minus_1; 718 s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride); 719 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ 720 tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); 721 STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1); 722 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ 723 STORE(tmp + PARALLEL_COLS_53 * (len - 2), 724 ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1))); 725 726 s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT); 727 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ 728 tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); 729 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, 730 tmp_len_minus_1); 731 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ 732 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, 733 ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1))); 734 735 736 } else { 737 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, 738 ADD(d1n_0, s0n_0)); 739 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, 740 ADD(d1n_1, s0n_1)); 741 } 742 743 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); 744 } 745 746 747 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or 748 * 16 in AVX2, when top-most pixel is on odd coordinate */ 749 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2( 750 OPJ_INT32* tmp, 751 const OPJ_INT32 sn, 752 const OPJ_INT32 len, 753 OPJ_INT32* tiledp_col, 754 const OPJ_SIZE_T stride) 755 { 756 OPJ_INT32 i; 757 OPJ_SIZE_T j; 758 759 VREG s1_0, s2_0, dc_0, dn_0; 760 VREG s1_1, s2_1, dc_1, dn_1; 761 const VREG two = LOAD_CST(2); 762 763 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; 764 const OPJ_INT32* in_odd = &tiledp_col[0]; 765 766 assert(len > 2); 767 #if __AVX2__ 768 assert(PARALLEL_COLS_53 == 16); 769 assert(VREG_INT_COUNT == 8); 770 #else 771 assert(PARALLEL_COLS_53 == 8); 772 assert(VREG_INT_COUNT == 4); 773 #endif 774 775 /* Note: loads of input even/odd values must be done in a unaligned */ 776 /* fashion. But stores in tmp can be done with aligned store, since */ 777 /* the temporary buffer is properly aligned */ 778 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); 779 780 s1_0 = LOADU(in_even + stride); 781 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ 782 dc_0 = SUB(LOADU(in_odd + 0), 783 SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2)); 784 STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0)); 785 786 s1_1 = LOADU(in_even + stride + VREG_INT_COUNT); 787 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ 788 dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT), 789 SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2)); 790 STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT, 791 ADD(LOADU(in_even + VREG_INT_COUNT), dc_1)); 792 793 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { 794 795 s2_0 = LOADU(in_even + (j + 1) * stride); 796 s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT); 797 798 /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */ 799 dn_0 = SUB(LOADU(in_odd + j * stride), 800 SAR(ADD3(s1_0, s2_0, two), 2)); 801 dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT), 802 SAR(ADD3(s1_1, s2_1, two), 2)); 803 804 STORE(tmp + PARALLEL_COLS_53 * i, dc_0); 805 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); 806 807 /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */ 808 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, 809 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); 810 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, 811 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); 812 813 dc_0 = dn_0; 814 s1_0 = s2_0; 815 dc_1 = dn_1; 816 s1_1 = s2_1; 817 } 818 STORE(tmp + PARALLEL_COLS_53 * i, dc_0); 819 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); 820 821 if (!(len & 1)) { 822 /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */ 823 dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride), 824 SAR(ADD3(s1_0, s1_0, two), 2)); 825 dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT), 826 SAR(ADD3(s1_1, s1_1, two), 2)); 827 828 /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */ 829 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0, 830 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); 831 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, 832 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); 833 834 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0); 835 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1); 836 } else { 837 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0)); 838 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, 839 ADD(s1_1, dc_1)); 840 } 841 842 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); 843 } 844 845 #undef VREG 846 #undef LOAD_CST 847 #undef LOADU 848 #undef LOAD 849 #undef STORE 850 #undef STOREU 851 #undef ADD 852 #undef ADD3 853 #undef SUB 854 #undef SAR 855 856 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */ 857 858 #if !defined(STANDARD_SLOW_VERSION) 859 /** Vertical inverse 5x3 wavelet transform for one column, when top-most 860 * pixel is on even coordinate */ 861 static void opj_idwt3_v_cas0(OPJ_INT32* tmp, 862 const OPJ_INT32 sn, 863 const OPJ_INT32 len, 864 OPJ_INT32* tiledp_col, 865 const OPJ_SIZE_T stride) 866 { 867 OPJ_INT32 i, j; 868 OPJ_INT32 d1c, d1n, s1n, s0c, s0n; 869 870 assert(len > 1); 871 872 /* Performs lifting in one single iteration. Saves memory */ 873 /* accesses and explicit interleaving. */ 874 875 s1n = tiledp_col[0]; 876 d1n = tiledp_col[(OPJ_SIZE_T)sn * stride]; 877 s0n = s1n - ((d1n + 1) >> 1); 878 879 for (i = 0, j = 0; i < (len - 3); i += 2, j++) { 880 d1c = d1n; 881 s0c = s0n; 882 883 s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride]; 884 d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride]; 885 886 s0n = s1n - ((d1c + d1n + 2) >> 2); 887 888 tmp[i ] = s0c; 889 tmp[i + 1] = d1c + ((s0c + s0n) >> 1); 890 } 891 892 tmp[i] = s0n; 893 894 if (len & 1) { 895 tmp[len - 1] = 896 tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] - 897 ((d1n + 1) >> 1); 898 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); 899 } else { 900 tmp[len - 1] = d1n + s0n; 901 } 902 903 for (i = 0; i < len; ++i) { 904 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i]; 905 } 906 } 907 908 909 /** Vertical inverse 5x3 wavelet transform for one column, when top-most 910 * pixel is on odd coordinate */ 911 static void opj_idwt3_v_cas1(OPJ_INT32* tmp, 912 const OPJ_INT32 sn, 913 const OPJ_INT32 len, 914 OPJ_INT32* tiledp_col, 915 const OPJ_SIZE_T stride) 916 { 917 OPJ_INT32 i, j; 918 OPJ_INT32 s1, s2, dc, dn; 919 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; 920 const OPJ_INT32* in_odd = &tiledp_col[0]; 921 922 assert(len > 2); 923 924 /* Performs lifting in one single iteration. Saves memory */ 925 /* accesses and explicit interleaving. */ 926 927 s1 = in_even[stride]; 928 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); 929 tmp[0] = in_even[0] + dc; 930 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { 931 932 s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride]; 933 934 dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2); 935 tmp[i ] = dc; 936 tmp[i + 1] = s1 + ((dn + dc) >> 1); 937 938 dc = dn; 939 s1 = s2; 940 } 941 tmp[i] = dc; 942 if (!(len & 1)) { 943 dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1); 944 tmp[len - 2] = s1 + ((dn + dc) >> 1); 945 tmp[len - 1] = dn; 946 } else { 947 tmp[len - 1] = s1 + dc; 948 } 949 950 for (i = 0; i < len; ++i) { 951 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i]; 952 } 953 } 954 #endif /* !defined(STANDARD_SLOW_VERSION) */ 955 956 /* <summary> */ 957 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */ 958 /* </summary> */ 959 /* Performs interleave, inverse wavelet transform and copy back to buffer */ 960 static void opj_idwt53_v(const opj_dwt_t *dwt, 961 OPJ_INT32* tiledp_col, 962 OPJ_SIZE_T stride, 963 OPJ_INT32 nb_cols) 964 { 965 #ifdef STANDARD_SLOW_VERSION 966 /* For documentation purpose */ 967 OPJ_INT32 k, c; 968 for (c = 0; c < nb_cols; c ++) { 969 opj_dwt_interleave_v(dwt, tiledp_col + c, stride); 970 opj_dwt_decode_1(dwt); 971 for (k = 0; k < dwt->sn + dwt->dn; ++k) { 972 tiledp_col[c + k * stride] = dwt->mem[k]; 973 } 974 } 975 #else 976 const OPJ_INT32 sn = dwt->sn; 977 const OPJ_INT32 len = sn + dwt->dn; 978 if (dwt->cas == 0) { 979 /* If len == 1, unmodified value */ 980 981 #if (defined(__SSE2__) || defined(__AVX2__)) 982 if (len > 1 && nb_cols == PARALLEL_COLS_53) { 983 /* Same as below general case, except that thanks to SSE2/AVX2 */ 984 /* we can efficently process 8/16 columns in parallel */ 985 opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); 986 return; 987 } 988 #endif 989 if (len > 1) { 990 OPJ_INT32 c; 991 for (c = 0; c < nb_cols; c++, tiledp_col++) { 992 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride); 993 } 994 return; 995 } 996 } else { 997 if (len == 1) { 998 OPJ_INT32 c; 999 for (c = 0; c < nb_cols; c++, tiledp_col++) { 1000 tiledp_col[0] /= 2; 1001 } 1002 return; 1003 } 1004 1005 if (len == 2) { 1006 OPJ_INT32 c; 1007 OPJ_INT32* out = dwt->mem; 1008 for (c = 0; c < nb_cols; c++, tiledp_col++) { 1009 OPJ_INT32 i; 1010 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; 1011 const OPJ_INT32* in_odd = &tiledp_col[0]; 1012 1013 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); 1014 out[0] = in_even[0] + out[1]; 1015 1016 for (i = 0; i < len; ++i) { 1017 tiledp_col[(OPJ_SIZE_T)i * stride] = out[i]; 1018 } 1019 } 1020 1021 return; 1022 } 1023 1024 #if (defined(__SSE2__) || defined(__AVX2__)) 1025 if (len > 2 && nb_cols == PARALLEL_COLS_53) { 1026 /* Same as below general case, except that thanks to SSE2/AVX2 */ 1027 /* we can efficently process 8/16 columns in parallel */ 1028 opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); 1029 return; 1030 } 1031 #endif 1032 if (len > 2) { 1033 OPJ_INT32 c; 1034 for (c = 0; c < nb_cols; c++, tiledp_col++) { 1035 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride); 1036 } 1037 return; 1038 } 1039 } 1040 #endif 1041 } 1042 1043 1044 /* <summary> */ 1045 /* Forward 9-7 wavelet transform in 1-D. */ 1046 /* </summary> */ 1047 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_SIZE_T a_count, 1048 OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) 1049 { 1050 OPJ_INT32 i; 1051 if (!cas) { 1052 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 1053 for (i = 0; i < dn; i++) { 1054 OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993); 1055 } 1056 for (i = 0; i < sn; i++) { 1057 OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434); 1058 } 1059 for (i = 0; i < dn; i++) { 1060 OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233); 1061 } 1062 for (i = 0; i < sn; i++) { 1063 OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633); 1064 } 1065 for (i = 0; i < dn; i++) { 1066 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */ 1067 } 1068 for (i = 0; i < sn; i++) { 1069 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */ 1070 } 1071 } 1072 } else { 1073 if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */ 1074 for (i = 0; i < dn; i++) { 1075 OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993); 1076 } 1077 for (i = 0; i < sn; i++) { 1078 OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434); 1079 } 1080 for (i = 0; i < dn; i++) { 1081 OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233); 1082 } 1083 for (i = 0; i < sn; i++) { 1084 OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633); 1085 } 1086 for (i = 0; i < dn; i++) { 1087 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */ 1088 } 1089 for (i = 0; i < sn; i++) { 1090 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */ 1091 } 1092 } 1093 } 1094 } 1095 1096 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, 1097 opj_stepsize_t *bandno_stepsize) 1098 { 1099 OPJ_INT32 p, n; 1100 p = opj_int_floorlog2(stepsize) - 13; 1101 n = 11 - opj_int_floorlog2(stepsize); 1102 bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff; 1103 bandno_stepsize->expn = numbps - p; 1104 } 1105 1106 /* 1107 ========================================================== 1108 DWT interface 1109 ========================================================== 1110 */ 1111 1112 1113 /* <summary> */ 1114 /* Forward 5-3 wavelet transform in 2-D. */ 1115 /* </summary> */ 1116 static INLINE OPJ_BOOL opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec, 1117 void(*p_function)(OPJ_INT32 *, OPJ_SIZE_T, OPJ_INT32, OPJ_INT32, OPJ_INT32)) 1118 { 1119 OPJ_INT32 i, j, k; 1120 OPJ_INT32 *a = 00; 1121 OPJ_INT32 *aj = 00; 1122 OPJ_INT32 *bj = 00; 1123 OPJ_INT32 w, l; 1124 1125 OPJ_INT32 rw; /* width of the resolution level computed */ 1126 OPJ_INT32 rh; /* height of the resolution level computed */ 1127 OPJ_SIZE_T l_data_count; 1128 OPJ_SIZE_T l_data_size; 1129 1130 opj_tcd_resolution_t * l_cur_res = 0; 1131 opj_tcd_resolution_t * l_last_res = 0; 1132 1133 w = tilec->x1 - tilec->x0; 1134 l = (OPJ_INT32)tilec->numresolutions - 1; 1135 a = tilec->data; 1136 1137 l_cur_res = tilec->resolutions + l; 1138 l_last_res = l_cur_res - 1; 1139 1140 l_data_count = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions); 1141 /* overflow check */ 1142 if (l_data_count > (SIZE_MAX / sizeof(OPJ_INT32))) { 1143 /* FIXME event manager error callback */ 1144 return OPJ_FALSE; 1145 } 1146 l_data_size = l_data_count * sizeof(OPJ_INT32); 1147 bj = (OPJ_INT32*)opj_malloc(l_data_size); 1148 /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */ 1149 /* in that case, so do not error out */ 1150 if (l_data_size != 0 && ! bj) { 1151 return OPJ_FALSE; 1152 } 1153 i = l; 1154 1155 while (i--) { 1156 OPJ_INT32 rw1; /* width of the resolution level once lower than computed one */ 1157 OPJ_INT32 rh1; /* height of the resolution level once lower than computed one */ 1158 OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */ 1159 OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */ 1160 OPJ_INT32 dn, sn; 1161 1162 rw = l_cur_res->x1 - l_cur_res->x0; 1163 rh = l_cur_res->y1 - l_cur_res->y0; 1164 rw1 = l_last_res->x1 - l_last_res->x0; 1165 rh1 = l_last_res->y1 - l_last_res->y0; 1166 1167 cas_row = l_cur_res->x0 & 1; 1168 cas_col = l_cur_res->y0 & 1; 1169 1170 sn = rh1; 1171 dn = rh - rh1; 1172 for (j = 0; j < rw; ++j) { 1173 aj = a + j; 1174 for (k = 0; k < rh; ++k) { 1175 bj[k] = aj[k * w]; 1176 } 1177 1178 (*p_function) (bj, l_data_count, dn, sn, cas_col); 1179 1180 opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col); 1181 } 1182 1183 sn = rw1; 1184 dn = rw - rw1; 1185 1186 for (j = 0; j < rh; j++) { 1187 aj = a + j * w; 1188 for (k = 0; k < rw; k++) { 1189 bj[k] = aj[k]; 1190 } 1191 (*p_function) (bj, l_data_count, dn, sn, cas_row); 1192 opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row); 1193 } 1194 1195 l_cur_res = l_last_res; 1196 1197 --l_last_res; 1198 } 1199 1200 opj_free(bj); 1201 return OPJ_TRUE; 1202 } 1203 1204 /* Forward 5-3 wavelet transform in 2-D. */ 1205 /* </summary> */ 1206 OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec) 1207 { 1208 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1); 1209 } 1210 1211 /* <summary> */ 1212 /* Inverse 5-3 wavelet transform in 2-D. */ 1213 /* </summary> */ 1214 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec, 1215 OPJ_UINT32 numres) 1216 { 1217 if (p_tcd->whole_tile_decoding) { 1218 return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres); 1219 } else { 1220 return opj_dwt_decode_partial_tile(tilec, numres); 1221 } 1222 } 1223 1224 1225 /* <summary> */ 1226 /* Get gain of 5-3 wavelet transform. */ 1227 /* </summary> */ 1228 OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient) 1229 { 1230 if (orient == 0) { 1231 return 0; 1232 } 1233 if (orient == 1 || orient == 2) { 1234 return 1; 1235 } 1236 return 2; 1237 } 1238 1239 /* <summary> */ 1240 /* Get norm of 5-3 wavelet. */ 1241 /* </summary> */ 1242 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient) 1243 { 1244 /* FIXME ! This is just a band-aid to avoid a buffer overflow */ 1245 /* but the array should really be extended up to 33 resolution levels */ 1246 /* See https://github.com/uclouvain/openjpeg/issues/493 */ 1247 if (orient == 0 && level >= 10) { 1248 level = 9; 1249 } else if (orient > 0 && level >= 9) { 1250 level = 8; 1251 } 1252 return opj_dwt_norms[orient][level]; 1253 } 1254 1255 /* <summary> */ 1256 /* Forward 9-7 wavelet transform in 2-D. */ 1257 /* </summary> */ 1258 OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec) 1259 { 1260 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real); 1261 } 1262 1263 /* <summary> */ 1264 /* Get gain of 9-7 wavelet transform. */ 1265 /* </summary> */ 1266 OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient) 1267 { 1268 (void)orient; 1269 return 0; 1270 } 1271 1272 /* <summary> */ 1273 /* Get norm of 9-7 wavelet. */ 1274 /* </summary> */ 1275 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient) 1276 { 1277 /* FIXME ! This is just a band-aid to avoid a buffer overflow */ 1278 /* but the array should really be extended up to 33 resolution levels */ 1279 /* See https://github.com/uclouvain/openjpeg/issues/493 */ 1280 if (orient == 0 && level >= 10) { 1281 level = 9; 1282 } else if (orient > 0 && level >= 9) { 1283 level = 8; 1284 } 1285 return opj_dwt_norms_real[orient][level]; 1286 } 1287 1288 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec) 1289 { 1290 OPJ_UINT32 numbands, bandno; 1291 numbands = 3 * tccp->numresolutions - 2; 1292 for (bandno = 0; bandno < numbands; bandno++) { 1293 OPJ_FLOAT64 stepsize; 1294 OPJ_UINT32 resno, level, orient, gain; 1295 1296 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1); 1297 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1); 1298 level = tccp->numresolutions - 1 - resno; 1299 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) || 1300 (orient == 2)) ? 1 : 2)); 1301 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) { 1302 stepsize = 1.0; 1303 } else { 1304 OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level]; 1305 stepsize = (1 << (gain)) / norm; 1306 } 1307 opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0), 1308 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]); 1309 } 1310 } 1311 1312 /* <summary> */ 1313 /* Determine maximum computed resolution level for inverse wavelet transform */ 1314 /* </summary> */ 1315 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, 1316 OPJ_UINT32 i) 1317 { 1318 OPJ_UINT32 mr = 0; 1319 OPJ_UINT32 w; 1320 while (--i) { 1321 ++r; 1322 if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) { 1323 mr = w ; 1324 } 1325 if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) { 1326 mr = w ; 1327 } 1328 } 1329 return mr ; 1330 } 1331 1332 typedef struct { 1333 opj_dwt_t h; 1334 OPJ_UINT32 rw; 1335 OPJ_UINT32 w; 1336 OPJ_INT32 * OPJ_RESTRICT tiledp; 1337 OPJ_UINT32 min_j; 1338 OPJ_UINT32 max_j; 1339 } opj_dwd_decode_h_job_t; 1340 1341 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls) 1342 { 1343 OPJ_UINT32 j; 1344 opj_dwd_decode_h_job_t* job; 1345 (void)tls; 1346 1347 job = (opj_dwd_decode_h_job_t*)user_data; 1348 for (j = job->min_j; j < job->max_j; j++) { 1349 opj_idwt53_h(&job->h, &job->tiledp[j * job->w]); 1350 } 1351 1352 opj_aligned_free(job->h.mem); 1353 opj_free(job); 1354 } 1355 1356 typedef struct { 1357 opj_dwt_t v; 1358 OPJ_UINT32 rh; 1359 OPJ_UINT32 w; 1360 OPJ_INT32 * OPJ_RESTRICT tiledp; 1361 OPJ_UINT32 min_j; 1362 OPJ_UINT32 max_j; 1363 } opj_dwd_decode_v_job_t; 1364 1365 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls) 1366 { 1367 OPJ_UINT32 j; 1368 opj_dwd_decode_v_job_t* job; 1369 (void)tls; 1370 1371 job = (opj_dwd_decode_v_job_t*)user_data; 1372 for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j; 1373 j += PARALLEL_COLS_53) { 1374 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, 1375 PARALLEL_COLS_53); 1376 } 1377 if (j < job->max_j) 1378 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, 1379 (OPJ_INT32)(job->max_j - j)); 1380 1381 opj_aligned_free(job->v.mem); 1382 opj_free(job); 1383 } 1384 1385 1386 /* <summary> */ 1387 /* Inverse wavelet transform in 2-D. */ 1388 /* </summary> */ 1389 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, 1390 const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) 1391 { 1392 opj_dwt_t h; 1393 opj_dwt_t v; 1394 1395 opj_tcd_resolution_t* tr = tilec->resolutions; 1396 1397 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - 1398 tr->x0); /* width of the resolution level computed */ 1399 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - 1400 tr->y0); /* height of the resolution level computed */ 1401 1402 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - 1403 1].x1 - 1404 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); 1405 OPJ_SIZE_T h_mem_size; 1406 int num_threads; 1407 1408 if (numres == 1U) { 1409 return OPJ_TRUE; 1410 } 1411 num_threads = opj_thread_pool_get_thread_count(tp); 1412 h.mem_count = opj_dwt_max_resolution(tr, numres); 1413 /* overflow check */ 1414 if (h.mem_count > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) { 1415 /* FIXME event manager error callback */ 1416 return OPJ_FALSE; 1417 } 1418 /* We need PARALLEL_COLS_53 times the height of the array, */ 1419 /* since for the vertical pass */ 1420 /* we process PARALLEL_COLS_53 columns at a time */ 1421 h_mem_size = h.mem_count * PARALLEL_COLS_53 * sizeof(OPJ_INT32); 1422 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); 1423 if (! h.mem) { 1424 /* FIXME event manager error callback */ 1425 return OPJ_FALSE; 1426 } 1427 1428 v.mem_count = h.mem_count; 1429 v.mem = h.mem; 1430 1431 while (--numres) { 1432 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data; 1433 OPJ_UINT32 j; 1434 1435 ++tr; 1436 h.sn = (OPJ_INT32)rw; 1437 v.sn = (OPJ_INT32)rh; 1438 1439 rw = (OPJ_UINT32)(tr->x1 - tr->x0); 1440 rh = (OPJ_UINT32)(tr->y1 - tr->y0); 1441 1442 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); 1443 h.cas = tr->x0 % 2; 1444 1445 if (num_threads <= 1 || rh <= 1) { 1446 for (j = 0; j < rh; ++j) { 1447 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]); 1448 } 1449 } else { 1450 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; 1451 OPJ_UINT32 step_j; 1452 1453 if (rh < num_jobs) { 1454 num_jobs = rh; 1455 } 1456 step_j = (rh / num_jobs); 1457 1458 for (j = 0; j < num_jobs; j++) { 1459 opj_dwd_decode_h_job_t* job; 1460 1461 job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t)); 1462 if (!job) { 1463 /* It would be nice to fallback to single thread case, but */ 1464 /* unfortunately some jobs may be launched and have modified */ 1465 /* tiledp, so it is not practical to recover from that error */ 1466 /* FIXME event manager error callback */ 1467 opj_thread_pool_wait_completion(tp, 0); 1468 opj_aligned_free(h.mem); 1469 return OPJ_FALSE; 1470 } 1471 job->h = h; 1472 job->rw = rw; 1473 job->w = w; 1474 job->tiledp = tiledp; 1475 job->min_j = j * step_j; 1476 job->max_j = (j + 1U) * step_j; /* this can overflow */ 1477 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ 1478 job->max_j = rh; 1479 } 1480 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); 1481 if (!job->h.mem) { 1482 /* FIXME event manager error callback */ 1483 opj_thread_pool_wait_completion(tp, 0); 1484 opj_free(job); 1485 opj_aligned_free(h.mem); 1486 return OPJ_FALSE; 1487 } 1488 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job); 1489 } 1490 opj_thread_pool_wait_completion(tp, 0); 1491 } 1492 1493 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); 1494 v.cas = tr->y0 % 2; 1495 1496 if (num_threads <= 1 || rw <= 1) { 1497 for (j = 0; j + PARALLEL_COLS_53 <= rw; 1498 j += PARALLEL_COLS_53) { 1499 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53); 1500 } 1501 if (j < rw) { 1502 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j)); 1503 } 1504 } else { 1505 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; 1506 OPJ_UINT32 step_j; 1507 1508 if (rw < num_jobs) { 1509 num_jobs = rw; 1510 } 1511 step_j = (rw / num_jobs); 1512 1513 for (j = 0; j < num_jobs; j++) { 1514 opj_dwd_decode_v_job_t* job; 1515 1516 job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t)); 1517 if (!job) { 1518 /* It would be nice to fallback to single thread case, but */ 1519 /* unfortunately some jobs may be launched and have modified */ 1520 /* tiledp, so it is not practical to recover from that error */ 1521 /* FIXME event manager error callback */ 1522 opj_thread_pool_wait_completion(tp, 0); 1523 opj_aligned_free(v.mem); 1524 return OPJ_FALSE; 1525 } 1526 job->v = v; 1527 job->rh = rh; 1528 job->w = w; 1529 job->tiledp = tiledp; 1530 job->min_j = j * step_j; 1531 job->max_j = (j + 1U) * step_j; /* this can overflow */ 1532 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ 1533 job->max_j = rw; 1534 } 1535 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); 1536 if (!job->v.mem) { 1537 /* FIXME event manager error callback */ 1538 opj_thread_pool_wait_completion(tp, 0); 1539 opj_free(job); 1540 opj_aligned_free(v.mem); 1541 return OPJ_FALSE; 1542 } 1543 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job); 1544 } 1545 opj_thread_pool_wait_completion(tp, 0); 1546 } 1547 } 1548 opj_aligned_free(h.mem); 1549 return OPJ_TRUE; 1550 } 1551 1552 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest, 1553 OPJ_INT32 cas, 1554 opj_sparse_array_int32_t* sa, 1555 OPJ_UINT32 sa_line, 1556 OPJ_UINT32 sn, 1557 OPJ_UINT32 win_l_x0, 1558 OPJ_UINT32 win_l_x1, 1559 OPJ_UINT32 win_h_x0, 1560 OPJ_UINT32 win_h_x1) 1561 { 1562 OPJ_BOOL ret; 1563 ret = opj_sparse_array_int32_read(sa, 1564 win_l_x0, sa_line, 1565 win_l_x1, sa_line + 1, 1566 dest + cas + 2 * win_l_x0, 1567 2, 0, OPJ_TRUE); 1568 assert(ret); 1569 ret = opj_sparse_array_int32_read(sa, 1570 sn + win_h_x0, sa_line, 1571 sn + win_h_x1, sa_line + 1, 1572 dest + 1 - cas + 2 * win_h_x0, 1573 2, 0, OPJ_TRUE); 1574 assert(ret); 1575 OPJ_UNUSED(ret); 1576 } 1577 1578 1579 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest, 1580 OPJ_INT32 cas, 1581 opj_sparse_array_int32_t* sa, 1582 OPJ_UINT32 sa_col, 1583 OPJ_UINT32 nb_cols, 1584 OPJ_UINT32 sn, 1585 OPJ_UINT32 win_l_y0, 1586 OPJ_UINT32 win_l_y1, 1587 OPJ_UINT32 win_h_y0, 1588 OPJ_UINT32 win_h_y1) 1589 { 1590 OPJ_BOOL ret; 1591 ret = opj_sparse_array_int32_read(sa, 1592 sa_col, win_l_y0, 1593 sa_col + nb_cols, win_l_y1, 1594 dest + cas * 4 + 2 * 4 * win_l_y0, 1595 1, 2 * 4, OPJ_TRUE); 1596 assert(ret); 1597 ret = opj_sparse_array_int32_read(sa, 1598 sa_col, sn + win_h_y0, 1599 sa_col + nb_cols, sn + win_h_y1, 1600 dest + (1 - cas) * 4 + 2 * 4 * win_h_y0, 1601 1, 2 * 4, OPJ_TRUE); 1602 assert(ret); 1603 OPJ_UNUSED(ret); 1604 } 1605 1606 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, 1607 OPJ_INT32 dn, OPJ_INT32 sn, 1608 OPJ_INT32 cas, 1609 OPJ_INT32 win_l_x0, 1610 OPJ_INT32 win_l_x1, 1611 OPJ_INT32 win_h_x0, 1612 OPJ_INT32 win_h_x1) 1613 { 1614 OPJ_INT32 i; 1615 1616 if (!cas) { 1617 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 1618 1619 /* Naive version is : 1620 for (i = win_l_x0; i < i_max; i++) { 1621 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 1622 } 1623 for (i = win_h_x0; i < win_h_x1; i++) { 1624 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 1625 } 1626 but the compiler doesn't manage to unroll it to avoid bound 1627 checking in OPJ_S_ and OPJ_D_ macros 1628 */ 1629 1630 i = win_l_x0; 1631 if (i < win_l_x1) { 1632 OPJ_INT32 i_max; 1633 1634 /* Left-most case */ 1635 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 1636 i ++; 1637 1638 i_max = win_l_x1; 1639 if (i_max > dn) { 1640 i_max = dn; 1641 } 1642 for (; i < i_max; i++) { 1643 /* No bound checking */ 1644 OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2; 1645 } 1646 for (; i < win_l_x1; i++) { 1647 /* Right-most case */ 1648 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 1649 } 1650 } 1651 1652 i = win_h_x0; 1653 if (i < win_h_x1) { 1654 OPJ_INT32 i_max = win_h_x1; 1655 if (i_max >= sn) { 1656 i_max = sn - 1; 1657 } 1658 for (; i < i_max; i++) { 1659 /* No bound checking */ 1660 OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1; 1661 } 1662 for (; i < win_h_x1; i++) { 1663 /* Right-most case */ 1664 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 1665 } 1666 } 1667 } 1668 } else { 1669 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ 1670 OPJ_S(0) /= 2; 1671 } else { 1672 for (i = win_l_x0; i < win_l_x1; i++) { 1673 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; 1674 } 1675 for (i = win_h_x0; i < win_h_x1; i++) { 1676 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; 1677 } 1678 } 1679 } 1680 } 1681 1682 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off] 1683 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off] 1684 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off))) 1685 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off))) 1686 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off))) 1687 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off))) 1688 1689 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a, 1690 OPJ_UINT32 nb_cols, 1691 OPJ_INT32 dn, OPJ_INT32 sn, 1692 OPJ_INT32 cas, 1693 OPJ_INT32 win_l_x0, 1694 OPJ_INT32 win_l_x1, 1695 OPJ_INT32 win_h_x0, 1696 OPJ_INT32 win_h_x1) 1697 { 1698 OPJ_INT32 i; 1699 OPJ_UINT32 off; 1700 1701 (void)nb_cols; 1702 1703 if (!cas) { 1704 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 1705 1706 /* Naive version is : 1707 for (i = win_l_x0; i < i_max; i++) { 1708 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 1709 } 1710 for (i = win_h_x0; i < win_h_x1; i++) { 1711 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 1712 } 1713 but the compiler doesn't manage to unroll it to avoid bound 1714 checking in OPJ_S_ and OPJ_D_ macros 1715 */ 1716 1717 i = win_l_x0; 1718 if (i < win_l_x1) { 1719 OPJ_INT32 i_max; 1720 1721 /* Left-most case */ 1722 for (off = 0; off < 4; off++) { 1723 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2; 1724 } 1725 i ++; 1726 1727 i_max = win_l_x1; 1728 if (i_max > dn) { 1729 i_max = dn; 1730 } 1731 1732 #ifdef __SSE2__ 1733 if (i + 1 < i_max) { 1734 const __m128i two = _mm_set1_epi32(2); 1735 __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8)); 1736 for (; i + 1 < i_max; i += 2) { 1737 /* No bound checking */ 1738 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8)); 1739 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8)); 1740 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8)); 1741 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8)); 1742 S = _mm_sub_epi32(S, 1743 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2)); 1744 S1 = _mm_sub_epi32(S1, 1745 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2)); 1746 _mm_store_si128((__m128i*)(a + i * 8), S); 1747 _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1); 1748 Dm1 = D1; 1749 } 1750 } 1751 #endif 1752 1753 for (; i < i_max; i++) { 1754 /* No bound checking */ 1755 for (off = 0; off < 4; off++) { 1756 OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2; 1757 } 1758 } 1759 for (; i < win_l_x1; i++) { 1760 /* Right-most case */ 1761 for (off = 0; off < 4; off++) { 1762 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2; 1763 } 1764 } 1765 } 1766 1767 i = win_h_x0; 1768 if (i < win_h_x1) { 1769 OPJ_INT32 i_max = win_h_x1; 1770 if (i_max >= sn) { 1771 i_max = sn - 1; 1772 } 1773 1774 #ifdef __SSE2__ 1775 if (i + 1 < i_max) { 1776 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8)); 1777 for (; i + 1 < i_max; i += 2) { 1778 /* No bound checking */ 1779 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8)); 1780 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8)); 1781 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8)); 1782 __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8)); 1783 D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1)); 1784 D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1)); 1785 _mm_store_si128((__m128i*)(a + 4 + i * 8), D); 1786 _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1); 1787 S = S2; 1788 } 1789 } 1790 #endif 1791 1792 for (; i < i_max; i++) { 1793 /* No bound checking */ 1794 for (off = 0; off < 4; off++) { 1795 OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1; 1796 } 1797 } 1798 for (; i < win_h_x1; i++) { 1799 /* Right-most case */ 1800 for (off = 0; off < 4; off++) { 1801 OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1; 1802 } 1803 } 1804 } 1805 } 1806 } else { 1807 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ 1808 for (off = 0; off < 4; off++) { 1809 OPJ_S_off(0, off) /= 2; 1810 } 1811 } else { 1812 for (i = win_l_x0; i < win_l_x1; i++) { 1813 for (off = 0; off < 4; off++) { 1814 OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2; 1815 } 1816 } 1817 for (i = win_h_x0; i < win_h_x1; i++) { 1818 for (off = 0; off < 4; off++) { 1819 OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1; 1820 } 1821 } 1822 } 1823 } 1824 } 1825 1826 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec, 1827 OPJ_UINT32 resno, 1828 OPJ_UINT32 bandno, 1829 OPJ_UINT32 tcx0, 1830 OPJ_UINT32 tcy0, 1831 OPJ_UINT32 tcx1, 1832 OPJ_UINT32 tcy1, 1833 OPJ_UINT32* tbx0, 1834 OPJ_UINT32* tby0, 1835 OPJ_UINT32* tbx1, 1836 OPJ_UINT32* tby1) 1837 { 1838 /* Compute number of decomposition for this band. See table F-1 */ 1839 OPJ_UINT32 nb = (resno == 0) ? 1840 tilec->numresolutions - 1 : 1841 tilec->numresolutions - resno; 1842 /* Map above tile-based coordinates to sub-band-based coordinates per */ 1843 /* equation B-15 of the standard */ 1844 OPJ_UINT32 x0b = bandno & 1; 1845 OPJ_UINT32 y0b = bandno >> 1; 1846 if (tbx0) { 1847 *tbx0 = (nb == 0) ? tcx0 : 1848 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 : 1849 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb); 1850 } 1851 if (tby0) { 1852 *tby0 = (nb == 0) ? tcy0 : 1853 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 : 1854 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb); 1855 } 1856 if (tbx1) { 1857 *tbx1 = (nb == 0) ? tcx1 : 1858 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 : 1859 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb); 1860 } 1861 if (tby1) { 1862 *tby1 = (nb == 0) ? tcy1 : 1863 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 : 1864 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb); 1865 } 1866 } 1867 1868 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width, 1869 OPJ_UINT32 max_size, 1870 OPJ_UINT32* start, 1871 OPJ_UINT32* end) 1872 { 1873 *start = opj_uint_subs(*start, filter_width); 1874 *end = opj_uint_adds(*end, filter_width); 1875 *end = opj_uint_min(*end, max_size); 1876 } 1877 1878 1879 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array( 1880 opj_tcd_tilecomp_t* tilec, 1881 OPJ_UINT32 numres) 1882 { 1883 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); 1884 OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0); 1885 OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0); 1886 OPJ_UINT32 resno, bandno, precno, cblkno; 1887 opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create( 1888 w, h, opj_uint_min(w, 64), opj_uint_min(h, 64)); 1889 if (sa == NULL) { 1890 return NULL; 1891 } 1892 1893 for (resno = 0; resno < numres; ++resno) { 1894 opj_tcd_resolution_t* res = &tilec->resolutions[resno]; 1895 1896 for (bandno = 0; bandno < res->numbands; ++bandno) { 1897 opj_tcd_band_t* band = &res->bands[bandno]; 1898 1899 for (precno = 0; precno < res->pw * res->ph; ++precno) { 1900 opj_tcd_precinct_t* precinct = &band->precincts[precno]; 1901 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) { 1902 opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno]; 1903 if (cblk->decoded_data != NULL) { 1904 OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0); 1905 OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0); 1906 OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0); 1907 OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0); 1908 1909 if (band->bandno & 1) { 1910 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; 1911 x += (OPJ_UINT32)(pres->x1 - pres->x0); 1912 } 1913 if (band->bandno & 2) { 1914 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; 1915 y += (OPJ_UINT32)(pres->y1 - pres->y0); 1916 } 1917 1918 if (!opj_sparse_array_int32_write(sa, x, y, 1919 x + cblk_w, y + cblk_h, 1920 cblk->decoded_data, 1921 1, cblk_w, OPJ_TRUE)) { 1922 opj_sparse_array_int32_free(sa); 1923 return NULL; 1924 } 1925 } 1926 } 1927 } 1928 } 1929 } 1930 1931 return sa; 1932 } 1933 1934 1935 static OPJ_BOOL opj_dwt_decode_partial_tile( 1936 opj_tcd_tilecomp_t* tilec, 1937 OPJ_UINT32 numres) 1938 { 1939 opj_sparse_array_int32_t* sa; 1940 opj_dwt_t h; 1941 opj_dwt_t v; 1942 OPJ_UINT32 resno; 1943 /* This value matches the maximum left/right extension given in tables */ 1944 /* F.2 and F.3 of the standard. */ 1945 const OPJ_UINT32 filter_width = 2U; 1946 1947 opj_tcd_resolution_t* tr = tilec->resolutions; 1948 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); 1949 1950 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - 1951 tr->x0); /* width of the resolution level computed */ 1952 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - 1953 tr->y0); /* height of the resolution level computed */ 1954 1955 OPJ_SIZE_T h_mem_size; 1956 1957 /* Compute the intersection of the area of interest, expressed in tile coordinates */ 1958 /* with the tile coordinates */ 1959 OPJ_UINT32 win_tcx0 = tilec->win_x0; 1960 OPJ_UINT32 win_tcy0 = tilec->win_y0; 1961 OPJ_UINT32 win_tcx1 = tilec->win_x1; 1962 OPJ_UINT32 win_tcy1 = tilec->win_y1; 1963 1964 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) { 1965 return OPJ_TRUE; 1966 } 1967 1968 sa = opj_dwt_init_sparse_array(tilec, numres); 1969 if (sa == NULL) { 1970 return OPJ_FALSE; 1971 } 1972 1973 if (numres == 1U) { 1974 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, 1975 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, 1976 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, 1977 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, 1978 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, 1979 tilec->data_win, 1980 1, tr_max->win_x1 - tr_max->win_x0, 1981 OPJ_TRUE); 1982 assert(ret); 1983 OPJ_UNUSED(ret); 1984 opj_sparse_array_int32_free(sa); 1985 return OPJ_TRUE; 1986 } 1987 h.mem_count = opj_dwt_max_resolution(tr, numres); 1988 /* overflow check */ 1989 /* in vertical pass, we process 4 columns at a time */ 1990 if (h.mem_count > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) { 1991 /* FIXME event manager error callback */ 1992 opj_sparse_array_int32_free(sa); 1993 return OPJ_FALSE; 1994 } 1995 1996 h_mem_size = h.mem_count * 4 * sizeof(OPJ_INT32); 1997 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); 1998 if (! h.mem) { 1999 /* FIXME event manager error callback */ 2000 opj_sparse_array_int32_free(sa); 2001 return OPJ_FALSE; 2002 } 2003 2004 v.mem_count = h.mem_count; 2005 v.mem = h.mem; 2006 2007 for (resno = 1; resno < numres; resno ++) { 2008 OPJ_UINT32 i, j; 2009 /* Window of interest subband-based coordinates */ 2010 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1; 2011 OPJ_UINT32 win_hl_x0, win_hl_x1; 2012 OPJ_UINT32 win_lh_y0, win_lh_y1; 2013 /* Window of interest tile-resolution-based coordinates */ 2014 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1; 2015 /* Tile-resolution subband-based coordinates */ 2016 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0; 2017 2018 ++tr; 2019 2020 h.sn = (OPJ_INT32)rw; 2021 v.sn = (OPJ_INT32)rh; 2022 2023 rw = (OPJ_UINT32)(tr->x1 - tr->x0); 2024 rh = (OPJ_UINT32)(tr->y1 - tr->y0); 2025 2026 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); 2027 h.cas = tr->x0 % 2; 2028 2029 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); 2030 v.cas = tr->y0 % 2; 2031 2032 /* Get the subband coordinates for the window of interest */ 2033 /* LL band */ 2034 opj_dwt_get_band_coordinates(tilec, resno, 0, 2035 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2036 &win_ll_x0, &win_ll_y0, 2037 &win_ll_x1, &win_ll_y1); 2038 2039 /* HL band */ 2040 opj_dwt_get_band_coordinates(tilec, resno, 1, 2041 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2042 &win_hl_x0, NULL, &win_hl_x1, NULL); 2043 2044 /* LH band */ 2045 opj_dwt_get_band_coordinates(tilec, resno, 2, 2046 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2047 NULL, &win_lh_y0, NULL, &win_lh_y1); 2048 2049 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */ 2050 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0; 2051 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0; 2052 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0; 2053 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0; 2054 2055 /* Substract the origin of the bands for this tile, to the subwindow */ 2056 /* of interest band coordinates, so as to get them relative to the */ 2057 /* tile */ 2058 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0); 2059 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0); 2060 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0); 2061 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0); 2062 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0); 2063 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0); 2064 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0); 2065 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0); 2066 2067 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1); 2068 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1); 2069 2070 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1); 2071 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1); 2072 2073 /* Compute the tile-resolution-based coordinates for the window of interest */ 2074 if (h.cas == 0) { 2075 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1); 2076 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw); 2077 } else { 2078 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1); 2079 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw); 2080 } 2081 2082 if (v.cas == 0) { 2083 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1); 2084 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh); 2085 } else { 2086 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1); 2087 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh); 2088 } 2089 2090 for (j = 0; j < rh; ++j) { 2091 if ((j >= win_ll_y0 && j < win_ll_y1) || 2092 (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) { 2093 2094 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */ 2095 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */ 2096 /* on opj_decompress -i ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */ 2097 /* This is less extreme than memsetting the whole buffer to 0 */ 2098 /* although we could potentially do better with better handling of edge conditions */ 2099 if (win_tr_x1 >= 1 && win_tr_x1 < rw) { 2100 h.mem[win_tr_x1 - 1] = 0; 2101 } 2102 if (win_tr_x1 < rw) { 2103 h.mem[win_tr_x1] = 0; 2104 } 2105 2106 opj_dwt_interleave_partial_h(h.mem, 2107 h.cas, 2108 sa, 2109 j, 2110 (OPJ_UINT32)h.sn, 2111 win_ll_x0, 2112 win_ll_x1, 2113 win_hl_x0, 2114 win_hl_x1); 2115 opj_dwt_decode_partial_1(h.mem, h.mem_count, h.dn, h.sn, h.cas, 2116 (OPJ_INT32)win_ll_x0, 2117 (OPJ_INT32)win_ll_x1, 2118 (OPJ_INT32)win_hl_x0, 2119 (OPJ_INT32)win_hl_x1); 2120 if (!opj_sparse_array_int32_write(sa, 2121 win_tr_x0, j, 2122 win_tr_x1, j + 1, 2123 h.mem + win_tr_x0, 2124 1, 0, OPJ_TRUE)) { 2125 /* FIXME event manager error callback */ 2126 opj_sparse_array_int32_free(sa); 2127 opj_aligned_free(h.mem); 2128 return OPJ_FALSE; 2129 } 2130 } 2131 } 2132 2133 for (i = win_tr_x0; i < win_tr_x1;) { 2134 OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i); 2135 opj_dwt_interleave_partial_v(v.mem, 2136 v.cas, 2137 sa, 2138 i, 2139 nb_cols, 2140 (OPJ_UINT32)v.sn, 2141 win_ll_y0, 2142 win_ll_y1, 2143 win_lh_y0, 2144 win_lh_y1); 2145 opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas, 2146 (OPJ_INT32)win_ll_y0, 2147 (OPJ_INT32)win_ll_y1, 2148 (OPJ_INT32)win_lh_y0, 2149 (OPJ_INT32)win_lh_y1); 2150 if (!opj_sparse_array_int32_write(sa, 2151 i, win_tr_y0, 2152 i + nb_cols, win_tr_y1, 2153 v.mem + 4 * win_tr_y0, 2154 1, 4, OPJ_TRUE)) { 2155 /* FIXME event manager error callback */ 2156 opj_sparse_array_int32_free(sa); 2157 opj_aligned_free(h.mem); 2158 return OPJ_FALSE; 2159 } 2160 2161 i += nb_cols; 2162 } 2163 } 2164 opj_aligned_free(h.mem); 2165 2166 { 2167 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, 2168 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, 2169 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, 2170 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, 2171 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, 2172 tilec->data_win, 2173 1, tr_max->win_x1 - tr_max->win_x0, 2174 OPJ_TRUE); 2175 assert(ret); 2176 OPJ_UNUSED(ret); 2177 } 2178 opj_sparse_array_int32_free(sa); 2179 return OPJ_TRUE; 2180 } 2181 2182 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, 2183 OPJ_FLOAT32* OPJ_RESTRICT a, 2184 OPJ_UINT32 width, 2185 OPJ_UINT32 remaining_height) 2186 { 2187 OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas); 2188 OPJ_UINT32 i, k; 2189 OPJ_UINT32 x0 = dwt->win_l_x0; 2190 OPJ_UINT32 x1 = dwt->win_l_x1; 2191 2192 for (k = 0; k < 2; ++k) { 2193 if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 && 2194 ((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) { 2195 /* Fast code path */ 2196 for (i = x0; i < x1; ++i) { 2197 OPJ_UINT32 j = i; 2198 bi[i * 8 ] = a[j]; 2199 j += width; 2200 bi[i * 8 + 1] = a[j]; 2201 j += width; 2202 bi[i * 8 + 2] = a[j]; 2203 j += width; 2204 bi[i * 8 + 3] = a[j]; 2205 } 2206 } else { 2207 /* Slow code path */ 2208 for (i = x0; i < x1; ++i) { 2209 OPJ_UINT32 j = i; 2210 bi[i * 8 ] = a[j]; 2211 j += width; 2212 if (remaining_height == 1) { 2213 continue; 2214 } 2215 bi[i * 8 + 1] = a[j]; 2216 j += width; 2217 if (remaining_height == 2) { 2218 continue; 2219 } 2220 bi[i * 8 + 2] = a[j]; 2221 j += width; 2222 if (remaining_height == 3) { 2223 continue; 2224 } 2225 bi[i * 8 + 3] = a[j]; /* This one*/ 2226 } 2227 } 2228 2229 bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas); 2230 a += dwt->sn; 2231 x0 = dwt->win_h_x0; 2232 x1 = dwt->win_h_x1; 2233 } 2234 } 2235 2236 static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt, 2237 opj_sparse_array_int32_t* sa, 2238 OPJ_UINT32 sa_line, 2239 OPJ_UINT32 remaining_height) 2240 { 2241 OPJ_UINT32 i; 2242 for (i = 0; i < remaining_height; i++) { 2243 OPJ_BOOL ret; 2244 ret = opj_sparse_array_int32_read(sa, 2245 dwt->win_l_x0, sa_line + i, 2246 dwt->win_l_x1, sa_line + i + 1, 2247 /* Nasty cast from float* to int32* */ 2248 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i, 2249 8, 0, OPJ_TRUE); 2250 assert(ret); 2251 ret = opj_sparse_array_int32_read(sa, 2252 (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i, 2253 (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1, 2254 /* Nasty cast from float* to int32* */ 2255 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i, 2256 8, 0, OPJ_TRUE); 2257 assert(ret); 2258 OPJ_UNUSED(ret); 2259 } 2260 } 2261 2262 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, 2263 OPJ_FLOAT32* OPJ_RESTRICT a, 2264 OPJ_UINT32 width, 2265 OPJ_UINT32 nb_elts_read) 2266 { 2267 opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; 2268 OPJ_UINT32 i; 2269 2270 for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) { 2271 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width], 2272 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32)); 2273 } 2274 2275 a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width; 2276 bi = dwt->wavelet + 1 - dwt->cas; 2277 2278 for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) { 2279 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width], 2280 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32)); 2281 } 2282 } 2283 2284 static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt, 2285 opj_sparse_array_int32_t* sa, 2286 OPJ_UINT32 sa_col, 2287 OPJ_UINT32 nb_elts_read) 2288 { 2289 OPJ_BOOL ret; 2290 ret = opj_sparse_array_int32_read(sa, 2291 sa_col, dwt->win_l_x0, 2292 sa_col + nb_elts_read, dwt->win_l_x1, 2293 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0), 2294 1, 8, OPJ_TRUE); 2295 assert(ret); 2296 ret = opj_sparse_array_int32_read(sa, 2297 sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, 2298 sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, 2299 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0), 2300 1, 8, OPJ_TRUE); 2301 assert(ret); 2302 OPJ_UNUSED(ret); 2303 } 2304 2305 #ifdef __SSE__ 2306 2307 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, 2308 OPJ_UINT32 start, 2309 OPJ_UINT32 end, 2310 const __m128 c) 2311 { 2312 __m128* OPJ_RESTRICT vw = (__m128*) w; 2313 OPJ_UINT32 i; 2314 /* 4x unrolled loop */ 2315 vw += 2 * start; 2316 for (i = start; i + 3 < end; i += 4, vw += 8) { 2317 __m128 xmm0 = _mm_mul_ps(vw[0], c); 2318 __m128 xmm2 = _mm_mul_ps(vw[2], c); 2319 __m128 xmm4 = _mm_mul_ps(vw[4], c); 2320 __m128 xmm6 = _mm_mul_ps(vw[6], c); 2321 vw[0] = xmm0; 2322 vw[2] = xmm2; 2323 vw[4] = xmm4; 2324 vw[6] = xmm6; 2325 } 2326 for (; i < end; ++i, vw += 2) { 2327 vw[0] = _mm_mul_ps(vw[0], c); 2328 } 2329 } 2330 2331 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, 2332 OPJ_UINT32 start, 2333 OPJ_UINT32 end, 2334 OPJ_UINT32 m, 2335 __m128 c) 2336 { 2337 __m128* OPJ_RESTRICT vl = (__m128*) l; 2338 __m128* OPJ_RESTRICT vw = (__m128*) w; 2339 OPJ_UINT32 i; 2340 OPJ_UINT32 imax = opj_uint_min(end, m); 2341 __m128 tmp1, tmp2, tmp3; 2342 if (start == 0) { 2343 tmp1 = vl[0]; 2344 } else { 2345 vw += start * 2; 2346 tmp1 = vw[-3]; 2347 } 2348 2349 i = start; 2350 2351 /* 4x loop unrolling */ 2352 for (; i + 3 < imax; i += 4) { 2353 __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 2354 tmp2 = vw[-1]; 2355 tmp3 = vw[ 0]; 2356 tmp4 = vw[ 1]; 2357 tmp5 = vw[ 2]; 2358 tmp6 = vw[ 3]; 2359 tmp7 = vw[ 4]; 2360 tmp8 = vw[ 5]; 2361 tmp9 = vw[ 6]; 2362 vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); 2363 vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c)); 2364 vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c)); 2365 vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c)); 2366 tmp1 = tmp9; 2367 vw += 8; 2368 } 2369 2370 for (; i < imax; ++i) { 2371 tmp2 = vw[-1]; 2372 tmp3 = vw[ 0]; 2373 vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); 2374 tmp1 = tmp3; 2375 vw += 2; 2376 } 2377 if (m < end) { 2378 assert(m + 1 == end); 2379 c = _mm_add_ps(c, c); 2380 c = _mm_mul_ps(c, vw[-2]); 2381 vw[-1] = _mm_add_ps(vw[-1], c); 2382 } 2383 } 2384 2385 #else 2386 2387 static void opj_v4dwt_decode_step1(opj_v4_t* w, 2388 OPJ_UINT32 start, 2389 OPJ_UINT32 end, 2390 const OPJ_FLOAT32 c) 2391 { 2392 OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; 2393 OPJ_UINT32 i; 2394 for (i = start; i < end; ++i) { 2395 OPJ_FLOAT32 tmp1 = fw[i * 8 ]; 2396 OPJ_FLOAT32 tmp2 = fw[i * 8 + 1]; 2397 OPJ_FLOAT32 tmp3 = fw[i * 8 + 2]; 2398 OPJ_FLOAT32 tmp4 = fw[i * 8 + 3]; 2399 fw[i * 8 ] = tmp1 * c; 2400 fw[i * 8 + 1] = tmp2 * c; 2401 fw[i * 8 + 2] = tmp3 * c; 2402 fw[i * 8 + 3] = tmp4 * c; 2403 } 2404 } 2405 2406 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, 2407 OPJ_UINT32 start, 2408 OPJ_UINT32 end, 2409 OPJ_UINT32 m, 2410 OPJ_FLOAT32 c) 2411 { 2412 OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l; 2413 OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w; 2414 OPJ_UINT32 i; 2415 OPJ_UINT32 imax = opj_uint_min(end, m); 2416 if (start > 0) { 2417 fw += 8 * start; 2418 fl = fw - 8; 2419 } 2420 for (i = start; i < imax; ++i) { 2421 OPJ_FLOAT32 tmp1_1 = fl[0]; 2422 OPJ_FLOAT32 tmp1_2 = fl[1]; 2423 OPJ_FLOAT32 tmp1_3 = fl[2]; 2424 OPJ_FLOAT32 tmp1_4 = fl[3]; 2425 OPJ_FLOAT32 tmp2_1 = fw[-4]; 2426 OPJ_FLOAT32 tmp2_2 = fw[-3]; 2427 OPJ_FLOAT32 tmp2_3 = fw[-2]; 2428 OPJ_FLOAT32 tmp2_4 = fw[-1]; 2429 OPJ_FLOAT32 tmp3_1 = fw[0]; 2430 OPJ_FLOAT32 tmp3_2 = fw[1]; 2431 OPJ_FLOAT32 tmp3_3 = fw[2]; 2432 OPJ_FLOAT32 tmp3_4 = fw[3]; 2433 fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c); 2434 fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c); 2435 fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c); 2436 fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c); 2437 fl = fw; 2438 fw += 8; 2439 } 2440 if (m < end) { 2441 assert(m + 1 == end); 2442 c += c; 2443 fw[-4] = fw[-4] + fl[0] * c; 2444 fw[-3] = fw[-3] + fl[1] * c; 2445 fw[-2] = fw[-2] + fl[2] * c; 2446 fw[-1] = fw[-1] + fl[3] * c; 2447 } 2448 } 2449 2450 #endif 2451 2452 /* <summary> */ 2453 /* Inverse 9-7 wavelet transform in 1-D. */ 2454 /* </summary> */ 2455 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) 2456 { 2457 OPJ_INT32 a, b; 2458 if (dwt->cas == 0) { 2459 if (!((dwt->dn > 0) || (dwt->sn > 1))) { 2460 return; 2461 } 2462 a = 0; 2463 b = 1; 2464 } else { 2465 if (!((dwt->sn > 0) || (dwt->dn > 1))) { 2466 return; 2467 } 2468 a = 1; 2469 b = 0; 2470 } 2471 #ifdef __SSE__ 2472 opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, 2473 _mm_set1_ps(opj_K)); 2474 opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, 2475 _mm_set1_ps(opj_c13318)); 2476 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, 2477 dwt->win_l_x0, dwt->win_l_x1, 2478 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), 2479 _mm_set1_ps(opj_dwt_delta)); 2480 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, 2481 dwt->win_h_x0, dwt->win_h_x1, 2482 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), 2483 _mm_set1_ps(opj_dwt_gamma)); 2484 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, 2485 dwt->win_l_x0, dwt->win_l_x1, 2486 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), 2487 _mm_set1_ps(opj_dwt_beta)); 2488 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, 2489 dwt->win_h_x0, dwt->win_h_x1, 2490 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), 2491 _mm_set1_ps(opj_dwt_alpha)); 2492 #else 2493 opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, 2494 opj_K); 2495 opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, 2496 opj_c13318); 2497 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, 2498 dwt->win_l_x0, dwt->win_l_x1, 2499 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), 2500 opj_dwt_delta); 2501 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, 2502 dwt->win_h_x0, dwt->win_h_x1, 2503 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), 2504 opj_dwt_gamma); 2505 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, 2506 dwt->win_l_x0, dwt->win_l_x1, 2507 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), 2508 opj_dwt_beta); 2509 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, 2510 dwt->win_h_x0, dwt->win_h_x1, 2511 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), 2512 opj_dwt_alpha); 2513 #endif 2514 } 2515 2516 2517 /* <summary> */ 2518 /* Inverse 9-7 wavelet transform in 2-D. */ 2519 /* </summary> */ 2520 static 2521 OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, 2522 OPJ_UINT32 numres) 2523 { 2524 opj_v4dwt_t h; 2525 opj_v4dwt_t v; 2526 2527 opj_tcd_resolution_t* res = tilec->resolutions; 2528 2529 OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 - 2530 res->x0); /* width of the resolution level computed */ 2531 OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 - 2532 res->y0); /* height of the resolution level computed */ 2533 2534 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - 2535 1].x1 - 2536 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); 2537 2538 OPJ_SIZE_T l_data_size; 2539 2540 l_data_size = opj_dwt_max_resolution(res, numres); 2541 /* overflow check */ 2542 if (l_data_size > (SIZE_MAX - 5U)) { 2543 /* FIXME event manager error callback */ 2544 return OPJ_FALSE; 2545 } 2546 l_data_size += 5U; 2547 /* overflow check */ 2548 if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { 2549 /* FIXME event manager error callback */ 2550 return OPJ_FALSE; 2551 } 2552 h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); 2553 if (!h.wavelet) { 2554 /* FIXME event manager error callback */ 2555 return OPJ_FALSE; 2556 } 2557 v.wavelet = h.wavelet; 2558 2559 while (--numres) { 2560 OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data; 2561 OPJ_UINT32 j; 2562 2563 h.sn = (OPJ_INT32)rw; 2564 v.sn = (OPJ_INT32)rh; 2565 2566 ++res; 2567 2568 rw = (OPJ_UINT32)(res->x1 - 2569 res->x0); /* width of the resolution level computed */ 2570 rh = (OPJ_UINT32)(res->y1 - 2571 res->y0); /* height of the resolution level computed */ 2572 2573 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); 2574 h.cas = res->x0 % 2; 2575 2576 h.win_l_x0 = 0; 2577 h.win_l_x1 = (OPJ_UINT32)h.sn; 2578 h.win_h_x0 = 0; 2579 h.win_h_x1 = (OPJ_UINT32)h.dn; 2580 for (j = 0; j + 3 < rh; j += 4) { 2581 OPJ_UINT32 k; 2582 opj_v4dwt_interleave_h(&h, aj, w, rh - j); 2583 opj_v4dwt_decode(&h); 2584 2585 for (k = 0; k < rw; k++) { 2586 aj[k ] = h.wavelet[k].f[0]; 2587 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; 2588 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; 2589 aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; 2590 } 2591 2592 aj += w * 4; 2593 } 2594 2595 if (j < rh) { 2596 OPJ_UINT32 k; 2597 opj_v4dwt_interleave_h(&h, aj, w, rh - j); 2598 opj_v4dwt_decode(&h); 2599 for (k = 0; k < rw; k++) { 2600 switch (rh - j) { 2601 case 3: 2602 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; 2603 /* FALLTHRU */ 2604 case 2: 2605 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; 2606 /* FALLTHRU */ 2607 case 1: 2608 aj[k] = h.wavelet[k].f[0]; 2609 } 2610 } 2611 } 2612 2613 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); 2614 v.cas = res->y0 % 2; 2615 v.win_l_x0 = 0; 2616 v.win_l_x1 = (OPJ_UINT32)v.sn; 2617 v.win_h_x0 = 0; 2618 v.win_h_x1 = (OPJ_UINT32)v.dn; 2619 2620 aj = (OPJ_FLOAT32*) tilec->data; 2621 for (j = rw; j > 3; j -= 4) { 2622 OPJ_UINT32 k; 2623 2624 opj_v4dwt_interleave_v(&v, aj, w, 4); 2625 opj_v4dwt_decode(&v); 2626 2627 for (k = 0; k < rh; ++k) { 2628 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32)); 2629 } 2630 aj += 4; 2631 } 2632 2633 if (rw & 0x03) { 2634 OPJ_UINT32 k; 2635 2636 j = rw & 0x03; 2637 2638 opj_v4dwt_interleave_v(&v, aj, w, j); 2639 opj_v4dwt_decode(&v); 2640 2641 for (k = 0; k < rh; ++k) { 2642 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 2643 (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32)); 2644 } 2645 } 2646 } 2647 2648 opj_aligned_free(h.wavelet); 2649 return OPJ_TRUE; 2650 } 2651 2652 static 2653 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, 2654 OPJ_UINT32 numres) 2655 { 2656 opj_sparse_array_int32_t* sa; 2657 opj_v4dwt_t h; 2658 opj_v4dwt_t v; 2659 OPJ_UINT32 resno; 2660 /* This value matches the maximum left/right extension given in tables */ 2661 /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */ 2662 /* we currently use 3. */ 2663 const OPJ_UINT32 filter_width = 4U; 2664 2665 opj_tcd_resolution_t* tr = tilec->resolutions; 2666 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); 2667 2668 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - 2669 tr->x0); /* width of the resolution level computed */ 2670 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - 2671 tr->y0); /* height of the resolution level computed */ 2672 2673 OPJ_SIZE_T l_data_size; 2674 2675 /* Compute the intersection of the area of interest, expressed in tile coordinates */ 2676 /* with the tile coordinates */ 2677 OPJ_UINT32 win_tcx0 = tilec->win_x0; 2678 OPJ_UINT32 win_tcy0 = tilec->win_y0; 2679 OPJ_UINT32 win_tcx1 = tilec->win_x1; 2680 OPJ_UINT32 win_tcy1 = tilec->win_y1; 2681 2682 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) { 2683 return OPJ_TRUE; 2684 } 2685 2686 sa = opj_dwt_init_sparse_array(tilec, numres); 2687 if (sa == NULL) { 2688 return OPJ_FALSE; 2689 } 2690 2691 if (numres == 1U) { 2692 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, 2693 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, 2694 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, 2695 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, 2696 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, 2697 tilec->data_win, 2698 1, tr_max->win_x1 - tr_max->win_x0, 2699 OPJ_TRUE); 2700 assert(ret); 2701 OPJ_UNUSED(ret); 2702 opj_sparse_array_int32_free(sa); 2703 return OPJ_TRUE; 2704 } 2705 2706 l_data_size = opj_dwt_max_resolution(tr, numres); 2707 /* overflow check */ 2708 if (l_data_size > (SIZE_MAX - 5U)) { 2709 /* FIXME event manager error callback */ 2710 return OPJ_FALSE; 2711 } 2712 l_data_size += 5U; 2713 /* overflow check */ 2714 if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { 2715 /* FIXME event manager error callback */ 2716 return OPJ_FALSE; 2717 } 2718 h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); 2719 if (!h.wavelet) { 2720 /* FIXME event manager error callback */ 2721 return OPJ_FALSE; 2722 } 2723 v.wavelet = h.wavelet; 2724 2725 for (resno = 1; resno < numres; resno ++) { 2726 OPJ_UINT32 j; 2727 /* Window of interest subband-based coordinates */ 2728 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1; 2729 OPJ_UINT32 win_hl_x0, win_hl_x1; 2730 OPJ_UINT32 win_lh_y0, win_lh_y1; 2731 /* Window of interest tile-resolution-based coordinates */ 2732 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1; 2733 /* Tile-resolution subband-based coordinates */ 2734 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0; 2735 2736 ++tr; 2737 2738 h.sn = (OPJ_INT32)rw; 2739 v.sn = (OPJ_INT32)rh; 2740 2741 rw = (OPJ_UINT32)(tr->x1 - tr->x0); 2742 rh = (OPJ_UINT32)(tr->y1 - tr->y0); 2743 2744 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); 2745 h.cas = tr->x0 % 2; 2746 2747 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); 2748 v.cas = tr->y0 % 2; 2749 2750 /* Get the subband coordinates for the window of interest */ 2751 /* LL band */ 2752 opj_dwt_get_band_coordinates(tilec, resno, 0, 2753 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2754 &win_ll_x0, &win_ll_y0, 2755 &win_ll_x1, &win_ll_y1); 2756 2757 /* HL band */ 2758 opj_dwt_get_band_coordinates(tilec, resno, 1, 2759 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2760 &win_hl_x0, NULL, &win_hl_x1, NULL); 2761 2762 /* LH band */ 2763 opj_dwt_get_band_coordinates(tilec, resno, 2, 2764 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2765 NULL, &win_lh_y0, NULL, &win_lh_y1); 2766 2767 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */ 2768 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0; 2769 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0; 2770 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0; 2771 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0; 2772 2773 /* Substract the origin of the bands for this tile, to the subwindow */ 2774 /* of interest band coordinates, so as to get them relative to the */ 2775 /* tile */ 2776 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0); 2777 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0); 2778 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0); 2779 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0); 2780 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0); 2781 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0); 2782 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0); 2783 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0); 2784 2785 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1); 2786 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1); 2787 2788 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1); 2789 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1); 2790 2791 /* Compute the tile-resolution-based coordinates for the window of interest */ 2792 if (h.cas == 0) { 2793 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1); 2794 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw); 2795 } else { 2796 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1); 2797 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw); 2798 } 2799 2800 if (v.cas == 0) { 2801 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1); 2802 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh); 2803 } else { 2804 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1); 2805 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh); 2806 } 2807 2808 h.win_l_x0 = win_ll_x0; 2809 h.win_l_x1 = win_ll_x1; 2810 h.win_h_x0 = win_hl_x0; 2811 h.win_h_x1 = win_hl_x1; 2812 for (j = 0; j + 3 < rh; j += 4) { 2813 if ((j + 3 >= win_ll_y0 && j < win_ll_y1) || 2814 (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && 2815 j < win_lh_y1 + (OPJ_UINT32)v.sn)) { 2816 opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j)); 2817 opj_v4dwt_decode(&h); 2818 if (!opj_sparse_array_int32_write(sa, 2819 win_tr_x0, j, 2820 win_tr_x1, j + 4, 2821 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], 2822 4, 1, OPJ_TRUE)) { 2823 /* FIXME event manager error callback */ 2824 opj_sparse_array_int32_free(sa); 2825 opj_aligned_free(h.wavelet); 2826 return OPJ_FALSE; 2827 } 2828 } 2829 } 2830 2831 if (j < rh && 2832 ((j + 3 >= win_ll_y0 && j < win_ll_y1) || 2833 (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && 2834 j < win_lh_y1 + (OPJ_UINT32)v.sn))) { 2835 opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j); 2836 opj_v4dwt_decode(&h); 2837 if (!opj_sparse_array_int32_write(sa, 2838 win_tr_x0, j, 2839 win_tr_x1, rh, 2840 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], 2841 4, 1, OPJ_TRUE)) { 2842 /* FIXME event manager error callback */ 2843 opj_sparse_array_int32_free(sa); 2844 opj_aligned_free(h.wavelet); 2845 return OPJ_FALSE; 2846 } 2847 } 2848 2849 v.win_l_x0 = win_ll_y0; 2850 v.win_l_x1 = win_ll_y1; 2851 v.win_h_x0 = win_lh_y0; 2852 v.win_h_x1 = win_lh_y1; 2853 for (j = win_tr_x0; j < win_tr_x1; j += 4) { 2854 OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j); 2855 2856 opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts); 2857 opj_v4dwt_decode(&v); 2858 2859 if (!opj_sparse_array_int32_write(sa, 2860 j, win_tr_y0, 2861 j + nb_elts, win_tr_y1, 2862 (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0], 2863 1, 4, OPJ_TRUE)) { 2864 /* FIXME event manager error callback */ 2865 opj_sparse_array_int32_free(sa); 2866 opj_aligned_free(h.wavelet); 2867 return OPJ_FALSE; 2868 } 2869 } 2870 } 2871 2872 { 2873 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, 2874 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, 2875 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, 2876 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, 2877 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, 2878 tilec->data_win, 2879 1, tr_max->win_x1 - tr_max->win_x0, 2880 OPJ_TRUE); 2881 assert(ret); 2882 OPJ_UNUSED(ret); 2883 } 2884 opj_sparse_array_int32_free(sa); 2885 2886 opj_aligned_free(h.wavelet); 2887 return OPJ_TRUE; 2888 } 2889 2890 2891 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd, 2892 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, 2893 OPJ_UINT32 numres) 2894 { 2895 if (p_tcd->whole_tile_decoding) { 2896 return opj_dwt_decode_tile_97(tilec, numres); 2897 } else { 2898 return opj_dwt_decode_partial_97(tilec, numres); 2899 } 2900 } 2901