1 2 /* filter_neon_intrinsics.c - NEON optimised filter functions 3 * 4 * Copyright (c) 2014 Glenn Randers-Pehrson 5 * Written by James Yu <james.yu at linaro.org>, October 2013. 6 * Based on filter_neon.S, written by Mans Rullgard, 2011. 7 * 8 * Last changed in libpng 1.6.16 [December 22, 2014] 9 * 10 * This code is released under the libpng license. 11 * For conditions of distribution and use, see the disclaimer 12 * and license in png.h 13 */ 14 15 #include "../pngpriv.h" 16 17 #ifdef PNG_READ_SUPPORTED 18 19 /* This code requires -mfpu=neon on the command line: */ 20 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ 21 22 #include <arm_neon.h> 23 24 /* libpng row pointers are not necessarily aligned to any particular boundary, 25 * however this code will only work with appropriate alignment. arm/arm_init.c 26 * checks for this (and will not compile unless it is done). This code uses 27 * variants of png_aligncast to avoid compiler warnings. 28 */ 29 #define png_ptr(type,pointer) png_aligncast(type *,pointer) 30 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) 31 32 /* The following relies on a variable 'temp_pointer' being declared with type 33 * 'type'. This is written this way just to hide the GCC strict aliasing 34 * warning; note that the code is safe because there never is an alias between 35 * the input and output pointers. 36 */ 37 #define png_ldr(type,pointer)\ 38 (temp_pointer = png_ptr(type,pointer), *temp_pointer) 39 40 #if PNG_ARM_NEON_OPT > 0 41 42 void 43 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 44 png_const_bytep prev_row) 45 { 46 png_bytep rp = row; 47 png_bytep rp_stop = row + row_info->rowbytes; 48 png_const_bytep pp = prev_row; 49 50 for (; rp < rp_stop; rp += 16, pp += 16) 51 { 52 uint8x16_t qrp, qpp; 53 54 qrp = vld1q_u8(rp); 55 qpp = vld1q_u8(pp); 56 qrp = vaddq_u8(qrp, qpp); 57 vst1q_u8(rp, qrp); 58 } 59 } 60 61 void 62 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 63 png_const_bytep prev_row) 64 { 65 png_bytep rp = row; 66 png_bytep rp_stop = row + row_info->rowbytes; 67 68 uint8x16_t vtmp = vld1q_u8(rp); 69 uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); 70 uint8x8x2_t vrp = *vrpt; 71 72 uint8x8x4_t vdest; 73 vdest.val[3] = vdup_n_u8(0); 74 75 for (; rp < rp_stop;) 76 { 77 uint8x8_t vtmp1, vtmp2; 78 uint32x2_t *temp_pointer; 79 80 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 81 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 82 vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); 83 vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 84 85 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 86 vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); 87 vdest.val[3] = vadd_u8(vdest.val[2], vtmp1); 88 89 vtmp = vld1q_u8(rp + 12); 90 vrpt = png_ptr(uint8x8x2_t, &vtmp); 91 vrp = *vrpt; 92 93 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 94 rp += 3; 95 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 96 rp += 3; 97 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 98 rp += 3; 99 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 100 rp += 3; 101 } 102 103 PNG_UNUSED(prev_row) 104 } 105 106 void 107 png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, 108 png_const_bytep prev_row) 109 { 110 png_bytep rp = row; 111 png_bytep rp_stop = row + row_info->rowbytes; 112 113 uint8x8x4_t vdest; 114 vdest.val[3] = vdup_n_u8(0); 115 116 for (; rp < rp_stop; rp += 16) 117 { 118 uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp)); 119 uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp); 120 uint8x8x4_t vrp = *vrpt; 121 uint32x2x4_t *temp_pointer; 122 123 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); 124 vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); 125 vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]); 126 vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]); 127 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 128 } 129 130 PNG_UNUSED(prev_row) 131 } 132 133 void 134 png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, 135 png_const_bytep prev_row) 136 { 137 png_bytep rp = row; 138 png_const_bytep pp = prev_row; 139 png_bytep rp_stop = row + row_info->rowbytes; 140 141 uint8x16_t vtmp; 142 uint8x8x2_t *vrpt; 143 uint8x8x2_t vrp; 144 uint8x8x4_t vdest; 145 vdest.val[3] = vdup_n_u8(0); 146 147 vtmp = vld1q_u8(rp); 148 vrpt = png_ptr(uint8x8x2_t,&vtmp); 149 vrp = *vrpt; 150 151 for (; rp < rp_stop; pp += 12) 152 { 153 uint8x8_t vtmp1, vtmp2, vtmp3; 154 155 uint8x8x2_t *vppt; 156 uint8x8x2_t vpp; 157 158 uint32x2_t *temp_pointer; 159 160 vtmp = vld1q_u8(pp); 161 vppt = png_ptr(uint8x8x2_t,&vtmp); 162 vpp = *vppt; 163 164 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 165 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 166 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 167 168 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 169 vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); 170 vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 171 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 172 173 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); 174 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 175 176 vtmp = vld1q_u8(rp + 12); 177 vrpt = png_ptr(uint8x8x2_t,&vtmp); 178 vrp = *vrpt; 179 180 vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); 181 vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); 182 183 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 184 185 vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2); 186 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 187 188 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 189 rp += 3; 190 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 191 rp += 3; 192 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 193 rp += 3; 194 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 195 rp += 3; 196 } 197 } 198 199 void 200 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, 201 png_const_bytep prev_row) 202 { 203 png_bytep rp = row; 204 png_bytep rp_stop = row + row_info->rowbytes; 205 png_const_bytep pp = prev_row; 206 207 uint8x8x4_t vdest; 208 vdest.val[3] = vdup_n_u8(0); 209 210 for (; rp < rp_stop; rp += 16, pp += 16) 211 { 212 uint32x2x4_t vtmp; 213 uint8x8x4_t *vrpt, *vppt; 214 uint8x8x4_t vrp, vpp; 215 uint32x2x4_t *temp_pointer; 216 217 vtmp = vld4_u32(png_ptr(uint32_t,rp)); 218 vrpt = png_ptr(uint8x8x4_t,&vtmp); 219 vrp = *vrpt; 220 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 221 vppt = png_ptr(uint8x8x4_t,&vtmp); 222 vpp = *vppt; 223 224 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); 225 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 226 vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 227 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 228 vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]); 229 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 230 vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]); 231 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 232 233 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 234 } 235 } 236 237 static uint8x8_t 238 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) 239 { 240 uint8x8_t d, e; 241 uint16x8_t p1, pa, pb, pc; 242 243 p1 = vaddl_u8(a, b); /* a + b */ 244 pc = vaddl_u8(c, c); /* c * 2 */ 245 pa = vabdl_u8(b, c); /* pa */ 246 pb = vabdl_u8(a, c); /* pb */ 247 pc = vabdq_u16(p1, pc); /* pc */ 248 249 p1 = vcleq_u16(pa, pb); /* pa <= pb */ 250 pa = vcleq_u16(pa, pc); /* pa <= pc */ 251 pb = vcleq_u16(pb, pc); /* pb <= pc */ 252 253 p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ 254 255 d = vmovn_u16(pb); 256 e = vmovn_u16(p1); 257 258 d = vbsl_u8(d, b, c); 259 e = vbsl_u8(e, a, d); 260 261 return e; 262 } 263 264 void 265 png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, 266 png_const_bytep prev_row) 267 { 268 png_bytep rp = row; 269 png_const_bytep pp = prev_row; 270 png_bytep rp_stop = row + row_info->rowbytes; 271 272 uint8x16_t vtmp; 273 uint8x8x2_t *vrpt; 274 uint8x8x2_t vrp; 275 uint8x8_t vlast = vdup_n_u8(0); 276 uint8x8x4_t vdest; 277 vdest.val[3] = vdup_n_u8(0); 278 279 vtmp = vld1q_u8(rp); 280 vrpt = png_ptr(uint8x8x2_t,&vtmp); 281 vrp = *vrpt; 282 283 for (; rp < rp_stop; pp += 12) 284 { 285 uint8x8x2_t *vppt; 286 uint8x8x2_t vpp; 287 uint8x8_t vtmp1, vtmp2, vtmp3; 288 uint32x2_t *temp_pointer; 289 290 vtmp = vld1q_u8(pp); 291 vppt = png_ptr(uint8x8x2_t,&vtmp); 292 vpp = *vppt; 293 294 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 295 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 296 297 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); 298 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); 299 vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 300 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 301 302 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); 303 vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); 304 vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); 305 vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); 306 307 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 308 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 309 310 vtmp = vld1q_u8(rp + 12); 311 vrpt = png_ptr(uint8x8x2_t,&vtmp); 312 vrp = *vrpt; 313 314 vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3); 315 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); 316 317 vlast = vtmp2; 318 319 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); 320 rp += 3; 321 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); 322 rp += 3; 323 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); 324 rp += 3; 325 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); 326 rp += 3; 327 } 328 } 329 330 void 331 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 332 png_const_bytep prev_row) 333 { 334 png_bytep rp = row; 335 png_bytep rp_stop = row + row_info->rowbytes; 336 png_const_bytep pp = prev_row; 337 338 uint8x8_t vlast = vdup_n_u8(0); 339 uint8x8x4_t vdest; 340 vdest.val[3] = vdup_n_u8(0); 341 342 for (; rp < rp_stop; rp += 16, pp += 16) 343 { 344 uint32x2x4_t vtmp; 345 uint8x8x4_t *vrpt, *vppt; 346 uint8x8x4_t vrp, vpp; 347 uint32x2x4_t *temp_pointer; 348 349 vtmp = vld4_u32(png_ptr(uint32_t,rp)); 350 vrpt = png_ptr(uint8x8x4_t,&vtmp); 351 vrp = *vrpt; 352 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); 353 vppt = png_ptr(uint8x8x4_t,&vtmp); 354 vpp = *vppt; 355 356 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); 357 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 358 vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 359 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 360 vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]); 361 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); 362 vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]); 363 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); 364 365 vlast = vpp.val[3]; 366 367 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); 368 } 369 } 370 371 #endif /* PNG_ARM_NEON_OPT > 0 */ 372 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ 373 #endif /* READ */ 374