1 ; 2 ; jdcolext.asm - colorspace conversion (AVX2) 3 ; 4 ; Copyright 2009, 2012 Pierre Ossman <ossman (a] cendio.se> for Cendio AB 5 ; Copyright (C) 2012, 2016, D. R. Commander. 6 ; Copyright (C) 2015, Intel Corporation. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler), 13 ; can *not* be assembled with Microsoft's MASM or any compatible 14 ; assembler (including Borland's Turbo Assembler). 15 ; NASM is available from http://nasm.sourceforge.net/ or 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 17 ; 18 ; [TAB8] 19 20 %include "jcolsamp.inc" 21 22 ; -------------------------------------------------------------------------- 23 ; 24 ; Convert some rows of samples to the output colorspace. 25 ; 26 ; GLOBAL(void) 27 ; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf, 28 ; JDIMENSION input_row, JSAMPARRAY output_buf, 29 ; int num_rows) 30 ; 31 32 %define out_width(b) (b) + 8 ; JDIMENSION out_width 33 %define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 34 %define input_row(b) (b) + 16 ; JDIMENSION input_row 35 %define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 36 %define num_rows(b) (b) + 24 ; int num_rows 37 38 %define original_ebp ebp + 0 39 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD 40 ; ymmword wk[WK_NUM] 41 %define WK_NUM 2 42 %define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr 43 44 align 32 45 GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2) 46 47 EXTN(jsimd_ycc_rgb_convert_avx2): 48 push ebp 49 mov eax, esp ; eax = original ebp 50 sub esp, byte 4 51 and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 52 mov [esp], eax 53 mov ebp, esp ; ebp = aligned ebp 54 lea esp, [wk(0)] 55 pushpic eax ; make a room for GOT address 56 push ebx 57 ; push ecx ; need not be preserved 58 ; push edx ; need not be preserved 59 push esi 60 push edi 61 62 get_GOT ebx ; get GOT address 63 movpic POINTER [gotptr], ebx ; save GOT address 64 65 mov ecx, JDIMENSION [out_width(eax)] ; num_cols 66 test ecx, ecx 67 jz near .return 68 69 push ecx 70 71 mov edi, JSAMPIMAGE [input_buf(eax)] 72 mov ecx, JDIMENSION [input_row(eax)] 73 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 74 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 75 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 76 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 77 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 78 lea edx, [edx+ecx*SIZEOF_JSAMPROW] 79 80 pop ecx 81 82 mov edi, JSAMPARRAY [output_buf(eax)] 83 mov eax, INT [num_rows(eax)] 84 test eax, eax 85 jle near .return 86 alignx 16, 7 87 .rowloop: 88 push eax 89 push edi 90 push edx 91 push ebx 92 push esi 93 push ecx ; col 94 95 mov esi, JSAMPROW [esi] ; inptr0 96 mov ebx, JSAMPROW [ebx] ; inptr1 97 mov edx, JSAMPROW [edx] ; inptr2 98 mov edi, JSAMPROW [edi] ; outptr 99 movpic eax, POINTER [gotptr] ; load GOT address (eax) 100 alignx 16, 7 101 .columnloop: 102 103 vmovdqu ymm5, YMMWORD [ebx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV) 104 vmovdqu ymm1, YMMWORD [edx] ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV) 105 106 vpcmpeqw ymm0, ymm0, ymm0 107 vpcmpeqw ymm7, ymm7, ymm7 108 vpsrlw ymm0, ymm0, BYTE_BIT ; ymm0={0xFF 0x00 0xFF 0x00 ..} 109 vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 110 111 vpand ymm4, ymm0, ymm5 ; ymm4=Cb(02468ACEGIKMOQSU)=CbE 112 vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Cb(13579BDFHJLNPRTV)=CbO 113 vpand ymm0, ymm0, ymm1 ; ymm0=Cr(02468ACEGIKMOQSU)=CrE 114 vpsrlw ymm1, ymm1, BYTE_BIT ; ymm1=Cr(13579BDFHJLNPRTV)=CrO 115 116 vpaddw ymm2, ymm4, ymm7 117 vpaddw ymm3, ymm5, ymm7 118 vpaddw ymm6, ymm0, ymm7 119 vpaddw ymm7, ymm1, ymm7 120 121 ; (Original) 122 ; R = Y + 1.40200 * Cr 123 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 124 ; B = Y + 1.77200 * Cb 125 ; 126 ; (This implementation) 127 ; R = Y + 0.40200 * Cr + Cr 128 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 129 ; B = Y - 0.22800 * Cb + Cb + Cb 130 131 vpaddw ymm4, ymm2, ymm2 ; ymm4=2*CbE 132 vpaddw ymm5, ymm3, ymm3 ; ymm5=2*CbO 133 vpaddw ymm0, ymm6, ymm6 ; ymm0=2*CrE 134 vpaddw ymm1, ymm7, ymm7 ; ymm1=2*CrO 135 136 vpmulhw ymm4, ymm4, [GOTOFF(eax,PW_MF0228)] ; ymm4=(2*CbE * -FIX(0.22800)) 137 vpmulhw ymm5, ymm5, [GOTOFF(eax,PW_MF0228)] ; ymm5=(2*CbO * -FIX(0.22800)) 138 vpmulhw ymm0, ymm0, [GOTOFF(eax,PW_F0402)] ; ymm0=(2*CrE * FIX(0.40200)) 139 vpmulhw ymm1, ymm1, [GOTOFF(eax,PW_F0402)] ; ymm1=(2*CrO * FIX(0.40200)) 140 141 vpaddw ymm4, ymm4, [GOTOFF(eax,PW_ONE)] 142 vpaddw ymm5, ymm5, [GOTOFF(eax,PW_ONE)] 143 vpsraw ymm4, ymm4, 1 ; ymm4=(CbE * -FIX(0.22800)) 144 vpsraw ymm5, ymm5, 1 ; ymm5=(CbO * -FIX(0.22800)) 145 vpaddw ymm0, ymm0, [GOTOFF(eax,PW_ONE)] 146 vpaddw ymm1, ymm1, [GOTOFF(eax,PW_ONE)] 147 vpsraw ymm0, ymm0, 1 ; ymm0=(CrE * FIX(0.40200)) 148 vpsraw ymm1, ymm1, 1 ; ymm1=(CrO * FIX(0.40200)) 149 150 vpaddw ymm4, ymm4, ymm2 151 vpaddw ymm5, ymm5, ymm3 152 vpaddw ymm4, ymm4, ymm2 ; ymm4=(CbE * FIX(1.77200))=(B-Y)E 153 vpaddw ymm5, ymm5, ymm3 ; ymm5=(CbO * FIX(1.77200))=(B-Y)O 154 vpaddw ymm0, ymm0, ymm6 ; ymm0=(CrE * FIX(1.40200))=(R-Y)E 155 vpaddw ymm1, ymm1, ymm7 ; ymm1=(CrO * FIX(1.40200))=(R-Y)O 156 157 vmovdqa YMMWORD [wk(0)], ymm4 ; wk(0)=(B-Y)E 158 vmovdqa YMMWORD [wk(1)], ymm5 ; wk(1)=(B-Y)O 159 160 vpunpckhwd ymm4, ymm2, ymm6 161 vpunpcklwd ymm2, ymm2, ymm6 162 vpmaddwd ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)] 163 vpmaddwd ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)] 164 vpunpckhwd ymm5, ymm3, ymm7 165 vpunpcklwd ymm3, ymm3, ymm7 166 vpmaddwd ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)] 167 vpmaddwd ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)] 168 169 vpaddd ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)] 170 vpaddd ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)] 171 vpsrad ymm2, ymm2, SCALEBITS 172 vpsrad ymm4, ymm4, SCALEBITS 173 vpaddd ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)] 174 vpaddd ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)] 175 vpsrad ymm3, ymm3, SCALEBITS 176 vpsrad ymm5, ymm5, SCALEBITS 177 178 vpackssdw ymm2, ymm2, ymm4 ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 179 vpackssdw ymm3, ymm3, ymm5 ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 180 vpsubw ymm2, ymm2, ymm6 ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 181 vpsubw ymm3, ymm3, ymm7 ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 182 183 vmovdqu ymm5, YMMWORD [esi] ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV) 184 185 vpcmpeqw ymm4, ymm4, ymm4 186 vpsrlw ymm4, ymm4, BYTE_BIT ; ymm4={0xFF 0x00 0xFF 0x00 ..} 187 vpand ymm4, ymm4, ymm5 ; ymm4=Y(02468ACEGIKMOQSU)=YE 188 vpsrlw ymm5, ymm5, BYTE_BIT ; ymm5=Y(13579BDFHJLNPRTV)=YO 189 190 vpaddw ymm0, ymm0, ymm4 ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU) 191 vpaddw ymm1, ymm1, ymm5 ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV) 192 vpackuswb ymm0, ymm0, ymm0 ; ymm0=R(02468ACE********GIKMOQSU********) 193 vpackuswb ymm1, ymm1, ymm1 ; ymm1=R(13579BDF********HJLNPRTV********) 194 195 vpaddw ymm2, ymm2, ymm4 ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU) 196 vpaddw ymm3, ymm3, ymm5 ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV) 197 vpackuswb ymm2, ymm2, ymm2 ; ymm2=G(02468ACE********GIKMOQSU********) 198 vpackuswb ymm3, ymm3, ymm3 ; ymm3=G(13579BDF********HJLNPRTV********) 199 200 vpaddw ymm4, ymm4, YMMWORD [wk(0)] ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU) 201 vpaddw ymm5, ymm5, YMMWORD [wk(1)] ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV) 202 vpackuswb ymm4, ymm4, ymm4 ; ymm4=B(02468ACE********GIKMOQSU********) 203 vpackuswb ymm5, ymm5, ymm5 ; ymm5=B(13579BDF********HJLNPRTV********) 204 205 %if RGB_PIXELSIZE == 3 ; --------------- 206 207 ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) 208 ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) 209 ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) 210 ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) 211 ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) 212 ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) 213 ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) 214 ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **) 215 216 vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 217 ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) 218 vpunpcklbw ymmE, ymmE, ymmB ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 219 ; 2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V) 220 vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 221 ; 1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V) 222 223 vpsrldq ymmH, ymmA, 2 ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G 224 ; 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --) 225 vpunpckhwd ymmG, ymmA, ymmE ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F 226 ; 0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V) 227 vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07 228 ; 0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N) 229 230 vpsrldq ymmE, ymmE, 2 ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H 231 ; 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --) 232 233 vpsrldq ymmB, ymmD, 2 ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H 234 ; 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --) 235 vpunpckhwd ymmC, ymmD, ymmH ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G 236 ; 1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --) 237 vpunpcklwd ymmD, ymmD, ymmH ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18 238 ; 1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O) 239 240 vpunpckhwd ymmF, ymmE, ymmB ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H 241 ; 2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --) 242 vpunpcklwd ymmE, ymmE, ymmB ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29 243 ; 2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P) 244 245 vpshufd ymmH, ymmA, 0x4E ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03 246 ; 0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J) 247 vpunpckldq ymmA, ymmA, ymmD ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14 248 ; 0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K) 249 vpunpckhdq ymmD, ymmD, ymmE ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29 250 ; 1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P) 251 vpunpckldq ymmE, ymmE, ymmH ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07 252 ; 2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N) 253 254 vpshufd ymmH, ymmG, 0x4E ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B 255 ; 0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R) 256 vpunpckldq ymmG, ymmG, ymmC ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C 257 ; 0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S) 258 vpunpckhdq ymmC, ymmC, ymmF ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H 259 ; 1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --) 260 vpunpckldq ymmF, ymmF, ymmH ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F 261 ; 2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V) 262 263 vpunpcklqdq ymmH, ymmA, ymmE ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 264 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 265 vpunpcklqdq ymmG, ymmD, ymmG ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A 266 ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) 267 vpunpcklqdq ymmC, ymmF, ymmC ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 268 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 269 270 vperm2i128 ymmA, ymmH, ymmG, 0x20 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 271 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 272 vperm2i128 ymmD, ymmC, ymmH, 0x30 ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 273 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 274 vperm2i128 ymmF, ymmG, ymmC, 0x31 ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 275 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 276 277 cmp ecx, byte SIZEOF_YMMWORD 278 jb short .column_st64 279 280 test edi, SIZEOF_YMMWORD-1 281 jnz short .out1 282 ; --(aligned)------------------- 283 vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA 284 vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD 285 vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF 286 jmp short .out0 287 .out1: ; --(unaligned)----------------- 288 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA 289 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD 290 vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF 291 .out0: 292 add edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr 293 sub ecx, byte SIZEOF_YMMWORD 294 jz near .nextrow 295 296 add esi, byte SIZEOF_YMMWORD ; inptr0 297 add ebx, byte SIZEOF_YMMWORD ; inptr1 298 add edx, byte SIZEOF_YMMWORD ; inptr2 299 jmp near .columnloop 300 alignx 16, 7 301 302 .column_st64: 303 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 304 cmp ecx, byte 2*SIZEOF_YMMWORD 305 jb short .column_st32 306 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA 307 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD 308 add edi, byte 2*SIZEOF_YMMWORD ; outptr 309 vmovdqa ymmA, ymmF 310 sub ecx, byte 2*SIZEOF_YMMWORD 311 jmp short .column_st31 312 .column_st32: 313 cmp ecx, byte SIZEOF_YMMWORD 314 jb short .column_st31 315 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA 316 add edi, byte SIZEOF_YMMWORD ; outptr 317 vmovdqa ymmA, ymmD 318 sub ecx, byte SIZEOF_YMMWORD 319 jmp short .column_st31 320 .column_st31: 321 cmp ecx, byte SIZEOF_XMMWORD 322 jb short .column_st15 323 vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 324 add edi, byte SIZEOF_XMMWORD ; outptr 325 vperm2i128 ymmA, ymmA, ymmA, 1 326 sub ecx, byte SIZEOF_XMMWORD 327 .column_st15: 328 ; Store the lower 8 bytes of xmmA to the output when it has enough 329 ; space. 330 cmp ecx, byte SIZEOF_MMWORD 331 jb short .column_st7 332 vmovq XMM_MMWORD [edi], xmmA 333 add edi, byte SIZEOF_MMWORD 334 sub ecx, byte SIZEOF_MMWORD 335 vpsrldq xmmA, xmmA, SIZEOF_MMWORD 336 .column_st7: 337 ; Store the lower 4 bytes of xmmA to the output when it has enough 338 ; space. 339 cmp ecx, byte SIZEOF_DWORD 340 jb short .column_st3 341 vmovd XMM_DWORD [edi], xmmA 342 add edi, byte SIZEOF_DWORD 343 sub ecx, byte SIZEOF_DWORD 344 vpsrldq xmmA, xmmA, SIZEOF_DWORD 345 .column_st3: 346 ; Store the lower 2 bytes of eax to the output when it has enough 347 ; space. 348 vmovd eax, xmmA 349 cmp ecx, byte SIZEOF_WORD 350 jb short .column_st1 351 mov WORD [edi], ax 352 add edi, byte SIZEOF_WORD 353 sub ecx, byte SIZEOF_WORD 354 shr eax, 16 355 .column_st1: 356 ; Store the lower 1 byte of eax to the output when it has enough 357 ; space. 358 test ecx, ecx 359 jz short .nextrow 360 mov BYTE [edi], al 361 362 %else ; RGB_PIXELSIZE == 4 ; ----------- 363 364 %ifdef RGBX_FILLER_0XFF 365 vpcmpeqb ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) 366 vpcmpeqb ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) 367 %else 368 vpxor ymm6, ymm6, ymm6 ; ymm6=XE=X(02468ACE********GIKMOQSU********) 369 vpxor ymm7, ymm7, ymm7 ; ymm7=XO=X(13579BDF********HJLNPRTV********) 370 %endif 371 ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **) 372 ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **) 373 ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **) 374 ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **) 375 ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **) 376 ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **) 377 ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **) 378 ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **) 379 380 vpunpcklbw ymmA, ymmA, ymmC ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 381 ; 0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U) 382 vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E 383 ; 2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U) 384 vpunpcklbw ymmB, ymmB, ymmD ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F 385 ; 0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V) 386 vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F 387 ; 2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V) 388 389 vpunpckhwd ymmC, ymmA, ymmE ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E 390 ; 0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U) 391 vpunpcklwd ymmA, ymmA, ymmE ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36 392 ; 0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M) 393 vpunpckhwd ymmG, ymmB, ymmF ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F 394 ; 0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V) 395 vpunpcklwd ymmB, ymmB, ymmF ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37 396 ; 0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N) 397 398 vpunpckhdq ymmE, ymmA, ymmB ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 399 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 400 vpunpckldq ymmB, ymmA, ymmB ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 401 ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) 402 vpunpckhdq ymmF, ymmC, ymmG ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F 403 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 404 vpunpckldq ymmG, ymmC, ymmG ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 405 ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) 406 407 vperm2i128 ymmA, ymmB, ymmE, 0x20 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 408 ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 409 vperm2i128 ymmD, ymmG, ymmF, 0x20 ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 410 ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 411 vperm2i128 ymmC, ymmB, ymmE, 0x31 ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J 412 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 413 vperm2i128 ymmH, ymmG, ymmF, 0x31 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R 414 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 415 416 cmp ecx, byte SIZEOF_YMMWORD 417 jb short .column_st64 418 419 test edi, SIZEOF_YMMWORD-1 420 jnz short .out1 421 ; --(aligned)------------------- 422 vmovntdq YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA 423 vmovntdq YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD 424 vmovntdq YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC 425 vmovntdq YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH 426 jmp short .out0 427 .out1: ; --(unaligned)----------------- 428 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA 429 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD 430 vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC 431 vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH 432 .out0: 433 add edi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; outptr 434 sub ecx, byte SIZEOF_YMMWORD 435 jz near .nextrow 436 437 add esi, byte SIZEOF_YMMWORD ; inptr0 438 add ebx, byte SIZEOF_YMMWORD ; inptr1 439 add edx, byte SIZEOF_YMMWORD ; inptr2 440 jmp near .columnloop 441 alignx 16, 7 442 443 .column_st64: 444 cmp ecx, byte SIZEOF_YMMWORD/2 445 jb short .column_st32 446 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA 447 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD 448 add edi, byte 2*SIZEOF_YMMWORD ; outptr 449 vmovdqa ymmA, ymmC 450 vmovdqa ymmD, ymmH 451 sub ecx, byte SIZEOF_YMMWORD/2 452 .column_st32: 453 cmp ecx, byte SIZEOF_YMMWORD/4 454 jb short .column_st16 455 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA 456 add edi, byte SIZEOF_YMMWORD ; outptr 457 vmovdqa ymmA, ymmD 458 sub ecx, byte SIZEOF_YMMWORD/4 459 .column_st16: 460 cmp ecx, byte SIZEOF_YMMWORD/8 461 jb short .column_st15 462 vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 463 vperm2i128 ymmA, ymmA, ymmA, 1 464 add edi, byte SIZEOF_XMMWORD ; outptr 465 sub ecx, byte SIZEOF_YMMWORD/8 466 .column_st15: 467 ; Store two pixels (8 bytes) of ymmA to the output when it has enough 468 ; space. 469 cmp ecx, byte SIZEOF_YMMWORD/16 470 jb short .column_st7 471 vmovq MMWORD [edi], xmmA 472 add edi, byte SIZEOF_YMMWORD/16*4 473 sub ecx, byte SIZEOF_YMMWORD/16 474 vpsrldq xmmA, SIZEOF_YMMWORD/16*4 475 .column_st7: 476 ; Store one pixel (4 bytes) of ymmA to the output when it has enough 477 ; space. 478 test ecx, ecx 479 jz short .nextrow 480 vmovd XMM_DWORD [edi], xmmA 481 482 %endif ; RGB_PIXELSIZE ; --------------- 483 484 alignx 16, 7 485 486 .nextrow: 487 pop ecx 488 pop esi 489 pop ebx 490 pop edx 491 pop edi 492 pop eax 493 494 add esi, byte SIZEOF_JSAMPROW 495 add ebx, byte SIZEOF_JSAMPROW 496 add edx, byte SIZEOF_JSAMPROW 497 add edi, byte SIZEOF_JSAMPROW ; output_buf 498 dec eax ; num_rows 499 jg near .rowloop 500 501 sfence ; flush the write buffer 502 503 .return: 504 vzeroupper 505 pop edi 506 pop esi 507 ; pop edx ; need not be preserved 508 ; pop ecx ; need not be preserved 509 pop ebx 510 mov esp, ebp ; esp <- aligned ebp 511 pop esp ; esp <- original ebp 512 pop ebp 513 ret 514 515 ; For some reason, the OS X linker does not honor the request to align the 516 ; segment unless we do this. 517 align 32 518