1 ; 2 ; jcclrss2-64.asm - colorspace conversion (64-bit SSE2) 3 ; 4 ; x86 SIMD extension for IJG JPEG library 5 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 6 ; Copyright (C) 2009, D. R. Commander. 7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 8 ; 9 ; This file should be assembled with NASM (Netwide Assembler), 10 ; can *not* be assembled with Microsoft's MASM or any compatible 11 ; assembler (including Borland's Turbo Assembler). 12 ; NASM is available from http://nasm.sourceforge.net/ or 13 ; http://sourceforge.net/project/showfiles.php?group_id=6208 14 ; 15 ; [TAB8] 16 17 %include "jcolsamp.inc" 18 19 ; -------------------------------------------------------------------------- 20 ; 21 ; Convert some rows of samples to the output colorspace. 22 ; 23 ; GLOBAL(void) 24 ; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, 25 ; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 26 ; JDIMENSION output_row, int num_rows); 27 ; 28 29 ; r10 = JDIMENSION img_width 30 ; r11 = JSAMPARRAY input_buf 31 ; r12 = JSAMPIMAGE output_buf 32 ; r13 = JDIMENSION output_row 33 ; r14 = int num_rows 34 35 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 36 %define WK_NUM 8 37 38 align 16 39 40 global EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE 41 42 EXTN(jsimd_rgb_ycc_convert_sse2): 43 push rbp 44 mov rax,rsp ; rax = original rbp 45 sub rsp, byte 4 46 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 47 mov [rsp],rax 48 mov rbp,rsp ; rbp = aligned rbp 49 lea rsp, [wk(0)] 50 collect_args 51 push rbx 52 53 mov rcx, r10 54 test rcx,rcx 55 jz near .return 56 57 push rcx 58 59 mov rsi, r12 60 mov rcx, r13 61 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 62 mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] 63 mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] 64 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 65 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 66 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 67 68 pop rcx 69 70 mov rsi, r11 71 mov eax, r14d 72 test rax,rax 73 jle near .return 74 .rowloop: 75 push rdx 76 push rbx 77 push rdi 78 push rsi 79 push rcx ; col 80 81 mov rsi, JSAMPROW [rsi] ; inptr 82 mov rdi, JSAMPROW [rdi] ; outptr0 83 mov rbx, JSAMPROW [rbx] ; outptr1 84 mov rdx, JSAMPROW [rdx] ; outptr2 85 86 cmp rcx, byte SIZEOF_XMMWORD 87 jae near .columnloop 88 89 %if RGB_PIXELSIZE == 3 ; --------------- 90 91 .column_ld1: 92 push rax 93 push rdx 94 lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 95 test cl, SIZEOF_BYTE 96 jz short .column_ld2 97 sub rcx, byte SIZEOF_BYTE 98 movzx rax, BYTE [rsi+rcx] 99 .column_ld2: 100 test cl, SIZEOF_WORD 101 jz short .column_ld4 102 sub rcx, byte SIZEOF_WORD 103 movzx rdx, WORD [rsi+rcx] 104 shl rax, WORD_BIT 105 or rax,rdx 106 .column_ld4: 107 movd xmmA,eax 108 pop rdx 109 pop rax 110 test cl, SIZEOF_DWORD 111 jz short .column_ld8 112 sub rcx, byte SIZEOF_DWORD 113 movd xmmF, XMM_DWORD [rsi+rcx] 114 pslldq xmmA, SIZEOF_DWORD 115 por xmmA,xmmF 116 .column_ld8: 117 test cl, SIZEOF_MMWORD 118 jz short .column_ld16 119 sub rcx, byte SIZEOF_MMWORD 120 movq xmmB, XMM_MMWORD [rsi+rcx] 121 pslldq xmmA, SIZEOF_MMWORD 122 por xmmA,xmmB 123 .column_ld16: 124 test cl, SIZEOF_XMMWORD 125 jz short .column_ld32 126 movdqa xmmF,xmmA 127 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 128 mov rcx, SIZEOF_XMMWORD 129 jmp short .rgb_ycc_cnv 130 .column_ld32: 131 test cl, 2*SIZEOF_XMMWORD 132 mov rcx, SIZEOF_XMMWORD 133 jz short .rgb_ycc_cnv 134 movdqa xmmB,xmmA 135 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 136 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 137 jmp short .rgb_ycc_cnv 138 139 .columnloop: 140 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 141 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 142 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 143 144 .rgb_ycc_cnv: 145 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 146 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 147 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 148 149 movdqa xmmG,xmmA 150 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 151 psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 152 153 punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 154 pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 155 156 punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 157 punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 158 159 movdqa xmmD,xmmA 160 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 161 psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 162 163 punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 164 pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 165 166 punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 167 punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 168 169 movdqa xmmE,xmmA 170 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 171 psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 172 173 punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 174 pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 175 176 punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 177 punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 178 179 pxor xmmH,xmmH 180 181 movdqa xmmC,xmmA 182 punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 183 punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 184 185 movdqa xmmB,xmmE 186 punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 187 punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 188 189 movdqa xmmF,xmmD 190 punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 191 punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 192 193 %else ; RGB_PIXELSIZE == 4 ; ----------- 194 195 .column_ld1: 196 test cl, SIZEOF_XMMWORD/16 197 jz short .column_ld2 198 sub rcx, byte SIZEOF_XMMWORD/16 199 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 200 .column_ld2: 201 test cl, SIZEOF_XMMWORD/8 202 jz short .column_ld4 203 sub rcx, byte SIZEOF_XMMWORD/8 204 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 205 pslldq xmmA, SIZEOF_MMWORD 206 por xmmA,xmmE 207 .column_ld4: 208 test cl, SIZEOF_XMMWORD/4 209 jz short .column_ld8 210 sub rcx, byte SIZEOF_XMMWORD/4 211 movdqa xmmE,xmmA 212 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 213 .column_ld8: 214 test cl, SIZEOF_XMMWORD/2 215 mov rcx, SIZEOF_XMMWORD 216 jz short .rgb_ycc_cnv 217 movdqa xmmF,xmmA 218 movdqa xmmH,xmmE 219 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 220 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 221 jmp short .rgb_ycc_cnv 222 223 .columnloop: 224 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 225 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 226 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 227 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 228 229 .rgb_ycc_cnv: 230 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 231 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 232 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 233 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 234 235 movdqa xmmD,xmmA 236 punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 237 punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 238 239 movdqa xmmC,xmmF 240 punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 241 punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 242 243 movdqa xmmB,xmmA 244 punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 245 punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 246 247 movdqa xmmG,xmmD 248 punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 249 punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 250 251 movdqa xmmE,xmmA 252 punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 253 punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 254 255 movdqa xmmH,xmmB 256 punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 257 punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 258 259 pxor xmmF,xmmF 260 261 movdqa xmmC,xmmA 262 punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 263 punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 264 265 movdqa xmmD,xmmB 266 punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 267 punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 268 269 movdqa xmmG,xmmE 270 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 271 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 272 273 punpcklbw xmmF,xmmH 274 punpckhbw xmmH,xmmH 275 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 276 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 277 278 %endif ; RGB_PIXELSIZE ; --------------- 279 280 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 281 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 282 283 ; (Original) 284 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 285 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 286 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 287 ; 288 ; (This implementation) 289 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 290 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 291 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 292 293 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE 294 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO 295 movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE 296 movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO 297 298 movdqa xmm6,xmm1 299 punpcklwd xmm1,xmm3 300 punpckhwd xmm6,xmm3 301 movdqa xmm7,xmm1 302 movdqa xmm4,xmm6 303 pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 304 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 305 pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 306 pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 307 308 movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 309 movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 310 311 pxor xmm1,xmm1 312 pxor xmm6,xmm6 313 punpcklwd xmm1,xmm5 ; xmm1=BOL 314 punpckhwd xmm6,xmm5 ; xmm6=BOH 315 psrld xmm1,1 ; xmm1=BOL*FIX(0.500) 316 psrld xmm6,1 ; xmm6=BOH*FIX(0.500) 317 318 movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] 319 320 paddd xmm7,xmm1 321 paddd xmm4,xmm6 322 paddd xmm7,xmm5 323 paddd xmm4,xmm5 324 psrld xmm7,SCALEBITS ; xmm7=CbOL 325 psrld xmm4,SCALEBITS ; xmm4=CbOH 326 packssdw xmm7,xmm4 ; xmm7=CbO 327 328 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE 329 330 movdqa xmm6,xmm0 331 punpcklwd xmm0,xmm2 332 punpckhwd xmm6,xmm2 333 movdqa xmm5,xmm0 334 movdqa xmm4,xmm6 335 pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 336 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 337 pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 338 pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 339 340 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 341 movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 342 343 pxor xmm0,xmm0 344 pxor xmm6,xmm6 345 punpcklwd xmm0,xmm1 ; xmm0=BEL 346 punpckhwd xmm6,xmm1 ; xmm6=BEH 347 psrld xmm0,1 ; xmm0=BEL*FIX(0.500) 348 psrld xmm6,1 ; xmm6=BEH*FIX(0.500) 349 350 movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 351 352 paddd xmm5,xmm0 353 paddd xmm4,xmm6 354 paddd xmm5,xmm1 355 paddd xmm4,xmm1 356 psrld xmm5,SCALEBITS ; xmm5=CbEL 357 psrld xmm4,SCALEBITS ; xmm4=CbEH 358 packssdw xmm5,xmm4 ; xmm5=CbE 359 360 psllw xmm7,BYTE_BIT 361 por xmm5,xmm7 ; xmm5=Cb 362 movdqa XMMWORD [rbx], xmm5 ; Save Cb 363 364 movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO 365 movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE 366 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO 367 368 movdqa xmm4,xmm0 369 punpcklwd xmm0,xmm3 370 punpckhwd xmm4,xmm3 371 movdqa xmm7,xmm0 372 movdqa xmm5,xmm4 373 pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 374 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 375 pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 376 pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 377 378 movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 379 380 paddd xmm0, XMMWORD [wk(4)] 381 paddd xmm4, XMMWORD [wk(5)] 382 paddd xmm0,xmm3 383 paddd xmm4,xmm3 384 psrld xmm0,SCALEBITS ; xmm0=YOL 385 psrld xmm4,SCALEBITS ; xmm4=YOH 386 packssdw xmm0,xmm4 ; xmm0=YO 387 388 pxor xmm3,xmm3 389 pxor xmm4,xmm4 390 punpcklwd xmm3,xmm1 ; xmm3=ROL 391 punpckhwd xmm4,xmm1 ; xmm4=ROH 392 psrld xmm3,1 ; xmm3=ROL*FIX(0.500) 393 psrld xmm4,1 ; xmm4=ROH*FIX(0.500) 394 395 movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 396 397 paddd xmm7,xmm3 398 paddd xmm5,xmm4 399 paddd xmm7,xmm1 400 paddd xmm5,xmm1 401 psrld xmm7,SCALEBITS ; xmm7=CrOL 402 psrld xmm5,SCALEBITS ; xmm5=CrOH 403 packssdw xmm7,xmm5 ; xmm7=CrO 404 405 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE 406 407 movdqa xmm4,xmm6 408 punpcklwd xmm6,xmm2 409 punpckhwd xmm4,xmm2 410 movdqa xmm1,xmm6 411 movdqa xmm5,xmm4 412 pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 413 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 414 pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 415 pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 416 417 movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 418 419 paddd xmm6, XMMWORD [wk(6)] 420 paddd xmm4, XMMWORD [wk(7)] 421 paddd xmm6,xmm2 422 paddd xmm4,xmm2 423 psrld xmm6,SCALEBITS ; xmm6=YEL 424 psrld xmm4,SCALEBITS ; xmm4=YEH 425 packssdw xmm6,xmm4 ; xmm6=YE 426 427 psllw xmm0,BYTE_BIT 428 por xmm6,xmm0 ; xmm6=Y 429 movdqa XMMWORD [rdi], xmm6 ; Save Y 430 431 pxor xmm2,xmm2 432 pxor xmm4,xmm4 433 punpcklwd xmm2,xmm3 ; xmm2=REL 434 punpckhwd xmm4,xmm3 ; xmm4=REH 435 psrld xmm2,1 ; xmm2=REL*FIX(0.500) 436 psrld xmm4,1 ; xmm4=REH*FIX(0.500) 437 438 movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] 439 440 paddd xmm1,xmm2 441 paddd xmm5,xmm4 442 paddd xmm1,xmm0 443 paddd xmm5,xmm0 444 psrld xmm1,SCALEBITS ; xmm1=CrEL 445 psrld xmm5,SCALEBITS ; xmm5=CrEH 446 packssdw xmm1,xmm5 ; xmm1=CrE 447 448 psllw xmm7,BYTE_BIT 449 por xmm1,xmm7 ; xmm1=Cr 450 movdqa XMMWORD [rdx], xmm1 ; Save Cr 451 452 sub rcx, byte SIZEOF_XMMWORD 453 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 454 add rdi, byte SIZEOF_XMMWORD ; outptr0 455 add rbx, byte SIZEOF_XMMWORD ; outptr1 456 add rdx, byte SIZEOF_XMMWORD ; outptr2 457 cmp rcx, byte SIZEOF_XMMWORD 458 jae near .columnloop 459 test rcx,rcx 460 jnz near .column_ld1 461 462 pop rcx ; col 463 pop rsi 464 pop rdi 465 pop rbx 466 pop rdx 467 468 add rsi, byte SIZEOF_JSAMPROW ; input_buf 469 add rdi, byte SIZEOF_JSAMPROW 470 add rbx, byte SIZEOF_JSAMPROW 471 add rdx, byte SIZEOF_JSAMPROW 472 dec rax ; num_rows 473 jg near .rowloop 474 475 .return: 476 pop rbx 477 uncollect_args 478 mov rsp,rbp ; rsp <- aligned rbp 479 pop rsp ; rsp <- original rbp 480 pop rbp 481 ret 482 483 ; For some reason, the OS X linker does not honor the request to align the 484 ; segment unless we do this. 485 align 16 486