1 /* 2 Copyright (c) 2010, Intel Corporation 3 All rights reserved. 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifndef L 32 # define L(label) .L##label 33 #endif 34 35 #ifndef ALIGN 36 # define ALIGN(n) .p2align n 37 #endif 38 39 #ifndef cfi_startproc 40 # define cfi_startproc .cfi_startproc 41 #endif 42 43 #ifndef cfi_endproc 44 # define cfi_endproc .cfi_endproc 45 #endif 46 47 #ifndef cfi_rel_offset 48 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 49 #endif 50 51 #ifndef cfi_restore 52 # define cfi_restore(reg) .cfi_restore (reg) 53 #endif 54 55 #ifndef cfi_adjust_cfa_offset 56 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 57 #endif 58 59 #ifndef ENTRY 60 # define ENTRY(name) \ 61 .type name, @function; \ 62 .globl name; \ 63 .p2align 4; \ 64 name: \ 65 cfi_startproc 66 #endif 67 68 #ifndef END 69 # define END(name) \ 70 cfi_endproc; \ 71 .size name, .-name 72 #endif 73 74 #define CFI_PUSH(REG) \ 75 cfi_adjust_cfa_offset (4); \ 76 cfi_rel_offset (REG, 0) 77 78 #define CFI_POP(REG) \ 79 cfi_adjust_cfa_offset (-4); \ 80 cfi_restore (REG) 81 82 #define PUSH(REG) pushl REG; CFI_PUSH (REG) 83 #define POP(REG) popl REG; CFI_POP (REG) 84 85 #ifdef USE_AS_BZERO 86 # define DEST PARMS 87 # define LEN DEST+4 88 # define SETRTNVAL 89 #else 90 # define DEST PARMS 91 # define CHR DEST+4 92 # define LEN CHR+4 93 # define SETRTNVAL movl DEST(%esp), %eax 94 #endif 95 96 #ifdef SHARED 97 # define ENTRANCE PUSH (%ebx); 98 # define RETURN_END POP (%ebx); ret 99 # define RETURN RETURN_END; CFI_PUSH (%ebx) 100 # define PARMS 8 /* Preserve EBX. */ 101 # define JMPTBL(I, B) I - B 102 103 /* Load an entry in a jump table into EBX and branch to it. TABLE is a 104 jump table with relative offsets. */ 105 # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 106 /* We first load PC into EBX. */ \ 107 call __i686.get_pc_thunk.bx; \ 108 /* Get the address of the jump table. */ \ 109 add $(TABLE - .), %ebx; \ 110 /* Get the entry and convert the relative offset to the \ 111 absolute address. */ \ 112 add (%ebx,%ecx,4), %ebx; \ 113 add %ecx, %edx; \ 114 /* We loaded the jump table and adjuested EDX. Go. */ \ 115 jmp *%ebx 116 117 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits 118 .globl __i686.get_pc_thunk.bx 119 .hidden __i686.get_pc_thunk.bx 120 ALIGN (4) 121 .type __i686.get_pc_thunk.bx,@function 122 __i686.get_pc_thunk.bx: 123 movl (%esp), %ebx 124 ret 125 #else 126 # define ENTRANCE 127 # define RETURN_END ret 128 # define RETURN RETURN_END 129 # define PARMS 4 130 # define JMPTBL(I, B) I 131 132 /* Branch to an entry in a jump table. TABLE is a jump table with 133 absolute offsets. */ 134 # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 135 add %ecx, %edx; \ 136 jmp *TABLE(,%ecx,4) 137 #endif 138 139 .section .text.sse2,"ax",@progbits 140 ALIGN (4) 141 ENTRY (sse2_memset5_atom) 142 ENTRANCE 143 144 movl LEN(%esp), %ecx 145 #ifdef USE_AS_BZERO 146 xor %eax, %eax 147 #else 148 movzbl CHR(%esp), %eax 149 movb %al, %ah 150 /* Fill the whole EAX with pattern. */ 151 movl %eax, %edx 152 shl $16, %eax 153 or %edx, %eax 154 #endif 155 movl DEST(%esp), %edx 156 cmp $32, %ecx 157 jae L(32bytesormore) 158 159 L(write_less32bytes): 160 BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) 161 162 163 .pushsection .rodata.sse2,"a",@progbits 164 ALIGN (2) 165 L(table_less_32bytes): 166 .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) 167 .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) 168 .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) 169 .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) 170 .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) 171 .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) 172 .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) 173 .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) 174 .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) 175 .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) 176 .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) 177 .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) 178 .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) 179 .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) 180 .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) 181 .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) 182 .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) 183 .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) 184 .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) 185 .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) 186 .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) 187 .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) 188 .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) 189 .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) 190 .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) 191 .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) 192 .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) 193 .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) 194 .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) 195 .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) 196 .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) 197 .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) 198 .popsection 199 200 ALIGN (4) 201 L(write_28bytes): 202 movl %eax, -28(%edx) 203 L(write_24bytes): 204 movl %eax, -24(%edx) 205 L(write_20bytes): 206 movl %eax, -20(%edx) 207 L(write_16bytes): 208 movl %eax, -16(%edx) 209 L(write_12bytes): 210 movl %eax, -12(%edx) 211 L(write_8bytes): 212 movl %eax, -8(%edx) 213 L(write_4bytes): 214 movl %eax, -4(%edx) 215 L(write_0bytes): 216 SETRTNVAL 217 RETURN 218 219 ALIGN (4) 220 L(write_29bytes): 221 movl %eax, -29(%edx) 222 L(write_25bytes): 223 movl %eax, -25(%edx) 224 L(write_21bytes): 225 movl %eax, -21(%edx) 226 L(write_17bytes): 227 movl %eax, -17(%edx) 228 L(write_13bytes): 229 movl %eax, -13(%edx) 230 L(write_9bytes): 231 movl %eax, -9(%edx) 232 L(write_5bytes): 233 movl %eax, -5(%edx) 234 L(write_1bytes): 235 movb %al, -1(%edx) 236 SETRTNVAL 237 RETURN 238 239 ALIGN (4) 240 L(write_30bytes): 241 movl %eax, -30(%edx) 242 L(write_26bytes): 243 movl %eax, -26(%edx) 244 L(write_22bytes): 245 movl %eax, -22(%edx) 246 L(write_18bytes): 247 movl %eax, -18(%edx) 248 L(write_14bytes): 249 movl %eax, -14(%edx) 250 L(write_10bytes): 251 movl %eax, -10(%edx) 252 L(write_6bytes): 253 movl %eax, -6(%edx) 254 L(write_2bytes): 255 movw %ax, -2(%edx) 256 SETRTNVAL 257 RETURN 258 259 ALIGN (4) 260 L(write_31bytes): 261 movl %eax, -31(%edx) 262 L(write_27bytes): 263 movl %eax, -27(%edx) 264 L(write_23bytes): 265 movl %eax, -23(%edx) 266 L(write_19bytes): 267 movl %eax, -19(%edx) 268 L(write_15bytes): 269 movl %eax, -15(%edx) 270 L(write_11bytes): 271 movl %eax, -11(%edx) 272 L(write_7bytes): 273 movl %eax, -7(%edx) 274 L(write_3bytes): 275 movw %ax, -3(%edx) 276 movb %al, -1(%edx) 277 SETRTNVAL 278 RETURN 279 280 ALIGN (4) 281 /* ECX > 32 and EDX is 4 byte aligned. */ 282 L(32bytesormore): 283 /* Fill xmm0 with the pattern. */ 284 #ifdef USE_AS_BZERO 285 pxor %xmm0, %xmm0 286 #else 287 movd %eax, %xmm0 288 punpcklbw %xmm0, %xmm0 289 pshufd $0, %xmm0, %xmm0 290 #endif 291 testl $0xf, %edx 292 jz L(aligned_16) 293 /* ECX > 32 and EDX is not 16 byte aligned. */ 294 L(not_aligned_16): 295 movdqu %xmm0, (%edx) 296 movl %edx, %eax 297 and $-16, %edx 298 add $16, %edx 299 sub %edx, %eax 300 add %eax, %ecx 301 movd %xmm0, %eax 302 303 ALIGN (4) 304 L(aligned_16): 305 cmp $128, %ecx 306 jae L(128bytesormore) 307 308 L(aligned_16_less128bytes): 309 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 310 311 ALIGN (4) 312 L(128bytesormore): 313 #ifdef SHARED_CACHE_SIZE 314 PUSH (%ebx) 315 mov $SHARED_CACHE_SIZE, %ebx 316 #else 317 # ifdef SHARED 318 call __i686.get_pc_thunk.bx 319 add $_GLOBAL_OFFSET_TABLE_, %ebx 320 mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx 321 # else 322 PUSH (%ebx) 323 mov __x86_shared_cache_size, %ebx 324 # endif 325 #endif 326 cmp %ebx, %ecx 327 jae L(128bytesormore_nt_start) 328 329 330 #ifdef DATA_CACHE_SIZE 331 POP (%ebx) 332 cmp $DATA_CACHE_SIZE, %ecx 333 #else 334 # ifdef SHARED 335 call __i686.get_pc_thunk.bx 336 add $_GLOBAL_OFFSET_TABLE_, %ebx 337 cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx 338 # else 339 POP (%ebx) 340 cmp __x86_data_cache_size, %ecx 341 # endif 342 #endif 343 344 jae L(128bytes_L2_normal) 345 subl $128, %ecx 346 L(128bytesormore_normal): 347 sub $128, %ecx 348 movdqa %xmm0, (%edx) 349 movdqa %xmm0, 0x10(%edx) 350 movdqa %xmm0, 0x20(%edx) 351 movdqa %xmm0, 0x30(%edx) 352 movdqa %xmm0, 0x40(%edx) 353 movdqa %xmm0, 0x50(%edx) 354 movdqa %xmm0, 0x60(%edx) 355 movdqa %xmm0, 0x70(%edx) 356 lea 128(%edx), %edx 357 jb L(128bytesless_normal) 358 359 360 sub $128, %ecx 361 movdqa %xmm0, (%edx) 362 movdqa %xmm0, 0x10(%edx) 363 movdqa %xmm0, 0x20(%edx) 364 movdqa %xmm0, 0x30(%edx) 365 movdqa %xmm0, 0x40(%edx) 366 movdqa %xmm0, 0x50(%edx) 367 movdqa %xmm0, 0x60(%edx) 368 movdqa %xmm0, 0x70(%edx) 369 lea 128(%edx), %edx 370 jae L(128bytesormore_normal) 371 372 L(128bytesless_normal): 373 lea 128(%ecx), %ecx 374 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 375 376 ALIGN (4) 377 L(128bytes_L2_normal): 378 prefetcht0 0x380(%edx) 379 prefetcht0 0x3c0(%edx) 380 sub $128, %ecx 381 movdqa %xmm0, (%edx) 382 movaps %xmm0, 0x10(%edx) 383 movaps %xmm0, 0x20(%edx) 384 movaps %xmm0, 0x30(%edx) 385 movaps %xmm0, 0x40(%edx) 386 movaps %xmm0, 0x50(%edx) 387 movaps %xmm0, 0x60(%edx) 388 movaps %xmm0, 0x70(%edx) 389 add $128, %edx 390 cmp $128, %ecx 391 jae L(128bytes_L2_normal) 392 393 L(128bytesless_L2_normal): 394 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 395 396 L(128bytesormore_nt_start): 397 sub %ebx, %ecx 398 ALIGN (4) 399 L(128bytesormore_shared_cache_loop): 400 prefetcht0 0x3c0(%edx) 401 prefetcht0 0x380(%edx) 402 sub $0x80, %ebx 403 movdqa %xmm0, (%edx) 404 movdqa %xmm0, 0x10(%edx) 405 movdqa %xmm0, 0x20(%edx) 406 movdqa %xmm0, 0x30(%edx) 407 movdqa %xmm0, 0x40(%edx) 408 movdqa %xmm0, 0x50(%edx) 409 movdqa %xmm0, 0x60(%edx) 410 movdqa %xmm0, 0x70(%edx) 411 add $0x80, %edx 412 cmp $0x80, %ebx 413 jae L(128bytesormore_shared_cache_loop) 414 cmp $0x80, %ecx 415 jb L(shared_cache_loop_end) 416 ALIGN (4) 417 L(128bytesormore_nt): 418 sub $0x80, %ecx 419 movntdq %xmm0, (%edx) 420 movntdq %xmm0, 0x10(%edx) 421 movntdq %xmm0, 0x20(%edx) 422 movntdq %xmm0, 0x30(%edx) 423 movntdq %xmm0, 0x40(%edx) 424 movntdq %xmm0, 0x50(%edx) 425 movntdq %xmm0, 0x60(%edx) 426 movntdq %xmm0, 0x70(%edx) 427 add $0x80, %edx 428 cmp $0x80, %ecx 429 jae L(128bytesormore_nt) 430 sfence 431 L(shared_cache_loop_end): 432 #if defined DATA_CACHE_SIZE || !defined SHARED 433 POP (%ebx) 434 #endif 435 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 436 437 438 .pushsection .rodata.sse2,"a",@progbits 439 ALIGN (2) 440 L(table_16_128bytes): 441 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 442 .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) 443 .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) 444 .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) 445 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 446 .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) 447 .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) 448 .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) 449 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 450 .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) 451 .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) 452 .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) 453 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 454 .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) 455 .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) 456 .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) 457 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 458 .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) 459 .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) 460 .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) 461 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 462 .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) 463 .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) 464 .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) 465 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 466 .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) 467 .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) 468 .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) 469 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 470 .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) 471 .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) 472 .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) 473 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 474 .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) 475 .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) 476 .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) 477 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 478 .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) 479 .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) 480 .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) 481 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 482 .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) 483 .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) 484 .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) 485 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 486 .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) 487 .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) 488 .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) 489 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 490 .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) 491 .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) 492 .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) 493 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 494 .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) 495 .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) 496 .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) 497 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 498 .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) 499 .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) 500 .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) 501 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 502 .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) 503 .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) 504 .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) 505 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 506 .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) 507 .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) 508 .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) 509 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 510 .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) 511 .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) 512 .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) 513 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 514 .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) 515 .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) 516 .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) 517 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 518 .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) 519 .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) 520 .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) 521 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 522 .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) 523 .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) 524 .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) 525 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 526 .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) 527 .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) 528 .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) 529 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 530 .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) 531 .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) 532 .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) 533 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 534 .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) 535 .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) 536 .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) 537 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 538 .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) 539 .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) 540 .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) 541 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 542 .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) 543 .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) 544 .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) 545 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 546 .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) 547 .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) 548 .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) 549 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 550 .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) 551 .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) 552 .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) 553 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 554 .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) 555 .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) 556 .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) 557 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 558 .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) 559 .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) 560 .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) 561 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 562 .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) 563 .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) 564 .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) 565 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 566 .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) 567 .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) 568 .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) 569 .popsection 570 571 ALIGN (4) 572 L(aligned_16_112bytes): 573 movdqa %xmm0, -112(%edx) 574 L(aligned_16_96bytes): 575 movdqa %xmm0, -96(%edx) 576 L(aligned_16_80bytes): 577 movdqa %xmm0, -80(%edx) 578 L(aligned_16_64bytes): 579 movdqa %xmm0, -64(%edx) 580 L(aligned_16_48bytes): 581 movdqa %xmm0, -48(%edx) 582 L(aligned_16_32bytes): 583 movdqa %xmm0, -32(%edx) 584 L(aligned_16_16bytes): 585 movdqa %xmm0, -16(%edx) 586 L(aligned_16_0bytes): 587 SETRTNVAL 588 RETURN 589 590 ALIGN (4) 591 L(aligned_16_113bytes): 592 movdqa %xmm0, -113(%edx) 593 L(aligned_16_97bytes): 594 movdqa %xmm0, -97(%edx) 595 L(aligned_16_81bytes): 596 movdqa %xmm0, -81(%edx) 597 L(aligned_16_65bytes): 598 movdqa %xmm0, -65(%edx) 599 L(aligned_16_49bytes): 600 movdqa %xmm0, -49(%edx) 601 L(aligned_16_33bytes): 602 movdqa %xmm0, -33(%edx) 603 L(aligned_16_17bytes): 604 movdqa %xmm0, -17(%edx) 605 L(aligned_16_1bytes): 606 movb %al, -1(%edx) 607 SETRTNVAL 608 RETURN 609 610 ALIGN (4) 611 L(aligned_16_114bytes): 612 movdqa %xmm0, -114(%edx) 613 L(aligned_16_98bytes): 614 movdqa %xmm0, -98(%edx) 615 L(aligned_16_82bytes): 616 movdqa %xmm0, -82(%edx) 617 L(aligned_16_66bytes): 618 movdqa %xmm0, -66(%edx) 619 L(aligned_16_50bytes): 620 movdqa %xmm0, -50(%edx) 621 L(aligned_16_34bytes): 622 movdqa %xmm0, -34(%edx) 623 L(aligned_16_18bytes): 624 movdqa %xmm0, -18(%edx) 625 L(aligned_16_2bytes): 626 movw %ax, -2(%edx) 627 SETRTNVAL 628 RETURN 629 630 ALIGN (4) 631 L(aligned_16_115bytes): 632 movdqa %xmm0, -115(%edx) 633 L(aligned_16_99bytes): 634 movdqa %xmm0, -99(%edx) 635 L(aligned_16_83bytes): 636 movdqa %xmm0, -83(%edx) 637 L(aligned_16_67bytes): 638 movdqa %xmm0, -67(%edx) 639 L(aligned_16_51bytes): 640 movdqa %xmm0, -51(%edx) 641 L(aligned_16_35bytes): 642 movdqa %xmm0, -35(%edx) 643 L(aligned_16_19bytes): 644 movdqa %xmm0, -19(%edx) 645 L(aligned_16_3bytes): 646 movw %ax, -3(%edx) 647 movb %al, -1(%edx) 648 SETRTNVAL 649 RETURN 650 651 ALIGN (4) 652 L(aligned_16_116bytes): 653 movdqa %xmm0, -116(%edx) 654 L(aligned_16_100bytes): 655 movdqa %xmm0, -100(%edx) 656 L(aligned_16_84bytes): 657 movdqa %xmm0, -84(%edx) 658 L(aligned_16_68bytes): 659 movdqa %xmm0, -68(%edx) 660 L(aligned_16_52bytes): 661 movdqa %xmm0, -52(%edx) 662 L(aligned_16_36bytes): 663 movdqa %xmm0, -36(%edx) 664 L(aligned_16_20bytes): 665 movdqa %xmm0, -20(%edx) 666 L(aligned_16_4bytes): 667 movl %eax, -4(%edx) 668 SETRTNVAL 669 RETURN 670 671 ALIGN (4) 672 L(aligned_16_117bytes): 673 movdqa %xmm0, -117(%edx) 674 L(aligned_16_101bytes): 675 movdqa %xmm0, -101(%edx) 676 L(aligned_16_85bytes): 677 movdqa %xmm0, -85(%edx) 678 L(aligned_16_69bytes): 679 movdqa %xmm0, -69(%edx) 680 L(aligned_16_53bytes): 681 movdqa %xmm0, -53(%edx) 682 L(aligned_16_37bytes): 683 movdqa %xmm0, -37(%edx) 684 L(aligned_16_21bytes): 685 movdqa %xmm0, -21(%edx) 686 L(aligned_16_5bytes): 687 movl %eax, -5(%edx) 688 movb %al, -1(%edx) 689 SETRTNVAL 690 RETURN 691 692 ALIGN (4) 693 L(aligned_16_118bytes): 694 movdqa %xmm0, -118(%edx) 695 L(aligned_16_102bytes): 696 movdqa %xmm0, -102(%edx) 697 L(aligned_16_86bytes): 698 movdqa %xmm0, -86(%edx) 699 L(aligned_16_70bytes): 700 movdqa %xmm0, -70(%edx) 701 L(aligned_16_54bytes): 702 movdqa %xmm0, -54(%edx) 703 L(aligned_16_38bytes): 704 movdqa %xmm0, -38(%edx) 705 L(aligned_16_22bytes): 706 movdqa %xmm0, -22(%edx) 707 L(aligned_16_6bytes): 708 movl %eax, -6(%edx) 709 movw %ax, -2(%edx) 710 SETRTNVAL 711 RETURN 712 713 ALIGN (4) 714 L(aligned_16_119bytes): 715 movdqa %xmm0, -119(%edx) 716 L(aligned_16_103bytes): 717 movdqa %xmm0, -103(%edx) 718 L(aligned_16_87bytes): 719 movdqa %xmm0, -87(%edx) 720 L(aligned_16_71bytes): 721 movdqa %xmm0, -71(%edx) 722 L(aligned_16_55bytes): 723 movdqa %xmm0, -55(%edx) 724 L(aligned_16_39bytes): 725 movdqa %xmm0, -39(%edx) 726 L(aligned_16_23bytes): 727 movdqa %xmm0, -23(%edx) 728 L(aligned_16_7bytes): 729 movl %eax, -7(%edx) 730 movw %ax, -3(%edx) 731 movb %al, -1(%edx) 732 SETRTNVAL 733 RETURN 734 735 ALIGN (4) 736 L(aligned_16_120bytes): 737 movdqa %xmm0, -120(%edx) 738 L(aligned_16_104bytes): 739 movdqa %xmm0, -104(%edx) 740 L(aligned_16_88bytes): 741 movdqa %xmm0, -88(%edx) 742 L(aligned_16_72bytes): 743 movdqa %xmm0, -72(%edx) 744 L(aligned_16_56bytes): 745 movdqa %xmm0, -56(%edx) 746 L(aligned_16_40bytes): 747 movdqa %xmm0, -40(%edx) 748 L(aligned_16_24bytes): 749 movdqa %xmm0, -24(%edx) 750 L(aligned_16_8bytes): 751 movq %xmm0, -8(%edx) 752 SETRTNVAL 753 RETURN 754 755 ALIGN (4) 756 L(aligned_16_121bytes): 757 movdqa %xmm0, -121(%edx) 758 L(aligned_16_105bytes): 759 movdqa %xmm0, -105(%edx) 760 L(aligned_16_89bytes): 761 movdqa %xmm0, -89(%edx) 762 L(aligned_16_73bytes): 763 movdqa %xmm0, -73(%edx) 764 L(aligned_16_57bytes): 765 movdqa %xmm0, -57(%edx) 766 L(aligned_16_41bytes): 767 movdqa %xmm0, -41(%edx) 768 L(aligned_16_25bytes): 769 movdqa %xmm0, -25(%edx) 770 L(aligned_16_9bytes): 771 movq %xmm0, -9(%edx) 772 movb %al, -1(%edx) 773 SETRTNVAL 774 RETURN 775 776 ALIGN (4) 777 L(aligned_16_122bytes): 778 movdqa %xmm0, -122(%edx) 779 L(aligned_16_106bytes): 780 movdqa %xmm0, -106(%edx) 781 L(aligned_16_90bytes): 782 movdqa %xmm0, -90(%edx) 783 L(aligned_16_74bytes): 784 movdqa %xmm0, -74(%edx) 785 L(aligned_16_58bytes): 786 movdqa %xmm0, -58(%edx) 787 L(aligned_16_42bytes): 788 movdqa %xmm0, -42(%edx) 789 L(aligned_16_26bytes): 790 movdqa %xmm0, -26(%edx) 791 L(aligned_16_10bytes): 792 movq %xmm0, -10(%edx) 793 movw %ax, -2(%edx) 794 SETRTNVAL 795 RETURN 796 797 ALIGN (4) 798 L(aligned_16_123bytes): 799 movdqa %xmm0, -123(%edx) 800 L(aligned_16_107bytes): 801 movdqa %xmm0, -107(%edx) 802 L(aligned_16_91bytes): 803 movdqa %xmm0, -91(%edx) 804 L(aligned_16_75bytes): 805 movdqa %xmm0, -75(%edx) 806 L(aligned_16_59bytes): 807 movdqa %xmm0, -59(%edx) 808 L(aligned_16_43bytes): 809 movdqa %xmm0, -43(%edx) 810 L(aligned_16_27bytes): 811 movdqa %xmm0, -27(%edx) 812 L(aligned_16_11bytes): 813 movq %xmm0, -11(%edx) 814 movw %ax, -3(%edx) 815 movb %al, -1(%edx) 816 SETRTNVAL 817 RETURN 818 819 ALIGN (4) 820 L(aligned_16_124bytes): 821 movdqa %xmm0, -124(%edx) 822 L(aligned_16_108bytes): 823 movdqa %xmm0, -108(%edx) 824 L(aligned_16_92bytes): 825 movdqa %xmm0, -92(%edx) 826 L(aligned_16_76bytes): 827 movdqa %xmm0, -76(%edx) 828 L(aligned_16_60bytes): 829 movdqa %xmm0, -60(%edx) 830 L(aligned_16_44bytes): 831 movdqa %xmm0, -44(%edx) 832 L(aligned_16_28bytes): 833 movdqa %xmm0, -28(%edx) 834 L(aligned_16_12bytes): 835 movq %xmm0, -12(%edx) 836 movl %eax, -4(%edx) 837 SETRTNVAL 838 RETURN 839 840 ALIGN (4) 841 L(aligned_16_125bytes): 842 movdqa %xmm0, -125(%edx) 843 L(aligned_16_109bytes): 844 movdqa %xmm0, -109(%edx) 845 L(aligned_16_93bytes): 846 movdqa %xmm0, -93(%edx) 847 L(aligned_16_77bytes): 848 movdqa %xmm0, -77(%edx) 849 L(aligned_16_61bytes): 850 movdqa %xmm0, -61(%edx) 851 L(aligned_16_45bytes): 852 movdqa %xmm0, -45(%edx) 853 L(aligned_16_29bytes): 854 movdqa %xmm0, -29(%edx) 855 L(aligned_16_13bytes): 856 movq %xmm0, -13(%edx) 857 movl %eax, -5(%edx) 858 movb %al, -1(%edx) 859 SETRTNVAL 860 RETURN 861 862 ALIGN (4) 863 L(aligned_16_126bytes): 864 movdqa %xmm0, -126(%edx) 865 L(aligned_16_110bytes): 866 movdqa %xmm0, -110(%edx) 867 L(aligned_16_94bytes): 868 movdqa %xmm0, -94(%edx) 869 L(aligned_16_78bytes): 870 movdqa %xmm0, -78(%edx) 871 L(aligned_16_62bytes): 872 movdqa %xmm0, -62(%edx) 873 L(aligned_16_46bytes): 874 movdqa %xmm0, -46(%edx) 875 L(aligned_16_30bytes): 876 movdqa %xmm0, -30(%edx) 877 L(aligned_16_14bytes): 878 movq %xmm0, -14(%edx) 879 movl %eax, -6(%edx) 880 movw %ax, -2(%edx) 881 SETRTNVAL 882 RETURN 883 884 ALIGN (4) 885 L(aligned_16_127bytes): 886 movdqa %xmm0, -127(%edx) 887 L(aligned_16_111bytes): 888 movdqa %xmm0, -111(%edx) 889 L(aligned_16_95bytes): 890 movdqa %xmm0, -95(%edx) 891 L(aligned_16_79bytes): 892 movdqa %xmm0, -79(%edx) 893 L(aligned_16_63bytes): 894 movdqa %xmm0, -63(%edx) 895 L(aligned_16_47bytes): 896 movdqa %xmm0, -47(%edx) 897 L(aligned_16_31bytes): 898 movdqa %xmm0, -31(%edx) 899 L(aligned_16_15bytes): 900 movq %xmm0, -15(%edx) 901 movl %eax, -7(%edx) 902 movw %ax, -3(%edx) 903 movb %al, -1(%edx) 904 SETRTNVAL 905 RETURN_END 906 907 END (sse2_memset5_atom) 908