1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "cache.h" 18 19 #ifndef MEMSET 20 # define MEMSET android_memset32 21 #endif 22 23 #ifndef L 24 # define L(label) .L##label 25 #endif 26 27 #ifndef ALIGN 28 # define ALIGN(n) .p2align n 29 #endif 30 31 #ifndef cfi_startproc 32 # define cfi_startproc .cfi_startproc 33 #endif 34 35 #ifndef cfi_endproc 36 # define cfi_endproc .cfi_endproc 37 #endif 38 39 #ifndef ENTRY 40 # define ENTRY(name) \ 41 .type name, @function; \ 42 .globl name; \ 43 .p2align 4; \ 44 name: \ 45 cfi_startproc 46 #endif 47 48 #ifndef END 49 # define END(name) \ 50 cfi_endproc; \ 51 .size name, .-name 52 #endif 53 54 #define JMPTBL(I, B) I - B 55 56 /* Branch to an entry in a jump table. TABLE is a jump table with 57 relative offsets. INDEX is a register contains the index into the 58 jump table. SCALE is the scale of INDEX. */ 59 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 60 lea TABLE(%rip), %r11; \ 61 movslq (%r11, INDEX, SCALE), INDEX; \ 62 lea (%r11, INDEX), INDEX; \ 63 jmp *INDEX 64 65 .section .text.sse2,"ax",@progbits 66 ALIGN (4) 67 ENTRY (MEMSET) // Address in rdi 68 shr $2, %rdx // Count in rdx 69 movl %esi, %ecx // Pattern in ecx 70 71 cmp $16, %rdx 72 jae L(16dbwordsormore) 73 74 L(write_less16dbwords): 75 lea (%rdi, %rdx, 4), %rdi 76 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords), %rdx, 4) 77 78 .pushsection .rodata.sse2,"a",@progbits 79 ALIGN (2) 80 L(table_less16dbwords): 81 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) 82 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) 83 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) 84 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) 85 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) 86 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) 87 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) 88 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) 89 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) 90 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) 91 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) 92 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) 93 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) 94 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) 95 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) 96 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) 97 .popsection 98 99 ALIGN (4) 100 L(write_15dbwords): 101 movl %ecx, -60(%rdi) 102 L(write_14dbwords): 103 movl %ecx, -56(%rdi) 104 L(write_13dbwords): 105 movl %ecx, -52(%rdi) 106 L(write_12dbwords): 107 movl %ecx, -48(%rdi) 108 L(write_11dbwords): 109 movl %ecx, -44(%rdi) 110 L(write_10dbwords): 111 movl %ecx, -40(%rdi) 112 L(write_9dbwords): 113 movl %ecx, -36(%rdi) 114 L(write_8dbwords): 115 movl %ecx, -32(%rdi) 116 L(write_7dbwords): 117 movl %ecx, -28(%rdi) 118 L(write_6dbwords): 119 movl %ecx, -24(%rdi) 120 L(write_5dbwords): 121 movl %ecx, -20(%rdi) 122 L(write_4dbwords): 123 movl %ecx, -16(%rdi) 124 L(write_3dbwords): 125 movl %ecx, -12(%rdi) 126 L(write_2dbwords): 127 movl %ecx, -8(%rdi) 128 L(write_1dbwords): 129 movl %ecx, -4(%rdi) 130 L(write_0dbwords): 131 ret 132 133 ALIGN (4) 134 L(16dbwordsormore): 135 test $3, %edi 136 jz L(aligned4bytes) 137 mov %ecx, (%rdi) 138 mov %ecx, -4(%rdi, %rdx, 4) 139 sub $1, %rdx 140 rol $24, %ecx 141 add $1, %rdi 142 test $3, %edi 143 jz L(aligned4bytes) 144 ror $8, %ecx 145 add $1, %rdi 146 test $3, %edi 147 jz L(aligned4bytes) 148 ror $8, %ecx 149 add $1, %rdi 150 L(aligned4bytes): 151 shl $2, %rdx 152 153 /* Fill xmm0 with the pattern. */ 154 movd %ecx, %xmm0 155 pshufd $0, %xmm0, %xmm0 156 157 testl $0xf, %edi 158 jz L(aligned_16) 159 /* RDX > 32 and RDI is not 16 byte aligned. */ 160 movdqu %xmm0, (%rdi) 161 mov %rdi, %rsi 162 and $-16, %rdi 163 add $16, %rdi 164 sub %rdi, %rsi 165 add %rsi, %rdx 166 167 ALIGN (4) 168 L(aligned_16): 169 cmp $128, %rdx 170 jge L(128bytesormore) 171 172 L(aligned_16_less128bytes): 173 add %rdx, %rdi 174 shr $2, %rdx 175 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 176 177 ALIGN (4) 178 L(128bytesormore): 179 cmp $SHARED_CACHE_SIZE, %rdx 180 jg L(128bytesormore_nt) 181 182 L(128bytesormore_normal): 183 sub $128, %rdx 184 movdqa %xmm0, (%rdi) 185 movdqa %xmm0, 0x10(%rdi) 186 movdqa %xmm0, 0x20(%rdi) 187 movdqa %xmm0, 0x30(%rdi) 188 movdqa %xmm0, 0x40(%rdi) 189 movdqa %xmm0, 0x50(%rdi) 190 movdqa %xmm0, 0x60(%rdi) 191 movdqa %xmm0, 0x70(%rdi) 192 lea 128(%rdi), %rdi 193 cmp $128, %rdx 194 jl L(128bytesless_normal) 195 196 sub $128, %rdx 197 movdqa %xmm0, (%rdi) 198 movdqa %xmm0, 0x10(%rdi) 199 movdqa %xmm0, 0x20(%rdi) 200 movdqa %xmm0, 0x30(%rdi) 201 movdqa %xmm0, 0x40(%rdi) 202 movdqa %xmm0, 0x50(%rdi) 203 movdqa %xmm0, 0x60(%rdi) 204 movdqa %xmm0, 0x70(%rdi) 205 lea 128(%rdi), %rdi 206 cmp $128, %rdx 207 jl L(128bytesless_normal) 208 209 sub $128, %rdx 210 movdqa %xmm0, (%rdi) 211 movdqa %xmm0, 0x10(%rdi) 212 movdqa %xmm0, 0x20(%rdi) 213 movdqa %xmm0, 0x30(%rdi) 214 movdqa %xmm0, 0x40(%rdi) 215 movdqa %xmm0, 0x50(%rdi) 216 movdqa %xmm0, 0x60(%rdi) 217 movdqa %xmm0, 0x70(%rdi) 218 lea 128(%rdi), %rdi 219 cmp $128, %rdx 220 jl L(128bytesless_normal) 221 222 sub $128, %rdx 223 movdqa %xmm0, (%rdi) 224 movdqa %xmm0, 0x10(%rdi) 225 movdqa %xmm0, 0x20(%rdi) 226 movdqa %xmm0, 0x30(%rdi) 227 movdqa %xmm0, 0x40(%rdi) 228 movdqa %xmm0, 0x50(%rdi) 229 movdqa %xmm0, 0x60(%rdi) 230 movdqa %xmm0, 0x70(%rdi) 231 lea 128(%rdi), %rdi 232 cmp $128, %rdx 233 jge L(128bytesormore_normal) 234 235 L(128bytesless_normal): 236 add %rdx, %rdi 237 shr $2, %rdx 238 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 239 240 ALIGN (4) 241 L(128bytesormore_nt): 242 sub $128, %rdx 243 movntdq %xmm0, (%rdi) 244 movntdq %xmm0, 0x10(%rdi) 245 movntdq %xmm0, 0x20(%rdi) 246 movntdq %xmm0, 0x30(%rdi) 247 movntdq %xmm0, 0x40(%rdi) 248 movntdq %xmm0, 0x50(%rdi) 249 movntdq %xmm0, 0x60(%rdi) 250 movntdq %xmm0, 0x70(%rdi) 251 lea 128(%rdi), %rdi 252 cmp $128, %rdx 253 jge L(128bytesormore_nt) 254 255 sfence 256 add %rdx, %rdi 257 shr $2, %rdx 258 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 259 260 .pushsection .rodata.sse2,"a",@progbits 261 ALIGN (2) 262 L(table_16_128bytes): 263 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 264 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 265 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 266 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 267 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 268 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 269 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 270 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 271 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 272 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 273 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 274 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 275 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 276 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 277 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 278 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 279 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 280 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 281 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 282 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 283 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 284 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 285 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 286 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 287 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 288 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 289 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 290 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 291 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 292 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 293 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 294 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 295 .popsection 296 297 ALIGN (4) 298 L(aligned_16_112bytes): 299 movdqa %xmm0, -112(%rdi) 300 L(aligned_16_96bytes): 301 movdqa %xmm0, -96(%rdi) 302 L(aligned_16_80bytes): 303 movdqa %xmm0, -80(%rdi) 304 L(aligned_16_64bytes): 305 movdqa %xmm0, -64(%rdi) 306 L(aligned_16_48bytes): 307 movdqa %xmm0, -48(%rdi) 308 L(aligned_16_32bytes): 309 movdqa %xmm0, -32(%rdi) 310 L(aligned_16_16bytes): 311 movdqa %xmm0, -16(%rdi) 312 L(aligned_16_0bytes): 313 ret 314 315 ALIGN (4) 316 L(aligned_16_116bytes): 317 movdqa %xmm0, -116(%rdi) 318 L(aligned_16_100bytes): 319 movdqa %xmm0, -100(%rdi) 320 L(aligned_16_84bytes): 321 movdqa %xmm0, -84(%rdi) 322 L(aligned_16_68bytes): 323 movdqa %xmm0, -68(%rdi) 324 L(aligned_16_52bytes): 325 movdqa %xmm0, -52(%rdi) 326 L(aligned_16_36bytes): 327 movdqa %xmm0, -36(%rdi) 328 L(aligned_16_20bytes): 329 movdqa %xmm0, -20(%rdi) 330 L(aligned_16_4bytes): 331 movl %ecx, -4(%rdi) 332 ret 333 334 ALIGN (4) 335 L(aligned_16_120bytes): 336 movdqa %xmm0, -120(%rdi) 337 L(aligned_16_104bytes): 338 movdqa %xmm0, -104(%rdi) 339 L(aligned_16_88bytes): 340 movdqa %xmm0, -88(%rdi) 341 L(aligned_16_72bytes): 342 movdqa %xmm0, -72(%rdi) 343 L(aligned_16_56bytes): 344 movdqa %xmm0, -56(%rdi) 345 L(aligned_16_40bytes): 346 movdqa %xmm0, -40(%rdi) 347 L(aligned_16_24bytes): 348 movdqa %xmm0, -24(%rdi) 349 L(aligned_16_8bytes): 350 movq %xmm0, -8(%rdi) 351 ret 352 353 ALIGN (4) 354 L(aligned_16_124bytes): 355 movdqa %xmm0, -124(%rdi) 356 L(aligned_16_108bytes): 357 movdqa %xmm0, -108(%rdi) 358 L(aligned_16_92bytes): 359 movdqa %xmm0, -92(%rdi) 360 L(aligned_16_76bytes): 361 movdqa %xmm0, -76(%rdi) 362 L(aligned_16_60bytes): 363 movdqa %xmm0, -60(%rdi) 364 L(aligned_16_44bytes): 365 movdqa %xmm0, -44(%rdi) 366 L(aligned_16_28bytes): 367 movdqa %xmm0, -28(%rdi) 368 L(aligned_16_12bytes): 369 movq %xmm0, -12(%rdi) 370 movl %ecx, -4(%rdi) 371 ret 372 373 END (MEMSET) 374