1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /* 17 * Contributed by: Intel Corporation 18 */ 19 20 #include "cache.h" 21 22 #ifndef L 23 # define L(label) .L##label 24 #endif 25 26 #ifndef ALIGN 27 # define ALIGN(n) .p2align n 28 #endif 29 30 #ifndef cfi_startproc 31 # define cfi_startproc .cfi_startproc 32 #endif 33 34 #ifndef cfi_endproc 35 # define cfi_endproc .cfi_endproc 36 #endif 37 38 #ifndef ENTRY 39 # define ENTRY(name) \ 40 .type name, @function; \ 41 .globl name; \ 42 .p2align 4; \ 43 name: \ 44 cfi_startproc 45 #endif 46 47 #ifndef END 48 # define END(name) \ 49 cfi_endproc; \ 50 .size name, .-name 51 #endif 52 53 #define JMPTBL(I, B) I - B 54 55 /* Branch to an entry in a jump table. TABLE is a jump table with 56 relative offsets. INDEX is a register contains the index into the 57 jump table. SCALE is the scale of INDEX. */ 58 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 59 lea TABLE(%rip), %r11; \ 60 movslq (%r11, INDEX, SCALE), INDEX; \ 61 lea (%r11, INDEX), INDEX; \ 62 jmp *INDEX 63 64 .section .text.sse2,"ax",@progbits 65 ALIGN (4) 66 ENTRY (android_memset32) // Address in rdi 67 shr $2, %rdx // Count in rdx 68 movl %esi, %ecx // Pattern in ecx 69 70 cmp $16, %rdx 71 jae L(16dbwordsormore) 72 73 L(write_less16dbwords): 74 lea (%rdi, %rdx, 4), %rdi 75 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords), %rdx, 4) 76 77 .pushsection .rodata.sse2,"a",@progbits 78 ALIGN (2) 79 L(table_less16dbwords): 80 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) 81 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) 82 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) 83 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) 84 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) 85 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) 86 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) 87 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) 88 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) 89 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) 90 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) 91 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) 92 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) 93 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) 94 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) 95 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) 96 .popsection 97 98 ALIGN (4) 99 L(write_15dbwords): 100 movl %ecx, -60(%rdi) 101 L(write_14dbwords): 102 movl %ecx, -56(%rdi) 103 L(write_13dbwords): 104 movl %ecx, -52(%rdi) 105 L(write_12dbwords): 106 movl %ecx, -48(%rdi) 107 L(write_11dbwords): 108 movl %ecx, -44(%rdi) 109 L(write_10dbwords): 110 movl %ecx, -40(%rdi) 111 L(write_9dbwords): 112 movl %ecx, -36(%rdi) 113 L(write_8dbwords): 114 movl %ecx, -32(%rdi) 115 L(write_7dbwords): 116 movl %ecx, -28(%rdi) 117 L(write_6dbwords): 118 movl %ecx, -24(%rdi) 119 L(write_5dbwords): 120 movl %ecx, -20(%rdi) 121 L(write_4dbwords): 122 movl %ecx, -16(%rdi) 123 L(write_3dbwords): 124 movl %ecx, -12(%rdi) 125 L(write_2dbwords): 126 movl %ecx, -8(%rdi) 127 L(write_1dbwords): 128 movl %ecx, -4(%rdi) 129 L(write_0dbwords): 130 ret 131 132 ALIGN (4) 133 L(16dbwordsormore): 134 test $3, %edi 135 jz L(aligned4bytes) 136 mov %ecx, (%rdi) 137 mov %ecx, -4(%rdi, %rdx, 4) 138 sub $1, %rdx 139 rol $24, %ecx 140 add $1, %rdi 141 test $3, %edi 142 jz L(aligned4bytes) 143 ror $8, %ecx 144 add $1, %rdi 145 test $3, %edi 146 jz L(aligned4bytes) 147 ror $8, %ecx 148 add $1, %rdi 149 L(aligned4bytes): 150 shl $2, %rdx 151 152 /* Fill xmm0 with the pattern. */ 153 movd %ecx, %xmm0 154 pshufd $0, %xmm0, %xmm0 155 156 testl $0xf, %edi 157 jz L(aligned_16) 158 /* RDX > 32 and RDI is not 16 byte aligned. */ 159 movdqu %xmm0, (%rdi) 160 mov %rdi, %rsi 161 and $-16, %rdi 162 add $16, %rdi 163 sub %rdi, %rsi 164 add %rsi, %rdx 165 166 ALIGN (4) 167 L(aligned_16): 168 cmp $128, %rdx 169 jge L(128bytesormore) 170 171 L(aligned_16_less128bytes): 172 add %rdx, %rdi 173 shr $2, %rdx 174 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 175 176 ALIGN (4) 177 L(128bytesormore): 178 cmp $SHARED_CACHE_SIZE, %rdx 179 jg L(128bytesormore_nt) 180 181 L(128bytesormore_normal): 182 sub $128, %rdx 183 movdqa %xmm0, (%rdi) 184 movdqa %xmm0, 0x10(%rdi) 185 movdqa %xmm0, 0x20(%rdi) 186 movdqa %xmm0, 0x30(%rdi) 187 movdqa %xmm0, 0x40(%rdi) 188 movdqa %xmm0, 0x50(%rdi) 189 movdqa %xmm0, 0x60(%rdi) 190 movdqa %xmm0, 0x70(%rdi) 191 lea 128(%rdi), %rdi 192 cmp $128, %rdx 193 jl L(128bytesless_normal) 194 195 sub $128, %rdx 196 movdqa %xmm0, (%rdi) 197 movdqa %xmm0, 0x10(%rdi) 198 movdqa %xmm0, 0x20(%rdi) 199 movdqa %xmm0, 0x30(%rdi) 200 movdqa %xmm0, 0x40(%rdi) 201 movdqa %xmm0, 0x50(%rdi) 202 movdqa %xmm0, 0x60(%rdi) 203 movdqa %xmm0, 0x70(%rdi) 204 lea 128(%rdi), %rdi 205 cmp $128, %rdx 206 jl L(128bytesless_normal) 207 208 sub $128, %rdx 209 movdqa %xmm0, (%rdi) 210 movdqa %xmm0, 0x10(%rdi) 211 movdqa %xmm0, 0x20(%rdi) 212 movdqa %xmm0, 0x30(%rdi) 213 movdqa %xmm0, 0x40(%rdi) 214 movdqa %xmm0, 0x50(%rdi) 215 movdqa %xmm0, 0x60(%rdi) 216 movdqa %xmm0, 0x70(%rdi) 217 lea 128(%rdi), %rdi 218 cmp $128, %rdx 219 jl L(128bytesless_normal) 220 221 sub $128, %rdx 222 movdqa %xmm0, (%rdi) 223 movdqa %xmm0, 0x10(%rdi) 224 movdqa %xmm0, 0x20(%rdi) 225 movdqa %xmm0, 0x30(%rdi) 226 movdqa %xmm0, 0x40(%rdi) 227 movdqa %xmm0, 0x50(%rdi) 228 movdqa %xmm0, 0x60(%rdi) 229 movdqa %xmm0, 0x70(%rdi) 230 lea 128(%rdi), %rdi 231 cmp $128, %rdx 232 jge L(128bytesormore_normal) 233 234 L(128bytesless_normal): 235 add %rdx, %rdi 236 shr $2, %rdx 237 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 238 239 ALIGN (4) 240 L(128bytesormore_nt): 241 sub $128, %rdx 242 movntdq %xmm0, (%rdi) 243 movntdq %xmm0, 0x10(%rdi) 244 movntdq %xmm0, 0x20(%rdi) 245 movntdq %xmm0, 0x30(%rdi) 246 movntdq %xmm0, 0x40(%rdi) 247 movntdq %xmm0, 0x50(%rdi) 248 movntdq %xmm0, 0x60(%rdi) 249 movntdq %xmm0, 0x70(%rdi) 250 lea 128(%rdi), %rdi 251 cmp $128, %rdx 252 jge L(128bytesormore_nt) 253 254 sfence 255 add %rdx, %rdi 256 shr $2, %rdx 257 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 258 259 .pushsection .rodata.sse2,"a",@progbits 260 ALIGN (2) 261 L(table_16_128bytes): 262 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 263 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 264 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 265 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 266 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 267 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 268 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 269 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 270 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 271 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 272 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 273 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 274 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 275 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 276 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 277 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 278 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 279 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 280 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 281 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 282 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 283 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 284 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 285 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 286 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 287 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 288 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 289 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 290 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 291 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 292 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 293 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 294 .popsection 295 296 ALIGN (4) 297 L(aligned_16_112bytes): 298 movdqa %xmm0, -112(%rdi) 299 L(aligned_16_96bytes): 300 movdqa %xmm0, -96(%rdi) 301 L(aligned_16_80bytes): 302 movdqa %xmm0, -80(%rdi) 303 L(aligned_16_64bytes): 304 movdqa %xmm0, -64(%rdi) 305 L(aligned_16_48bytes): 306 movdqa %xmm0, -48(%rdi) 307 L(aligned_16_32bytes): 308 movdqa %xmm0, -32(%rdi) 309 L(aligned_16_16bytes): 310 movdqa %xmm0, -16(%rdi) 311 L(aligned_16_0bytes): 312 ret 313 314 ALIGN (4) 315 L(aligned_16_116bytes): 316 movdqa %xmm0, -116(%rdi) 317 L(aligned_16_100bytes): 318 movdqa %xmm0, -100(%rdi) 319 L(aligned_16_84bytes): 320 movdqa %xmm0, -84(%rdi) 321 L(aligned_16_68bytes): 322 movdqa %xmm0, -68(%rdi) 323 L(aligned_16_52bytes): 324 movdqa %xmm0, -52(%rdi) 325 L(aligned_16_36bytes): 326 movdqa %xmm0, -36(%rdi) 327 L(aligned_16_20bytes): 328 movdqa %xmm0, -20(%rdi) 329 L(aligned_16_4bytes): 330 movl %ecx, -4(%rdi) 331 ret 332 333 ALIGN (4) 334 L(aligned_16_120bytes): 335 movdqa %xmm0, -120(%rdi) 336 L(aligned_16_104bytes): 337 movdqa %xmm0, -104(%rdi) 338 L(aligned_16_88bytes): 339 movdqa %xmm0, -88(%rdi) 340 L(aligned_16_72bytes): 341 movdqa %xmm0, -72(%rdi) 342 L(aligned_16_56bytes): 343 movdqa %xmm0, -56(%rdi) 344 L(aligned_16_40bytes): 345 movdqa %xmm0, -40(%rdi) 346 L(aligned_16_24bytes): 347 movdqa %xmm0, -24(%rdi) 348 L(aligned_16_8bytes): 349 movq %xmm0, -8(%rdi) 350 ret 351 352 ALIGN (4) 353 L(aligned_16_124bytes): 354 movdqa %xmm0, -124(%rdi) 355 L(aligned_16_108bytes): 356 movdqa %xmm0, -108(%rdi) 357 L(aligned_16_92bytes): 358 movdqa %xmm0, -92(%rdi) 359 L(aligned_16_76bytes): 360 movdqa %xmm0, -76(%rdi) 361 L(aligned_16_60bytes): 362 movdqa %xmm0, -60(%rdi) 363 L(aligned_16_44bytes): 364 movdqa %xmm0, -44(%rdi) 365 L(aligned_16_28bytes): 366 movdqa %xmm0, -28(%rdi) 367 L(aligned_16_12bytes): 368 movq %xmm0, -12(%rdi) 369 movl %ecx, -4(%rdi) 370 ret 371 372 END (android_memset32) 373