1 /* Copyright 2013 Google Inc. All Rights Reserved. 2 * 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 17 /* 18 * This "tool" can be used to brute force the XOR bitmask that a memory 19 * controller uses to interleave addresses onto its two channels. To use it, 20 * you need to have a bunch of addresses that are known to go to only one 21 * of the memory channels... easiest way to get these is to run stressapptest on 22 * a machine while holding a soldering iron close to the chips of one channel. 23 * Generate about a thousand failures and extract their physical addresses 24 * from the output. Write them to findmask.inc in a way that forms a valid 25 * definition for the addrs array. Make and run on a big machine. 26 * 27 * The program iterates over all possible bitmasks within the first NUM_BITS, 28 * parallelizing execution over NUM_THREADS. Every integer is masked 29 * onto all supplied addresses, counting the amount of times this results in 30 * an odd or even amount of bits. If all but NOISE addresses fall on one side, 31 * it will print that mask to stdout. Note that the script will always "find" 32 * the mask 0x0, and may also report masks such as 0x100000000 depending on 33 * your test machines memory size... you will need to use your own judgement to 34 * interpret the results. 35 * 36 * As the program might run for a long time, you can send SIGUSR1 to it to 37 * output the last mask that was processed and get a rough idea of the 38 * current progress. 39 */ 40 41 #include <inttypes.h> 42 #include <pthread.h> 43 #include <signal.h> 44 #include <stdint.h> 45 #include <stdlib.h> 46 #include <stdio.h> 47 48 #define NOISE 20 49 #define NUM_BITS 32 50 #define NUM_THREADS 128 // keep this a power of two 51 52 static uint64_t addrs[] = { 53 #include "findmask.inc" 54 }; 55 static uint64_t lastmask; 56 57 __attribute__((optimize(3, "unroll-loops"))) 58 void* thread_func(void* arg) { 59 register uint64_t mask; 60 register uintptr_t num = (uintptr_t)arg; 61 62 for (mask = num; mask < (1ULL << (NUM_BITS + 1)); mask += NUM_THREADS) { 63 register const uint64_t* cur; 64 register int a = 0; 65 register int b = 0; 66 67 for (cur = addrs; (char*)cur < (char*)addrs + sizeof(addrs); cur++) { 68 #ifdef __x86_64__ 69 register uint64_t addr asm("rdx") = *cur & mask; 70 register uint32_t tmp asm("ebx"); 71 72 // Behold: the dark bit counting magic! 73 asm ( 74 // Fold high and low 32 bits onto each other 75 "MOVl %%edx, %%ebx\n\t" 76 "SHRq $32, %%rdx\n\t" 77 "XORl %%ebx, %%edx\n\t" 78 // Fold high and low 16 bits onto each other 79 "MOVl %%edx, %%ebx\n\t" 80 "SHRl $16, %%edx\n\t" 81 "XORw %%bx, %%dx\n\t" 82 // Fold high and low 8 bits onto each other 83 "XORb %%dh, %%dl\n\t" 84 // Invoke ancient 8086 parity flag (only counts lowest byte) 85 "SETnp %%bl\n\t" 86 "SETp %%dl\n\t" 87 // Stupid SET instruction can only affect the lowest byte... 88 "ANDl $1, %%ebx\n\t" 89 "ANDl $1, %%edx\n\t" 90 // Increment either 'a' or 'b' without needing another branch 91 "ADDl %%ebx, %2\n\t" 92 "ADDl %%edx, %1\n\t" 93 : "=b" (tmp), "+r"(a), "+r"(b) : "d"(addr) : "cc"); 94 95 #else // generic processor 96 register uint64_t addr = *cur & mask; 97 register uint32_t low = (uint32_t)addr; 98 register uint32_t high = (uint32_t)(addr >> 32); 99 100 // Takes about twice as long as the version above... take that GCC! 101 __builtin_parity(low) ^ __builtin_parity(high) ? a++ : b++; 102 #endif 103 104 // Early abort: probably still the most valuable optimization in here 105 if (a >= NOISE && b >= NOISE) break; 106 } 107 108 if (a < NOISE) b = a; 109 if (b < NOISE) { 110 printf("Found mask with just %d deviations: 0x%" PRIx64 "\n", b, mask); 111 fflush(stdout); 112 } 113 114 // I'm a little paranoid about performance: don't write to memory too often 115 if (!(mask & 0x7ff)) lastmask = mask; 116 } 117 118 return 0; 119 } 120 121 void signal_handler(int signum) { 122 printf("Received signal... currently evaluating mask 0x%" PRIx64 "!\n", 123 lastmask); 124 fflush(stdout); 125 } 126 127 int main(int argc, char** argv) { 128 uintptr_t i; 129 pthread_t threads[NUM_THREADS]; 130 131 signal(SIGUSR1, signal_handler); 132 133 for (i = 0; i < NUM_THREADS; i++) 134 pthread_create(&threads[i], 0, thread_func, (void*)i); 135 136 for (i = 0; i < NUM_THREADS; i++) 137 pthread_join(threads[i], 0); 138 139 return 0; 140 } 141