Home | History | Annotate | Download | only in src
      1 /* Copyright 2013 Google Inc. All Rights Reserved.
      2  *
      3  * Licensed under the Apache License, Version 2.0 (the "License");
      4  * you may not use this file except in compliance with the License.
      5  * You may obtain a copy of the License at
      6  *
      7  *      http://www.apache.org/licenses/LICENSE-2.0
      8  *
      9  * Unless required by applicable law or agreed to in writing, software
     10  * distributed under the License is distributed on an "AS IS" BASIS,
     11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12  * See the License for the specific language governing permissions and
     13  * limitations under the License.
     14  */
     15 
     16 
     17 /*
     18  * This "tool" can be used to brute force the XOR bitmask that a memory
     19  * controller uses to interleave addresses onto its two channels. To use it,
     20  * you need to have a bunch of addresses that are known to go to only one
     21  * of the memory channels... easiest way to get these is to run stressapptest on
     22  * a machine while holding a soldering iron close to the chips of one channel.
     23  * Generate about a thousand failures and extract their physical addresses
     24  * from the output. Write them to findmask.inc in a way that forms a valid
     25  * definition for the addrs array. Make and run on a big machine.
     26  *
     27  * The program iterates over all possible bitmasks within the first NUM_BITS,
     28  * parallelizing execution over NUM_THREADS. Every integer is masked
     29  * onto all supplied addresses, counting the amount of times this results in
     30  * an odd or even amount of bits. If all but NOISE addresses fall on one side,
     31  * it will print that mask to stdout. Note that the script will always "find"
     32  * the mask 0x0, and may also report masks such as 0x100000000 depending on
     33  * your test machines memory size... you will need to use your own judgement to
     34  * interpret the results.
     35  *
     36  * As the program might run for a long time, you can send SIGUSR1 to it to
     37  * output the last mask that was processed and get a rough idea of the
     38  * current progress.
     39  */
     40 
     41 #include <inttypes.h>
     42 #include <pthread.h>
     43 #include <signal.h>
     44 #include <stdint.h>
     45 #include <stdlib.h>
     46 #include <stdio.h>
     47 
     48 #define NOISE 20
     49 #define NUM_BITS 32
     50 #define NUM_THREADS 128  // keep this a power of two
     51 
     52 static uint64_t addrs[] = {
     53 #include "findmask.inc"
     54 };
     55 static uint64_t lastmask;
     56 
     57 __attribute__((optimize(3, "unroll-loops")))
     58 void* thread_func(void* arg) {
     59   register uint64_t mask;
     60   register uintptr_t num = (uintptr_t)arg;
     61 
     62   for (mask = num; mask < (1ULL << (NUM_BITS + 1)); mask += NUM_THREADS) {
     63     register const uint64_t* cur;
     64     register int a = 0;
     65     register int b = 0;
     66 
     67     for (cur = addrs; (char*)cur < (char*)addrs + sizeof(addrs); cur++) {
     68 #ifdef __x86_64__
     69       register uint64_t addr asm("rdx") = *cur & mask;
     70       register uint32_t tmp asm("ebx");
     71 
     72       // Behold: the dark bit counting magic!
     73       asm (
     74         // Fold high and low 32 bits onto each other
     75         "MOVl %%edx, %%ebx\n\t"
     76         "SHRq $32, %%rdx\n\t"
     77         "XORl %%ebx, %%edx\n\t"
     78         // Fold high and low 16 bits onto each other
     79         "MOVl %%edx, %%ebx\n\t"
     80         "SHRl $16, %%edx\n\t"
     81         "XORw %%bx, %%dx\n\t"
     82         // Fold high and low 8 bits onto each other
     83         "XORb %%dh, %%dl\n\t"
     84         // Invoke ancient 8086 parity flag (only counts lowest byte)
     85         "SETnp %%bl\n\t"
     86         "SETp %%dl\n\t"
     87         // Stupid SET instruction can only affect the lowest byte...
     88         "ANDl $1, %%ebx\n\t"
     89         "ANDl $1, %%edx\n\t"
     90         // Increment either 'a' or 'b' without needing another branch
     91         "ADDl %%ebx, %2\n\t"
     92         "ADDl %%edx, %1\n\t"
     93         : "=b" (tmp), "+r"(a), "+r"(b) : "d"(addr) : "cc");
     94 
     95 #else  // generic processor
     96       register uint64_t addr = *cur & mask;
     97       register uint32_t low = (uint32_t)addr;
     98       register uint32_t high = (uint32_t)(addr >> 32);
     99 
    100       // Takes about twice as long as the version above... take that GCC!
    101       __builtin_parity(low) ^ __builtin_parity(high) ? a++ : b++;
    102 #endif
    103 
    104       // Early abort: probably still the most valuable optimization in here
    105       if (a >= NOISE && b >= NOISE) break;
    106     }
    107 
    108     if (a < NOISE) b = a;
    109     if (b < NOISE) {
    110       printf("Found mask with just %d deviations: 0x%" PRIx64 "\n", b, mask);
    111       fflush(stdout);
    112     }
    113 
    114     // I'm a little paranoid about performance: don't write to memory too often
    115     if (!(mask & 0x7ff)) lastmask = mask;
    116   }
    117 
    118   return 0;
    119 }
    120 
    121 void signal_handler(int signum) {
    122   printf("Received signal... currently evaluating mask 0x%" PRIx64 "!\n",
    123          lastmask);
    124   fflush(stdout);
    125 }
    126 
    127 int main(int argc, char** argv) {
    128   uintptr_t i;
    129   pthread_t threads[NUM_THREADS];
    130 
    131   signal(SIGUSR1, signal_handler);
    132 
    133   for (i = 0; i < NUM_THREADS; i++)
    134     pthread_create(&threads[i], 0, thread_func, (void*)i);
    135 
    136   for (i = 0; i < NUM_THREADS; i++)
    137     pthread_join(threads[i], 0);
    138 
    139   return 0;
    140 }
    141