Home | History | Annotate | Download | only in src
      1 // Copyright 2008 Google Inc. All Rights Reserved.
      2 
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 
      7 //      http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "adler32memcpy.h"
     16 
     17 // We are using (a modified form of) adler-32 checksum algorithm instead
     18 // of CRC since adler-32 is faster than CRC.
     19 // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
     20 // This form of adler is bit modified, instead of treating the data in
     21 // units of bytes, 32-bit data is taken as a unit and two 64-bit
     22 // checksums are done (we could have one checksum but two checksums
     23 // make the code run faster).
     24 
     25 // Adler-32 implementation:
     26 //   Data is treated as 1-byte numbers and,
     27 //   there are two 16-bit numbers a and b
     28 //   Initialize a with 1 and b with 0.
     29 //   for each data unit 'd'
     30 //      a += d
     31 //      b += a
     32 //   checksum = a<<16 + b
     33 //   This sum should never overflow.
     34 //
     35 // Adler-64+64 implementation:
     36 //   (applied in this code)
     37 //   Data is treated as 32-bit numbers and whole data is separated into two
     38 //   streams, and hence the two checksums a1, a2, b1 and b2.
     39 //   Initialize a1 and a2 with 1, b1 and b2 with 0
     40 //   add first dataunit to a1
     41 //   add a1 to b1
     42 //   add second dataunit to a1
     43 //   add a1 to b1
     44 //   add third dataunit to a2
     45 //   add a2 to b2
     46 //   add fourth dataunit to a2
     47 //   add a2 to b2
     48 //   ...
     49 //   repeat the sequence back for next 4 dataunits
     50 //
     51 //   variable A = XMM6 and variable B = XMM7.
     52 //   (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
     53 
     54 // Assumptions
     55 // 1. size_in_bytes is a multiple of 16.
     56 // 2. srcmem and dstmem are 16 byte aligned.
     57 // 3. size_in_bytes is less than 2^19 bytes.
     58 
     59 // Assumption 3 ensures that there is no overflow when numbers are being
     60 // added (we can remove this assumption by doing modulus with a prime
     61 // number when it is just about to overflow but that would be a very costly
     62 // exercise)
     63 
     64 // Returns true if the checksums are equal.
     65 bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
     66   return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
     67            (b1_ == other.b1_) && (b2_ == other.b2_) );
     68 }
     69 
     70 // Returns string representation of the Adler checksum.
     71 string AdlerChecksum::ToHexString() const {
     72   char buffer[128];
     73   snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_);
     74   return string(buffer);
     75 }
     76 
     77 // Sets components of the Adler checksum.
     78 void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
     79   a1_ = a1;
     80   a2_ = a2;
     81   b1_ = b1;
     82   b2_ = b2;
     83 }
     84 
     85 // Calculates Adler checksum for supplied data.
     86 bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
     87                             AdlerChecksum *checksum) {
     88   // Use this data wrapper to access memory with 64bit read/write.
     89   datacast_t data;
     90   unsigned int count = size_in_bytes / sizeof(data);
     91 
     92   if (count > (1U) << 19) {
     93     // Size is too large, must be strictly less than 512 KB.
     94     return false;
     95   }
     96 
     97   uint64 a1 = 1;
     98   uint64 a2 = 1;
     99   uint64 b1 = 0;
    100   uint64 b2 = 0;
    101 
    102   unsigned int i = 0;
    103   while (i < count) {
    104     // Process 64 bits at a time.
    105     data.l64 = data64[i];
    106     a1 = a1 + data.l32.l;
    107     b1 = b1 + a1;
    108     a1 = a1 + data.l32.h;
    109     b1 = b1 + a1;
    110     i++;
    111 
    112     data.l64 = data64[i];
    113     a2 = a2 + data.l32.l;
    114     b2 = b2 + a2;
    115     a2 = a2 + data.l32.h;
    116     b2 = b2 + a2;
    117     i++;
    118   }
    119   checksum->Set(a1, a2, b1, b2);
    120   return true;
    121 }
    122 
    123 // C implementation of Adler memory copy.
    124 bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64,
    125                   unsigned int size_in_bytes, AdlerChecksum *checksum) {
    126   // Use this data wrapper to access memory with 64bit read/write.
    127   datacast_t data;
    128   unsigned int count = size_in_bytes / sizeof(data);
    129 
    130   if (count > ((1U) << 19)) {
    131     // Size is too large, must be strictly less than 512 KB.
    132     return false;
    133   }
    134 
    135   uint64 a1 = 1;
    136   uint64 a2 = 1;
    137   uint64 b1 = 0;
    138   uint64 b2 = 0;
    139 
    140   unsigned int i = 0;
    141   while (i < count) {
    142     // Process 64 bits at a time.
    143     data.l64 = srcmem64[i];
    144     a1 = a1 + data.l32.l;
    145     b1 = b1 + a1;
    146     a1 = a1 + data.l32.h;
    147     b1 = b1 + a1;
    148     dstmem64[i] = data.l64;
    149     i++;
    150 
    151     data.l64 = srcmem64[i];
    152     a2 = a2 + data.l32.l;
    153     b2 = b2 + a2;
    154     a2 = a2 + data.l32.h;
    155     b2 = b2 + a2;
    156     dstmem64[i] = data.l64;
    157     i++;
    158   }
    159   checksum->Set(a1, a2, b1, b2);
    160   return true;
    161 }
    162 
    163 // C implementation of Adler memory copy with some float point ops,
    164 // attempting to warm up the CPU.
    165 bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
    166                       unsigned int size_in_bytes, AdlerChecksum *checksum) {
    167   // Use this data wrapper to access memory with 64bit read/write.
    168   datacast_t data;
    169   unsigned int count = size_in_bytes / sizeof(data);
    170 
    171   if (count > ((1U) << 19)) {
    172     // Size is too large, must be strictly less than 512 KB.
    173     return false;
    174   }
    175 
    176   uint64 a1 = 1;
    177   uint64 a2 = 1;
    178   uint64 b1 = 0;
    179   uint64 b2 = 0;
    180 
    181   double a = 2.0 * static_cast<double>(srcmem64[0]);
    182   double b = 5.0 * static_cast<double>(srcmem64[0]);
    183   double c = 7.0 * static_cast<double>(srcmem64[0]);
    184   double d = 9.0 * static_cast<double>(srcmem64[0]);
    185 
    186   unsigned int i = 0;
    187   while (i < count) {
    188     // Process 64 bits at a time.
    189     data.l64 = srcmem64[i];
    190     a1 = a1 + data.l32.l;
    191     b1 = b1 + a1;
    192     a1 = a1 + data.l32.h;
    193     b1 = b1 + a1;
    194     dstmem64[i] = data.l64;
    195     i++;
    196 
    197     // Warm cpu up.
    198     a = a * b;
    199     b = b + c;
    200 
    201     data.l64 = srcmem64[i];
    202     a2 = a2 + data.l32.l;
    203     b2 = b2 + a2;
    204     a2 = a2 + data.l32.h;
    205     b2 = b2 + a2;
    206     dstmem64[i] = data.l64;
    207     i++;
    208 
    209     // Warm cpu up.
    210     c = c * d;
    211     d = d + d;
    212   }
    213 
    214   // Warm cpu up.
    215   d = a + b + c + d;
    216   if (d == 1.0) {
    217     // Reference the result so that it can't be discarded by the compiler.
    218     printf("Log: This will probably never happen.\n");
    219   }
    220 
    221   checksum->Set(a1, a2, b1, b2);
    222   return true;
    223 }
    224 
    225 // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
    226 bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
    227                     unsigned int size_in_bytes, AdlerChecksum *checksum) {
    228 // Use assembly implementation where supported.
    229 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
    230 
    231 // Pull a bit of tricky preprocessing to make the inline asm both
    232 // 32 bit and 64 bit.
    233 #ifdef STRESSAPPTEST_CPU_I686  // Instead of coding both, x86...
    234 #define rAX "%%eax"
    235 #define rCX "%%ecx"
    236 #define rDX "%%edx"
    237 #define rBX "%%ebx"
    238 #define rSP "%%esp"
    239 #define rBP "%%ebp"
    240 #define rSI "%%esi"
    241 #define rDI "%%edi"
    242 #endif
    243 
    244 #ifdef STRESSAPPTEST_CPU_X86_64  // ...and x64, we use rXX macros.
    245 #define rAX "%%rax"
    246 #define rCX "%%rcx"
    247 #define rDX "%%rdx"
    248 #define rBX "%%rbx"
    249 #define rSP "%%rsp"
    250 #define rBP "%%rbp"
    251 #define rSI "%%rsi"
    252 #define rDI "%%rdi"
    253 #endif
    254 
    255   // Elements 0 to 3 are used for holding checksum terms a1, a2,
    256   // b1, b2 respectively. These elements are filled by asm code.
    257   // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
    258   // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
    259   // setting word index 0 and word index 2 to zero).
    260   // Element 6 and 7 are used for setting a1 and a2 to 1.
    261   volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
    262       {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
    263 
    264   if ((size_in_bytes >> 19) > 0) {
    265     // Size is too large. Must be less than 2^19 bytes = 512 KB.
    266     return false;
    267   }
    268 
    269   // Number of 32-bit words which are not added to a1/a2 in the main loop.
    270   uint32 remaining_words = (size_in_bytes % 48) / 4;
    271 
    272   // Since we are moving 48 bytes at a time number of iterations = total size/48
    273   // is value of counter.
    274   uint32 num_of_48_byte_units = size_in_bytes / 48;
    275 
    276   asm volatile (
    277       // Source address is in ESI (extended source index)
    278       // destination is in EDI (extended destination index)
    279       // and counter is already in ECX (extended counter
    280       // index).
    281       "cmp  $0, " rCX ";"   // Compare counter to zero.
    282       "jz END;"
    283 
    284       // XMM6 is initialized with 1 and XMM7 with 0.
    285       "prefetchnta  0(" rSI ");"
    286       "prefetchnta 64(" rSI ");"
    287       "movdqu   48(" rAX "), %%xmm6;"
    288       "xorps      %%xmm7, %%xmm7;"
    289 
    290       // Start of the loop which copies 48 bytes from source to dst each time.
    291       "TOP:\n"
    292 
    293       // Make 6 moves each of 16 bytes from srcmem to XMM registers.
    294       // We are using 2 words out of 4 words in each XMM register,
    295       // word index 0 and word index 2
    296       "movdqa   0(" rSI "), %%xmm0;"
    297       "movdqu   4(" rSI "), %%xmm1;"  // Be careful to use unaligned move here.
    298       "movdqa  16(" rSI "), %%xmm2;"
    299       "movdqu  20(" rSI "), %%xmm3;"
    300       "movdqa  32(" rSI "), %%xmm4;"
    301       "movdqu  36(" rSI "), %%xmm5;"
    302 
    303       // Move 3 * 16 bytes from XMM registers to dstmem.
    304       // Note: this copy must be performed before pinsrw instructions since
    305       // they will modify the XMM registers.
    306       "movntdq %%xmm0,  0(" rDI ");"
    307       "movntdq %%xmm2, 16(" rDI ");"
    308       "movntdq %%xmm4, 32(" rDI ");"
    309 
    310       // Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
    311       "andps 32(" rAX "), %%xmm0;"
    312       "andps 32(" rAX "), %%xmm1;"
    313       "andps 32(" rAX "), %%xmm2;"
    314       "andps 32(" rAX "), %%xmm3;"
    315       "andps 32(" rAX "), %%xmm4;"
    316       "andps 32(" rAX "), %%xmm5;"
    317 
    318       // Add XMM0 to XMM6 and then add XMM6 to XMM7.
    319       // Repeat this for XMM1, ..., XMM5.
    320       // Overflow(for XMM7) can occur only if there are more
    321       // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
    322       // if size_in_bytes > 2^19 than overflow occurs.
    323       "paddq %%xmm0, %%xmm6;"
    324       "paddq %%xmm6, %%xmm7;"
    325       "paddq %%xmm1, %%xmm6;"
    326       "paddq %%xmm6, %%xmm7;"
    327       "paddq %%xmm2, %%xmm6;"
    328       "paddq %%xmm6, %%xmm7;"
    329       "paddq %%xmm3, %%xmm6;"
    330       "paddq %%xmm6, %%xmm7;"
    331       "paddq %%xmm4, %%xmm6;"
    332       "paddq %%xmm6, %%xmm7;"
    333       "paddq %%xmm5, %%xmm6;"
    334       "paddq %%xmm6, %%xmm7;"
    335 
    336       // Increment ESI and EDI by 48 bytes and decrement counter by 1.
    337       "add $48, " rSI ";"
    338       "add $48, " rDI ";"
    339       "prefetchnta  0(" rSI ");"
    340       "prefetchnta 64(" rSI ");"
    341       "dec " rCX ";"
    342       "jnz TOP;"
    343 
    344       // Now only remaining_words 32-bit words are left.
    345       // make a loop, add first two words to a1 and next two to a2 (just like
    346       // above loop, the only extra thing we are doing is rechecking
    347       // rDX (=remaining_words) everytime we add a number to a1/a2.
    348       "REM_IS_STILL_NOT_ZERO:\n"
    349       // Unless remaining_words becomes less than 4 words(16 bytes)
    350       // there is not much issue and remaining_words will always
    351       // be a multiple of four by assumption.
    352       "cmp $4, " rDX ";"
    353       // In case for some weird reasons if remaining_words becomes
    354       // less than 4 but not zero then also break the code and go off to END.
    355       "jl END;"
    356       // Otherwise just go on and copy data in chunks of 4-words at a time till
    357       // whole data (<48 bytes) is copied.
    358       "movdqa  0(" rSI "), %%xmm0;"    // Copy next 4-words to XMM0 and to XMM1.
    359 
    360       "movdqa  0(" rSI "), %%xmm5;"    // Accomplish movdqu 4(%rSI) without
    361       "pshufd $0x39, %%xmm5, %%xmm1;"  // indexing off memory boundary.
    362 
    363       "movntdq %%xmm0,  0(" rDI ");"   // Copy 4-words to destination.
    364       "andps  32(" rAX "), %%xmm0;"
    365       "andps  32(" rAX "), %%xmm1;"
    366       "paddq     %%xmm0, %%xmm6;"
    367       "paddq     %%xmm6, %%xmm7;"
    368       "paddq     %%xmm1, %%xmm6;"
    369       "paddq     %%xmm6, %%xmm7;"
    370       "add $16, " rSI ";"
    371       "add $16, " rDI ";"
    372       "sub $4, " rDX ";"
    373       // Decrement %rDX by 4 since %rDX is number of 32-bit
    374       // words left after considering all 48-byte units.
    375       "jmp REM_IS_STILL_NOT_ZERO;"
    376 
    377       "END:\n"
    378       // Report checksum values A and B (both right now are two concatenated
    379       // 64 bit numbers and have to be converted to 64 bit numbers)
    380       // seems like Adler128 (since size of each part is 4 byte rather than
    381       // 1 byte).
    382       "movdqa %%xmm6,   0(" rAX ");"
    383       "movdqa %%xmm7,  16(" rAX ");"
    384       "sfence;"
    385 
    386       // No output registers.
    387       :
    388       // Input registers.
    389       : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
    390         "c" (num_of_48_byte_units), "d" (remaining_words)
    391   );  // asm.
    392 
    393   if (checksum != NULL) {
    394     checksum->Set(checksum_arr[0], checksum_arr[1],
    395                   checksum_arr[2], checksum_arr[3]);
    396   }
    397 
    398   // Everything went fine, so return true (this does not mean
    399   // that there is no problem with memory this just mean that data was copied
    400   // from src to dst and checksum was calculated successfully).
    401   return true;
    402 #else
    403   // Fall back to C implementation for anything else.
    404   return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
    405 #endif
    406 }
    407