Home | History | Annotate | Download | only in life_simd
      1 // Copyright 2014 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <assert.h>
      6 #include <math.h>
      7 #include <stdint.h>
      8 #include <stdio.h>
      9 #include <stdlib.h>
     10 #include <string.h>
     11 #include <sys/time.h>
     12 #include <unistd.h>
     13 
     14 #include <ppapi/c/ppb_input_event.h>
     15 #include <ppapi/cpp/fullscreen.h>
     16 #include <ppapi/cpp/input_event.h>
     17 #include <ppapi/cpp/var.h>
     18 #include <ppapi/cpp/var_array.h>
     19 #include <ppapi/cpp/var_array_buffer.h>
     20 #include <ppapi/cpp/var_dictionary.h>
     21 
     22 #include "ppapi_simple/ps.h"
     23 #include "ppapi_simple/ps_context_2d.h"
     24 #include "ppapi_simple/ps_event.h"
     25 #include "ppapi_simple/ps_instance.h"
     26 #include "ppapi_simple/ps_interface.h"
     27 #include "ppapi_simple/ps_main.h"
     28 #include "sdk_util/macros.h"
     29 #include "sdk_util/thread_pool.h"
     30 
     31 using namespace sdk_util;  // For sdk_util::ThreadPool
     32 
     33 namespace {
     34 
     35 #define INLINE inline __attribute__((always_inline))
     36 
     37 // BGRA helper macro, for constructing a pixel for a BGRA buffer.
     38 #define MakeBGRA(b, g, r, a)  \
     39   (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))
     40 
     41 const int kFramesToBenchmark = 100;
     42 const int kCellAlignment = 0x10;
     43 
     44 // 128 bit vector types
     45 typedef uint8_t u8x16_t __attribute__ ((vector_size (16)));
     46 
     47 // Helper function to broadcast x across 16 element vector.
     48 INLINE u8x16_t broadcast(uint8_t x) {
     49   u8x16_t r = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
     50   return r;
     51 }
     52 
     53 // Convert a count value into a live (green) or dead color value.
     54 const uint32_t kNeighborColors[] = {
     55     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     56     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     57     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     58     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     59     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     60     MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
     61     MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
     62     MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
     63     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     64     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     65     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     66     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     67     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     68     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     69     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     70     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     71     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     72     MakeBGRA(0x00, 0x00, 0x00, 0xFF),
     73 };
     74 
     75 // These represent the new health value of a cell based on its neighboring
     76 // values.  The health is binary: either alive or dead.
     77 const uint8_t kIsAlive[] = {
     78       0, 0, 0, 0, 0, 1, 1, 1, 0,
     79       0, 0, 0, 0, 0, 0, 0, 0, 0
     80 };
     81 
     82 // Timer helper for benchmarking.  Returns seconds elapsed since program start,
     83 // as a double.
     84 timeval start_tv;
     85 int start_tv_retv = gettimeofday(&start_tv, NULL);
     86 
     87 inline double getseconds() {
     88   const double usec_to_sec = 0.000001;
     89   timeval tv;
     90   if ((0 == start_tv_retv) && (0 == gettimeofday(&tv, NULL)))
     91     return (tv.tv_sec - start_tv.tv_sec) + tv.tv_usec * usec_to_sec;
     92   return 0.0;
     93 }
     94 } // namespace
     95 
     96 
     97 class Life {
     98  public:
     99   Life();
    100   virtual ~Life();
    101   // Runs a tick of the simulations, update 2D output.
    102   void Update();
    103   // Handle event from user, or message from JS.
    104   void HandleEvent(PSEvent* ps_event);
    105  private:
    106   void UpdateContext();
    107   void DrawCell(int32_t x, int32_t y);
    108   void ProcessTouchEvent(const pp::TouchInputEvent& touches);
    109   void PostUpdateMessage(const char* message, double value);
    110   void StartBenchmark();
    111   void EndBenchmark();
    112   void Stir();
    113   void wSimulate(int y);
    114   static void wSimulateEntry(int y, void* data);
    115   void Simulate();
    116 
    117   bool simd_;
    118   bool multithread_;
    119   bool benchmarking_;
    120   int benchmark_frame_counter_;
    121   double bench_start_time_;
    122   double bench_end_time_;
    123   uint8_t* cell_in_;
    124   uint8_t* cell_out_;
    125   int32_t cell_stride_;
    126   int32_t width_;
    127   int32_t height_;
    128   PSContext2D_t* ps_context_;
    129   ThreadPool* workers_;
    130 };
    131 
    132 Life::Life() :
    133     simd_(true),
    134     multithread_(true),
    135     benchmarking_(false),
    136     benchmark_frame_counter_(0),
    137     bench_start_time_(0.0),
    138     bench_end_time_(0.0),
    139     cell_in_(NULL),
    140     cell_out_(NULL),
    141     cell_stride_(0),
    142     width_(0),
    143     height_(0) {
    144   ps_context_ = PSContext2DAllocate(PP_IMAGEDATAFORMAT_BGRA_PREMUL);
    145   // Query system for number of processors via sysconf()
    146   int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
    147   if (num_threads < 2)
    148     num_threads = 2;
    149   workers_ = new ThreadPool(num_threads);
    150   PSEventSetFilter(PSE_ALL);
    151 }
    152 
    153 Life::~Life() {
    154   delete workers_;
    155   PSContext2DFree(ps_context_);
    156 }
    157 
    158 void Life::UpdateContext() {
    159   cell_stride_ = (ps_context_->width + kCellAlignment - 1) &
    160       ~(kCellAlignment - 1);
    161   size_t size = cell_stride_ * ps_context_->height;
    162 
    163   if (ps_context_->width != width_ || ps_context_->height != height_) {
    164     free(cell_in_);
    165     free(cell_out_);
    166 
    167     // Create a new context
    168     void* in_buffer = NULL;
    169     void* out_buffer = NULL;
    170     // alloc buffers aligned on 16 bytes
    171     posix_memalign(&in_buffer, kCellAlignment, size);
    172     posix_memalign(&out_buffer, kCellAlignment, size);
    173     cell_in_ = (uint8_t*) in_buffer;
    174     cell_out_ = (uint8_t*) out_buffer;
    175 
    176     memset(cell_out_, 0, size);
    177     for (size_t index = 0; index < size; index++) {
    178       cell_in_[index] = rand() & 1;
    179     }
    180     width_ = ps_context_->width;
    181     height_ = ps_context_->height;
    182   }
    183 }
    184 
    185 void Life::DrawCell(int32_t x, int32_t y) {
    186   if (!cell_in_) return;
    187   if (x > 0 && x < ps_context_->width - 1 &&
    188       y > 0 && y < ps_context_->height - 1) {
    189     cell_in_[x - 1 + y * cell_stride_] = 1;
    190     cell_in_[x + 1 + y * cell_stride_] = 1;
    191     cell_in_[x + (y - 1) * cell_stride_] = 1;
    192     cell_in_[x + (y + 1) * cell_stride_] = 1;
    193   }
    194 }
    195 
    196 void Life::ProcessTouchEvent(const pp::TouchInputEvent& touches) {
    197   uint32_t count = touches.GetTouchCount(PP_TOUCHLIST_TYPE_TOUCHES);
    198   uint32_t i, j;
    199   for (i = 0; i < count; i++) {
    200     pp::TouchPoint touch =
    201         touches.GetTouchByIndex(PP_TOUCHLIST_TYPE_TOUCHES, i);
    202     int radius = (int)(touch.radii().x());
    203     int x = (int)(touch.position().x());
    204     int y = (int)(touch.position().y());
    205     // num = 1/100th the area of touch point
    206     uint32_t num = (uint32_t)(M_PI * radius * radius / 100.0f);
    207     for (j = 0; j < num; j++) {
    208       int dx = rand() % (radius * 2) - radius;
    209       int dy = rand() % (radius * 2) - radius;
    210       // only plot random cells within the touch area
    211       if (dx * dx + dy * dy <= radius * radius)
    212         DrawCell(x + dx, y + dy);
    213     }
    214   }
    215 }
    216 
    217 void Life::PostUpdateMessage(const char* message_name, double value) {
    218   pp::VarDictionary message;
    219   message.Set("message", message_name);
    220   message.Set("value", value);
    221   PSInterfaceMessaging()->PostMessage(PSGetInstanceId(), message.pp_var());
    222 }
    223 
    224 void Life::StartBenchmark() {
    225   printf("Running benchmark... (SIMD: %s, multi-threading: %s, size: %dx%d)\n",
    226       simd_ ? "enabled" : "disabled",
    227       multithread_ ? "enabled" : "disabled",
    228       ps_context_->width,
    229       ps_context_->height);
    230   benchmarking_ = true;
    231   bench_start_time_ = getseconds();
    232   benchmark_frame_counter_ = kFramesToBenchmark;
    233 }
    234 
    235 void Life::EndBenchmark() {
    236   double total_time;
    237   bench_end_time_ = getseconds();
    238   benchmarking_ = false;
    239   total_time = bench_end_time_ - bench_start_time_;
    240   printf("Finished - benchmark took %f seconds\n", total_time);
    241   // Send benchmark result to JS.
    242   PostUpdateMessage("benchmark_result", total_time);
    243 }
    244 
    245 void Life::HandleEvent(PSEvent* ps_event) {
    246   // Give the 2D context a chance to process the event.
    247   if (0 != PSContext2DHandleEvent(ps_context_, ps_event)) {
    248     UpdateContext();
    249     return;
    250   }
    251 
    252   switch(ps_event->type) {
    253 
    254     case PSE_INSTANCE_HANDLEINPUT: {
    255       pp::InputEvent event(ps_event->as_resource);
    256 
    257       switch(event.GetType()) {
    258         case PP_INPUTEVENT_TYPE_MOUSEDOWN:
    259         case PP_INPUTEVENT_TYPE_MOUSEMOVE: {
    260           pp::MouseInputEvent mouse = pp::MouseInputEvent(event);
    261           // If the button is down, draw
    262           if (mouse.GetModifiers() & PP_INPUTEVENT_MODIFIER_LEFTBUTTONDOWN) {
    263             PP_Point location = mouse.GetPosition();
    264             DrawCell(location.x, location.y);
    265           }
    266           break;
    267         }
    268 
    269         case PP_INPUTEVENT_TYPE_TOUCHSTART:
    270         case PP_INPUTEVENT_TYPE_TOUCHMOVE: {
    271           pp::TouchInputEvent touches = pp::TouchInputEvent(event);
    272           ProcessTouchEvent(touches);
    273           break;
    274         }
    275 
    276         case PP_INPUTEVENT_TYPE_KEYDOWN: {
    277           pp::Fullscreen fullscreen(PSInstance::GetInstance());
    278           bool isFullscreen = fullscreen.IsFullscreen();
    279           fullscreen.SetFullscreen(!isFullscreen);
    280           break;
    281         }
    282 
    283         default:
    284           break;
    285       }
    286       break;  // case PSE_INSTANCE_HANDLEINPUT
    287     }
    288 
    289     case PSE_INSTANCE_HANDLEMESSAGE: {
    290       // Convert Pepper Simple message to PPAPI C++ vars
    291       pp::Var var(ps_event->as_var);
    292       if (var.is_dictionary()) {
    293         pp::VarDictionary dictionary(var);
    294         std::string message = dictionary.Get("message").AsString();
    295         if (message == "run_benchmark" && !benchmarking_) {
    296           StartBenchmark();
    297         } else if (message == "set_simd") {
    298           simd_ = dictionary.Get("value").AsBool();
    299         } else if (message == "set_threading") {
    300           multithread_ = dictionary.Get("value").AsBool();
    301         }
    302       }
    303       break;  // case PSE_INSTANCE_HANDLEMESSAGE
    304     }
    305 
    306     default:
    307       break;
    308   }
    309 }
    310 
    311 void Life::Stir() {
    312   int32_t width = ps_context_->width;
    313   int32_t height = ps_context_->height;
    314   int32_t stride = cell_stride_;
    315   int32_t i;
    316   if (cell_in_ == NULL || cell_out_ == NULL)
    317     return;
    318 
    319   for (i = 0; i < width; ++i) {
    320     cell_in_[i] = rand() & 1;
    321     cell_in_[i + (height - 1) * stride] = rand() & 1;
    322   }
    323   for (i = 0; i < height; ++i) {
    324     cell_in_[i * stride] = rand() & 1;
    325     cell_in_[i * stride + (width - 1)] = rand() & 1;
    326   }
    327 }
    328 
    329 void Life::wSimulate(int y) {
    330   // Don't run simulation on top and bottom borders
    331   if (y < 1 || y >= ps_context_->height - 1)
    332     return;
    333 
    334   // Do neighbor summation; apply rules, output pixel color. Note that a 1 cell
    335   // wide perimeter is excluded from the simulation update; only cells from
    336   // x = 1 to x < width - 1 and y = 1 to y < height - 1 are updated.
    337   uint8_t *src0 = (cell_in_ + (y - 1) * cell_stride_);
    338   uint8_t *src1 = src0 + cell_stride_;
    339   uint8_t *src2 = src1 + cell_stride_;
    340   uint8_t *dst = (cell_out_ + y * cell_stride_) + 1;
    341   uint32_t *pixels = static_cast<uint32_t *>(ps_context_->data);
    342   uint32_t *pixel_line = // static_cast<uint32_t*>
    343       (pixels + y * ps_context_->stride / sizeof(uint32_t));
    344   int32_t x = 1;
    345 
    346   if (simd_) {
    347     const u8x16_t kOne = broadcast(1);
    348     const u8x16_t kFour = broadcast(4);
    349     const u8x16_t kEight = broadcast(8);
    350     const u8x16_t kZero255 = {0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    351 
    352     // Prime the src
    353     u8x16_t src00 = *reinterpret_cast<u8x16_t*>(&src0[0]);
    354     u8x16_t src01 = *reinterpret_cast<u8x16_t*>(&src0[16]);
    355     u8x16_t src10 = *reinterpret_cast<u8x16_t*>(&src1[0]);
    356     u8x16_t src11 = *reinterpret_cast<u8x16_t*>(&src1[16]);
    357     u8x16_t src20 = *reinterpret_cast<u8x16_t*>(&src2[0]);
    358     u8x16_t src21 = *reinterpret_cast<u8x16_t*>(&src2[16]);
    359 
    360     // This inner loop is SIMD - each loop iteration will process 16 cells.
    361     for (; (x + 15) < (ps_context_->width - 1); x += 16) {
    362 
    363       // Construct jittered source temps, using __builtin_shufflevector(..) to
    364       // extract a shifted 16 element vector from the 32 element concatenation
    365       // of two source vectors.
    366       u8x16_t src0j0 = src00;
    367       u8x16_t src0j1 = __builtin_shufflevector(src00, src01,
    368           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
    369       u8x16_t src0j2 = __builtin_shufflevector(src00, src01,
    370           2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
    371       u8x16_t src1j0 = src10;
    372       u8x16_t src1j1 = __builtin_shufflevector(src10, src11,
    373           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
    374       u8x16_t src1j2 = __builtin_shufflevector(src10, src11,
    375           2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
    376       u8x16_t src2j0 = src20;
    377       u8x16_t src2j1 = __builtin_shufflevector(src20, src21,
    378           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
    379       u8x16_t src2j2 = __builtin_shufflevector(src20, src21,
    380           2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
    381 
    382       // Sum the jittered sources to construct neighbor count.
    383       u8x16_t count = src0j0 + src0j1 +  src0j2 +
    384                       src1j0 +        +  src1j2 +
    385                       src2j0 + src2j1 +  src2j2;
    386       // Add the center cell.
    387       count = count + count + src1j1;
    388       // If count > 4 and < 8, center cell will be alive in the next frame.
    389       u8x16_t alive1 = count > kFour;
    390       u8x16_t alive2 = count < kEight;
    391       // Intersect the two comparisons from above.
    392       u8x16_t alive = alive1 & alive2;
    393 
    394       // At this point, alive[x] will be one of two values:
    395       //   0x00 for a dead cell
    396       //   0xFF for an alive cell.
    397       //
    398       // Next, convert alive cells to green pixel color.
    399       // Use __builtin_shufflevector(..) to construct output pixels from
    400       // concantination of alive vector and kZero255 const vector.
    401       //   Indices 0..15 select the 16 cells from alive vector.
    402       //   Index 16 is zero constant from kZero255 constant vector.
    403       //   Index 17 is 255 constant from kZero255 constant vector.
    404       //   Output pixel color values are in BGRABGRABGRABGRA order.
    405       // Since each pixel needs 4 bytes of color information, 16 cells will
    406       // need to expand to 4 seperate 16 byte pixel splats.
    407       u8x16_t pixel0_3 = __builtin_shufflevector(alive, kZero255,
    408         16, 0, 16, 17, 16, 1, 16, 17, 16, 2, 16, 17, 16, 3, 16, 17);
    409       u8x16_t pixel4_7 = __builtin_shufflevector(alive, kZero255,
    410         16, 4, 16, 17, 16, 5, 16, 17, 16, 6, 16, 17, 16, 7, 16, 17);
    411       u8x16_t pixel8_11 = __builtin_shufflevector(alive, kZero255,
    412         16, 8, 16, 17, 16, 9, 16, 17, 16, 10, 16, 17, 16, 11, 16, 17);
    413       u8x16_t pixel12_15 = __builtin_shufflevector(alive, kZero255,
    414         16, 12, 16, 17, 16, 13, 16, 17, 16, 14, 16, 17, 16, 15, 16, 17);
    415 
    416       // Write 16 pixels to output pixel buffer.
    417       *reinterpret_cast<u8x16_t*>(pixel_line + 0) = pixel0_3;
    418       *reinterpret_cast<u8x16_t*>(pixel_line + 4) = pixel4_7;
    419       *reinterpret_cast<u8x16_t*>(pixel_line + 8) = pixel8_11;
    420       *reinterpret_cast<u8x16_t*>(pixel_line + 12) = pixel12_15;
    421 
    422       // Convert alive mask to 1 or 0 and store in destination cell array.
    423       *reinterpret_cast<u8x16_t*>(dst) = alive & kOne;
    424 
    425       // Increment pointers.
    426       pixel_line += 16;
    427       dst += 16;
    428       src0 += 16;
    429       src1 += 16;
    430       src2 += 16;
    431 
    432       // Shift source over by 16 cells and read the next 16 cells.
    433       src00 = src01;
    434       src01 = *reinterpret_cast<u8x16_t*>(&src0[16]);
    435       src10 = src11;
    436       src11 = *reinterpret_cast<u8x16_t*>(&src1[16]);
    437       src20 = src21;
    438       src21 = *reinterpret_cast<u8x16_t*>(&src2[16]);
    439     }
    440   }
    441 
    442   // The SIMD loop above does 16 cells at a time.  The loop below is the
    443   // regular version which processes one cell at a time.  It is used to
    444   // finish the remainder of the scanline not handled by the SIMD loop.
    445   for (; x < (ps_context_->width - 1); ++x) {
    446     // Sum the jittered sources to construct neighbor count.
    447     int count = src0[0] + src0[1] + src0[2] +
    448                 src1[0] +         + src1[2] +
    449                 src2[0] + src2[1] + src2[2];
    450     // Add the center cell.
    451     count = count + count + src1[1];
    452     // Use table lookup indexed by count to determine pixel & alive state.
    453     uint32_t color = kNeighborColors[count];
    454     *pixel_line++ = color;
    455     *dst++ = kIsAlive[count];
    456     ++src0;
    457     ++src1;
    458     ++src2;
    459   }
    460 }
    461 
    462 // Static entry point for worker thread.
    463 void Life::wSimulateEntry(int slice, void* thiz) {
    464   static_cast<Life*>(thiz)->wSimulate(slice);
    465 }
    466 
    467 void Life::Simulate() {
    468   // Stir up the edges to prevent the simulation from reaching steady state.
    469   Stir();
    470 
    471   if (multithread_) {
    472     // If multi-threading enabled, dispatch tasks to pool of worker threads.
    473     workers_->Dispatch(ps_context_->height, wSimulateEntry, this);
    474   } else {
    475     // Else manually simulate each line on this thread.
    476     for (int y = 0; y < ps_context_->height; y++) {
    477       wSimulateEntry(y, this);
    478     }
    479   }
    480   std::swap(cell_in_, cell_out_);
    481 }
    482 
    483 void Life::Update() {
    484 
    485   PSContext2DGetBuffer(ps_context_);
    486   if (NULL == ps_context_->data)
    487     return;
    488 
    489   // If we somehow have not allocated these pointers yet, skip this frame.
    490   if (!cell_in_ || !cell_out_) return;
    491 
    492   // Simulate one (or more if benchmarking) frames
    493   do {
    494     Simulate();
    495     if (!benchmarking_)
    496       break;
    497     --benchmark_frame_counter_;
    498   } while(benchmark_frame_counter_ > 0);
    499   if (benchmarking_)
    500     EndBenchmark();
    501 
    502  PSContext2DSwapBuffer(ps_context_);
    503 }
    504 
    505 // Starting point for the module.  We do not use main since it would
    506 // collide with main in libppapi_cpp.
    507 int example_main(int argc, char* argv[]) {
    508   Life life;
    509   while (true) {
    510     PSEvent* ps_event;
    511     // Consume all available events
    512     while ((ps_event = PSEventTryAcquire()) != NULL) {
    513       life.HandleEvent(ps_event);
    514       PSEventRelease(ps_event);
    515     }
    516     // Do simulation, render and present.
    517     life.Update();
    518   }
    519   return 0;
    520 }
    521 
    522 // Register the function to call once the Instance Object is initialized.
    523 // see: pappi_simple/ps_main.h
    524 PPAPI_SIMPLE_REGISTER_MAIN(example_main);
    525