Home | History | Annotate | Download | only in kernels
      1 /*
      2  * Copyright 2017 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can
      5  * be found in the LICENSE file.
      6  *
      7  */
      8 
      9 //
     10 //
     11 //
     12 
     13 #include "path.h"
     14 #include "block_pool_cl.h"
     15 #include "path_builder_cl_12.h"
     16 #include "kernel_cl_12.h"
     17 
     18 //
     19 //
     20 //
     21 
     22 #if 0
     23 
     24 //
     25 // SIMD AVX2
     26 //
     27 
     28 #define SKC_PATHS_COPY_WORDS_PER_ELEM          8
     29 #define SKC_PATHS_COPY_SUBGROUP_SIZE           1
     30 #define SKC_PATHS_COPY_KERNEL_ATTRIBUTES
     31 
     32 typedef skc_uint8  skc_paths_copy_elem;
     33 typedef skc_uint8  skc_pb_idx_v;
     34 
     35 #define SKC_PATHS_COPY_ELEM_EXPAND()           SKC_EXPAND_8()
     36 
     37 #define SKC_IS_NOT_PATH_HEAD(sg,I)             ((sg) + I >= SKC_PATH_HEAD_WORDS)
     38 
     39 #endif
     40 
     41 //
     42 //
     43 //
     44 
     45 #define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK      (SKC_PATHS_COPY_SUBGROUP_SIZE - 1)
     46 #define SKC_PATHS_COPY_ELEMS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
     47 #define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK      (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS)
     48 #define SKC_PATHS_COPY_ELEMS_PER_THREAD        (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE)
     49 
     50 // FIXME -- use SUBGROUP terminology everywhere
     51 #define SKC_PATHS_COPY_SUBGROUP_WORDS          (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS)
     52 
     53 //
     54 //
     55 //
     56 
     57 #define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER                              \
     58   (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS))
     59 
     60 #define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER                           \
     61   (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS))
     62 
     63 // #define SKC_PATHS_COPY_HEAD_ELEMS    ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS)
     64 
     65 //
     66 //
     67 //
     68 
     69 //
     70 // BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL
     71 //
     72 
     73 #define SKC_CMD_PATHS_COPY_ONE_BITS              (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
     74 
     75 #define SKC_CMD_PATHS_COPY_ONE_MASK              SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS)
     76 
     77 #define SKC_CMD_PATHS_COPY_ONE                   (1u << SKC_CMD_PATHS_COPY_ONE_BITS)
     78 
     79 #define SKC_CMD_PATHS_COPY_GET_TAG(ti)           SKC_TAGGED_BLOCK_ID_GET_TAG(ti)
     80 
     81 #define SKC_CMD_PATHS_COPY_GET_ROLLING(ti)       ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS)
     82 
     83 #define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b)  (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG))
     84 
     85 //
     86 //
     87 //
     88 
     89 skc_uint
     90 skc_sub_group_local_id()
     91 {
     92 #if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
     93   return get_sub_group_local_id();
     94 #else
     95   return 0;
     96 #endif
     97 }
     98 
     99 //
    100 // convert an atomic read counter offset to a block id
    101 //
    102 
    103 skc_block_id_t
    104 skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids,
    105                  skc_uint                        const bp_idx_mask,
    106                  skc_uint                        const bp_reads,
    107                  skc_uint                        const bp_off)
    108 {
    109   skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask;
    110 
    111   return bp_ids[bp_idx];
    112 }
    113 
    114 //
    115 //
    116 //
    117 
    118 void
    119 skc_copy_segs(__global skc_paths_copy_elem       * const bp_elems, // to
    120               skc_uint                             const bp_elems_idx,
    121               __global skc_paths_copy_elem const * const pb_elems, // from
    122               skc_uint                             const pb_elems_idx)
    123 {
    124   for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    125     {
    126       (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii];
    127     }
    128 
    129 #if 0
    130   //
    131   // NOTE THIS IS PRINTING 8 ROWS
    132   //
    133   printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
    134          (skc_uint)get_global_id(0),pb_elems_idx,
    135          as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]),
    136          as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]),
    137          as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]),
    138          as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE]));
    139   printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",
    140          (skc_uint)get_global_id(0),pb_elems_idx,
    141          as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]),
    142          as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]),
    143          as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]),
    144          as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE]));
    145 #endif
    146 }
    147 
    148 //
    149 //
    150 //
    151 
    152 void
    153 skc_copy_node(__global skc_paths_copy_elem       * const bp_elems, // to
    154               skc_uint                             const bp_elems_idx,
    155               __global skc_block_id_t      const * const bp_ids,
    156               skc_uint                             const bp_reads,
    157               skc_uint                             const bp_idx_mask,
    158               __global skc_paths_copy_elem const * const pb_elems, // from
    159               skc_uint                             const pb_elems_idx,
    160               skc_uint                             const pb_rolling)
    161 {
    162   //
    163   // remap block id tags bp_elems the host-side rolling counter pb_elems a
    164   // device-side block pool id
    165   //
    166   for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    167     {
    168       // load block_id_tag words
    169       skc_paths_copy_elem elem   = (pb_elems + pb_elems_idx)[ii];
    170 
    171       // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
    172       skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
    173 
    174       // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
    175 
    176       //
    177       // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
    178       // will _always_ be safe as long as we don't use the loaded
    179       // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
    180       // of iterating over the vector components.
    181       //
    182 
    183       // only convert if original elem is not invalid
    184 
    185 #undef  SKC_EXPAND_X
    186 #define SKC_EXPAND_X(I,S,C,P,R)                                 \
    187       if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
    188         skc_block_id_t const b = bp_ids[bp_idx C];              \
    189         elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
    190       }
    191 
    192       // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C);
    193 
    194       SKC_PATHS_COPY_ELEM_EXPAND();
    195 
    196       // store the elem back
    197       (bp_elems+bp_elems_idx)[ii] = elem;
    198     }
    199 }
    200 
    201 //
    202 //
    203 //
    204 
    205 void
    206 skc_host_map_update(__global skc_uint * const host_map,
    207                     skc_uint            const block,
    208                     skc_paths_copy_elem const elem)
    209 {
    210   //
    211   // write first elem to map -- FIXME -- this is a little nasty
    212   // because it relies on the the host handle always being the first
    213   // word in the path header.
    214   //
    215   // OTOH, this is not unreasonable.  The alternative is to have a
    216   // separate kernel initializing the map.
    217   //
    218 #if SKC_PATHS_COPY_SUBGROUP_SIZE > 1
    219   if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE)
    220 #endif
    221     {
    222 #if SKC_PATHS_COPY_ELEM_WORDS == 1
    223       host_map[elem] = block;
    224 #if 0
    225       printf("[%u] = %u\n",elem,block);
    226 #endif
    227 #else
    228       host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block;
    229 #endif
    230     }
    231 }
    232 
    233 //
    234 //
    235 //
    236 
    237 void
    238 skc_copy_head(__global skc_uint                  * const host_map,
    239               skc_uint                             const block,
    240               __global skc_paths_copy_elem       * const bp_elems, // to
    241               skc_uint                             const bp_elems_idx,
    242               __global skc_block_id_t      const * const bp_ids,
    243               skc_uint                             const bp_reads,
    244               skc_uint                             const bp_idx_mask,
    245               __global skc_paths_copy_elem const * const pb_elems, // from
    246               skc_uint                             const pb_elems_idx,
    247               skc_uint                             const pb_rolling)
    248 {
    249   //
    250   // if there are more path header words than there are
    251   // threads-per-block then we can just copy the initial header words
    252   //
    253 #if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 )
    254   for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    255     {
    256       skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii];
    257 
    258       (bp_elems+bp_elems_idx)[ii] = elem;
    259 
    260       if (ii == 0) {
    261         skc_host_map_update(host_map,block,elem);
    262       }
    263     }
    264 #endif
    265 
    266   //
    267   // this is similar to copy node but the first H words of the path
    268   // header are not modified and simply copied
    269   //
    270   for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    271     {
    272       skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii];
    273 
    274 #if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 )
    275       if (ii == 0) {
    276         skc_host_map_update(host_map,block,elem);
    277       }
    278 #endif
    279       // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid
    280       skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
    281 
    282       //
    283       // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
    284       // will _always_ be safe as long as we don't use the loaded
    285       // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
    286       // of iterating over the vector components.
    287       //
    288 
    289       // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
    290 
    291       // FIXME -- MIX MIX MIX MIX / SELECT
    292 
    293       // only convert if original elem is not invalid
    294 #undef  SKC_EXPAND_X
    295 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    296       if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \
    297         skc_block_id_t const b = bp_ids[bp_idx C];                      \
    298         elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);           \
    299       }
    300 
    301       // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C);
    302 
    303       SKC_PATHS_COPY_ELEM_EXPAND();
    304 
    305       // store the elem back
    306       (bp_elems+bp_elems_idx)[ii] = elem;
    307     }
    308 
    309   //
    310   // the remaining words are treated like a node
    311   //
    312   for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE)
    313     {
    314       // load block_id_tag words
    315       skc_paths_copy_elem elem   = (pb_elems+pb_elems_idx)[ii];
    316 
    317       // calculate ahead of time
    318       skc_pb_idx_v  const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask;
    319 
    320       //
    321       // FIXME -- SIMD can be fully parallelized since a bp_ids[] load
    322       // will _always_ be safe as long as we don't use the loaded
    323       // value!  So... fix UPDATE_ROLLING to be SIMD-friendly instead
    324       // of iterating over the vector components.
    325       //
    326 
    327       // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS
    328 
    329       // only convert if original elem is not invalid
    330 #undef  SKC_EXPAND_X
    331 #define SKC_EXPAND_X(I,S,C,P,R)                                 \
    332       if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) {              \
    333         skc_block_id_t const b = bp_ids[bp_idx C];              \
    334         elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b);   \
    335       }
    336 
    337       // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C);
    338 
    339       SKC_PATHS_COPY_ELEM_EXPAND();
    340 
    341       // store the elem
    342       (bp_elems+bp_elems_idx)[ii] = elem;
    343     }
    344 }
    345 
    346 //
    347 // FIXME -- pack some of these constant integer args in a vec or struct
    348 //
    349 
    350 __kernel
    351 SKC_PATHS_COPY_KERNEL_ATTRIBS
    352 void
    353 skc_kernel_paths_copy
    354 (__global skc_uint                        * const host_map,
    355 
    356  __global skc_block_id_t            const * const bp_ids,
    357  __global skc_paths_copy_elem             * const bp_elems,
    358  skc_uint                                   const bp_idx_mask, // pow2 modulo mask for block pool ring
    359 
    360  __global skc_uint                  const * const bp_alloc,    // block pool ring base
    361  skc_uint                                   const bp_alloc_idx,// which subbuf
    362 
    363  __global union skc_tagged_block_id const * const pb_cmds,
    364  __global skc_paths_copy_elem       const * const pb_elems,
    365 
    366  skc_uint                                   const pb_size,     // # of commands/blocks in buffer
    367  skc_uint                                   const pb_rolling,  // shifted rolling counter base
    368 
    369  skc_uint                                   const pb_prev_from,
    370  skc_uint                                   const pb_prev_span,
    371  skc_uint                                   const pb_curr_from)
    372 {
    373   //
    374   // THERE ARE 3 TYPES OF PATH COPYING COMMANDS:
    375   //
    376   // - HEAD
    377   // - NODE
    378   // - SEGS
    379   //
    380   // THESE ARE SUBGROUP ORIENTED KERNELS
    381   //
    382   // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS
    383   //
    384 
    385   //
    386   // It's likely that peak bandwidth is achievable with a single
    387   // workgroup.
    388   //
    389   // So let's keep the grids modestly sized and for simplicity and
    390   // portability, let's assume that a single workgroup can perform all
    391   // steps in the copy.
    392   //
    393   // Launch as large of a workgroup as possiblex
    394   //
    395   // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL
    396   // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS
    397   // 3. FOR EACH COMMAND:
    398   //      - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES.
    399   //      - NODE: CONVERT AND COPY B INDICES
    400   //      - SEGS: BULK COPY
    401   //
    402   // B : number of words in block -- always pow2
    403   // W : intelligently/arbitrarily chosen factor of B -- always pow2
    404   //
    405 
    406   //
    407   // There are several approaches to processing the commands:
    408   //
    409   // 1. B threads are responsible for one block. All threads broadcast
    410   //    load a single command word. Workgroup size must be a facpb_elemsr of
    411   //    B.
    412   //
    413   // 2. W threads process an entire block. W will typically be the
    414   //    device's subgroup/warp/wave width. W threads broadcast load a
    415   //    single command word.
    416   //
    417   // 3. W threads process W blocks. W threads load W command words and
    418   //    process W blocks.
    419   //
    420   // Clearly (1) has low I/O intensity but will achieve high
    421   // parallelism by activating the most possible threads. The downside
    422   // of this kind of approach is that the kernel will occupy even a
    423   // large GPU with low intensity work and reduce opportunities for
    424   // concurrent kernel execution (of other kernels).
    425   //
    426   // See Vasily Volkov's CUDA presentation describing these tradeoffs.
    427   //
    428   // Note that there are many other approaches.  For example, similar
    429   // pb_elems (1) but each thread loads a pow2 vector of block data.
    430   //
    431 
    432   // load the copied atomic read "base" from gmem
    433   skc_uint const bp_reads = bp_alloc[bp_alloc_idx];
    434   // will always be less than 2^32
    435   skc_uint const gid      = get_global_id(0);
    436   // every subgroup/simd that will work on the block loads the same command
    437   skc_uint const sg_idx   = gid / SKC_PATHS_COPY_SUBGROUP_SIZE;
    438   // path builder data can be spread across two spans
    439   skc_uint       pb_idx   = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from);
    440 
    441   // no need pb_elems make this branchless
    442   if (pb_idx >= pb_size)
    443     pb_idx -= pb_size;
    444 
    445   // broadcast load the command
    446   union skc_tagged_block_id const pb_cmd       = pb_cmds[pb_idx];
    447 
    448   // what do we want pb_elems do with this block?
    449   skc_cmd_paths_copy_tag    const tag          = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32);
    450 
    451   // compute offset from rolling base to get index into block pool ring allocation
    452   skc_uint                  const bp_off       = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling);
    453 
    454   // convert the pb_cmd's offset counter pb_elems a block id
    455   skc_block_id_t            const block        = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off);
    456 
    457 #if 0
    458   if (get_sub_group_local_id() == 0) {
    459     printf("bp_off/reads = %u / %u\n",bp_off,bp_reads);
    460     printf("< %8u >\n",block);
    461   }
    462 #endif
    463 
    464   // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id()
    465   skc_uint                 const tid          = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK;
    466 
    467   // calculate bp_elems (to) / pb_elems (from)
    468   skc_uint                 const bp_elems_idx = block  * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid;
    469   skc_uint                 const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK    + tid;
    470 
    471   if      (tag == SKC_CMD_PATHS_COPY_TAG_SEGS)
    472     {
    473 #if 0
    474       if (tid == 0)
    475         printf("%3u, segs\n",bp_off);
    476 #endif
    477       skc_copy_segs(bp_elems,
    478                     bp_elems_idx,
    479                     pb_elems,
    480                     pb_elems_idx);
    481     }
    482   else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE)
    483     {
    484 #if 0
    485       if (tid == 0)
    486         printf("%3u, NODE\n",bp_off);
    487 #endif
    488       skc_copy_node(bp_elems, // to
    489                     bp_elems_idx,
    490                     bp_ids,
    491                     bp_reads,
    492                     bp_idx_mask,
    493                     pb_elems, // from
    494                     pb_elems_idx,
    495                     pb_rolling);
    496     }
    497   else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD)
    498     {
    499 #if 0
    500       if (tid == 0)
    501         printf("%3u, HEAD\n",bp_off);
    502 #endif
    503       skc_copy_head(host_map,
    504                     block,
    505                     bp_elems, // to
    506                     bp_elems_idx,
    507                     bp_ids,
    508                     bp_reads,
    509                     bp_idx_mask,
    510                     pb_elems, // from
    511                     pb_elems_idx,
    512                     pb_rolling);
    513     }
    514 }
    515 
    516 //
    517 //
    518 //
    519 
    520 __kernel
    521 SKC_PATHS_ALLOC_KERNEL_ATTRIBS
    522 void
    523 skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics,
    524                        __global skc_uint          * const bp_alloc,
    525                        skc_uint                     const bp_alloc_idx,
    526                        skc_uint                     const pb_cmd_count)
    527 {
    528   //
    529   // allocate blocks in block pool
    530   //
    531   skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count);
    532 
    533   // store in slot
    534   bp_alloc[bp_alloc_idx] = reads;
    535 
    536 #if 0
    537   printf("pc: %8u + %u\n",reads,pb_cmd_count);
    538 #endif
    539 }
    540 
    541 //
    542 //
    543 //
    544