Home | History | Annotate | Download | only in core
      1 //
      2 // Copyright 2012 Francisco Jerez
      3 //
      4 // Permission is hereby granted, free of charge, to any person obtaining a
      5 // copy of this software and associated documentation files (the "Software"),
      6 // to deal in the Software without restriction, including without limitation
      7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 // and/or sell copies of the Software, and to permit persons to whom the
      9 // Software is furnished to do so, subject to the following conditions:
     10 //
     11 // The above copyright notice and this permission notice shall be included in
     12 // all copies or substantial portions of the Software.
     13 //
     14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20 // OTHER DEALINGS IN THE SOFTWARE.
     21 //
     22 
     23 #include "core/kernel.hpp"
     24 #include "core/resource.hpp"
     25 #include "util/factor.hpp"
     26 #include "util/u_math.h"
     27 #include "pipe/p_context.h"
     28 
     29 using namespace clover;
     30 
     31 kernel::kernel(clover::program &prog, const std::string &name,
     32                const std::vector<module::argument> &margs) :
     33    program(prog), _name(name), exec(*this),
     34    program_ref(prog._kernel_ref_counter) {
     35    for (auto &marg : margs) {
     36       if (marg.semantic == module::argument::general)
     37          _args.emplace_back(argument::create(marg));
     38    }
     39 }
     40 
     41 template<typename V>
     42 static inline std::vector<uint>
     43 pad_vector(command_queue &q, const V &v, uint x) {
     44    std::vector<uint> w { v.begin(), v.end() };
     45    w.resize(q.device().max_block_size().size(), x);
     46    return w;
     47 }
     48 
     49 void
     50 kernel::launch(command_queue &q,
     51                const std::vector<size_t> &grid_offset,
     52                const std::vector<size_t> &grid_size,
     53                const std::vector<size_t> &block_size) {
     54    const auto m = program().build(q.device()).binary;
     55    const auto reduced_grid_size =
     56       map(divides(), grid_size, block_size);
     57    void *st = exec.bind(&q, grid_offset);
     58    struct pipe_grid_info info = {};
     59 
     60    // The handles are created during exec_context::bind(), so we need make
     61    // sure to call exec_context::bind() before retrieving them.
     62    std::vector<uint32_t *> g_handles = map([&](size_t h) {
     63          return (uint32_t *)&exec.input[h];
     64       }, exec.g_handles);
     65 
     66    q.pipe->bind_compute_state(q.pipe, st);
     67    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
     68                                0, exec.samplers.size(),
     69                                exec.samplers.data());
     70 
     71    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
     72                              exec.sviews.size(), exec.sviews.data());
     73    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
     74                                  exec.resources.data());
     75    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
     76                               exec.g_buffers.data(), g_handles.data());
     77 
     78    // Fill information for the launch_grid() call.
     79    info.work_dim = grid_size.size();
     80    copy(pad_vector(q, block_size, 1), info.block);
     81    copy(pad_vector(q, reduced_grid_size, 1), info.grid);
     82    info.pc = find(name_equals(_name), m.syms).offset;
     83    info.input = exec.input.data();
     84 
     85    q.pipe->launch_grid(q.pipe, &info);
     86 
     87    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
     88    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
     89    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
     90                              exec.sviews.size(), NULL);
     91    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
     92                                exec.samplers.size(), NULL);
     93 
     94    q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
     95    exec.unbind();
     96 }
     97 
     98 size_t
     99 kernel::mem_local() const {
    100    size_t sz = 0;
    101 
    102    for (auto &arg : args()) {
    103       if (dynamic_cast<local_argument *>(&arg))
    104          sz += arg.storage();
    105    }
    106 
    107    return sz;
    108 }
    109 
    110 size_t
    111 kernel::mem_private() const {
    112    return 0;
    113 }
    114 
    115 const std::string &
    116 kernel::name() const {
    117    return _name;
    118 }
    119 
    120 std::vector<size_t>
    121 kernel::optimal_block_size(const command_queue &q,
    122                            const std::vector<size_t> &grid_size) const {
    123    return factor::find_grid_optimal_factor<size_t>(
    124       q.device().max_threads_per_block(), q.device().max_block_size(),
    125       grid_size);
    126 }
    127 
    128 std::vector<size_t>
    129 kernel::required_block_size() const {
    130    return { 0, 0, 0 };
    131 }
    132 
    133 kernel::argument_range
    134 kernel::args() {
    135    return map(derefs(), _args);
    136 }
    137 
    138 kernel::const_argument_range
    139 kernel::args() const {
    140    return map(derefs(), _args);
    141 }
    142 
    143 const module &
    144 kernel::module(const command_queue &q) const {
    145    return program().build(q.device()).binary;
    146 }
    147 
    148 kernel::exec_context::exec_context(kernel &kern) :
    149    kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
    150 }
    151 
    152 kernel::exec_context::~exec_context() {
    153    if (st)
    154       q->pipe->delete_compute_state(q->pipe, st);
    155 }
    156 
    157 void *
    158 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
    159                            const std::vector<size_t> &grid_offset) {
    160    std::swap(q, _q);
    161 
    162    // Bind kernel arguments.
    163    auto &m = kern.program().build(q->device()).binary;
    164    auto margs = find(name_equals(kern.name()), m.syms).args;
    165    auto msec = find(type_equals(module::section::text_executable), m.secs);
    166    auto explicit_arg = kern._args.begin();
    167 
    168    for (auto &marg : margs) {
    169       switch (marg.semantic) {
    170       case module::argument::general:
    171          (*(explicit_arg++))->bind(*this, marg);
    172          break;
    173 
    174       case module::argument::grid_dimension: {
    175          const cl_uint dimension = grid_offset.size();
    176          auto arg = argument::create(marg);
    177 
    178          arg->set(sizeof(dimension), &dimension);
    179          arg->bind(*this, marg);
    180          break;
    181       }
    182       case module::argument::grid_offset: {
    183          for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
    184             auto arg = argument::create(marg);
    185 
    186             arg->set(sizeof(x), &x);
    187             arg->bind(*this, marg);
    188          }
    189          break;
    190       }
    191       case module::argument::image_size: {
    192          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
    193          std::vector<cl_uint> image_size{
    194                static_cast<cl_uint>(img->width()),
    195                static_cast<cl_uint>(img->height()),
    196                static_cast<cl_uint>(img->depth())};
    197          for (auto x : image_size) {
    198             auto arg = argument::create(marg);
    199 
    200             arg->set(sizeof(x), &x);
    201             arg->bind(*this, marg);
    202          }
    203          break;
    204       }
    205       case module::argument::image_format: {
    206          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
    207          cl_image_format fmt = img->format();
    208          std::vector<cl_uint> image_format{
    209                static_cast<cl_uint>(fmt.image_channel_data_type),
    210                static_cast<cl_uint>(fmt.image_channel_order)};
    211          for (auto x : image_format) {
    212             auto arg = argument::create(marg);
    213 
    214             arg->set(sizeof(x), &x);
    215             arg->bind(*this, marg);
    216          }
    217          break;
    218       }
    219       }
    220    }
    221 
    222    // Create a new compute state if anything changed.
    223    if (!st || q != _q ||
    224        cs.req_local_mem != mem_local ||
    225        cs.req_input_mem != input.size()) {
    226       if (st)
    227          _q->pipe->delete_compute_state(_q->pipe, st);
    228 
    229       cs.ir_type = q->device().ir_format();
    230       cs.prog = &(msec.data[0]);
    231       cs.req_local_mem = mem_local;
    232       cs.req_input_mem = input.size();
    233       st = q->pipe->create_compute_state(q->pipe, &cs);
    234    }
    235 
    236    return st;
    237 }
    238 
    239 void
    240 kernel::exec_context::unbind() {
    241    for (auto &arg : kern.args())
    242       arg.unbind(*this);
    243 
    244    input.clear();
    245    samplers.clear();
    246    sviews.clear();
    247    resources.clear();
    248    g_buffers.clear();
    249    g_handles.clear();
    250    mem_local = 0;
    251 }
    252 
    253 namespace {
    254    template<typename T>
    255    std::vector<uint8_t>
    256    bytes(const T& x) {
    257       return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
    258    }
    259 
    260    ///
    261    /// Transform buffer \a v from the native byte order into the byte
    262    /// order specified by \a e.
    263    ///
    264    template<typename T>
    265    void
    266    byteswap(T &v, pipe_endian e) {
    267       if (PIPE_ENDIAN_NATIVE != e)
    268          std::reverse(v.begin(), v.end());
    269    }
    270 
    271    ///
    272    /// Pad buffer \a v to the next multiple of \a n.
    273    ///
    274    template<typename T>
    275    void
    276    align(T &v, size_t n) {
    277       v.resize(util_align_npot(v.size(), n));
    278    }
    279 
    280    bool
    281    msb(const std::vector<uint8_t> &s) {
    282       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
    283          return s.back() & 0x80;
    284       else
    285          return s.front() & 0x80;
    286    }
    287 
    288    ///
    289    /// Resize buffer \a v to size \a n using sign or zero extension
    290    /// according to \a ext.
    291    ///
    292    template<typename T>
    293    void
    294    extend(T &v, enum module::argument::ext_type ext, size_t n) {
    295       const size_t m = std::min(v.size(), n);
    296       const bool sign_ext = (ext == module::argument::sign_ext);
    297       const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
    298       T w(n, fill);
    299 
    300       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
    301          std::copy_n(v.begin(), m, w.begin());
    302       else
    303          std::copy_n(v.end() - m, m, w.end() - m);
    304 
    305       std::swap(v, w);
    306    }
    307 
    308    ///
    309    /// Append buffer \a w to \a v.
    310    ///
    311    template<typename T>
    312    void
    313    insert(T &v, const T &w) {
    314       v.insert(v.end(), w.begin(), w.end());
    315    }
    316 
    317    ///
    318    /// Append \a n elements to the end of buffer \a v.
    319    ///
    320    template<typename T>
    321    size_t
    322    allocate(T &v, size_t n) {
    323       size_t pos = v.size();
    324       v.resize(pos + n);
    325       return pos;
    326    }
    327 }
    328 
    329 std::unique_ptr<kernel::argument>
    330 kernel::argument::create(const module::argument &marg) {
    331    switch (marg.type) {
    332    case module::argument::scalar:
    333       return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
    334 
    335    case module::argument::global:
    336       return std::unique_ptr<kernel::argument>(new global_argument);
    337 
    338    case module::argument::local:
    339       return std::unique_ptr<kernel::argument>(new local_argument);
    340 
    341    case module::argument::constant:
    342       return std::unique_ptr<kernel::argument>(new constant_argument);
    343 
    344    case module::argument::image2d_rd:
    345    case module::argument::image3d_rd:
    346       return std::unique_ptr<kernel::argument>(new image_rd_argument);
    347 
    348    case module::argument::image2d_wr:
    349    case module::argument::image3d_wr:
    350       return std::unique_ptr<kernel::argument>(new image_wr_argument);
    351 
    352    case module::argument::sampler:
    353       return std::unique_ptr<kernel::argument>(new sampler_argument);
    354 
    355    }
    356    throw error(CL_INVALID_KERNEL_DEFINITION);
    357 }
    358 
    359 kernel::argument::argument() : _set(false) {
    360 }
    361 
    362 bool
    363 kernel::argument::set() const {
    364    return _set;
    365 }
    366 
    367 size_t
    368 kernel::argument::storage() const {
    369    return 0;
    370 }
    371 
    372 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
    373 }
    374 
    375 void
    376 kernel::scalar_argument::set(size_t size, const void *value) {
    377    if (!value)
    378       throw error(CL_INVALID_ARG_VALUE);
    379 
    380    if (size != this->size)
    381       throw error(CL_INVALID_ARG_SIZE);
    382 
    383    v = { (uint8_t *)value, (uint8_t *)value + size };
    384    _set = true;
    385 }
    386 
    387 void
    388 kernel::scalar_argument::bind(exec_context &ctx,
    389                               const module::argument &marg) {
    390    auto w = v;
    391 
    392    extend(w, marg.ext_type, marg.target_size);
    393    byteswap(w, ctx.q->device().endianness());
    394    align(ctx.input, marg.target_align);
    395    insert(ctx.input, w);
    396 }
    397 
    398 void
    399 kernel::scalar_argument::unbind(exec_context &ctx) {
    400 }
    401 
    402 void
    403 kernel::global_argument::set(size_t size, const void *value) {
    404    if (size != sizeof(cl_mem))
    405       throw error(CL_INVALID_ARG_SIZE);
    406 
    407    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
    408    _set = true;
    409 }
    410 
    411 void
    412 kernel::global_argument::bind(exec_context &ctx,
    413                               const module::argument &marg) {
    414    align(ctx.input, marg.target_align);
    415 
    416    if (buf) {
    417       const resource &r = buf->resource(*ctx.q);
    418       ctx.g_handles.push_back(ctx.input.size());
    419       ctx.g_buffers.push_back(r.pipe);
    420 
    421       // How to handle multi-demensional offsets?
    422       // We don't need to.  Buffer offsets are always
    423       // one-dimensional.
    424       auto v = bytes(r.offset[0]);
    425       extend(v, marg.ext_type, marg.target_size);
    426       byteswap(v, ctx.q->device().endianness());
    427       insert(ctx.input, v);
    428    } else {
    429       // Null pointer.
    430       allocate(ctx.input, marg.target_size);
    431    }
    432 }
    433 
    434 void
    435 kernel::global_argument::unbind(exec_context &ctx) {
    436 }
    437 
    438 size_t
    439 kernel::local_argument::storage() const {
    440    return _storage;
    441 }
    442 
    443 void
    444 kernel::local_argument::set(size_t size, const void *value) {
    445    if (value)
    446       throw error(CL_INVALID_ARG_VALUE);
    447 
    448    if (!size)
    449       throw error(CL_INVALID_ARG_SIZE);
    450 
    451    _storage = size;
    452    _set = true;
    453 }
    454 
    455 void
    456 kernel::local_argument::bind(exec_context &ctx,
    457                              const module::argument &marg) {
    458    auto v = bytes(ctx.mem_local);
    459 
    460    extend(v, module::argument::zero_ext, marg.target_size);
    461    byteswap(v, ctx.q->device().endianness());
    462    align(ctx.input, marg.target_align);
    463    insert(ctx.input, v);
    464 
    465    ctx.mem_local += _storage;
    466 }
    467 
    468 void
    469 kernel::local_argument::unbind(exec_context &ctx) {
    470 }
    471 
    472 void
    473 kernel::constant_argument::set(size_t size, const void *value) {
    474    if (size != sizeof(cl_mem))
    475       throw error(CL_INVALID_ARG_SIZE);
    476 
    477    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
    478    _set = true;
    479 }
    480 
    481 void
    482 kernel::constant_argument::bind(exec_context &ctx,
    483                                 const module::argument &marg) {
    484    align(ctx.input, marg.target_align);
    485 
    486    if (buf) {
    487       resource &r = buf->resource(*ctx.q);
    488       auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
    489 
    490       extend(v, module::argument::zero_ext, marg.target_size);
    491       byteswap(v, ctx.q->device().endianness());
    492       insert(ctx.input, v);
    493 
    494       st = r.bind_surface(*ctx.q, false);
    495       ctx.resources.push_back(st);
    496    } else {
    497       // Null pointer.
    498       allocate(ctx.input, marg.target_size);
    499    }
    500 }
    501 
    502 void
    503 kernel::constant_argument::unbind(exec_context &ctx) {
    504    if (buf)
    505       buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
    506 }
    507 
    508 void
    509 kernel::image_rd_argument::set(size_t size, const void *value) {
    510    if (!value)
    511       throw error(CL_INVALID_ARG_VALUE);
    512 
    513    if (size != sizeof(cl_mem))
    514       throw error(CL_INVALID_ARG_SIZE);
    515 
    516    img = &obj<image>(*(cl_mem *)value);
    517    _set = true;
    518 }
    519 
    520 void
    521 kernel::image_rd_argument::bind(exec_context &ctx,
    522                                 const module::argument &marg) {
    523    auto v = bytes(ctx.sviews.size());
    524 
    525    extend(v, module::argument::zero_ext, marg.target_size);
    526    byteswap(v, ctx.q->device().endianness());
    527    align(ctx.input, marg.target_align);
    528    insert(ctx.input, v);
    529 
    530    st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
    531    ctx.sviews.push_back(st);
    532 }
    533 
    534 void
    535 kernel::image_rd_argument::unbind(exec_context &ctx) {
    536    img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
    537 }
    538 
    539 void
    540 kernel::image_wr_argument::set(size_t size, const void *value) {
    541    if (!value)
    542       throw error(CL_INVALID_ARG_VALUE);
    543 
    544    if (size != sizeof(cl_mem))
    545       throw error(CL_INVALID_ARG_SIZE);
    546 
    547    img = &obj<image>(*(cl_mem *)value);
    548    _set = true;
    549 }
    550 
    551 void
    552 kernel::image_wr_argument::bind(exec_context &ctx,
    553                                 const module::argument &marg) {
    554    auto v = bytes(ctx.resources.size());
    555 
    556    extend(v, module::argument::zero_ext, marg.target_size);
    557    byteswap(v, ctx.q->device().endianness());
    558    align(ctx.input, marg.target_align);
    559    insert(ctx.input, v);
    560 
    561    st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
    562    ctx.resources.push_back(st);
    563 }
    564 
    565 void
    566 kernel::image_wr_argument::unbind(exec_context &ctx) {
    567    img->resource(*ctx.q).unbind_surface(*ctx.q, st);
    568 }
    569 
    570 void
    571 kernel::sampler_argument::set(size_t size, const void *value) {
    572    if (!value)
    573       throw error(CL_INVALID_SAMPLER);
    574 
    575    if (size != sizeof(cl_sampler))
    576       throw error(CL_INVALID_ARG_SIZE);
    577 
    578    s = &obj(*(cl_sampler *)value);
    579    _set = true;
    580 }
    581 
    582 void
    583 kernel::sampler_argument::bind(exec_context &ctx,
    584                                const module::argument &marg) {
    585    st = s->bind(*ctx.q);
    586    ctx.samplers.push_back(st);
    587 }
    588 
    589 void
    590 kernel::sampler_argument::unbind(exec_context &ctx) {
    591    s->unbind(*ctx.q, st);
    592 }
    593