1 // 2 // Copyright 2012 Francisco Jerez 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a 5 // copy of this software and associated documentation files (the "Software"), 6 // to deal in the Software without restriction, including without limitation 7 // the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 // and/or sell copies of the Software, and to permit persons to whom the 9 // Software is furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 // OTHER DEALINGS IN THE SOFTWARE. 21 // 22 23 #include "core/kernel.hpp" 24 #include "core/resource.hpp" 25 #include "util/factor.hpp" 26 #include "util/u_math.h" 27 #include "pipe/p_context.h" 28 29 using namespace clover; 30 31 kernel::kernel(clover::program &prog, const std::string &name, 32 const std::vector<module::argument> &margs) : 33 program(prog), _name(name), exec(*this), 34 program_ref(prog._kernel_ref_counter) { 35 for (auto &marg : margs) { 36 if (marg.semantic == module::argument::general) 37 _args.emplace_back(argument::create(marg)); 38 } 39 } 40 41 template<typename V> 42 static inline std::vector<uint> 43 pad_vector(command_queue &q, const V &v, uint x) { 44 std::vector<uint> w { v.begin(), v.end() }; 45 w.resize(q.device().max_block_size().size(), x); 46 return w; 47 } 48 49 void 50 kernel::launch(command_queue &q, 51 const std::vector<size_t> &grid_offset, 52 const std::vector<size_t> &grid_size, 53 const std::vector<size_t> &block_size) { 54 const auto m = program().build(q.device()).binary; 55 const auto reduced_grid_size = 56 map(divides(), grid_size, block_size); 57 void *st = exec.bind(&q, grid_offset); 58 struct pipe_grid_info info = {}; 59 60 // The handles are created during exec_context::bind(), so we need make 61 // sure to call exec_context::bind() before retrieving them. 62 std::vector<uint32_t *> g_handles = map([&](size_t h) { 63 return (uint32_t *)&exec.input[h]; 64 }, exec.g_handles); 65 66 q.pipe->bind_compute_state(q.pipe, st); 67 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 68 0, exec.samplers.size(), 69 exec.samplers.data()); 70 71 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, 72 exec.sviews.size(), exec.sviews.data()); 73 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), 74 exec.resources.data()); 75 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), 76 exec.g_buffers.data(), g_handles.data()); 77 78 // Fill information for the launch_grid() call. 79 info.work_dim = grid_size.size(); 80 copy(pad_vector(q, block_size, 1), info.block); 81 copy(pad_vector(q, reduced_grid_size, 1), info.grid); 82 info.pc = find(name_equals(_name), m.syms).offset; 83 info.input = exec.input.data(); 84 85 q.pipe->launch_grid(q.pipe, &info); 86 87 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL); 88 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL); 89 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, 90 exec.sviews.size(), NULL); 91 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0, 92 exec.samplers.size(), NULL); 93 94 q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER); 95 exec.unbind(); 96 } 97 98 size_t 99 kernel::mem_local() const { 100 size_t sz = 0; 101 102 for (auto &arg : args()) { 103 if (dynamic_cast<local_argument *>(&arg)) 104 sz += arg.storage(); 105 } 106 107 return sz; 108 } 109 110 size_t 111 kernel::mem_private() const { 112 return 0; 113 } 114 115 const std::string & 116 kernel::name() const { 117 return _name; 118 } 119 120 std::vector<size_t> 121 kernel::optimal_block_size(const command_queue &q, 122 const std::vector<size_t> &grid_size) const { 123 return factor::find_grid_optimal_factor<size_t>( 124 q.device().max_threads_per_block(), q.device().max_block_size(), 125 grid_size); 126 } 127 128 std::vector<size_t> 129 kernel::required_block_size() const { 130 return { 0, 0, 0 }; 131 } 132 133 kernel::argument_range 134 kernel::args() { 135 return map(derefs(), _args); 136 } 137 138 kernel::const_argument_range 139 kernel::args() const { 140 return map(derefs(), _args); 141 } 142 143 const module & 144 kernel::module(const command_queue &q) const { 145 return program().build(q.device()).binary; 146 } 147 148 kernel::exec_context::exec_context(kernel &kern) : 149 kern(kern), q(NULL), mem_local(0), st(NULL), cs() { 150 } 151 152 kernel::exec_context::~exec_context() { 153 if (st) 154 q->pipe->delete_compute_state(q->pipe, st); 155 } 156 157 void * 158 kernel::exec_context::bind(intrusive_ptr<command_queue> _q, 159 const std::vector<size_t> &grid_offset) { 160 std::swap(q, _q); 161 162 // Bind kernel arguments. 163 auto &m = kern.program().build(q->device()).binary; 164 auto margs = find(name_equals(kern.name()), m.syms).args; 165 auto msec = find(type_equals(module::section::text_executable), m.secs); 166 auto explicit_arg = kern._args.begin(); 167 168 for (auto &marg : margs) { 169 switch (marg.semantic) { 170 case module::argument::general: 171 (*(explicit_arg++))->bind(*this, marg); 172 break; 173 174 case module::argument::grid_dimension: { 175 const cl_uint dimension = grid_offset.size(); 176 auto arg = argument::create(marg); 177 178 arg->set(sizeof(dimension), &dimension); 179 arg->bind(*this, marg); 180 break; 181 } 182 case module::argument::grid_offset: { 183 for (cl_uint x : pad_vector(*q, grid_offset, 0)) { 184 auto arg = argument::create(marg); 185 186 arg->set(sizeof(x), &x); 187 arg->bind(*this, marg); 188 } 189 break; 190 } 191 case module::argument::image_size: { 192 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get(); 193 std::vector<cl_uint> image_size{ 194 static_cast<cl_uint>(img->width()), 195 static_cast<cl_uint>(img->height()), 196 static_cast<cl_uint>(img->depth())}; 197 for (auto x : image_size) { 198 auto arg = argument::create(marg); 199 200 arg->set(sizeof(x), &x); 201 arg->bind(*this, marg); 202 } 203 break; 204 } 205 case module::argument::image_format: { 206 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get(); 207 cl_image_format fmt = img->format(); 208 std::vector<cl_uint> image_format{ 209 static_cast<cl_uint>(fmt.image_channel_data_type), 210 static_cast<cl_uint>(fmt.image_channel_order)}; 211 for (auto x : image_format) { 212 auto arg = argument::create(marg); 213 214 arg->set(sizeof(x), &x); 215 arg->bind(*this, marg); 216 } 217 break; 218 } 219 } 220 } 221 222 // Create a new compute state if anything changed. 223 if (!st || q != _q || 224 cs.req_local_mem != mem_local || 225 cs.req_input_mem != input.size()) { 226 if (st) 227 _q->pipe->delete_compute_state(_q->pipe, st); 228 229 cs.ir_type = q->device().ir_format(); 230 cs.prog = &(msec.data[0]); 231 cs.req_local_mem = mem_local; 232 cs.req_input_mem = input.size(); 233 st = q->pipe->create_compute_state(q->pipe, &cs); 234 } 235 236 return st; 237 } 238 239 void 240 kernel::exec_context::unbind() { 241 for (auto &arg : kern.args()) 242 arg.unbind(*this); 243 244 input.clear(); 245 samplers.clear(); 246 sviews.clear(); 247 resources.clear(); 248 g_buffers.clear(); 249 g_handles.clear(); 250 mem_local = 0; 251 } 252 253 namespace { 254 template<typename T> 255 std::vector<uint8_t> 256 bytes(const T& x) { 257 return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) }; 258 } 259 260 /// 261 /// Transform buffer \a v from the native byte order into the byte 262 /// order specified by \a e. 263 /// 264 template<typename T> 265 void 266 byteswap(T &v, pipe_endian e) { 267 if (PIPE_ENDIAN_NATIVE != e) 268 std::reverse(v.begin(), v.end()); 269 } 270 271 /// 272 /// Pad buffer \a v to the next multiple of \a n. 273 /// 274 template<typename T> 275 void 276 align(T &v, size_t n) { 277 v.resize(util_align_npot(v.size(), n)); 278 } 279 280 bool 281 msb(const std::vector<uint8_t> &s) { 282 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE) 283 return s.back() & 0x80; 284 else 285 return s.front() & 0x80; 286 } 287 288 /// 289 /// Resize buffer \a v to size \a n using sign or zero extension 290 /// according to \a ext. 291 /// 292 template<typename T> 293 void 294 extend(T &v, enum module::argument::ext_type ext, size_t n) { 295 const size_t m = std::min(v.size(), n); 296 const bool sign_ext = (ext == module::argument::sign_ext); 297 const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0); 298 T w(n, fill); 299 300 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE) 301 std::copy_n(v.begin(), m, w.begin()); 302 else 303 std::copy_n(v.end() - m, m, w.end() - m); 304 305 std::swap(v, w); 306 } 307 308 /// 309 /// Append buffer \a w to \a v. 310 /// 311 template<typename T> 312 void 313 insert(T &v, const T &w) { 314 v.insert(v.end(), w.begin(), w.end()); 315 } 316 317 /// 318 /// Append \a n elements to the end of buffer \a v. 319 /// 320 template<typename T> 321 size_t 322 allocate(T &v, size_t n) { 323 size_t pos = v.size(); 324 v.resize(pos + n); 325 return pos; 326 } 327 } 328 329 std::unique_ptr<kernel::argument> 330 kernel::argument::create(const module::argument &marg) { 331 switch (marg.type) { 332 case module::argument::scalar: 333 return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size)); 334 335 case module::argument::global: 336 return std::unique_ptr<kernel::argument>(new global_argument); 337 338 case module::argument::local: 339 return std::unique_ptr<kernel::argument>(new local_argument); 340 341 case module::argument::constant: 342 return std::unique_ptr<kernel::argument>(new constant_argument); 343 344 case module::argument::image2d_rd: 345 case module::argument::image3d_rd: 346 return std::unique_ptr<kernel::argument>(new image_rd_argument); 347 348 case module::argument::image2d_wr: 349 case module::argument::image3d_wr: 350 return std::unique_ptr<kernel::argument>(new image_wr_argument); 351 352 case module::argument::sampler: 353 return std::unique_ptr<kernel::argument>(new sampler_argument); 354 355 } 356 throw error(CL_INVALID_KERNEL_DEFINITION); 357 } 358 359 kernel::argument::argument() : _set(false) { 360 } 361 362 bool 363 kernel::argument::set() const { 364 return _set; 365 } 366 367 size_t 368 kernel::argument::storage() const { 369 return 0; 370 } 371 372 kernel::scalar_argument::scalar_argument(size_t size) : size(size) { 373 } 374 375 void 376 kernel::scalar_argument::set(size_t size, const void *value) { 377 if (!value) 378 throw error(CL_INVALID_ARG_VALUE); 379 380 if (size != this->size) 381 throw error(CL_INVALID_ARG_SIZE); 382 383 v = { (uint8_t *)value, (uint8_t *)value + size }; 384 _set = true; 385 } 386 387 void 388 kernel::scalar_argument::bind(exec_context &ctx, 389 const module::argument &marg) { 390 auto w = v; 391 392 extend(w, marg.ext_type, marg.target_size); 393 byteswap(w, ctx.q->device().endianness()); 394 align(ctx.input, marg.target_align); 395 insert(ctx.input, w); 396 } 397 398 void 399 kernel::scalar_argument::unbind(exec_context &ctx) { 400 } 401 402 void 403 kernel::global_argument::set(size_t size, const void *value) { 404 if (size != sizeof(cl_mem)) 405 throw error(CL_INVALID_ARG_SIZE); 406 407 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL); 408 _set = true; 409 } 410 411 void 412 kernel::global_argument::bind(exec_context &ctx, 413 const module::argument &marg) { 414 align(ctx.input, marg.target_align); 415 416 if (buf) { 417 const resource &r = buf->resource(*ctx.q); 418 ctx.g_handles.push_back(ctx.input.size()); 419 ctx.g_buffers.push_back(r.pipe); 420 421 // How to handle multi-demensional offsets? 422 // We don't need to. Buffer offsets are always 423 // one-dimensional. 424 auto v = bytes(r.offset[0]); 425 extend(v, marg.ext_type, marg.target_size); 426 byteswap(v, ctx.q->device().endianness()); 427 insert(ctx.input, v); 428 } else { 429 // Null pointer. 430 allocate(ctx.input, marg.target_size); 431 } 432 } 433 434 void 435 kernel::global_argument::unbind(exec_context &ctx) { 436 } 437 438 size_t 439 kernel::local_argument::storage() const { 440 return _storage; 441 } 442 443 void 444 kernel::local_argument::set(size_t size, const void *value) { 445 if (value) 446 throw error(CL_INVALID_ARG_VALUE); 447 448 if (!size) 449 throw error(CL_INVALID_ARG_SIZE); 450 451 _storage = size; 452 _set = true; 453 } 454 455 void 456 kernel::local_argument::bind(exec_context &ctx, 457 const module::argument &marg) { 458 auto v = bytes(ctx.mem_local); 459 460 extend(v, module::argument::zero_ext, marg.target_size); 461 byteswap(v, ctx.q->device().endianness()); 462 align(ctx.input, marg.target_align); 463 insert(ctx.input, v); 464 465 ctx.mem_local += _storage; 466 } 467 468 void 469 kernel::local_argument::unbind(exec_context &ctx) { 470 } 471 472 void 473 kernel::constant_argument::set(size_t size, const void *value) { 474 if (size != sizeof(cl_mem)) 475 throw error(CL_INVALID_ARG_SIZE); 476 477 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL); 478 _set = true; 479 } 480 481 void 482 kernel::constant_argument::bind(exec_context &ctx, 483 const module::argument &marg) { 484 align(ctx.input, marg.target_align); 485 486 if (buf) { 487 resource &r = buf->resource(*ctx.q); 488 auto v = bytes(ctx.resources.size() << 24 | r.offset[0]); 489 490 extend(v, module::argument::zero_ext, marg.target_size); 491 byteswap(v, ctx.q->device().endianness()); 492 insert(ctx.input, v); 493 494 st = r.bind_surface(*ctx.q, false); 495 ctx.resources.push_back(st); 496 } else { 497 // Null pointer. 498 allocate(ctx.input, marg.target_size); 499 } 500 } 501 502 void 503 kernel::constant_argument::unbind(exec_context &ctx) { 504 if (buf) 505 buf->resource(*ctx.q).unbind_surface(*ctx.q, st); 506 } 507 508 void 509 kernel::image_rd_argument::set(size_t size, const void *value) { 510 if (!value) 511 throw error(CL_INVALID_ARG_VALUE); 512 513 if (size != sizeof(cl_mem)) 514 throw error(CL_INVALID_ARG_SIZE); 515 516 img = &obj<image>(*(cl_mem *)value); 517 _set = true; 518 } 519 520 void 521 kernel::image_rd_argument::bind(exec_context &ctx, 522 const module::argument &marg) { 523 auto v = bytes(ctx.sviews.size()); 524 525 extend(v, module::argument::zero_ext, marg.target_size); 526 byteswap(v, ctx.q->device().endianness()); 527 align(ctx.input, marg.target_align); 528 insert(ctx.input, v); 529 530 st = img->resource(*ctx.q).bind_sampler_view(*ctx.q); 531 ctx.sviews.push_back(st); 532 } 533 534 void 535 kernel::image_rd_argument::unbind(exec_context &ctx) { 536 img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st); 537 } 538 539 void 540 kernel::image_wr_argument::set(size_t size, const void *value) { 541 if (!value) 542 throw error(CL_INVALID_ARG_VALUE); 543 544 if (size != sizeof(cl_mem)) 545 throw error(CL_INVALID_ARG_SIZE); 546 547 img = &obj<image>(*(cl_mem *)value); 548 _set = true; 549 } 550 551 void 552 kernel::image_wr_argument::bind(exec_context &ctx, 553 const module::argument &marg) { 554 auto v = bytes(ctx.resources.size()); 555 556 extend(v, module::argument::zero_ext, marg.target_size); 557 byteswap(v, ctx.q->device().endianness()); 558 align(ctx.input, marg.target_align); 559 insert(ctx.input, v); 560 561 st = img->resource(*ctx.q).bind_surface(*ctx.q, true); 562 ctx.resources.push_back(st); 563 } 564 565 void 566 kernel::image_wr_argument::unbind(exec_context &ctx) { 567 img->resource(*ctx.q).unbind_surface(*ctx.q, st); 568 } 569 570 void 571 kernel::sampler_argument::set(size_t size, const void *value) { 572 if (!value) 573 throw error(CL_INVALID_SAMPLER); 574 575 if (size != sizeof(cl_sampler)) 576 throw error(CL_INVALID_ARG_SIZE); 577 578 s = &obj(*(cl_sampler *)value); 579 _set = true; 580 } 581 582 void 583 kernel::sampler_argument::bind(exec_context &ctx, 584 const module::argument &marg) { 585 st = s->bind(*ctx.q); 586 ctx.samplers.push_back(st); 587 } 588 589 void 590 kernel::sampler_argument::unbind(exec_context &ctx) { 591 s->unbind(*ctx.q, st); 592 } 593