HomeSort by relevance Sort by last modified time
    Searched refs:threads_per_block (Results 1 - 13 of 13) sorted by null

  /external/tensorflow/tensorflow/core/kernels/
adjust_hue_op_gpu.cu.cc 33 const int threads_per_block = config.thread_per_block; local
35 (number_of_elements + threads_per_block - 1) / threads_per_block;
37 <<<block_count, threads_per_block, 0, stream>>>(
adjust_saturation_op_gpu.cu.cc 35 const int threads_per_block = config.thread_per_block; local
37 (number_of_elements + threads_per_block - 1) / threads_per_block;
39 <<<block_count, threads_per_block, 0, stream>>>(
reduction_gpu_kernels.cu.h 517 const int threads_per_block = 128; local
518 const int warps_per_block = threads_per_block / 32;
521 RowReduceKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
641 int threads_per_block = 128; local
642 int num_blocks = Eigen::divup(extent_y, threads_per_block);
644 ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
653 int threads_per_block = 128; local
655 (extent_x * extent_z + threads_per_block - 1) / threads_per_block;
659 ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>
    [all...]
  /external/tensorflow/tensorflow/compiler/xla/service/gpu/
partition_assignment.cc 41 launch_dims.threads_per_block());
66 int64 threads_per_block; local
68 threads_per_block = device_desc.threads_per_core_limit() /
78 threads_per_block = device_desc.threads_per_warp();
79 if (threads_per_block == 0) {
81 threads_per_block = 32;
85 if (num_elements < threads_per_block) {
86 threads_per_block = num_elements;
88 << threads_per_block << ") because the latter is smaller.";
91 int64 block_count = CeilOfRatio(num_elements, threads_per_block);
    [all...]
partition_assignment.h 41 LaunchDimensions(int64 block_count, int64 threads_per_block)
42 : block_count_(block_count), threads_per_block_(threads_per_block) {}
49 int64 threads_per_block() const { return threads_per_block_; } function in class:xla::gpu::LaunchDimensions
parallel_loop_emitter.cc 79 llvm_ir::AddRangeMetadata(0, launch_dimensions_.threads_per_block(),
87 ir_builder_->getInt64(launch_dimensions_.threads_per_block()), "",
91 // Add an @llvm.assume(linear_index < threads_per_block * num_blocks).
103 ir_builder_->getInt64(launch_dimensions_.threads_per_block() *
kernel_thunk.cc 95 stream, se::ThreadDim(launch_dimensions.threads_per_block()),
elemental_ir_emitter.cc 308 llvm::Value* threads_per_block = ir_builder_->CreateIntCast( local
311 ir_builder_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
313 ir_builder_->CreateNSWMul(block_id, threads_per_block),
ir_emitter_unnested.cc 135 launch_dims.threads_per_block());
    [all...]
  /external/tensorflow/tensorflow/stream_executor/
device_description.cc 143 uint64 element_count, uint64 *threads_per_block,
145 *threads_per_block = device_description.threads_per_block_limit();
146 *block_count = DivideCeil(element_count, *threads_per_block);
148 CHECK_LE(element_count, *threads_per_block);
149 *threads_per_block = element_count;
device_description.h 356 // Equivalent to ceil(double(element_count) / threads_per_block).
364 uint64 element_count, uint64 *threads_per_block,
  /external/tensorflow/tensorflow/stream_executor/cuda/
cuda_driver.h 451 CudaContext* context, CUfunction kernel, int threads_per_block,
cuda_driver.cc     [all...]

Completed in 146 milliseconds