Home | History | Annotate | Download | only in cuda

Lines Matching full:nthreads

261         template <int nthreads, // Number of threads which process one block historgam
274 __shared__ float sh_squares[nthreads * nblocks];
275 float* squares = sh_squares + threadIdx.z * nthreads;
281 float sum = reduce_smem<nthreads>(squares, elem * elem);
286 sum = reduce_smem<nthreads>(squares, elem * elem);
301 int nthreads = power_2up(block_hist_size);
302 dim3 threads(nthreads, 1, nblocks);
308 if (nthreads == 32)
310 else if (nthreads == 64)
312 else if (nthreads == 128)
314 else if (nthreads == 256)
316 else if (nthreads == 512)
332 template <int nthreads, // Number of threads per one histogram block
348 for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
355 __shared__ float products[nthreads * nblocks];
357 const int tid = threadIdx.z * nthreads + threadIdx.x;
359 nthreads>(products, product, tid, plus<float>());
370 const int nthreads = 256;
378 dim3 threads(nthreads, 1, nblocks);
381 cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
386 compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
394 template <int nthreads, // Number of threads per one histogram block
410 for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
417 __shared__ float products[nthreads * nblocks];
419 const int tid = threadIdx.z * nthreads + threadIdx.x;
421 reduce<nthreads>(products, product, tid, plus<float>());
432 const int nthreads = 256;
440 dim3 threads(nthreads, 1, nblocks);
443 cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
446 classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
458 template <int nthreads>
470 for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
482 const int nthreads = 256;
488 dim3 threads(nthreads, 1);
492 extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
500 template <int nthreads>
513 for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
531 const int nthreads = 256;
537 dim3 threads(nthreads, 1);
541 extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
552 template <int nthreads, int correct_gamma>
560 __shared__ float sh_row[(nthreads + 2) * 3];
569 sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;
570 sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;
576 sh_row[(nthreads + 2)] = val.y;
577 sh_row[2 * (nthreads + 2)] = val.z;
584 sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
585 sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
594 b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];
595 b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];
597 a.y = sh_row[threadIdx.x + (nthreads + 2)];
598 a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];
659 const int nthreads = 256;
661 dim3 bdim(nthreads, 1);
665 compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
667 compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
674 template <int nthreads, int correct_gamma>
682 __shared__ float sh_row[nthreads + 2];
732 const int nthreads = 256;
734 dim3 bdim(nthreads, 1);
738 compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
740 compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);