Home | History | Annotate | Download | only in CodeGenCUDA
      1 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s
      2 
      3 #include "Inputs/cuda.h"
      4 
      5 #define MAX_THREADS_PER_BLOCK 256
      6 #define MIN_BLOCKS_PER_MP     2
      7 
      8 // Test both max threads per block and Min cta per sm.
      9 extern "C" {
     10 __global__ void
     11 __launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
     12 Kernel1()
     13 {
     14 }
     15 }
     16 
     17 // CHECK: !{{[0-9]+}} = !{void ()* @Kernel1, !"maxntidx", i32 256}
     18 // CHECK: !{{[0-9]+}} = !{void ()* @Kernel1, !"minctasm", i32 2}
     19 
     20 // Test only max threads per block. Min cta per sm defaults to 0, and
     21 // CodeGen doesn't output a zero value for minctasm.
     22 extern "C" {
     23 __global__ void
     24 __launch_bounds__( MAX_THREADS_PER_BLOCK )
     25 Kernel2()
     26 {
     27 }
     28 }
     29 
     30 // CHECK: !{{[0-9]+}} = !{void ()* @Kernel2, !"maxntidx", i32 256}
     31 
     32 template <int max_threads_per_block>
     33 __global__ void
     34 __launch_bounds__(max_threads_per_block)
     35 Kernel3()
     36 {
     37 }
     38 
     39 template void Kernel3<MAX_THREADS_PER_BLOCK>();
     40 // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
     41 
     42 template <int max_threads_per_block, int min_blocks_per_mp>
     43 __global__ void
     44 __launch_bounds__(max_threads_per_block, min_blocks_per_mp)
     45 Kernel4()
     46 {
     47 }
     48 template void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
     49 
     50 // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
     51 // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2}
     52 
     53 const int constint = 100;
     54 template <int max_threads_per_block, int min_blocks_per_mp>
     55 __global__ void
     56 __launch_bounds__(max_threads_per_block + constint,
     57                   min_blocks_per_mp + max_threads_per_block)
     58 Kernel5()
     59 {
     60 }
     61 template void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
     62 
     63 // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
     64 // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258}
     65 
     66 // Make sure we don't emit negative launch bounds values.
     67 __global__ void
     68 __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
     69 Kernel6()
     70 {
     71 }
     72 // CHECK-NOT: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel6{{.*}}, !"maxntidx",
     73 // CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel6{{.*}}, !"minctasm",
     74 
     75 __global__ void
     76 __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
     77 Kernel7()
     78 {
     79 }
     80 // CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"maxntidx",
     81 // CHECK-NOT: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"minctasm",
     82 
     83 const char constchar = 12;
     84 __global__ void __launch_bounds__(constint, constchar) Kernel8() {}
     85 // CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
     86 // CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12
     87