Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
      2 ;
      3 ; This test checks that the lds input queue will is empty at the end of
      4 ; the ALU clause.
      5 
      6 ; CHECK-LABEL: {{^}}lds_input_queue:
      7 ; CHECK: LDS_READ_RET * OQAP
      8 ; CHECK-NOT: ALU clause
      9 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
     10 
     11 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
     12 
     13 define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
     14 entry:
     15   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
     16   %1 = load i32, i32 addrspace(3)* %0
     17   call void @llvm.AMDGPU.barrier.local()
     18 
     19   ; This will start a new clause for the vertex fetch
     20   %2 = load i32, i32 addrspace(1)* %in
     21   %3 = add i32 %1, %2
     22   store i32 %3, i32 addrspace(1)* %out
     23   ret void
     24 }
     25 
     26 declare void @llvm.AMDGPU.barrier.local()
     27 
     28 ; The machine scheduler does not do proper alias analysis and assumes that
     29 ; loads from global values (Note that a global value is different that a
     30 ; value from global memory.  A global value is a value that is declared
     31 ; outside of a function, it can reside in any address space) alias with
     32 ; all other loads.
     33 ;
     34 ; This is a problem for scheduling the reads from the local data share (lds).
     35 ; These reads are implemented using two instructions.  The first copies the
     36 ; data from lds into the lds output queue, and the second moves the data from
     37 ; the input queue into main memory.  These two instructions don't have to be
     38 ; scheduled one after the other, but they do need to be scheduled in the same
     39 ; clause.  The aliasing problem mentioned above causes problems when there is a
     40 ; load from global memory which immediately follows a load from a global value that
     41 ; has been declared in the local memory space:
     42 ;
     43 ;  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
     44 ;  %1 = load i32, i32 addrspace(3)* %0
     45 ;  %2 = load i32, i32 addrspace(1)* %in
     46 ;
     47 ; The instruction selection phase will generate ISA that looks like this:
     48 ; %OQAP = LDS_READ_RET
     49 ; %vreg0 = MOV %OQAP
     50 ; %vreg1 = VTX_READ_32
     51 ; %vreg2 = ADD_INT %vreg1, %vreg0
     52 ;
     53 ; The bottom scheduler will schedule the two ALU instructions first:
     54 ;
     55 ; UNSCHEDULED:
     56 ; %OQAP = LDS_READ_RET
     57 ; %vreg1 = VTX_READ_32
     58 ;
     59 ; SCHEDULED:
     60 ;
     61 ; vreg0 = MOV %OQAP
     62 ; vreg2 = ADD_INT %vreg1, %vreg2
     63 ;
     64 ; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
     65 ; to consider the global memory read (VTX_READ_32) has a chain dependency, so
     66 ; the global memory read will always be scheduled first.  This will give us a
     67 ; final program which looks like this:
     68 ;
     69 ; Alu clause:
     70 ; %OQAP = LDS_READ_RET
     71 ; VTX clause:
     72 ; %vreg1 = VTX_READ_32
     73 ; Alu clause:
     74 ; vreg0 = MOV %OQAP
     75 ; vreg2 = ADD_INT %vreg1, %vreg2
     76 ;
     77 ; This is an illegal program because the OQAP def and use know occur in
     78 ; different ALU clauses.
     79 ;
     80 ; This test checks this scenario and makes sure it doesn't result in an
     81 ; illegal program.  For now, we have fixed this issue by merging the
     82 ; LDS_READ_RET and MOV together during instruction selection and then
     83 ; expanding them after scheduling.  Once the scheduler has better alias
     84 ; analysis, we should be able to keep these instructions sparate before
     85 ; scheduling.
     86 ;
     87 ; CHECK-LABEL: {{^}}local_global_alias:
     88 ; CHECK: LDS_READ_RET
     89 ; CHECK-NOT: ALU clause
     90 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
     91 define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
     92 entry:
     93   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
     94   %1 = load i32, i32 addrspace(3)* %0
     95   %2 = load i32, i32 addrspace(1)* %in
     96   %3 = add i32 %2, %1
     97   store i32 %3, i32 addrspace(1)* %out
     98   ret void
     99 }
    100