1 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s 2 ; 3 ; This test checks that the lds input queue will is empty at the end of 4 ; the ALU clause. 5 6 ; CHECK-LABEL: {{^}}lds_input_queue: 7 ; CHECK: LDS_READ_RET * OQAP 8 ; CHECK-NOT: ALU clause 9 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP 10 11 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 12 13 define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { 14 entry: 15 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index 16 %1 = load i32, i32 addrspace(3)* %0 17 call void @llvm.AMDGPU.barrier.local() 18 19 ; This will start a new clause for the vertex fetch 20 %2 = load i32, i32 addrspace(1)* %in 21 %3 = add i32 %1, %2 22 store i32 %3, i32 addrspace(1)* %out 23 ret void 24 } 25 26 declare void @llvm.AMDGPU.barrier.local() 27 28 ; The machine scheduler does not do proper alias analysis and assumes that 29 ; loads from global values (Note that a global value is different that a 30 ; value from global memory. A global value is a value that is declared 31 ; outside of a function, it can reside in any address space) alias with 32 ; all other loads. 33 ; 34 ; This is a problem for scheduling the reads from the local data share (lds). 35 ; These reads are implemented using two instructions. The first copies the 36 ; data from lds into the lds output queue, and the second moves the data from 37 ; the input queue into main memory. These two instructions don't have to be 38 ; scheduled one after the other, but they do need to be scheduled in the same 39 ; clause. The aliasing problem mentioned above causes problems when there is a 40 ; load from global memory which immediately follows a load from a global value that 41 ; has been declared in the local memory space: 42 ; 43 ; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index 44 ; %1 = load i32, i32 addrspace(3)* %0 45 ; %2 = load i32, i32 addrspace(1)* %in 46 ; 47 ; The instruction selection phase will generate ISA that looks like this: 48 ; %OQAP = LDS_READ_RET 49 ; %vreg0 = MOV %OQAP 50 ; %vreg1 = VTX_READ_32 51 ; %vreg2 = ADD_INT %vreg1, %vreg0 52 ; 53 ; The bottom scheduler will schedule the two ALU instructions first: 54 ; 55 ; UNSCHEDULED: 56 ; %OQAP = LDS_READ_RET 57 ; %vreg1 = VTX_READ_32 58 ; 59 ; SCHEDULED: 60 ; 61 ; vreg0 = MOV %OQAP 62 ; vreg2 = ADD_INT %vreg1, %vreg2 63 ; 64 ; The lack of proper aliasing results in the local memory read (LDS_READ_RET) 65 ; to consider the global memory read (VTX_READ_32) has a chain dependency, so 66 ; the global memory read will always be scheduled first. This will give us a 67 ; final program which looks like this: 68 ; 69 ; Alu clause: 70 ; %OQAP = LDS_READ_RET 71 ; VTX clause: 72 ; %vreg1 = VTX_READ_32 73 ; Alu clause: 74 ; vreg0 = MOV %OQAP 75 ; vreg2 = ADD_INT %vreg1, %vreg2 76 ; 77 ; This is an illegal program because the OQAP def and use know occur in 78 ; different ALU clauses. 79 ; 80 ; This test checks this scenario and makes sure it doesn't result in an 81 ; illegal program. For now, we have fixed this issue by merging the 82 ; LDS_READ_RET and MOV together during instruction selection and then 83 ; expanding them after scheduling. Once the scheduler has better alias 84 ; analysis, we should be able to keep these instructions sparate before 85 ; scheduling. 86 ; 87 ; CHECK-LABEL: {{^}}local_global_alias: 88 ; CHECK: LDS_READ_RET 89 ; CHECK-NOT: ALU clause 90 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP 91 define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 92 entry: 93 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 94 %1 = load i32, i32 addrspace(3)* %0 95 %2 = load i32, i32 addrspace(1)* %in 96 %3 = add i32 %2, %1 97 store i32 %3, i32 addrspace(1)* %out 98 ret void 99 } 100