Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
      2 ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s
      3 ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
      4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s
      5 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s
      6 
      7 declare void @llvm.amdgcn.s.barrier() #0
      8 
      9 ; SI-LABEL: {{^}}private_access_f64_alloca:
     10 
     11 ; SI-ALLOCA16: buffer_store_dwordx2
     12 ; SI-ALLOCA16: buffer_load_dwordx2
     13 
     14 ; SI-ALLOCA4: buffer_store_dword v
     15 ; SI-ALLOCA4: buffer_store_dword v
     16 ; SI-ALLOCA4: buffer_load_dword v
     17 ; SI-ALLOCA4: buffer_load_dword v
     18 
     19 ; SI-PROMOTE: ds_write_b64
     20 ; SI-PROMOTE: ds_read_b64
     21 ; CI-PROMOTE: ds_write_b64
     22 ; CI-PROMOTE: ds_read_b64
     23 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
     24   %val = load double, double addrspace(1)* %in, align 8
     25   %array = alloca [16 x double], align 8
     26   %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
     27   store double %val, double* %ptr, align 8
     28   call void @llvm.amdgcn.s.barrier()
     29   %result = load double, double* %ptr, align 8
     30   store double %result, double addrspace(1)* %out, align 8
     31   ret void
     32 }
     33 
     34 ; SI-LABEL: {{^}}private_access_v2f64_alloca:
     35 
     36 ; SI-ALLOCA16: buffer_store_dwordx4
     37 ; SI-ALLOCA16: buffer_load_dwordx4
     38 
     39 ; SI-ALLOCA4: buffer_store_dword v
     40 ; SI-ALLOCA4: buffer_store_dword v
     41 ; SI-ALLOCA4: buffer_store_dword v
     42 ; SI-ALLOCA4: buffer_store_dword v
     43 ; SI-ALLOCA4: buffer_load_dword v
     44 ; SI-ALLOCA4: buffer_load_dword v
     45 ; SI-ALLOCA4: buffer_load_dword v
     46 ; SI-ALLOCA4: buffer_load_dword v
     47 
     48 ; SI-PROMOTE: ds_write_b64
     49 ; SI-PROMOTE: ds_write_b64
     50 ; SI-PROMOTE: ds_read_b64
     51 ; SI-PROMOTE: ds_read_b64
     52 ; CI-PROMOTE: ds_write2_b64
     53 ; CI-PROMOTE: ds_read2_b64
     54 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
     55   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
     56   %array = alloca [8 x <2 x double>], align 16
     57   %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
     58   store <2 x double> %val, <2 x double>* %ptr, align 16
     59   call void @llvm.amdgcn.s.barrier()
     60   %result = load <2 x double>, <2 x double>* %ptr, align 16
     61   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
     62   ret void
     63 }
     64 
     65 ; SI-LABEL: {{^}}private_access_i64_alloca:
     66 
     67 ; SI-ALLOCA16: buffer_store_dwordx2
     68 ; SI-ALLOCA16: buffer_load_dwordx2
     69 
     70 ; SI-ALLOCA4: buffer_store_dword v
     71 ; SI-ALLOCA4: buffer_store_dword v
     72 ; SI-ALLOCA4: buffer_load_dword v
     73 ; SI-ALLOCA4: buffer_load_dword v
     74 
     75 
     76 ; SI-PROMOTE: ds_write_b64
     77 ; SI-PROMOTE: ds_read_b64
     78 ; CI-PROMOTE: ds_write_b64
     79 ; CI-PROMOTE: ds_read_b64
     80 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
     81   %val = load i64, i64 addrspace(1)* %in, align 8
     82   %array = alloca [8 x i64], align 8
     83   %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b
     84   store i64 %val, i64* %ptr, align 8
     85   call void @llvm.amdgcn.s.barrier()
     86   %result = load i64, i64* %ptr, align 8
     87   store i64 %result, i64 addrspace(1)* %out, align 8
     88   ret void
     89 }
     90 
     91 ; SI-LABEL: {{^}}private_access_v2i64_alloca:
     92 
     93 ; SI-ALLOCA16: buffer_store_dwordx4
     94 ; SI-ALLOCA16: buffer_load_dwordx4
     95 
     96 ; SI-ALLOCA4: buffer_store_dword v
     97 ; SI-ALLOCA4: buffer_store_dword v
     98 ; SI-ALLOCA4: buffer_store_dword v
     99 ; SI-ALLOCA4: buffer_store_dword v
    100 
    101 ; SI-ALLOCA4: buffer_load_dword v
    102 ; SI-ALLOCA4: buffer_load_dword v
    103 ; SI-ALLOCA4: buffer_load_dword v
    104 ; SI-ALLOCA4: buffer_load_dword v
    105 
    106 ; SI-PROMOTE: ds_write_b64
    107 ; SI-PROMOTE: ds_write_b64
    108 ; SI-PROMOTE: ds_read_b64
    109 ; SI-PROMOTE: ds_read_b64
    110 ; CI-PROMOTE: ds_write2_b64
    111 ; CI-PROMOTE: ds_read2_b64
    112 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
    113   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
    114   %array = alloca [8 x <2 x i64>], align 16
    115   %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
    116   store <2 x i64> %val, <2 x i64>* %ptr, align 16
    117   call void @llvm.amdgcn.s.barrier()
    118   %result = load <2 x i64>, <2 x i64>* %ptr, align 16
    119   store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
    120   ret void
    121 }
    122 
    123 attributes #0 = { convergent nounwind }
    124 attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="2" "amdgpu-max-work-group-size"="64" }
    125