Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
      2 ; FIXME: Fails with -enable-var-scope
      3 
      4 ; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
      5 
      6 ; Extract the high bit of the low half
      7 ; GCN-LABEL: {{^}}v_uextract_bit_31_i64:
      8 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
      9 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
     10 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
     11 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
     12 define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
     13   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
     14   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
     15   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
     16   %ld.64 = load i64, i64 addrspace(1)* %in.gep
     17   %srl = lshr i64 %ld.64, 31
     18   %bit = and i64 %srl, 1
     19   store i64 %bit, i64 addrspace(1)* %out.gep
     20   ret void
     21 }
     22 
     23 ; Extract the high bit of the high half
     24 ; GCN-LABEL: {{^}}v_uextract_bit_63_i64:
     25 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
     26 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     27 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
     28 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
     29 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}}
     30 define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
     31   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
     32   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
     33   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
     34   %ld.64 = load i64, i64 addrspace(1)* %in.gep
     35   %srl = lshr i64 %ld.64, 63
     36   %bit = and i64 %srl, 1
     37   store i64 %bit, i64 addrspace(1)* %out.gep
     38   ret void
     39 }
     40 
     41 ; GCN-LABEL: {{^}}v_uextract_bit_1_i64:
     42 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     43 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
     44 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
     45 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
     46 define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
     47   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
     48   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
     49   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
     50   %ld.64 = load i64, i64 addrspace(1)* %in.gep
     51   %srl = lshr i64 %ld.64, 1
     52   %bit = and i64 %srl, 1
     53   store i64 %bit, i64 addrspace(1)* %out.gep
     54   ret void
     55 }
     56 
     57 ; GCN-LABEL: {{^}}v_uextract_bit_20_i64:
     58 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
     59 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
     60 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
     61 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
     62 define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
     63   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
     64   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
     65   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
     66   %ld.64 = load i64, i64 addrspace(1)* %in.gep
     67   %srl = lshr i64 %ld.64, 20
     68   %bit = and i64 %srl, 1
     69   store i64 %bit, i64 addrspace(1)* %out.gep
     70   ret void
     71 }
     72 
     73 ; GCN-LABEL: {{^}}v_uextract_bit_32_i64:
     74 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     75 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
     76 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
     77 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
     78 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
     79 define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
     80   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
     81   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
     82   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
     83   %ld.64 = load i64, i64 addrspace(1)* %in.gep
     84   %srl = lshr i64 %ld.64, 32
     85   %bit = and i64 %srl, 1
     86   store i64 %bit, i64 addrspace(1)* %out.gep
     87   ret void
     88 }
     89 
     90 ; GCN-LABEL: {{^}}v_uextract_bit_33_i64:
     91 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
     92 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
     93 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
     94 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
     95 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}}
     96 define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
     97   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
     98   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
     99   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    100   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    101   %srl = lshr i64 %ld.64, 33
    102   %bit = and i64 %srl, 1
    103   store i64 %bit, i64 addrspace(1)* %out.gep
    104   ret void
    105 }
    106 
    107 ; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64:
    108 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    109 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
    110 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    111 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
    112 define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    113   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
    114   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    115   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    116   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    117   %srl = lshr i64 %ld.64, 20
    118   %bit = and i64 %srl, 3
    119   store i64 %bit, i64 addrspace(1)* %out.gep
    120   ret void
    121 }
    122 
    123 ; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64:
    124 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    125 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
    126 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    127 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
    128 define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    129   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
    130   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    131   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    132   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    133   %srl = lshr i64 %ld.64, 1
    134   %bit = and i64 %srl, 1073741823
    135   store i64 %bit, i64 addrspace(1)* %out.gep
    136   ret void
    137 }
    138 
    139 ; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64:
    140 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    141 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
    142 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    143 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
    144 define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    145   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
    146   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    147   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    148   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    149   %srl = lshr i64 %ld.64, 1
    150   %bit = and i64 %srl, 2147483647
    151   store i64 %bit, i64 addrspace(1)* %out.gep
    152   ret void
    153 }
    154 
    155 ; Spans the dword boundary, so requires full shift.
    156 ; Truncated after the shift, so only low shift result is used.
    157 ; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64:
    158 ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
    159 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
    160 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
    161 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    162 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
    163 define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    164   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
    165   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    166   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    167   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    168   %srl = lshr i64 %ld.64, 31
    169   %bit = and i64 %srl, 3
    170   store i64 %bit, i64 addrspace(1)* %out.gep
    171   ret void
    172 }
    173 
    174 ; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64:
    175 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    176 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    177 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
    178 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
    179 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}}
    180 define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    181   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    182   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    183   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    184   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    185   %srl = lshr i64 %ld.64, 33
    186   %bit = and i64 %srl, 3
    187   store i64 %bit, i64 addrspace(1)* %out.gep
    188   ret void
    189 }
    190 
    191 ; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
    192 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    193 ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
    194 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30
    195 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
    196 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
    197 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
    198 define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    199   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    200   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    201   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    202   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    203   %srl = lshr i64 %ld.64, 30
    204   %bit = and i64 %srl, 1073741823
    205   store i64 %bit, i64 addrspace(1)* %out.gep
    206   ret void
    207 }
    208 
    209 ; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64:
    210 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    211 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    212 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
    213 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
    214 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}}
    215 define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    216   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    217   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    218   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    219   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    220   %srl = lshr i64 %ld.64, 33
    221   %bit = and i64 %srl, 1073741823
    222   store i64 %bit, i64 addrspace(1)* %out.gep
    223   ret void
    224 }
    225 
    226 ; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
    227 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    228 ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
    229 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
    230 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[ZERO]]{{\]}}
    231 define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    232   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    233   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    234   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    235   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    236   %srl = lshr i64 %ld.64, 31
    237   %and = and i64 %srl, 4294967295
    238   store i64 %and, i64 addrspace(1)* %out
    239   ret void
    240 }
    241 
    242 ; trunc applied before and mask
    243 ; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32:
    244 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    245 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
    246 ; GCN: buffer_store_dword v[[SHIFT]]
    247 define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    248   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    249   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    250   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
    251   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    252   %srl = lshr i64 %ld.64, 31
    253   %trunc = trunc i64 %srl to i32
    254   %bit = and i32 %trunc, 1
    255   store i32 %bit, i32 addrspace(1)* %out.gep
    256   ret void
    257 }
    258 
    259 ; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32:
    260 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
    261 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
    262 ; GCN: buffer_store_dword [[BFE]]
    263 define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    264   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    265   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    266   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
    267   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    268   %srl = lshr i64 %ld.64, 3
    269   %trunc = trunc i64 %srl to i32
    270   %bit = and i32 %trunc, 1
    271   store i32 %bit, i32 addrspace(1)* %out.gep
    272   ret void
    273 }
    274 
    275 ; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32:
    276 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    277 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
    278 ; GCN: buffer_store_dword [[BFE]]
    279 define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    280   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    281   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    282   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
    283   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    284   %srl = lshr i64 %ld.64, 33
    285   %trunc = trunc i64 %srl to i32
    286   %bit = and i32 %trunc, 1
    287   store i32 %bit, i32 addrspace(1)* %out.gep
    288   ret void
    289 }
    290 
    291 ; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32:
    292 ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
    293 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
    294 ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
    295 ; GCN-NOT: v[[SHRLO]]
    296 ; GCN: buffer_store_dword v[[SHRLO]]
    297 define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    298   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    299   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    300   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
    301   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    302   %srl = lshr i64 %ld.64, 31
    303   %trunc = trunc i64 %srl to i32
    304   %bit = and i32 %trunc, 3
    305   store i32 %bit, i32 addrspace(1)* %out.gep
    306   ret void
    307 }
    308 
    309 ; GCN-LABEL: {{^}}and_not_mask_i64:
    310 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
    311 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    312 ; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}}
    313 ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
    314 ; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
    315 ; GCN-NOT: v[[SHRLO]]
    316 ; GCN-NOT: v[[SHRHI]]
    317 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
    318 define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    319   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    320   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    321   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    322   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    323   %srl = lshr i64 %ld.64, 20
    324   %bit = and i64 %srl, 4
    325   store i64 %bit, i64 addrspace(1)* %out.gep
    326   ret void
    327 }
    328 
    329 ; The instruction count is the same with/without hasOneUse, but
    330 ; keeping the 32-bit and has a smaller encoding size than the bfe.
    331 
    332 ; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
    333 ; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
    334 ; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27
    335 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
    336 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    337 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
    338 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
    339 define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    340   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    341   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    342   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    343   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    344   %srl = lshr i64 %ld.64, 27
    345   %bit = and i64 %srl, 3
    346   store volatile i64 %srl, i64 addrspace(1)* %out
    347   store volatile i64 %bit, i64 addrspace(1)* %out
    348   ret void
    349 }
    350 
    351 ; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
    352 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    353 ; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
    354 ; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
    355 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
    356 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
    357 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
    358 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
    359 define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
    360   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    361   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    362   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
    363   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    364   %srl = lshr i64 %ld.64, 34
    365   %bit = and i64 %srl, 7
    366   store volatile i64 %srl, i64 addrspace(1)* %out
    367   store volatile i64 %bit, i64 addrspace(1)* %out
    368   ret void
    369 }
    370 
    371 ; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
    372 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
    373 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
    374 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
    375 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}}
    376 ; GCN: buffer_store_dword v[[ZERO]]
    377 define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
    378   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
    379   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
    380   %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x
    381   %out1.gep = getelementptr i32, i32 addrspace(1)* %out1, i32 %id.x
    382   %ld.64 = load i64, i64 addrspace(1)* %in.gep
    383   %srl = lshr i64 %ld.64, 33
    384   %bit = and i64 %srl, 7
    385   store volatile i64 %bit, i64 addrspace(1)* %out0.gep
    386 
    387   %srl.srl32 = lshr i64 %srl, 32
    388   %srl.hi = trunc i64 %srl.srl32 to i32
    389   store volatile i32 %srl.hi, i32 addrspace(1)* %out1.gep
    390   ret void
    391 }
    392 
    393 declare i32 @llvm.amdgcn.workitem.id.x() #0
    394 
    395 declare i32 @llvm.amdgcn.workgroup.id.x() #0
    396 
    397 attributes #0 = { nounwind readnone }
    398 attributes #1 = { nounwind }
    399