Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
      3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
      4 
      5 declare i32 @llvm.r600.read.tidig.x() #0
      6 
      7 ; FUNC-LABEL: {{^}}test2:
      8 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
      9 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     10 
     11 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
     12 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
     13 
     14 define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
     15   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
     16   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
     17   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
     18   %result = and <2 x i32> %a, %b
     19   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
     20   ret void
     21 }
     22 
     23 ; FUNC-LABEL: {{^}}test4:
     24 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     25 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     26 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     27 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
     28 
     29 
     30 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
     31 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
     32 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
     33 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
     34 
     35 define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
     36   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
     37   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
     38   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
     39   %result = and <4 x i32> %a, %b
     40   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
     41   ret void
     42 }
     43 
     44 ; FUNC-LABEL: {{^}}s_and_i32:
     45 ; SI: s_and_b32
     46 define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
     47   %and = and i32 %a, %b
     48   store i32 %and, i32 addrspace(1)* %out, align 4
     49   ret void
     50 }
     51 
     52 ; FUNC-LABEL: {{^}}s_and_constant_i32:
     53 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
     54 define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
     55   %and = and i32 %a, 1234567
     56   store i32 %and, i32 addrspace(1)* %out, align 4
     57   ret void
     58 }
     59 
     60 ; FIXME: We should really duplicate the constant so that the SALU use
     61 ; can fold into the s_and_b32 and the VALU one is materialized
     62 ; directly without copying from the SGPR.
     63 
     64 ; Second use is a VGPR use of the constant.
     65 ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0:
     66 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
     67 ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
     68 ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
     69 ; SI: buffer_store_dword [[VK]]
     70 define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
     71   %and = and i32 %a, 1234567
     72 
     73   ; Just to stop future replacement of copy to vgpr + store with VALU op.
     74   %foo = add i32 %and, %b
     75   store volatile i32 %foo, i32 addrspace(1)* %out
     76   store volatile i32 1234567, i32 addrspace(1)* %out
     77   ret void
     78 }
     79 
     80 ; Second use is another SGPR use of the constant.
     81 ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1:
     82 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
     83 ; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
     84 ; SI: s_add_i32
     85 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
     86 ; SI: v_mov_b32_e32 [[VADD:v[0-9]+]], [[ADD]]
     87 ; SI: buffer_store_dword [[VADD]]
     88 define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
     89   %and = and i32 %a, 1234567
     90   %foo = add i32 %and, 1234567
     91   %bar = add i32 %foo, %b
     92   store volatile i32 %bar, i32 addrspace(1)* %out
     93   ret void
     94 }
     95 
     96 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
     97 ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
     98 define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
     99   %tid = call i32 @llvm.r600.read.tidig.x() #0
    100   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
    101   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
    102   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
    103   %a = load i32, i32 addrspace(1)* %gep.a
    104   %b = load i32, i32 addrspace(1)* %gep.b
    105   %and = and i32 %a, %b
    106   store i32 %and, i32 addrspace(1)* %gep.out
    107   ret void
    108 }
    109 
    110 ; FUNC-LABEL: {{^}}v_and_i32_sgpr_vgpr:
    111 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
    112 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
    113 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
    114 define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
    115   %tid = call i32 @llvm.r600.read.tidig.x() #0
    116   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
    117   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
    118   %b = load i32, i32 addrspace(1)* %gep.b
    119   %and = and i32 %a, %b
    120   store i32 %and, i32 addrspace(1)* %gep.out
    121   ret void
    122 }
    123 
    124 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_sgpr:
    125 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
    126 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
    127 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
    128 define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
    129   %tid = call i32 @llvm.r600.read.tidig.x() #0
    130   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
    131   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
    132   %a = load i32, i32 addrspace(1)* %gep.a
    133   %and = and i32 %a, %b
    134   store i32 %and, i32 addrspace(1)* %gep.out
    135   ret void
    136 }
    137 
    138 ; FUNC-LABEL: {{^}}v_and_constant_i32
    139 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
    140 define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
    141   %tid = call i32 @llvm.r600.read.tidig.x() #0
    142   %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
    143   %a = load i32, i32 addrspace(1)* %gep, align 4
    144   %and = and i32 %a, 1234567
    145   store i32 %and, i32 addrspace(1)* %out, align 4
    146   ret void
    147 }
    148 
    149 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
    150 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
    151 define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
    152   %tid = call i32 @llvm.r600.read.tidig.x() #0
    153   %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
    154   %a = load i32, i32 addrspace(1)* %gep, align 4
    155   %and = and i32 %a, 64
    156   store i32 %and, i32 addrspace(1)* %out, align 4
    157   ret void
    158 }
    159 
    160 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
    161 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
    162 define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
    163   %tid = call i32 @llvm.r600.read.tidig.x() #0
    164   %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
    165   %a = load i32, i32 addrspace(1)* %gep, align 4
    166   %and = and i32 %a, -16
    167   store i32 %and, i32 addrspace(1)* %out, align 4
    168   ret void
    169 }
    170 
    171 ; FUNC-LABEL: {{^}}s_and_i64
    172 ; SI: s_and_b64
    173 define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
    174   %and = and i64 %a, %b
    175   store i64 %and, i64 addrspace(1)* %out, align 8
    176   ret void
    177 }
    178 
    179 ; FUNC-LABEL: {{^}}s_and_i1:
    180 ; SI: s_load_dword [[LOAD:s[0-9]+]]
    181 ; SI: s_lshr_b32 [[B_SHIFT:s[0-9]+]], [[LOAD]], 8
    182 ; SI: s_and_b32 [[AND:s[0-9]+]], [[LOAD]], [[B_SHIFT]]
    183 ; SI: s_and_b32 [[AND_TRUNC:s[0-9]+]], [[AND]], 1{{$}}
    184 ; SI: v_mov_b32_e32 [[V_AND_TRUNC:v[0-9]+]], [[AND_TRUNC]]
    185 ; SI: buffer_store_byte [[V_AND_TRUNC]]
    186 define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
    187   %and = and i1 %a, %b
    188   store i1 %and, i1 addrspace(1)* %out
    189   ret void
    190 }
    191 
    192 ; FUNC-LABEL: {{^}}s_and_constant_i64:
    193 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
    194 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
    195 ; SI: buffer_store_dwordx2
    196 define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
    197   %and = and i64 %a, 549756338176
    198   store i64 %and, i64 addrspace(1)* %out, align 8
    199   ret void
    200 }
    201 
    202 ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i64:
    203 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
    204 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
    205 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
    206 define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
    207   %and0 = and i64 %a, 549756338176
    208   %and1 = and i64 %b, 549756338176
    209   store volatile i64 %and0, i64 addrspace(1)* %out
    210   store volatile i64 %and1, i64 addrspace(1)* %out
    211   ret void
    212 }
    213 
    214 ; FUNC-LABEL: {{^}}s_and_32_bit_constant_i64:
    215 ; SI: s_load_dwordx2
    216 ; SI-NOT: and
    217 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
    218 ; SI-NOT: and
    219 ; SI: buffer_store_dwordx2
    220 define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i32, i64 %a) {
    221   %and = and i64 %a, 1234567
    222   store i64 %and, i64 addrspace(1)* %out, align 8
    223   ret void
    224 }
    225 
    226 ; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64:
    227 ; SI: s_load_dwordx2
    228 ; SI: s_load_dword [[A:s[0-9]+]]
    229 ; SI: s_load_dword [[B:s[0-9]+]]
    230 ; SI: s_load_dwordx2
    231 ; SI-NOT: and
    232 ; SI: s_lshl_b32 [[A]], [[A]], 1
    233 ; SI: s_lshl_b32 [[B]], [[B]], 1
    234 ; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
    235 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
    236 ; SI-NOT: and
    237 ; SI: buffer_store_dwordx2
    238 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
    239   %shl.a = shl i64 %a, 1
    240   %shl.b = shl i64 %b, 1
    241   %and0 = and i64 %shl.a, 62
    242   %and1 = and i64 %shl.b, 62
    243   %add0 = add i64 %and0, %c
    244   %add1 = add i64 %and1, %c
    245   store volatile i64 %add0, i64 addrspace(1)* %out
    246   store volatile i64 %add1, i64 addrspace(1)* %out
    247   ret void
    248 }
    249 
    250 ; FUNC-LABEL: {{^}}v_and_i64:
    251 ; SI: v_and_b32
    252 ; SI: v_and_b32
    253 define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
    254   %tid = call i32 @llvm.r600.read.tidig.x() #0
    255   %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
    256   %a = load i64, i64 addrspace(1)* %gep.a, align 8
    257   %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
    258   %b = load i64, i64 addrspace(1)* %gep.b, align 8
    259   %and = and i64 %a, %b
    260   store i64 %and, i64 addrspace(1)* %out, align 8
    261   ret void
    262 }
    263 
    264 ; FUNC-LABEL: {{^}}v_and_constant_i64:
    265 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
    266 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
    267 ; SI: buffer_store_dwordx2
    268 define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
    269   %tid = call i32 @llvm.r600.read.tidig.x() #0
    270   %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
    271   %a = load i64, i64 addrspace(1)* %gep.a, align 8
    272   %and = and i64 %a, 1231231234567
    273   store i64 %and, i64 addrspace(1)* %out, align 8
    274   ret void
    275 }
    276 
    277 ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
    278 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
    279 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
    280 ; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
    281 ; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
    282 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
    283 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]]
    284 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]]
    285 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
    286 ; SI: buffer_store_dwordx2
    287 ; SI: buffer_store_dwordx2
    288 define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
    289   %a = load volatile i64, i64 addrspace(1)* %aptr
    290   %b = load volatile i64, i64 addrspace(1)* %aptr
    291   %and0 = and i64 %a, 1231231234567
    292   %and1 = and i64 %b, 1231231234567
    293   store volatile i64 %and0, i64 addrspace(1)* %out
    294   store volatile i64 %and1, i64 addrspace(1)* %out
    295   ret void
    296 }
    297 
    298 ; FUNC-LABEL: {{^}}v_and_multi_use_inline_imm_i64:
    299 ; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
    300 ; SI-NOT: and
    301 ; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
    302 ; SI-NOT: and
    303 ; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
    304 ; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
    305 ; SI-NOT: and
    306 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
    307 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
    308 define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
    309   %a = load volatile i64, i64 addrspace(1)* %aptr
    310   %b = load volatile i64, i64 addrspace(1)* %aptr
    311   %and0 = and i64 %a, 63
    312   %and1 = and i64 %b, 63
    313   store volatile i64 %and0, i64 addrspace(1)* %out
    314   store volatile i64 %and1, i64 addrspace(1)* %out
    315   ret void
    316 }
    317 
    318 ; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
    319 ; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
    320 ; SI-NOT: and
    321 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
    322 ; SI-NOT: and
    323 ; SI: buffer_store_dwordx2
    324 define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
    325   %tid = call i32 @llvm.r600.read.tidig.x() #0
    326   %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
    327   %a = load i64, i64 addrspace(1)* %gep.a, align 8
    328   %and = and i64 %a, 1234567
    329   store i64 %and, i64 addrspace(1)* %out, align 8
    330   ret void
    331 }
    332 
    333 ; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
    334 ; SI: {{buffer|flat}}_load_dword v{{[0-9]+}}
    335 ; SI-NOT: and
    336 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
    337 ; SI-NOT: and
    338 ; SI: buffer_store_dwordx2
    339 define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
    340   %tid = call i32 @llvm.r600.read.tidig.x() #0
    341   %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
    342   %a = load i64, i64 addrspace(1)* %gep.a, align 8
    343   %and = and i64 %a, 64
    344   store i64 %and, i64 addrspace(1)* %out, align 8
    345   ret void
    346 }
    347 
    348 ; FIXME: Should be able to reduce load width
    349 ; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64:
    350 ; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
    351 ; SI-NOT: and
    352 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
    353 ; SI-NOT: and
    354 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
    355 define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
    356   %tid = call i32 @llvm.r600.read.tidig.x() #0
    357   %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
    358   %a = load i64, i64 addrspace(1)* %gep.a, align 8
    359   %and = and i64 %a, -8
    360   store i64 %and, i64 addrspace(1)* %out, align 8
    361   ret void
    362 }
    363 
    364 ; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
    365 ; SI: s_load_dword
    366 ; SI-NOT: and
    367 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
    368 ; SI-NOT: and
    369 ; SI: buffer_store_dword
    370 define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    371   %and = and i64 %a, 64
    372   store i64 %and, i64 addrspace(1)* %out, align 8
    373   ret void
    374 }
    375 
    376 ; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
    377 ; SI: s_load_dword [[A:s[0-9]+]]
    378 ; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
    379 ; SI-NOT: and
    380 ; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
    381 ; SI-NOT: and
    382 ; SI: s_add_u32
    383 ; SI-NEXT: s_addc_u32
    384 define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i32, i64 %b) {
    385   %shl = shl i64 %a, 1
    386   %and = and i64 %shl, 64
    387   %add = add i64 %and, %b
    388   store i64 %add, i64 addrspace(1)* %out, align 8
    389   ret void
    390 }
    391 
    392 ; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
    393 ; SI: s_load_dwordx2
    394 ; SI-NOT: and
    395 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
    396 ; SI-NOT: and
    397 ; SI: buffer_store_dwordx2
    398 define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    399   %and = and i64 %a, 1
    400   store i64 %and, i64 addrspace(1)* %out, align 8
    401   ret void
    402 }
    403 
    404 ; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
    405 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
    406 
    407 ; SI: s_load_dwordx2
    408 ; SI: s_load_dwordx2
    409 ; SI-NOT: and
    410 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
    411 ; SI-NOT: and
    412 ; SI: buffer_store_dwordx2
    413 define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    414   %and = and i64 %a, 4607182418800017408
    415   store i64 %and, i64 addrspace(1)* %out, align 8
    416   ret void
    417 }
    418 
    419 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
    420 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
    421 
    422 ; SI: s_load_dwordx2
    423 ; SI: s_load_dwordx2
    424 ; SI-NOT: and
    425 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
    426 ; SI-NOT: and
    427 ; SI: buffer_store_dwordx2
    428 define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    429   %and = and i64 %a, 13830554455654793216
    430   store i64 %and, i64 addrspace(1)* %out, align 8
    431   ret void
    432 }
    433 
    434 ; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
    435 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
    436 
    437 ; SI: s_load_dwordx2
    438 ; SI: s_load_dwordx2
    439 ; SI-NOT: and
    440 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
    441 ; SI-NOT: and
    442 ; SI: buffer_store_dwordx2
    443 define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    444   %and = and i64 %a, 4602678819172646912
    445   store i64 %and, i64 addrspace(1)* %out, align 8
    446   ret void
    447 }
    448 
    449 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64:
    450 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
    451 
    452 ; SI: s_load_dwordx2
    453 ; SI: s_load_dwordx2
    454 ; SI-NOT: and
    455 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
    456 ; SI-NOT: and
    457 ; SI: buffer_store_dwordx2
    458 define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    459   %and = and i64 %a, 13826050856027422720
    460   store i64 %and, i64 addrspace(1)* %out, align 8
    461   ret void
    462 }
    463 
    464 ; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64:
    465 ; SI: s_load_dwordx2
    466 ; SI: s_load_dwordx2
    467 ; SI-NOT: and
    468 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
    469 ; SI-NOT: and
    470 ; SI: buffer_store_dwordx2
    471 define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    472   %and = and i64 %a, 4611686018427387904
    473   store i64 %and, i64 addrspace(1)* %out, align 8
    474   ret void
    475 }
    476 
    477 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64:
    478 ; SI: s_load_dwordx2
    479 ; SI: s_load_dwordx2
    480 ; SI-NOT: and
    481 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
    482 ; SI-NOT: and
    483 ; SI: buffer_store_dwordx2
    484 define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    485   %and = and i64 %a, 13835058055282163712
    486   store i64 %and, i64 addrspace(1)* %out, align 8
    487   ret void
    488 }
    489 
    490 ; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64:
    491 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
    492 
    493 ; SI: s_load_dwordx2
    494 ; SI: s_load_dwordx2
    495 ; SI-NOT: and
    496 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
    497 ; SI-NOT: and
    498 ; SI: buffer_store_dwordx2
    499 define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    500   %and = and i64 %a, 4616189618054758400
    501   store i64 %and, i64 addrspace(1)* %out, align 8
    502   ret void
    503 }
    504 
    505 ; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64:
    506 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
    507 
    508 ; SI: s_load_dwordx2
    509 ; SI: s_load_dwordx2
    510 ; SI-NOT: and
    511 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
    512 ; SI-NOT: and
    513 ; SI: buffer_store_dwordx2
    514 define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    515   %and = and i64 %a, 13839561654909534208
    516   store i64 %and, i64 addrspace(1)* %out, align 8
    517   ret void
    518 }
    519 
    520 
    521 ; Test with the 64-bit integer bitpattern for a 32-bit float in the
    522 ; low 32-bits, which is not a valid 64-bit inline immmediate.
    523 
    524 ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
    525 ; SI: s_load_dwordx2
    526 ; SI: s_load_dword s
    527 ; SI-NOT: and
    528 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
    529 ; SI-NOT: and
    530 ; SI: buffer_store_dwordx2
    531 define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    532   %and = and i64 %a, 1082130432
    533   store i64 %and, i64 addrspace(1)* %out, align 8
    534   ret void
    535 }
    536 
    537 ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64:
    538 ; SI: s_load_dwordx2
    539 ; SI: s_load_dwordx2
    540 ; SI-NOT: and
    541 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
    542 ; SI-NOT: and
    543 ; SI: buffer_store_dwordx2
    544 define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    545   %and = and i64 %a, -1065353216
    546   store i64 %and, i64 addrspace(1)* %out, align 8
    547   ret void
    548 }
    549 
    550 ; Shift into upper 32-bits
    551 ; SI: s_load_dwordx2
    552 ; SI: s_load_dwordx2
    553 ; SI-NOT: and
    554 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
    555 ; SI-NOT: and
    556 ; SI: buffer_store_dwordx2
    557 define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    558   %and = and i64 %a, 4647714815446351872
    559   store i64 %and, i64 addrspace(1)* %out, align 8
    560   ret void
    561 }
    562 
    563 ; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64:
    564 ; SI: s_load_dwordx2
    565 ; SI: s_load_dwordx2
    566 ; SI-NOT: and
    567 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
    568 ; SI-NOT: and
    569 ; SI: buffer_store_dwordx2
    570 define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
    571   %and = and i64 %a, 13871086852301127680
    572   store i64 %and, i64 addrspace(1)* %out, align 8
    573   ret void
    574 }
    575 attributes #0 = { nounwind readnone }
    576