Home | History | Annotate | Download | only in AMDGPU
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
      3 ; RUN: llc -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM %s
      4 
      5 ; Loosely based on test/CodeGen/{X86,AArch64}/extract-lowbits.ll,
      6 ; but with all 64-bit tests, and tests with loads dropped.
      7 
      8 ; Patterns:
      9 ;   a) x &  (1 << nbits) - 1
     10 ;   b) x & ~(-1 << nbits)
     11 ;   c) x &  (-1 >> (32 - y))
     12 ;   d) x << (32 - y) >> (32 - y)
     13 ; are equivalent.
     14 
     15 ; ---------------------------------------------------------------------------- ;
     16 ; Pattern a. 32-bit
     17 ; ---------------------------------------------------------------------------- ;
     18 
     19 define amdgpu_kernel void @bzhi32_a0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
     20 ; EG-LABEL: bzhi32_a0:
     21 ; EG:       ; %bb.0:
     22 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
     23 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
     24 ; EG-NEXT:    CF_END
     25 ; EG-NEXT:    PAD
     26 ; EG-NEXT:    ALU clause starting at 4:
     27 ; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
     28 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
     29 ; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
     30 ;
     31 ; CM-LABEL: bzhi32_a0:
     32 ; CM:       ; %bb.0:
     33 ; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
     34 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
     35 ; CM-NEXT:    CF_END
     36 ; CM-NEXT:    PAD
     37 ; CM-NEXT:    ALU clause starting at 4:
     38 ; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
     39 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
     40 ; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
     41   %onebit = shl i32 1, %numlowbits
     42   %mask = add nsw i32 %onebit, -1
     43   %masked = and i32 %mask, %val
     44   store i32 %masked, i32 addrspace(1)* %out
     45   ret void
     46 }
     47 
     48 define amdgpu_kernel void @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) {
     49 ; EG-LABEL: bzhi32_a1_indexzext:
     50 ; EG:       ; %bb.0:
     51 ; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
     52 ; EG-NEXT:    TEX 0 @6
     53 ; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
     54 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
     55 ; EG-NEXT:    CF_END
     56 ; EG-NEXT:    PAD
     57 ; EG-NEXT:    Fetch clause starting at 6:
     58 ; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
     59 ; EG-NEXT:    ALU clause starting at 8:
     60 ; EG-NEXT:     MOV * T0.X, 0.0,
     61 ; EG-NEXT:    ALU clause starting at 9:
     62 ; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
     63 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
     64 ; EG-NEXT:     BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W,
     65 ; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
     66 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
     67 ;
     68 ; CM-LABEL: bzhi32_a1_indexzext:
     69 ; CM:       ; %bb.0:
     70 ; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
     71 ; CM-NEXT:    TEX 0 @6
     72 ; CM-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
     73 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
     74 ; CM-NEXT:    CF_END
     75 ; CM-NEXT:    PAD
     76 ; CM-NEXT:    Fetch clause starting at 6:
     77 ; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
     78 ; CM-NEXT:    ALU clause starting at 8:
     79 ; CM-NEXT:     MOV * T0.X, 0.0,
     80 ; CM-NEXT:    ALU clause starting at 9:
     81 ; CM-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
     82 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
     83 ; CM-NEXT:     BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W,
     84 ; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
     85 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
     86   %conv = zext i8 %numlowbits to i32
     87   %onebit = shl i32 1, %conv
     88   %mask = add nsw i32 %onebit, -1
     89   %masked = and i32 %mask, %val
     90   store i32 %masked, i32 addrspace(1)* %out
     91   ret void
     92 }
     93 
     94 define amdgpu_kernel void @bzhi32_a4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
     95 ; EG-LABEL: bzhi32_a4_commutative:
     96 ; EG:       ; %bb.0:
     97 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
     98 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
     99 ; EG-NEXT:    CF_END
    100 ; EG-NEXT:    PAD
    101 ; EG-NEXT:    ALU clause starting at 4:
    102 ; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    103 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    104 ; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    105 ;
    106 ; CM-LABEL: bzhi32_a4_commutative:
    107 ; CM:       ; %bb.0:
    108 ; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    109 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
    110 ; CM-NEXT:    CF_END
    111 ; CM-NEXT:    PAD
    112 ; CM-NEXT:    ALU clause starting at 4:
    113 ; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    114 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    115 ; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    116   %onebit = shl i32 1, %numlowbits
    117   %mask = add nsw i32 %onebit, -1
    118   %masked = and i32 %val, %mask ; swapped order
    119   store i32 %masked, i32 addrspace(1)* %out
    120   ret void
    121 }
    122 
    123 ; ---------------------------------------------------------------------------- ;
    124 ; Pattern b. 32-bit
    125 ; ---------------------------------------------------------------------------- ;
    126 
    127 define amdgpu_kernel void @bzhi32_b0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
    128 ; EG-LABEL: bzhi32_b0:
    129 ; EG:       ; %bb.0:
    130 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    131 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
    132 ; EG-NEXT:    CF_END
    133 ; EG-NEXT:    PAD
    134 ; EG-NEXT:    ALU clause starting at 4:
    135 ; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    136 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    137 ; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    138 ;
    139 ; CM-LABEL: bzhi32_b0:
    140 ; CM:       ; %bb.0:
    141 ; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    142 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
    143 ; CM-NEXT:    CF_END
    144 ; CM-NEXT:    PAD
    145 ; CM-NEXT:    ALU clause starting at 4:
    146 ; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    147 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    148 ; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    149   %notmask = shl i32 -1, %numlowbits
    150   %mask = xor i32 %notmask, -1
    151   %masked = and i32 %mask, %val
    152   store i32 %masked, i32 addrspace(1)* %out
    153   ret void
    154 }
    155 
    156 define amdgpu_kernel void @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) {
    157 ; EG-LABEL: bzhi32_b1_indexzext:
    158 ; EG:       ; %bb.0:
    159 ; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
    160 ; EG-NEXT:    TEX 0 @6
    161 ; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
    162 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
    163 ; EG-NEXT:    CF_END
    164 ; EG-NEXT:    PAD
    165 ; EG-NEXT:    Fetch clause starting at 6:
    166 ; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
    167 ; EG-NEXT:    ALU clause starting at 8:
    168 ; EG-NEXT:     MOV * T0.X, 0.0,
    169 ; EG-NEXT:    ALU clause starting at 9:
    170 ; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
    171 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
    172 ; EG-NEXT:     BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W,
    173 ; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
    174 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    175 ;
    176 ; CM-LABEL: bzhi32_b1_indexzext:
    177 ; CM:       ; %bb.0:
    178 ; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
    179 ; CM-NEXT:    TEX 0 @6
    180 ; CM-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
    181 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
    182 ; CM-NEXT:    CF_END
    183 ; CM-NEXT:    PAD
    184 ; CM-NEXT:    Fetch clause starting at 6:
    185 ; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
    186 ; CM-NEXT:    ALU clause starting at 8:
    187 ; CM-NEXT:     MOV * T0.X, 0.0,
    188 ; CM-NEXT:    ALU clause starting at 9:
    189 ; CM-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
    190 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
    191 ; CM-NEXT:     BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W,
    192 ; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
    193 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    194   %conv = zext i8 %numlowbits to i32
    195   %notmask = shl i32 -1, %conv
    196   %mask = xor i32 %notmask, -1
    197   %masked = and i32 %mask, %val
    198   store i32 %masked, i32 addrspace(1)* %out
    199   ret void
    200 }
    201 
    202 define amdgpu_kernel void @bzhi32_b4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
    203 ; EG-LABEL: bzhi32_b4_commutative:
    204 ; EG:       ; %bb.0:
    205 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    206 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
    207 ; EG-NEXT:    CF_END
    208 ; EG-NEXT:    PAD
    209 ; EG-NEXT:    ALU clause starting at 4:
    210 ; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    211 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    212 ; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    213 ;
    214 ; CM-LABEL: bzhi32_b4_commutative:
    215 ; CM:       ; %bb.0:
    216 ; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    217 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
    218 ; CM-NEXT:    CF_END
    219 ; CM-NEXT:    PAD
    220 ; CM-NEXT:    ALU clause starting at 4:
    221 ; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    222 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    223 ; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    224   %notmask = shl i32 -1, %numlowbits
    225   %mask = xor i32 %notmask, -1
    226   %masked = and i32 %val, %mask ; swapped order
    227   store i32 %masked, i32 addrspace(1)* %out
    228   ret void
    229 }
    230 
    231 ; ---------------------------------------------------------------------------- ;
    232 ; Pattern c. 32-bit
    233 ; ---------------------------------------------------------------------------- ;
    234 
    235 define amdgpu_kernel void @bzhi32_c0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
    236 ; EG-LABEL: bzhi32_c0:
    237 ; EG:       ; %bb.0:
    238 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    239 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
    240 ; EG-NEXT:    CF_END
    241 ; EG-NEXT:    PAD
    242 ; EG-NEXT:    ALU clause starting at 4:
    243 ; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    244 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    245 ; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    246 ;
    247 ; CM-LABEL: bzhi32_c0:
    248 ; CM:       ; %bb.0:
    249 ; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    250 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
    251 ; CM-NEXT:    CF_END
    252 ; CM-NEXT:    PAD
    253 ; CM-NEXT:    ALU clause starting at 4:
    254 ; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    255 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    256 ; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    257   %numhighbits = sub i32 32, %numlowbits
    258   %mask = lshr i32 -1, %numhighbits
    259   %masked = and i32 %mask, %val
    260   store i32 %masked, i32 addrspace(1)* %out
    261   ret void
    262 }
    263 
    264 define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) {
    265 ; EG-LABEL: bzhi32_c1_indexzext:
    266 ; EG:       ; %bb.0:
    267 ; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
    268 ; EG-NEXT:    TEX 0 @6
    269 ; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
    270 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
    271 ; EG-NEXT:    CF_END
    272 ; EG-NEXT:    PAD
    273 ; EG-NEXT:    Fetch clause starting at 6:
    274 ; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
    275 ; EG-NEXT:    ALU clause starting at 8:
    276 ; EG-NEXT:     MOV * T0.X, 0.0,
    277 ; EG-NEXT:    ALU clause starting at 9:
    278 ; EG-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
    279 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
    280 ; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
    281 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
    282 ; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
    283 ; EG-NEXT:    -1(nan), 0(0.000000e+00)
    284 ; EG-NEXT:     AND_INT T0.X, PV.W, KC0[2].Y,
    285 ; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
    286 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    287 ;
    288 ; CM-LABEL: bzhi32_c1_indexzext:
    289 ; CM:       ; %bb.0:
    290 ; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
    291 ; CM-NEXT:    TEX 0 @6
    292 ; CM-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
    293 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
    294 ; CM-NEXT:    CF_END
    295 ; CM-NEXT:    PAD
    296 ; CM-NEXT:    Fetch clause starting at 6:
    297 ; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
    298 ; CM-NEXT:    ALU clause starting at 8:
    299 ; CM-NEXT:     MOV * T0.X, 0.0,
    300 ; CM-NEXT:    ALU clause starting at 9:
    301 ; CM-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
    302 ; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
    303 ; CM-NEXT:     AND_INT * T0.W, PV.W, literal.x,
    304 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
    305 ; CM-NEXT:     LSHR * T0.W, literal.x, PV.W,
    306 ; CM-NEXT:    -1(nan), 0(0.000000e+00)
    307 ; CM-NEXT:     AND_INT * T0.X, PV.W, KC0[2].Y,
    308 ; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
    309 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    310   %numhighbits = sub i8 32, %numlowbits
    311   %sh_prom = zext i8 %numhighbits to i32
    312   %mask = lshr i32 -1, %sh_prom
    313   %masked = and i32 %mask, %val
    314   store i32 %masked, i32 addrspace(1)* %out
    315   ret void
    316 }
    317 
    318 define amdgpu_kernel void @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
    319 ; EG-LABEL: bzhi32_c4_commutative:
    320 ; EG:       ; %bb.0:
    321 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    322 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
    323 ; EG-NEXT:    CF_END
    324 ; EG-NEXT:    PAD
    325 ; EG-NEXT:    ALU clause starting at 4:
    326 ; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    327 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    328 ; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    329 ;
    330 ; CM-LABEL: bzhi32_c4_commutative:
    331 ; CM:       ; %bb.0:
    332 ; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    333 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
    334 ; CM-NEXT:    CF_END
    335 ; CM-NEXT:    PAD
    336 ; CM-NEXT:    ALU clause starting at 4:
    337 ; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    338 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    339 ; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    340   %numhighbits = sub i32 32, %numlowbits
    341   %mask = lshr i32 -1, %numhighbits
    342   %masked = and i32 %val, %mask ; swapped order
    343   store i32 %masked, i32 addrspace(1)* %out
    344   ret void
    345 }
    346 
    347 ; ---------------------------------------------------------------------------- ;
    348 ; Pattern d. 32-bit.
    349 ; ---------------------------------------------------------------------------- ;
    350 
    351 define amdgpu_kernel void @bzhi32_d0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
    352 ; EG-LABEL: bzhi32_d0:
    353 ; EG:       ; %bb.0:
    354 ; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    355 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
    356 ; EG-NEXT:    CF_END
    357 ; EG-NEXT:    PAD
    358 ; EG-NEXT:    ALU clause starting at 4:
    359 ; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    360 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    361 ; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    362 ;
    363 ; CM-LABEL: bzhi32_d0:
    364 ; CM:       ; %bb.0:
    365 ; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
    366 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
    367 ; CM-NEXT:    CF_END
    368 ; CM-NEXT:    PAD
    369 ; CM-NEXT:    ALU clause starting at 4:
    370 ; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
    371 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    372 ; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
    373   %numhighbits = sub i32 32, %numlowbits
    374   %highbitscleared = shl i32 %val, %numhighbits
    375   %masked = lshr i32 %highbitscleared, %numhighbits
    376   store i32 %masked, i32 addrspace(1)* %out
    377   ret void
    378 }
    379 
    380 define amdgpu_kernel void @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) {
    381 ; EG-LABEL: bzhi32_d1_indexzext:
    382 ; EG:       ; %bb.0:
    383 ; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
    384 ; EG-NEXT:    TEX 0 @6
    385 ; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
    386 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
    387 ; EG-NEXT:    CF_END
    388 ; EG-NEXT:    PAD
    389 ; EG-NEXT:    Fetch clause starting at 6:
    390 ; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
    391 ; EG-NEXT:    ALU clause starting at 8:
    392 ; EG-NEXT:     MOV * T0.X, 0.0,
    393 ; EG-NEXT:    ALU clause starting at 9:
    394 ; EG-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
    395 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
    396 ; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
    397 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
    398 ; EG-NEXT:     LSHL * T1.W, KC0[2].Y, PV.W,
    399 ; EG-NEXT:     LSHR T0.X, PV.W, T0.W,
    400 ; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
    401 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    402 ;
    403 ; CM-LABEL: bzhi32_d1_indexzext:
    404 ; CM:       ; %bb.0:
    405 ; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
    406 ; CM-NEXT:    TEX 0 @6
    407 ; CM-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
    408 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
    409 ; CM-NEXT:    CF_END
    410 ; CM-NEXT:    PAD
    411 ; CM-NEXT:    Fetch clause starting at 6:
    412 ; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
    413 ; CM-NEXT:    ALU clause starting at 8:
    414 ; CM-NEXT:     MOV * T0.X, 0.0,
    415 ; CM-NEXT:    ALU clause starting at 9:
    416 ; CM-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
    417 ; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
    418 ; CM-NEXT:     AND_INT * T0.W, PV.W, literal.x,
    419 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
    420 ; CM-NEXT:     LSHL * T1.W, KC0[2].Y, PV.W,
    421 ; CM-NEXT:     LSHR * T0.X, PV.W, T0.W,
    422 ; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
    423 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
    424   %numhighbits = sub i8 32, %numlowbits
    425   %sh_prom = zext i8 %numhighbits to i32
    426   %highbitscleared = shl i32 %val, %sh_prom
    427   %masked = lshr i32 %highbitscleared, %sh_prom
    428   store i32 %masked, i32 addrspace(1)* %out
    429   ret void
    430 }
    431