Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
      2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
      3 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
      4 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
      5 
      6 ; FUNC-LABEL: {{^}}store_i1:
      7 ; EG: MOVA_INT
      8 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
      9 ; EG: MOVA_INT
     10 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
     11 
     12 ; CM: MOVA_INT
     13 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
     14 ; CM: MOVA_INT
     15 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
     16 
     17 ; SI: buffer_store_byte
     18 define amdgpu_kernel void @store_i1(i1 addrspace(5)* %out) {
     19 entry:
     20   store i1 true, i1 addrspace(5)* %out
     21   ret void
     22 }
     23 
     24 ; i8 store
     25 ; FUNC-LABEL: {{^}}store_i8:
     26 ; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
     27 ; EG-NEXT: 2
     28 ; EG: MOVA_INT * AR.x (MASKED)
     29 ; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
     30 
     31 ; IG 0: Get the byte index and truncate the value
     32 ; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
     33 ; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
     34 ; EG-NEXT: 3(4.203895e-45)
     35 
     36 
     37 ; EG: LSHL * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], literal.x, PV.W
     38 ; EG-NEXT: 255(3.573311e-43)
     39 
     40 ; EG: NOT_INT
     41 ; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
     42 ; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
     43 ; TODO: Is the reload necessary?
     44 ; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
     45 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
     46 
     47 ; SI: buffer_store_byte
     48 
     49 define amdgpu_kernel void @store_i8(i8 addrspace(5)* %out, i8 %in) {
     50 entry:
     51   store i8 %in, i8 addrspace(5)* %out
     52   ret void
     53 }
     54 
     55 ; i16 store
     56 ; FUNC-LABEL: {{^}}store_i16:
     57 ; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
     58 ; EG-NEXT: 2
     59 ; EG: MOVA_INT * AR.x (MASKED)
     60 ; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
     61 
     62 ; EG: VTX_READ_16
     63 
     64 ; IG 0: Get the byte index and truncate the value
     65 ; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
     66 ; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
     67 ; EG-NEXT: 3(4.203895e-45)
     68 
     69 ; EG: NOT_INT
     70 ; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
     71 ; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
     72 ; TODO: Is the reload necessary?
     73 ; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
     74 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
     75 
     76 ; SI: buffer_store_short
     77 define amdgpu_kernel void @store_i16(i16 addrspace(5)* %out, i16 %in) {
     78 entry:
     79   store i16 %in, i16 addrspace(5)* %out
     80   ret void
     81 }
     82 
     83 ; FUNC-LABEL: {{^}}store_i24:
     84 ; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
     85 ; SI-DAG: buffer_store_byte
     86 ; SI-DAG: buffer_store_short
     87 
     88 ; EG: MOVA_INT
     89 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
     90 ; EG: MOVA_INT
     91 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
     92 ; TODO: This load and store can be eliminated
     93 ; EG: MOVA_INT
     94 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
     95 ; EG: MOVA_INT
     96 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
     97 
     98 ; CM: MOVA_INT
     99 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    100 ; CM: MOVA_INT
    101 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    102 ; TODO: This load and store can be eliminated
    103 ; CM: MOVA_INT
    104 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    105 ; CM: MOVA_INT
    106 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    107 define amdgpu_kernel void @store_i24(i24 addrspace(5)* %out, i24 %in) {
    108 entry:
    109   store i24 %in, i24 addrspace(5)* %out
    110   ret void
    111 }
    112 
    113 ; FUNC-LABEL: {{^}}store_i25:
    114 ; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
    115 ; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
    116 ; SI: buffer_store_dword [[VAND]]
    117 
    118 ; EG: MOVA_INT
    119 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    120 ; EG-NOT: MOVA_INT
    121 
    122 ; CM: MOVA_INT
    123 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    124 ; CM-NOT: MOVA_INT
    125 define amdgpu_kernel void @store_i25(i25 addrspace(5)* %out, i25 %in) {
    126 entry:
    127   store i25 %in, i25 addrspace(5)* %out
    128   ret void
    129 }
    130 
    131 ; FUNC-LABEL: {{^}}store_v2i8:
    132 ; v2i8 is naturally 2B aligned, treat as i16
    133 ; EG: MOVA_INT
    134 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    135 ; EG: MOVA_INT
    136 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    137 ; EG-NOT: MOVA_INT
    138 
    139 ; CM: MOVA_INT
    140 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    141 ; CM: MOVA_INT
    142 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    143 ; CM-NOT: MOVA_INT
    144 
    145 ; SI: buffer_store_short
    146 define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(5)* %out, <2 x i32> %in) {
    147 entry:
    148   %0 = trunc <2 x i32> %in to <2 x i8>
    149   store <2 x i8> %0, <2 x i8> addrspace(5)* %out
    150   ret void
    151 }
    152 
    153 ; FUNC-LABEL: {{^}}store_v2i8_unaligned:
    154 ; EG: MOVA_INT
    155 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    156 ; EG: MOVA_INT
    157 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    158 ; TODO: This load and store cannot be eliminated,
    159 ;       they might be different locations
    160 ; EG: MOVA_INT
    161 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    162 ; EG: MOVA_INT
    163 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    164 
    165 ; CM: MOVA_INT
    166 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    167 ; CM: MOVA_INT
    168 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    169 ; TODO: This load and store cannot be eliminated,
    170 ;       they might be different locations
    171 ; CM: MOVA_INT
    172 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    173 ; CM: MOVA_INT
    174 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    175 
    176 ; SI: buffer_store_byte
    177 define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(5)* %out, <2 x i32> %in) {
    178 entry:
    179   %0 = trunc <2 x i32> %in to <2 x i8>
    180   store <2 x i8> %0, <2 x i8> addrspace(5)* %out, align 1
    181   ret void
    182 }
    183 
    184 
    185 ; FUNC-LABEL: {{^}}store_v2i16:
    186 ; v2i8 is naturally 2B aligned, treat as i16
    187 ; EG: MOVA_INT
    188 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    189 ; EG-NOT: MOVA_INT
    190 
    191 ; CM: MOVA_INT
    192 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    193 ; CM-NOT: MOVA_INT
    194 
    195 ; SI: buffer_store_dword
    196 define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(5)* %out, <2 x i32> %in) {
    197 entry:
    198   %0 = trunc <2 x i32> %in to <2 x i16>
    199   store <2 x i16> %0, <2 x i16> addrspace(5)* %out
    200   ret void
    201 }
    202 
    203 ; FUNC-LABEL: {{^}}store_v2i16_unaligned:
    204 ; EG: MOVA_INT
    205 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    206 ; EG: MOVA_INT
    207 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    208 ; TODO: This load and store cannot be eliminated,
    209 ;       they might be different locations
    210 ; EG: MOVA_INT
    211 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    212 ; EG: MOVA_INT
    213 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    214 
    215 ; CM: MOVA_INT
    216 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    217 ; CM: MOVA_INT
    218 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    219 ; TODO: This load and store cannot be eliminated,
    220 ;       they might be different locations
    221 ; CM: MOVA_INT
    222 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    223 ; CM: MOVA_INT
    224 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    225 
    226 ; SI: buffer_store_short
    227 ; SI: buffer_store_short
    228 define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(5)* %out, <2 x i32> %in) {
    229 entry:
    230   %0 = trunc <2 x i32> %in to <2 x i16>
    231   store <2 x i16> %0, <2 x i16> addrspace(5)* %out, align 2
    232   ret void
    233 }
    234 
    235 ; FUNC-LABEL: {{^}}store_v4i8:
    236 ; EG: MOVA_INT
    237 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    238 ; EG-NOT: MOVA_INT
    239 
    240 ; CM: MOVA_INT
    241 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    242 ; CM-NOT: MOVA_INT
    243 
    244 ; SI: buffer_store_dword
    245 define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
    246 entry:
    247   %0 = trunc <4 x i32> %in to <4 x i8>
    248   store <4 x i8> %0, <4 x i8> addrspace(5)* %out
    249   ret void
    250 }
    251 
    252 ; FUNC-LABEL: {{^}}store_v4i8_unaligned:
    253 ; EG: MOVA_INT
    254 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    255 ; EG: MOVA_INT
    256 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    257 ; TODO: This load and store cannot be eliminated,
    258 ;       they might be different locations
    259 ; EG: MOVA_INT
    260 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    261 ; EG: MOVA_INT
    262 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    263 ; TODO: This load and store cannot be eliminated,
    264 ;       they might be different locations
    265 ; EG: MOVA_INT
    266 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    267 ; EG: MOVA_INT
    268 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    269 ; TODO: This load and store cannot be eliminated,
    270 ;       they might be different locations
    271 ; EG: MOVA_INT
    272 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    273 ; EG: MOVA_INT
    274 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    275 
    276 ; CM: MOVA_INT
    277 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    278 ; CM: MOVA_INT
    279 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    280 ; TODO: This load and store cannot be eliminated,
    281 ;       they might be different locations
    282 ; CM: MOVA_INT
    283 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    284 ; CM: MOVA_INT
    285 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    286 ; TODO: This load and store cannot be eliminated,
    287 ;       they might be different locations
    288 ; CM: MOVA_INT
    289 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    290 ; CM: MOVA_INT
    291 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    292 ; TODO: This load and store cannot be eliminated,
    293 ;       they might be different locations
    294 ; CM: MOVA_INT
    295 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    296 ; CM: MOVA_INT
    297 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    298 
    299 ; SI: buffer_store_byte
    300 ; SI: buffer_store_byte
    301 ; SI: buffer_store_byte
    302 ; SI: buffer_store_byte
    303 ; SI-NOT: buffer_store_dword
    304 define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
    305 entry:
    306   %0 = trunc <4 x i32> %in to <4 x i8>
    307   store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 1
    308   ret void
    309 }
    310 
    311 ; FUNC-LABEL: {{^}}store_v8i8_unaligned:
    312 ; EG: MOVA_INT
    313 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    314 ; EG: MOVA_INT
    315 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    316 ; TODO: This load and store cannot be eliminated,
    317 ;       they might be different locations
    318 ; EG: MOVA_INT
    319 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    320 ; EG: MOVA_INT
    321 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    322 ; TODO: This load and store cannot be eliminated,
    323 ;       they might be different locations
    324 ; EG: MOVA_INT
    325 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    326 ; EG: MOVA_INT
    327 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    328 ; TODO: This load and store cannot be eliminated,
    329 ;       they might be different locations
    330 ; EG: MOVA_INT
    331 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    332 ; EG: MOVA_INT
    333 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    334 ; TODO: This load and store cannot be eliminated,
    335 ;       they might be different locations
    336 ; EG: MOVA_INT
    337 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    338 ; EG: MOVA_INT
    339 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    340 ; TODO: This load and store cannot be eliminated,
    341 ;       they might be different locations
    342 ; EG: MOVA_INT
    343 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    344 ; EG: MOVA_INT
    345 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    346 ; TODO: This load and store cannot be eliminated,
    347 ;       they might be different locations
    348 ; EG: MOVA_INT
    349 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    350 ; EG: MOVA_INT
    351 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    352 ; TODO: This load and store cannot be eliminated,
    353 ;       they might be different locations
    354 ; EG: MOVA_INT
    355 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    356 ; EG: MOVA_INT
    357 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    358 
    359 ; CM: MOVA_INT
    360 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    361 ; CM: MOVA_INT
    362 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    363 ; TODO: This load and store cannot be eliminated,
    364 ;       they might be different locations
    365 ; CM: MOVA_INT
    366 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    367 ; CM: MOVA_INT
    368 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    369 ; TODO: This load and store cannot be eliminated,
    370 ;       they might be different locations
    371 ; CM: MOVA_INT
    372 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    373 ; CM: MOVA_INT
    374 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    375 ; TODO: This load and store cannot be eliminated,
    376 ;       they might be different locations
    377 ; CM: MOVA_INT
    378 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    379 ; CM: MOVA_INT
    380 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    381 ; TODO: This load and store cannot be eliminated,
    382 ;       they might be different locations
    383 ; CM: MOVA_INT
    384 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    385 ; CM: MOVA_INT
    386 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    387 ; TODO: This load and store cannot be eliminated,
    388 ;       they might be different locations
    389 ; CM: MOVA_INT
    390 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    391 ; CM: MOVA_INT
    392 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    393 ; TODO: This load and store cannot be eliminated,
    394 ;       they might be different locations
    395 ; CM: MOVA_INT
    396 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    397 ; CM: MOVA_INT
    398 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    399 ; TODO: This load and store cannot be eliminated,
    400 ;       they might be different locations
    401 ; CM: MOVA_INT
    402 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    403 ; CM: MOVA_INT
    404 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    405 
    406 ; SI: buffer_store_byte
    407 ; SI: buffer_store_byte
    408 ; SI: buffer_store_byte
    409 ; SI: buffer_store_byte
    410 ; SI: buffer_store_byte
    411 ; SI: buffer_store_byte
    412 ; SI: buffer_store_byte
    413 ; SI: buffer_store_byte
    414 ; SI-NOT: buffer_store_dword
    415 define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(5)* %out, <8 x i32> %in) {
    416 entry:
    417   %0 = trunc <8 x i32> %in to <8 x i8>
    418   store <8 x i8> %0, <8 x i8> addrspace(5)* %out, align 1
    419   ret void
    420 }
    421 
    422 ; FUNC-LABEL: {{^}}store_v4i8_halfaligned:
    423 ; EG: MOVA_INT
    424 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    425 ; EG: MOVA_INT
    426 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    427 ; TODO: This load and store cannot be eliminated,
    428 ;       they might be different locations
    429 ; EG: MOVA_INT
    430 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    431 ; EG: MOVA_INT
    432 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    433 
    434 ; CM: MOVA_INT
    435 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    436 ; CM: MOVA_INT
    437 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    438 ; TODO: This load and store cannot be eliminated,
    439 ;       they might be different locations
    440 ; CM: MOVA_INT
    441 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    442 ; CM: MOVA_INT
    443 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    444 
    445 ; SI: buffer_store_short
    446 ; SI: buffer_store_short
    447 ; SI-NOT: buffer_store_dword
    448 define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
    449 entry:
    450   %0 = trunc <4 x i32> %in to <4 x i8>
    451   store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 2
    452   ret void
    453 }
    454 
    455 ; floating-point store
    456 ; FUNC-LABEL: {{^}}store_f32:
    457 ; EG: MOVA_INT
    458 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    459 
    460 ; CM: MOVA_INT
    461 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    462 
    463 ; SI: buffer_store_dword
    464 
    465 define amdgpu_kernel void @store_f32(float addrspace(5)* %out, float %in) {
    466   store float %in, float addrspace(5)* %out
    467   ret void
    468 }
    469 
    470 ; FUNC-LABEL: {{^}}store_v4i16:
    471 ; EG: MOVA_INT
    472 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    473 ; EG: MOVA_INT
    474 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    475 
    476 ; CM: MOVA_INT
    477 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    478 ; CM: MOVA_INT
    479 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    480 
    481 ;TODO: why not x2?
    482 ; XSI: buffer_store_dwordx2
    483 ; SI: buffer_store_dword
    484 ; SI: buffer_store_dword
    485 define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(5)* %out, <4 x i32> %in) {
    486 entry:
    487   %0 = trunc <4 x i32> %in to <4 x i16>
    488   store <4 x i16> %0, <4 x i16> addrspace(5)* %out
    489   ret void
    490 }
    491 
    492 ; vec2 floating-point stores
    493 ; FUNC-LABEL: {{^}}store_v2f32:
    494 ; EG: MOVA_INT
    495 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    496 ; EG: MOVA_INT
    497 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    498 
    499 ; CM: MOVA_INT
    500 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    501 ; CM: MOVA_INT
    502 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    503 
    504 ;TODO: why not x2?
    505 ; XSI: buffer_store_dwordx2
    506 ; SI: buffer_store_dword
    507 ; SI: buffer_store_dword
    508 
    509 define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(5)* %out, float %a, float %b) {
    510 entry:
    511   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
    512   %1 = insertelement <2 x float> %0, float %b, i32 1
    513   store <2 x float> %1, <2 x float> addrspace(5)* %out
    514   ret void
    515 }
    516 
    517 ; FUNC-LABEL: {{^}}store_v3i32:
    518 ; EG: MOVA_INT
    519 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    520 ; EG: MOVA_INT
    521 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    522 ; EG: MOVA_INT
    523 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    524 
    525 ; CM: MOVA_INT
    526 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    527 ; CM: MOVA_INT
    528 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    529 ; CM: MOVA_INT
    530 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    531 
    532 ;TODO: why not x2?
    533 ; XSI-DAG: buffer_store_dwordx2
    534 ; SI: buffer_store_dword
    535 ; SI: buffer_store_dword
    536 ; SI: buffer_store_dword
    537 
    538 define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(5)* %out, <3 x i32> %a) nounwind {
    539   store <3 x i32> %a, <3 x i32> addrspace(5)* %out, align 16
    540   ret void
    541 }
    542 
    543 ; FUNC-LABEL: {{^}}store_v4i32:
    544 ; EG: MOVA_INT
    545 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    546 ; EG: MOVA_INT
    547 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    548 ; EG: MOVA_INT
    549 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    550 ; EG: MOVA_INT
    551 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    552 
    553 ; CM: MOVA_INT
    554 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    555 ; CM: MOVA_INT
    556 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    557 ; CM: MOVA_INT
    558 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    559 ; CM: MOVA_INT
    560 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    561 
    562 ;TODO: why not x4?
    563 ; XSI: buffer_store_dwordx4
    564 ; SI: buffer_store_dword
    565 ; SI: buffer_store_dword
    566 ; SI: buffer_store_dword
    567 ; SI: buffer_store_dword
    568 define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(5)* %out, <4 x i32> %in) {
    569 entry:
    570   store <4 x i32> %in, <4 x i32> addrspace(5)* %out
    571   ret void
    572 }
    573 
    574 ; FUNC-LABEL: {{^}}store_v4i32_unaligned:
    575 ; EG: MOVA_INT
    576 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    577 ; EG: MOVA_INT
    578 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    579 ; EG: MOVA_INT
    580 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    581 ; EG: MOVA_INT
    582 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    583 
    584 ; CM: MOVA_INT
    585 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    586 ; CM: MOVA_INT
    587 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    588 ; CM: MOVA_INT
    589 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    590 ; CM: MOVA_INT
    591 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    592 
    593 ;TODO: why not x4?
    594 ; XSI: buffer_store_dwordx4
    595 ; SI: buffer_store_dword
    596 ; SI: buffer_store_dword
    597 ; SI: buffer_store_dword
    598 ; SI: buffer_store_dword
    599 define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(5)* %out, <4 x i32> %in) {
    600 entry:
    601   store <4 x i32> %in, <4 x i32> addrspace(5)* %out, align 4
    602   ret void
    603 }
    604 
    605 ; v4f32 store
    606 ; FUNC-LABEL: {{^}}store_v4f32:
    607 ; EG: MOVA_INT
    608 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    609 ; EG: MOVA_INT
    610 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    611 ; EG: MOVA_INT
    612 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    613 ; EG: MOVA_INT
    614 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    615 
    616 ; CM: MOVA_INT
    617 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    618 ; CM: MOVA_INT
    619 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    620 ; CM: MOVA_INT
    621 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    622 ; CM: MOVA_INT
    623 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    624 
    625 ;TODO: why not x4?
    626 ; XSI: buffer_store_dwordx4
    627 ; SI: buffer_store_dword
    628 ; SI: buffer_store_dword
    629 ; SI: buffer_store_dword
    630 ; SI: buffer_store_dword
    631 define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(5)* %out, <4 x float> addrspace(5)* %in) {
    632   %1 = load <4 x float>, <4 x float> addrspace(5)* %in
    633   store <4 x float> %1, <4 x float> addrspace(5)* %out
    634   ret void
    635 }
    636 
    637 ; FUNC-LABEL: {{^}}store_i64_i8:
    638 ; EG: MOVA_INT
    639 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    640 ; EG: MOVA_INT
    641 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    642 
    643 ; CM: MOVA_INT
    644 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    645 ; CM: MOVA_INT
    646 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    647 
    648 ; SI: buffer_store_byte
    649 define amdgpu_kernel void @store_i64_i8(i8 addrspace(5)* %out, i64 %in) {
    650 entry:
    651   %0 = trunc i64 %in to i8
    652   store i8 %0, i8 addrspace(5)* %out
    653   ret void
    654 }
    655 
    656 ; FUNC-LABEL: {{^}}store_i64_i16:
    657 ; EG: MOVA_INT
    658 ; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    659 ; EG: MOVA_INT
    660 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    661 
    662 ; CM: MOVA_INT
    663 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
    664 ; CM: MOVA_INT
    665 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    666 
    667 ; SI: buffer_store_short
    668 define amdgpu_kernel void @store_i64_i16(i16 addrspace(5)* %out, i64 %in) {
    669 entry:
    670   %0 = trunc i64 %in to i16
    671   store i16 %0, i16 addrspace(5)* %out
    672   ret void
    673 }
    674 
    675 ; The stores in this function are combined by the optimizer to create a
    676 ; 64-bit store with 32-bit alignment.  This is legal and the legalizer
    677 ; should not try to split the 64-bit store back into 2 32-bit stores.
    678 
    679 ; FUNC-LABEL: {{^}}vecload2:
    680 ; EG: MOVA_INT
    681 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    682 ; EG: MOVA_INT
    683 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    684 
    685 ; CM: MOVA_INT
    686 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    687 ; CM: MOVA_INT
    688 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    689 
    690 ;TODO: why not x2?
    691 ; XSI: buffer_store_dwordx2
    692 ; SI: buffer_store_dword
    693 ; SI: buffer_store_dword
    694 define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 {
    695 entry:
    696   %0 = load i32, i32 addrspace(4)* %mem, align 4
    697   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1
    698   %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4
    699   store i32 %0, i32 addrspace(5)* %out, align 4
    700   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1
    701   store i32 %1, i32 addrspace(5)* %arrayidx1, align 4
    702   ret void
    703 }
    704 
    705 ; When i128 was a legal type this program generated cannot select errors:
    706 
    707 ; FUNC-LABEL: {{^}}"i128-const-store":
    708 ; EG: MOVA_INT
    709 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    710 ; EG: MOVA_INT
    711 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    712 ; EG: MOVA_INT
    713 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    714 ; EG: MOVA_INT
    715 ; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
    716 
    717 ; CM: MOVA_INT
    718 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    719 ; CM: MOVA_INT
    720 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    721 ; CM: MOVA_INT
    722 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    723 ; CM: MOVA_INT
    724 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
    725 
    726 ;TODO: why not x4?
    727 ; XSI: buffer_store_dwordx4
    728 ; SI: buffer_store_dword
    729 ; SI: buffer_store_dword
    730 ; SI: buffer_store_dword
    731 ; SI: buffer_store_dword
    732 define amdgpu_kernel void @i128-const-store(i32 addrspace(5)* %out) {
    733 entry:
    734   store i32 1, i32 addrspace(5)* %out, align 4
    735   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1
    736   store i32 1, i32 addrspace(5)* %arrayidx2, align 4
    737   %arrayidx4 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 2
    738   store i32 2, i32 addrspace(5)* %arrayidx4, align 4
    739   %arrayidx6 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 3
    740   store i32 2, i32 addrspace(5)* %arrayidx6, align 4
    741   ret void
    742 }
    743 
    744 
    745 attributes #0 = { nounwind }
    746