Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2013-2015 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  */
     23 
     24 #include "isl/isl.h"
     25 #include "brw_fs_surface_builder.h"
     26 #include "brw_fs.h"
     27 
     28 using namespace brw;
     29 
     30 namespace brw {
     31    namespace surface_access {
     32       namespace {
     33          /**
     34           * Generate a logical send opcode for a surface message and return
     35           * the result.
     36           */
     37          fs_reg
     38          emit_send(const fs_builder &bld, enum opcode opcode,
     39                    const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
     40                    unsigned dims, unsigned arg, unsigned rsize,
     41                    brw_predicate pred = BRW_PREDICATE_NONE)
     42          {
     43             /* Reduce the dynamically uniform surface index to a single
     44              * scalar.
     45              */
     46             const fs_reg usurface = bld.emit_uniformize(surface);
     47             const fs_reg srcs[] = {
     48                addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
     49             };
     50             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
     51             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
     52 
     53             inst->size_written = rsize * dst.component_size(inst->exec_size);
     54             inst->predicate = pred;
     55             return dst;
     56          }
     57       }
     58 
     59       /**
     60        * Emit an untyped surface read opcode.  \p dims determines the number
     61        * of components of the address and \p size the number of components of
     62        * the returned value.
     63        */
     64       fs_reg
     65       emit_untyped_read(const fs_builder &bld,
     66                         const fs_reg &surface, const fs_reg &addr,
     67                         unsigned dims, unsigned size,
     68                         brw_predicate pred)
     69       {
     70          return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
     71                           addr, fs_reg(), surface, dims, size, size, pred);
     72       }
     73 
     74       /**
     75        * Emit an untyped surface write opcode.  \p dims determines the number
     76        * of components of the address and \p size the number of components of
     77        * the argument.
     78        */
     79       void
     80       emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
     81                          const fs_reg &addr, const fs_reg &src,
     82                          unsigned dims, unsigned size,
     83                          brw_predicate pred)
     84       {
     85          emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
     86                    addr, src, surface, dims, size, 0, pred);
     87       }
     88 
     89       /**
     90        * Emit an untyped surface atomic opcode.  \p dims determines the number
     91        * of components of the address and \p rsize the number of components of
     92        * the returned value (either zero or one).
     93        */
     94       fs_reg
     95       emit_untyped_atomic(const fs_builder &bld,
     96                           const fs_reg &surface, const fs_reg &addr,
     97                           const fs_reg &src0, const fs_reg &src1,
     98                           unsigned dims, unsigned rsize, unsigned op,
     99                           brw_predicate pred)
    100       {
    101          /* FINISHME: Factor out this frequently recurring pattern into a
    102           * helper function.
    103           */
    104          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
    105          const fs_reg srcs[] = { src0, src1 };
    106          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
    107          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
    108 
    109          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
    110                           addr, tmp, surface, dims, op, rsize, pred);
    111       }
    112 
    113       /**
    114        * Emit a typed surface read opcode.  \p dims determines the number of
    115        * components of the address and \p size the number of components of the
    116        * returned value.
    117        */
    118       fs_reg
    119       emit_typed_read(const fs_builder &bld, const fs_reg &surface,
    120                       const fs_reg &addr, unsigned dims, unsigned size)
    121       {
    122          return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
    123                           addr, fs_reg(), surface, dims, size, size);
    124       }
    125 
    126       /**
    127        * Emit a typed surface write opcode.  \p dims determines the number of
    128        * components of the address and \p size the number of components of the
    129        * argument.
    130        */
    131       void
    132       emit_typed_write(const fs_builder &bld, const fs_reg &surface,
    133                        const fs_reg &addr, const fs_reg &src,
    134                        unsigned dims, unsigned size)
    135       {
    136          emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
    137                    addr, src, surface, dims, size, 0);
    138       }
    139 
    140       /**
    141        * Emit a typed surface atomic opcode.  \p dims determines the number of
    142        * components of the address and \p rsize the number of components of
    143        * the returned value (either zero or one).
    144        */
    145       fs_reg
    146       emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
    147                         const fs_reg &addr,
    148                         const fs_reg &src0, const fs_reg &src1,
    149                         unsigned dims, unsigned rsize, unsigned op,
    150                         brw_predicate pred)
    151       {
    152          /* FINISHME: Factor out this frequently recurring pattern into a
    153           * helper function.
    154           */
    155          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
    156          const fs_reg srcs[] = { src0, src1 };
    157          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
    158          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
    159 
    160          return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
    161                           addr, tmp, surface, dims, op, rsize);
    162       }
    163    }
    164 }
    165 
    166 namespace {
    167    namespace image_format_info {
    168       /* The higher compiler layers use the GL enums for image formats even if
    169        * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
    170        * enum before we can use them.
    171        */
    172       enum isl_format
    173       isl_format_for_gl_format(uint32_t gl_format)
    174       {
    175          switch (gl_format) {
    176          case GL_R8:             return ISL_FORMAT_R8_UNORM;
    177          case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
    178          case GL_R8UI:           return ISL_FORMAT_R8_UINT;
    179          case GL_R8I:            return ISL_FORMAT_R8_SINT;
    180          case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
    181          case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
    182          case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
    183          case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
    184          case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
    185          case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
    186          case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
    187          case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
    188          case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
    189          case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
    190          case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
    191          case GL_R16:            return ISL_FORMAT_R16_UNORM;
    192          case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
    193          case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
    194          case GL_R16UI:          return ISL_FORMAT_R16_UINT;
    195          case GL_R16I:           return ISL_FORMAT_R16_SINT;
    196          case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
    197          case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
    198          case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
    199          case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
    200          case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
    201          case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
    202          case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
    203          case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
    204          case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
    205          case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
    206          case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
    207          case GL_R32UI:          return ISL_FORMAT_R32_UINT;
    208          case GL_R32I:           return ISL_FORMAT_R32_SINT;
    209          case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
    210          case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
    211          case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
    212          case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
    213          case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
    214          case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
    215          case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
    216          default:
    217             assert(!"Invalid image format");
    218             return ISL_FORMAT_UNSUPPORTED;
    219          }
    220       }
    221 
    222       /**
    223        * Simple 4-tuple of scalars used to pass around per-color component
    224        * values.
    225        */
    226       struct color_u {
    227          color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
    228          {
    229          }
    230 
    231          color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
    232             r(r), g(g), b(b), a(a)
    233          {
    234          }
    235 
    236          unsigned
    237          operator[](unsigned i) const
    238          {
    239             const unsigned xs[] = { r, g, b, a };
    240             return xs[i];
    241          }
    242 
    243          unsigned r, g, b, a;
    244       };
    245 
    246       /**
    247        * Return the per-channel bitfield widths for a given image format.
    248        */
    249       inline color_u
    250       get_bit_widths(isl_format format)
    251       {
    252          const isl_format_layout *fmtl = isl_format_get_layout(format);
    253 
    254          return color_u(fmtl->channels.r.bits,
    255                         fmtl->channels.g.bits,
    256                         fmtl->channels.b.bits,
    257                         fmtl->channels.a.bits);
    258       }
    259 
    260       /**
    261        * Return the per-channel bitfield shifts for a given image format.
    262        */
    263       inline color_u
    264       get_bit_shifts(isl_format format)
    265       {
    266          const color_u widths = get_bit_widths(format);
    267          return color_u(0, widths.r, widths.r + widths.g,
    268                         widths.r + widths.g + widths.b);
    269       }
    270 
    271       /**
    272        * Return true if all present components have the same bit width.
    273        */
    274       inline bool
    275       is_homogeneous(isl_format format)
    276       {
    277          const color_u widths = get_bit_widths(format);
    278          return ((widths.g == 0 || widths.g == widths.r) &&
    279                  (widths.b == 0 || widths.b == widths.r) &&
    280                  (widths.a == 0 || widths.a == widths.r));
    281       }
    282 
    283       /**
    284        * Return true if the format conversion boils down to a trivial copy.
    285        */
    286       inline bool
    287       is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
    288       {
    289          return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
    290                  format == isl_lower_storage_image_format(devinfo, format);
    291       }
    292 
    293       /**
    294        * Return true if the hardware natively supports some format with
    295        * compatible bitfield layout, but possibly different data types.
    296        */
    297       inline bool
    298       has_supported_bit_layout(const gen_device_info *devinfo,
    299                                isl_format format)
    300       {
    301          const color_u widths = get_bit_widths(format);
    302          const color_u lower_widths = get_bit_widths(
    303             isl_lower_storage_image_format(devinfo, format));
    304 
    305          return (widths.r == lower_widths.r &&
    306                  widths.g == lower_widths.g &&
    307                  widths.b == lower_widths.b &&
    308                  widths.a == lower_widths.a);
    309       }
    310 
    311       /**
    312        * Return true if we are required to spread individual components over
    313        * several components of the format used by the hardware (RG32 and
    314        * friends implemented as RGBA16UI).
    315        */
    316       inline bool
    317       has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
    318       {
    319          const isl_format lower_format =
    320             isl_lower_storage_image_format(devinfo, format);
    321 
    322          return (isl_format_get_num_channels(format) <
    323                  isl_format_get_num_channels(lower_format));
    324       }
    325 
    326       /**
    327        * Return true if the hardware returns garbage in the unused high bits
    328        * of each component.  This may happen on IVB because we rely on the
    329        * undocumented behavior that typed reads from surfaces of the
    330        * unsupported R8 and R16 formats return useful data in their least
    331        * significant bits.
    332        */
    333       inline bool
    334       has_undefined_high_bits(const gen_device_info *devinfo,
    335                               isl_format format)
    336       {
    337          const isl_format lower_format =
    338             isl_lower_storage_image_format(devinfo, format);
    339 
    340          return (devinfo->gen == 7 && !devinfo->is_haswell &&
    341                  (lower_format == ISL_FORMAT_R16_UINT ||
    342                   lower_format == ISL_FORMAT_R8_UINT));
    343       }
    344 
    345       /**
    346        * Return true if the format represents values as signed integers
    347        * requiring sign extension when unpacking.
    348        */
    349       inline bool
    350       needs_sign_extension(isl_format format)
    351       {
    352          return isl_format_has_snorm_channel(format) ||
    353                 isl_format_has_sint_channel(format);
    354       }
    355    }
    356 
    357    namespace image_validity {
    358       /**
    359        * Check whether the bound image is suitable for untyped access.
    360        */
    361       brw_predicate
    362       emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
    363                                brw_predicate pred)
    364       {
    365          const gen_device_info *devinfo = bld.shader->devinfo;
    366          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
    367 
    368          if (devinfo->gen == 7 && !devinfo->is_haswell) {
    369             /* Check whether the first stride component (i.e. the Bpp value)
    370              * is greater than four, what on Gen7 indicates that a surface of
    371              * type RAW has been bound for untyped access.  Reading or writing
    372              * to a surface of type other than RAW using untyped surface
    373              * messages causes a hang on IVB and VLV.
    374              */
    375             set_predicate(pred,
    376                           bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
    377                                   BRW_CONDITIONAL_G));
    378 
    379             return BRW_PREDICATE_NORMAL;
    380          } else {
    381             /* More recent generations handle the format mismatch
    382              * gracefully.
    383              */
    384             return pred;
    385          }
    386       }
    387 
    388       /**
    389        * Check whether there is an image bound at the given index and write
    390        * the comparison result to f0.0.  Returns an appropriate predication
    391        * mode to use on subsequent image operations.
    392        */
    393       brw_predicate
    394       emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
    395       {
    396          const gen_device_info *devinfo = bld.shader->devinfo;
    397          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
    398 
    399          if (devinfo->gen == 7 && !devinfo->is_haswell) {
    400             /* Check the first component of the size field to find out if the
    401              * image is bound.  Necessary on IVB for typed atomics because
    402              * they don't seem to respect null surfaces and will happily
    403              * corrupt or read random memory when no image is bound.
    404              */
    405             bld.CMP(bld.null_reg_ud(),
    406                     retype(size, BRW_REGISTER_TYPE_UD),
    407                     brw_imm_d(0), BRW_CONDITIONAL_NZ);
    408 
    409             return BRW_PREDICATE_NORMAL;
    410          } else {
    411             /* More recent platforms implement compliant behavior when a null
    412              * surface is bound.
    413              */
    414             return BRW_PREDICATE_NONE;
    415          }
    416       }
    417 
    418       /**
    419        * Check whether the provided coordinates are within the image bounds
    420        * and write the comparison result to f0.0.  Returns an appropriate
    421        * predication mode to use on subsequent image operations.
    422        */
    423       brw_predicate
    424       emit_bounds_check(const fs_builder &bld, const fs_reg &image,
    425                         const fs_reg &addr, unsigned dims)
    426       {
    427          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
    428 
    429          for (unsigned c = 0; c < dims; ++c)
    430             set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
    431                           bld.CMP(bld.null_reg_ud(),
    432                                   offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
    433                                   offset(size, bld, c),
    434                                   BRW_CONDITIONAL_L));
    435 
    436          return BRW_PREDICATE_NORMAL;
    437       }
    438    }
    439 
    440    namespace image_coordinates {
    441       /**
    442        * Return the total number of coordinates needed to address a texel of
    443        * the surface, which may be more than the sum of \p surf_dims and \p
    444        * arr_dims if padding is required.
    445        */
    446       unsigned
    447       num_image_coordinates(const fs_builder &bld,
    448                             unsigned surf_dims, unsigned arr_dims,
    449                             isl_format format)
    450       {
    451          /* HSW in vec4 mode and our software coordinate handling for untyped
    452           * reads want the array index to be at the Z component.
    453           */
    454          const bool array_index_at_z =
    455             format != ISL_FORMAT_UNSUPPORTED &&
    456             !isl_has_matching_typed_storage_image_format(
    457                bld.shader->devinfo, format);
    458          const unsigned zero_dims =
    459             ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
    460 
    461          return surf_dims + zero_dims + arr_dims;
    462       }
    463 
    464       /**
    465        * Transform image coordinates into the form expected by the
    466        * implementation.
    467        */
    468       fs_reg
    469       emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
    470                              unsigned surf_dims, unsigned arr_dims,
    471                              isl_format format)
    472       {
    473          const unsigned dims =
    474             num_image_coordinates(bld, surf_dims, arr_dims, format);
    475 
    476          if (dims > surf_dims + arr_dims) {
    477             assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
    478             /* The array index is required to be passed in as the Z component,
    479              * insert a zero at the Y component to shift it to the right
    480              * position.
    481              *
    482              * FINISHME: Factor out this frequently recurring pattern into a
    483              * helper function.
    484              */
    485             const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
    486             const fs_reg dst = bld.vgrf(addr.type, dims);
    487             bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
    488             return dst;
    489          } else {
    490             return addr;
    491          }
    492       }
    493 
    494       /**
    495        * Calculate the offset in memory of the texel given by \p coord.
    496        *
    497        * This is meant to be used with untyped surface messages to access a
    498        * tiled surface, what involves taking into account the tiling and
    499        * swizzling modes of the surface manually so it will hopefully not
    500        * happen very often.
    501        *
    502        * The tiling algorithm implemented here matches either the X or Y
    503        * tiling layouts supported by the hardware depending on the tiling
    504        * coefficients passed to the program as uniforms.  See Volume 1 Part 2
    505        * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
    506        * explanation of the hardware tiling format.
    507        */
    508       fs_reg
    509       emit_address_calculation(const fs_builder &bld, const fs_reg &image,
    510                                const fs_reg &coord, unsigned dims)
    511       {
    512          const gen_device_info *devinfo = bld.shader->devinfo;
    513          const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
    514          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
    515          const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
    516          const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
    517          const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
    518          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
    519          const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
    520          const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
    521          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
    522 
    523          /* Shift the coordinates by the fixed surface offset.  It may be
    524           * non-zero if the image is a single slice of a higher-dimensional
    525           * surface, or if a non-zero mipmap level of the surface is bound to
    526           * the pipeline.  The offset needs to be applied here rather than at
    527           * surface state set-up time because the desired slice-level may
    528           * start mid-tile, so simply shifting the surface base address
    529           * wouldn't give a well-formed tiled surface in the general case.
    530           */
    531          for (unsigned c = 0; c < 2; ++c)
    532             bld.ADD(offset(addr, bld, c), offset(off, bld, c),
    533                     (c < dims ?
    534                      offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
    535                      fs_reg(brw_imm_d(0))));
    536 
    537          /* The layout of 3-D textures in memory is sort-of like a tiling
    538           * format.  At each miplevel, the slices are arranged in rows of
    539           * 2^level slices per row.  The slice row is stored in tmp.y and
    540           * the slice within the row is stored in tmp.x.
    541           *
    542           * The layout of 2-D array textures and cubemaps is much simpler:
    543           * Depending on whether the ARYSPC_LOD0 layout is in use it will be
    544           * stored in memory as an array of slices, each one being a 2-D
    545           * arrangement of miplevels, or as a 2D arrangement of miplevels,
    546           * each one being an array of slices.  In either case the separation
    547           * between slices of the same LOD is equal to the qpitch value
    548           * provided as stride.w.
    549           *
    550           * This code can be made to handle either 2D arrays and 3D textures
    551           * by passing in the miplevel as tile.z for 3-D textures and 0 in
    552           * tile.z for 2-D array textures.
    553           *
    554           * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
    555           * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
    556           * of the hardware 3D texture and 2D array layouts.
    557           */
    558          if (dims > 2) {
    559             /* Decompose z into a major (tmp.y) and a minor (tmp.x)
    560              * index.
    561              */
    562             bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
    563                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
    564             bld.SHR(offset(tmp, bld, 1),
    565                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
    566                     offset(tile, bld, 2));
    567 
    568             /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
    569              * slice offset.
    570              */
    571             for (unsigned c = 0; c < 2; ++c) {
    572                bld.MUL(offset(tmp, bld, c),
    573                        offset(stride, bld, 2 + c), offset(tmp, bld, c));
    574                bld.ADD(offset(addr, bld, c),
    575                        offset(addr, bld, c), offset(tmp, bld, c));
    576             }
    577          }
    578 
    579          if (dims > 1) {
    580             /* Calculate the major/minor x and y indices.  In order to
    581              * accommodate both X and Y tiling, the Y-major tiling format is
    582              * treated as being a bunch of narrow X-tiles placed next to each
    583              * other.  This means that the tile width for Y-tiling is actually
    584              * the width of one sub-column of the Y-major tile where each 4K
    585              * tile has 8 512B sub-columns.
    586              *
    587              * The major Y value is the row of tiles in which the pixel lives.
    588              * The major X value is the tile sub-column in which the pixel
    589              * lives; for X tiling, this is the same as the tile column, for Y
    590              * tiling, each tile has 8 sub-columns.  The minor X and Y indices
    591              * are the position within the sub-column.
    592              */
    593             for (unsigned c = 0; c < 2; ++c) {
    594                /* Calculate the minor x and y indices. */
    595                bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
    596                        brw_imm_d(0), offset(addr, bld, c));
    597 
    598                /* Calculate the major x and y indices. */
    599                bld.SHR(offset(major, bld, c),
    600                        offset(addr, bld, c), offset(tile, bld, c));
    601             }
    602 
    603             /* Calculate the texel index from the start of the tile row and
    604              * the vertical coordinate of the row.
    605              * Equivalent to:
    606              *   tmp.x = (major.x << tile.y << tile.x) +
    607              *           (minor.y << tile.x) + minor.x
    608              *   tmp.y = major.y << tile.y
    609              */
    610             bld.SHL(tmp, major, offset(tile, bld, 1));
    611             bld.ADD(tmp, tmp, offset(minor, bld, 1));
    612             bld.SHL(tmp, tmp, offset(tile, bld, 0));
    613             bld.ADD(tmp, tmp, minor);
    614             bld.SHL(offset(tmp, bld, 1),
    615                     offset(major, bld, 1), offset(tile, bld, 1));
    616 
    617             /* Add it to the start of the tile row. */
    618             bld.MUL(offset(tmp, bld, 1),
    619                     offset(tmp, bld, 1), offset(stride, bld, 1));
    620             bld.ADD(tmp, tmp, offset(tmp, bld, 1));
    621 
    622             /* Multiply by the Bpp value. */
    623             bld.MUL(dst, tmp, stride);
    624 
    625             if (devinfo->gen < 8 && !devinfo->is_baytrail) {
    626                /* Take into account the two dynamically specified shifts.
    627                 * Both need are used to implement swizzling of X-tiled
    628                 * surfaces.  For Y-tiled surfaces only one bit needs to be
    629                 * XOR-ed with bit 6 of the memory address, so a swz value of
    630                 * 0xff (actually interpreted as 31 by the hardware) will be
    631                 * provided to cause the relevant bit of tmp.y to be zero and
    632                 * turn the first XOR into the identity.  For linear surfaces
    633                 * or platforms lacking address swizzling both shifts will be
    634                 * 0xff causing the relevant bits of both tmp.x and .y to be
    635                 * zero, what effectively disables swizzling.
    636                 */
    637                for (unsigned c = 0; c < 2; ++c)
    638                   bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
    639 
    640                /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
    641                bld.XOR(tmp, tmp, offset(tmp, bld, 1));
    642                bld.AND(tmp, tmp, brw_imm_d(1 << 6));
    643                bld.XOR(dst, dst, tmp);
    644             }
    645 
    646          } else {
    647             /* Multiply by the Bpp/stride value.  Note that the addr.y may be
    648              * non-zero even if the image is one-dimensional because a
    649              * vertical offset may have been applied above to select a
    650              * non-zero slice or level of a higher-dimensional texture.
    651              */
    652             bld.MUL(offset(addr, bld, 1),
    653                     offset(addr, bld, 1), offset(stride, bld, 1));
    654             bld.ADD(addr, addr, offset(addr, bld, 1));
    655             bld.MUL(dst, addr, stride);
    656          }
    657 
    658          return dst;
    659       }
    660    }
    661 
    662    namespace image_format_conversion {
    663       using image_format_info::color_u;
    664 
    665       namespace {
    666          /**
    667           * Maximum representable value in an unsigned integer with the given
    668           * number of bits.
    669           */
    670          inline unsigned
    671          scale(unsigned n)
    672          {
    673             return (1 << n) - 1;
    674          }
    675       }
    676 
    677       /**
    678        * Pack the vector \p src in a bitfield given the per-component bit
    679        * shifts and widths.  Note that bitfield components are not allowed to
    680        * cross 32-bit boundaries.
    681        */
    682       fs_reg
    683       emit_pack(const fs_builder &bld, const fs_reg &src,
    684                 const color_u &shifts, const color_u &widths)
    685       {
    686          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
    687          bool seen[4] = {};
    688 
    689          for (unsigned c = 0; c < 4; ++c) {
    690             if (widths[c]) {
    691                const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
    692 
    693                /* Shift each component left to the correct bitfield position. */
    694                bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
    695 
    696                /* Add everything up. */
    697                if (seen[shifts[c] / 32]) {
    698                   bld.OR(offset(dst, bld, shifts[c] / 32),
    699                          offset(dst, bld, shifts[c] / 32), tmp);
    700                } else {
    701                   bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
    702                   seen[shifts[c] / 32] = true;
    703                }
    704             }
    705          }
    706 
    707          return dst;
    708       }
    709 
    710       /**
    711        * Unpack a vector from the bitfield \p src given the per-component bit
    712        * shifts and widths.  Note that bitfield components are not allowed to
    713        * cross 32-bit boundaries.
    714        */
    715       fs_reg
    716       emit_unpack(const fs_builder &bld, const fs_reg &src,
    717                   const color_u &shifts, const color_u &widths)
    718       {
    719          const fs_reg dst = bld.vgrf(src.type, 4);
    720 
    721          for (unsigned c = 0; c < 4; ++c) {
    722             if (widths[c]) {
    723                /* Shift left to discard the most significant bits. */
    724                bld.SHL(offset(dst, bld, c),
    725                        offset(src, bld, shifts[c] / 32),
    726                        brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
    727 
    728                /* Shift back to the least significant bits using an arithmetic
    729                 * shift to get sign extension on signed types.
    730                 */
    731                bld.ASR(offset(dst, bld, c),
    732                        offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
    733             }
    734          }
    735 
    736          return dst;
    737       }
    738 
    739       /**
    740        * Convert an integer vector into another integer vector of the
    741        * specified bit widths, properly handling overflow.
    742        */
    743       fs_reg
    744       emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
    745                               const color_u &widths, bool is_signed)
    746       {
    747          const unsigned s = (is_signed ? 1 : 0);
    748          const fs_reg dst = bld.vgrf(
    749             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
    750          assert(src.type == dst.type);
    751 
    752          for (unsigned c = 0; c < 4; ++c) {
    753             if (widths[c]) {
    754                /* Clamp to the maximum value. */
    755                bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
    756                                brw_imm_d((int)scale(widths[c] - s)),
    757                                BRW_CONDITIONAL_L);
    758 
    759                /* Clamp to the minimum value. */
    760                if (is_signed)
    761                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
    762                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
    763                                   BRW_CONDITIONAL_GE);
    764 
    765                /* Mask off all but the bits we actually want.  Otherwise, if
    766                 * we pass a negative number into the hardware when it's
    767                 * expecting something like UINT8, it will happily clamp it to
    768                 * +255 for us.
    769                 */
    770                if (is_signed && widths[c] < 32)
    771                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
    772                           brw_imm_d(scale(widths[c])));
    773             }
    774          }
    775 
    776          return dst;
    777       }
    778 
    779       /**
    780        * Convert a normalized fixed-point vector of the specified signedness
    781        * and bit widths into a floating point vector.
    782        */
    783       fs_reg
    784       emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
    785                                const color_u &widths, bool is_signed)
    786       {
    787          const unsigned s = (is_signed ? 1 : 0);
    788          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
    789 
    790          for (unsigned c = 0; c < 4; ++c) {
    791             if (widths[c]) {
    792                /* Convert to float. */
    793                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
    794 
    795                /* Divide by the normalization constants. */
    796                bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
    797                        brw_imm_f(1.0f / scale(widths[c] - s)));
    798 
    799                /* Clamp to the minimum value. */
    800                if (is_signed)
    801                   bld.emit_minmax(offset(dst, bld, c),
    802                                   offset(dst, bld, c), brw_imm_f(-1.0f),
    803                                   BRW_CONDITIONAL_GE);
    804             }
    805          }
    806          return dst;
    807       }
    808 
    809       /**
    810        * Convert a floating-point vector into a normalized fixed-point vector
    811        * of the specified signedness and bit widths.
    812        */
    813       fs_reg
    814       emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
    815                              const color_u &widths, bool is_signed)
    816       {
    817          const unsigned s = (is_signed ? 1 : 0);
    818          const fs_reg dst = bld.vgrf(
    819             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
    820          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
    821 
    822          for (unsigned c = 0; c < 4; ++c) {
    823             if (widths[c]) {
    824                /* Clamp the normalized floating-point argument. */
    825                if (is_signed) {
    826                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
    827                                   brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
    828 
    829                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
    830                                   brw_imm_f(1.0f), BRW_CONDITIONAL_L);
    831                } else {
    832                   set_saturate(true, bld.MOV(offset(fdst, bld, c),
    833                                              offset(src, bld, c)));
    834                }
    835 
    836                /* Multiply by the normalization constants. */
    837                bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
    838                        brw_imm_f((float)scale(widths[c] - s)));
    839 
    840                /* Convert to integer. */
    841                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
    842                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
    843 
    844                /* Mask off all but the bits we actually want.  Otherwise, if
    845                 * we pass a negative number into the hardware when it's
    846                 * expecting something like UINT8, it will happily clamp it to
    847                 * +255 for us.
    848                 */
    849                if (is_signed && widths[c] < 32)
    850                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
    851                           brw_imm_d(scale(widths[c])));
    852             }
    853          }
    854 
    855          return dst;
    856       }
    857 
    858       /**
    859        * Convert a floating point vector of the specified bit widths into a
    860        * 32-bit floating point vector.
    861        */
    862       fs_reg
    863       emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
    864                               const color_u &widths)
    865       {
    866          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
    867          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
    868 
    869          for (unsigned c = 0; c < 4; ++c) {
    870             if (widths[c]) {
    871                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
    872 
    873                /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
    874                 * This works because they have a 5-bit exponent just like the
    875                 * 16-bit floating point format, and they have no sign bit.
    876                 */
    877                if (widths[c] < 16)
    878                   bld.SHL(offset(dst, bld, c),
    879                           offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
    880 
    881                /* Convert to 32-bit floating point. */
    882                bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
    883             }
    884          }
    885 
    886          return fdst;
    887       }
    888 
    889       /**
    890        * Convert a vector into a floating point vector of the specified bit
    891        * widths.
    892        */
    893       fs_reg
    894       emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
    895                             const color_u &widths)
    896       {
    897          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
    898          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
    899 
    900          for (unsigned c = 0; c < 4; ++c) {
    901             if (widths[c]) {
    902                bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
    903 
    904                /* Clamp to the minimum value. */
    905                if (widths[c] < 16)
    906                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
    907                                   brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
    908 
    909                /* Convert to 16-bit floating-point. */
    910                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
    911 
    912                /* Discard the least significant bits to get floating point
    913                 * numbers of the requested width.  This works because the
    914                 * 10-bit and 11-bit floating point formats have a 5-bit
    915                 * exponent just like the 16-bit format, and they have no sign
    916                 * bit.
    917                 */
    918                if (widths[c] < 16)
    919                   bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
    920                           brw_imm_ud(15 - widths[c]));
    921             }
    922          }
    923 
    924          return dst;
    925       }
    926 
    927       /**
    928        * Fill missing components of a vector with 0, 0, 0, 1.
    929        */
    930       fs_reg
    931       emit_pad(const fs_builder &bld, const fs_reg &src,
    932                const color_u &widths)
    933       {
    934          const fs_reg dst = bld.vgrf(src.type, 4);
    935          const unsigned pad[] = { 0, 0, 0, 1 };
    936 
    937          for (unsigned c = 0; c < 4; ++c)
    938             bld.MOV(offset(dst, bld, c),
    939                     widths[c] ? offset(src, bld, c)
    940                               : fs_reg(brw_imm_ud(pad[c])));
    941 
    942          return dst;
    943       }
    944    }
    945 }
    946 
    947 namespace brw {
    948    namespace image_access {
    949       /**
    950        * Load a vector from a surface of the given format and dimensionality
    951        * at the given coordinates.  \p surf_dims and \p arr_dims give the
    952        * number of non-array and array coordinates of the image respectively.
    953        */
    954       fs_reg
    955       emit_image_load(const fs_builder &bld,
    956                       const fs_reg &image, const fs_reg &addr,
    957                       unsigned surf_dims, unsigned arr_dims,
    958                       unsigned gl_format)
    959       {
    960          using namespace image_format_info;
    961          using namespace image_format_conversion;
    962          using namespace image_validity;
    963          using namespace image_coordinates;
    964          using namespace surface_access;
    965          const gen_device_info *devinfo = bld.shader->devinfo;
    966          const isl_format format = isl_format_for_gl_format(gl_format);
    967          const isl_format lower_format =
    968             isl_lower_storage_image_format(devinfo, format);
    969          fs_reg tmp;
    970 
    971          /* Transform the image coordinates into actual surface coordinates. */
    972          const fs_reg saddr =
    973             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
    974          const unsigned dims =
    975             num_image_coordinates(bld, surf_dims, arr_dims, format);
    976 
    977          if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
    978             /* Hopefully we get here most of the time... */
    979             tmp = emit_typed_read(bld, image, saddr, dims,
    980                                   isl_format_get_num_channels(lower_format));
    981          } else {
    982             /* Untyped surface reads return 32 bits of the surface per
    983              * component, without any sort of unpacking or type conversion,
    984              */
    985             const unsigned size = isl_format_get_layout(format)->bpb / 32;
    986             /* they don't properly handle out of bounds access, so we have to
    987              * check manually if the coordinates are valid and predicate the
    988              * surface read on the result,
    989              */
    990             const brw_predicate pred =
    991                emit_untyped_image_check(bld, image,
    992                                         emit_bounds_check(bld, image,
    993                                                           saddr, dims));
    994 
    995             /* and they don't know about surface coordinates, we need to
    996              * convert them to a raw memory offset.
    997              */
    998             const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
    999 
   1000             tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
   1001 
   1002             /* An out of bounds surface access should give zero as result. */
   1003             for (unsigned c = 0; c < size; ++c)
   1004                set_predicate(pred, bld.SEL(offset(tmp, bld, c),
   1005                                            offset(tmp, bld, c), brw_imm_d(0)));
   1006          }
   1007 
   1008          /* Set the register type to D instead of UD if the data type is
   1009           * represented as a signed integer in memory so that sign extension
   1010           * is handled correctly by unpack.
   1011           */
   1012          if (needs_sign_extension(format))
   1013             tmp = retype(tmp, BRW_REGISTER_TYPE_D);
   1014 
   1015          if (!has_supported_bit_layout(devinfo, format)) {
   1016             /* Unpack individual vector components from the bitfield if the
   1017              * hardware is unable to do it for us.
   1018              */
   1019             if (has_split_bit_layout(devinfo, format))
   1020                tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
   1021                                get_bit_widths(lower_format));
   1022             else
   1023                tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
   1024                                  get_bit_widths(format));
   1025 
   1026          } else if ((needs_sign_extension(format) &&
   1027                      !is_conversion_trivial(devinfo, format)) ||
   1028                     has_undefined_high_bits(devinfo, format)) {
   1029             /* Perform a trivial unpack even though the bit layout matches in
   1030              * order to get the most significant bits of each component
   1031              * initialized properly.
   1032              */
   1033             tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
   1034                               get_bit_widths(format));
   1035          }
   1036 
   1037          if (!isl_format_has_int_channel(format)) {
   1038             if (is_conversion_trivial(devinfo, format)) {
   1039                /* Just need to cast the vector to the target type. */
   1040                tmp = retype(tmp, BRW_REGISTER_TYPE_F);
   1041             } else {
   1042                /* Do the right sort of type conversion to float. */
   1043                if (isl_format_has_float_channel(format))
   1044                   tmp = emit_convert_from_float(
   1045                      bld, tmp, get_bit_widths(format));
   1046                else
   1047                   tmp = emit_convert_from_scaled(
   1048                      bld, tmp, get_bit_widths(format),
   1049                      isl_format_has_snorm_channel(format));
   1050             }
   1051          }
   1052 
   1053          /* Initialize missing components of the result. */
   1054          return emit_pad(bld, tmp, get_bit_widths(format));
   1055       }
   1056 
   1057       /**
   1058        * Store a vector in a surface of the given format and dimensionality at
   1059        * the given coordinates.  \p surf_dims and \p arr_dims give the number
   1060        * of non-array and array coordinates of the image respectively.
   1061        */
   1062       void
   1063       emit_image_store(const fs_builder &bld, const fs_reg &image,
   1064                        const fs_reg &addr, const fs_reg &src,
   1065                        unsigned surf_dims, unsigned arr_dims,
   1066                        unsigned gl_format)
   1067       {
   1068          using namespace image_format_info;
   1069          using namespace image_format_conversion;
   1070          using namespace image_validity;
   1071          using namespace image_coordinates;
   1072          using namespace surface_access;
   1073          const isl_format format = isl_format_for_gl_format(gl_format);
   1074          const gen_device_info *devinfo = bld.shader->devinfo;
   1075 
   1076          /* Transform the image coordinates into actual surface coordinates. */
   1077          const fs_reg saddr =
   1078             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
   1079          const unsigned dims =
   1080             num_image_coordinates(bld, surf_dims, arr_dims, format);
   1081 
   1082          if (gl_format == GL_NONE) {
   1083             /* We don't know what the format is, but that's fine because it
   1084              * implies write-only access, and typed surface writes are always
   1085              * able to take care of type conversion and packing for us.
   1086              */
   1087             emit_typed_write(bld, image, saddr, src, dims, 4);
   1088 
   1089          } else {
   1090             const isl_format lower_format =
   1091                isl_lower_storage_image_format(devinfo, format);
   1092             fs_reg tmp = src;
   1093 
   1094             if (!is_conversion_trivial(devinfo, format)) {
   1095                /* Do the right sort of type conversion. */
   1096                if (isl_format_has_float_channel(format))
   1097                   tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
   1098 
   1099                else if (isl_format_has_int_channel(format))
   1100                   tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
   1101                                                 isl_format_has_sint_channel(format));
   1102 
   1103                else
   1104                   tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
   1105                                                isl_format_has_snorm_channel(format));
   1106             }
   1107 
   1108             /* We're down to bit manipulation at this point. */
   1109             tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
   1110 
   1111             if (!has_supported_bit_layout(devinfo, format)) {
   1112                /* Pack the vector components into a bitfield if the hardware
   1113                 * is unable to do it for us.
   1114                 */
   1115                if (has_split_bit_layout(devinfo, format))
   1116                   tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
   1117                                     get_bit_widths(lower_format));
   1118 
   1119                else
   1120                   tmp = emit_pack(bld, tmp, get_bit_shifts(format),
   1121                                   get_bit_widths(format));
   1122             }
   1123 
   1124             if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
   1125                /* Hopefully we get here most of the time... */
   1126                emit_typed_write(bld, image, saddr, tmp, dims,
   1127                                 isl_format_get_num_channels(lower_format));
   1128 
   1129             } else {
   1130                /* Untyped surface writes store 32 bits of the surface per
   1131                 * component, without any sort of packing or type conversion,
   1132                 */
   1133                const unsigned size = isl_format_get_layout(format)->bpb / 32;
   1134 
   1135                /* they don't properly handle out of bounds access, so we have
   1136                 * to check manually if the coordinates are valid and predicate
   1137                 * the surface write on the result,
   1138                 */
   1139                const brw_predicate pred =
   1140                   emit_untyped_image_check(bld, image,
   1141                                            emit_bounds_check(bld, image,
   1142                                                              saddr, dims));
   1143 
   1144                /* and, phew, they don't know about surface coordinates, we
   1145                 * need to convert them to a raw memory offset.
   1146                 */
   1147                const fs_reg laddr = emit_address_calculation(
   1148                   bld, image, saddr, dims);
   1149 
   1150                emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
   1151             }
   1152          }
   1153       }
   1154 
   1155       /**
   1156        * Perform an atomic read-modify-write operation in a surface of the
   1157        * given dimensionality at the given coordinates.  \p surf_dims and \p
   1158        * arr_dims give the number of non-array and array coordinates of the
   1159        * image respectively.  Main building block of the imageAtomic GLSL
   1160        * built-ins.
   1161        */
   1162       fs_reg
   1163       emit_image_atomic(const fs_builder &bld,
   1164                         const fs_reg &image, const fs_reg &addr,
   1165                         const fs_reg &src0, const fs_reg &src1,
   1166                         unsigned surf_dims, unsigned arr_dims,
   1167                         unsigned rsize, unsigned op)
   1168       {
   1169          using namespace image_validity;
   1170          using namespace image_coordinates;
   1171          using namespace surface_access;
   1172          /* Avoid performing an atomic operation on an unbound surface. */
   1173          const brw_predicate pred = emit_typed_atomic_check(bld, image);
   1174 
   1175          /* Transform the image coordinates into actual surface coordinates. */
   1176          const fs_reg saddr =
   1177             emit_image_coordinates(bld, addr, surf_dims, arr_dims,
   1178                                    ISL_FORMAT_R32_UINT);
   1179          const unsigned dims =
   1180             num_image_coordinates(bld, surf_dims, arr_dims,
   1181                                   ISL_FORMAT_R32_UINT);
   1182 
   1183          /* Thankfully we can do without untyped atomics here. */
   1184          const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
   1185                                               dims, rsize, op, pred);
   1186 
   1187          /* An unbound surface access should give zero as result. */
   1188          if (rsize && pred)
   1189             set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
   1190 
   1191          return retype(tmp, src0.type);
   1192       }
   1193    }
   1194 }
   1195