1 /* 2 * Copyright (c) 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 /** @file hsw_queryobj.c 26 * 27 * Support for query buffer objects (GL_ARB_query_buffer_object) on Haswell+. 28 */ 29 #include "main/imports.h" 30 31 #include "brw_context.h" 32 #include "brw_defines.h" 33 #include "intel_batchbuffer.h" 34 #include "intel_buffer_objects.h" 35 36 /* 37 * GPR0 = 80 * GPR0; 38 */ 39 static void 40 mult_gpr0_by_80(struct brw_context *brw) 41 { 42 static const uint32_t maths[] = { 43 MI_MATH_ALU2(LOAD, SRCA, R0), 44 MI_MATH_ALU2(LOAD, SRCB, R0), 45 MI_MATH_ALU0(ADD), 46 MI_MATH_ALU2(STORE, R1, ACCU), 47 MI_MATH_ALU2(LOAD, SRCA, R1), 48 MI_MATH_ALU2(LOAD, SRCB, R1), 49 MI_MATH_ALU0(ADD), 50 MI_MATH_ALU2(STORE, R1, ACCU), 51 MI_MATH_ALU2(LOAD, SRCA, R1), 52 MI_MATH_ALU2(LOAD, SRCB, R1), 53 MI_MATH_ALU0(ADD), 54 MI_MATH_ALU2(STORE, R1, ACCU), 55 MI_MATH_ALU2(LOAD, SRCA, R1), 56 MI_MATH_ALU2(LOAD, SRCB, R1), 57 MI_MATH_ALU0(ADD), 58 /* GPR1 = 16 * GPR0 */ 59 MI_MATH_ALU2(STORE, R1, ACCU), 60 MI_MATH_ALU2(LOAD, SRCA, R1), 61 MI_MATH_ALU2(LOAD, SRCB, R1), 62 MI_MATH_ALU0(ADD), 63 MI_MATH_ALU2(STORE, R2, ACCU), 64 MI_MATH_ALU2(LOAD, SRCA, R2), 65 MI_MATH_ALU2(LOAD, SRCB, R2), 66 MI_MATH_ALU0(ADD), 67 /* GPR2 = 64 * GPR0 */ 68 MI_MATH_ALU2(STORE, R2, ACCU), 69 MI_MATH_ALU2(LOAD, SRCA, R1), 70 MI_MATH_ALU2(LOAD, SRCB, R2), 71 MI_MATH_ALU0(ADD), 72 /* GPR0 = 80 * GPR0 */ 73 MI_MATH_ALU2(STORE, R0, ACCU), 74 }; 75 76 BEGIN_BATCH(1 + ARRAY_SIZE(maths)); 77 OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2)); 78 79 for (int m = 0; m < ARRAY_SIZE(maths); m++) 80 OUT_BATCH(maths[m]); 81 82 ADVANCE_BATCH(); 83 } 84 85 /* 86 * GPR0 = GPR0 & ((1ull << n) - 1); 87 */ 88 static void 89 keep_gpr0_lower_n_bits(struct brw_context *brw, uint32_t n) 90 { 91 static const uint32_t maths[] = { 92 MI_MATH_ALU2(LOAD, SRCA, R0), 93 MI_MATH_ALU2(LOAD, SRCB, R1), 94 MI_MATH_ALU0(AND), 95 MI_MATH_ALU2(STORE, R0, ACCU), 96 }; 97 98 assert(n < 64); 99 brw_load_register_imm64(brw, HSW_CS_GPR(1), (1ull << n) - 1); 100 101 BEGIN_BATCH(1 + ARRAY_SIZE(maths)); 102 OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2)); 103 104 for (int m = 0; m < ARRAY_SIZE(maths); m++) 105 OUT_BATCH(maths[m]); 106 107 ADVANCE_BATCH(); 108 } 109 110 /* 111 * GPR0 = GPR0 << 30; 112 */ 113 static void 114 shl_gpr0_by_30_bits(struct brw_context *brw) 115 { 116 /* First we mask 34 bits of GPR0 to prevent overflow */ 117 keep_gpr0_lower_n_bits(brw, 34); 118 119 static const uint32_t shl_maths[] = { 120 MI_MATH_ALU2(LOAD, SRCA, R0), 121 MI_MATH_ALU2(LOAD, SRCB, R0), 122 MI_MATH_ALU0(ADD), 123 MI_MATH_ALU2(STORE, R0, ACCU), 124 }; 125 126 const uint32_t outer_count = 5; 127 const uint32_t inner_count = 6; 128 STATIC_ASSERT(outer_count * inner_count == 30); 129 const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_maths); 130 const uint32_t batch_len = cmd_len * outer_count; 131 132 BEGIN_BATCH(batch_len); 133 134 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of 135 * 30 left shifts. 136 */ 137 for (int o = 0; o < outer_count; o++) { 138 /* Submit one MI_MATH to shift left by 6 bits */ 139 OUT_BATCH(HSW_MI_MATH | (cmd_len - 2)); 140 for (int i = 0; i < inner_count; i++) 141 for (int m = 0; m < ARRAY_SIZE(shl_maths); m++) 142 OUT_BATCH(shl_maths[m]); 143 } 144 145 ADVANCE_BATCH(); 146 } 147 148 /* 149 * GPR0 = GPR0 >> 2; 150 * 151 * Note that the upper 30 bits of GPR0 are lost! 152 */ 153 static void 154 shr_gpr0_by_2_bits(struct brw_context *brw) 155 { 156 shl_gpr0_by_30_bits(brw); 157 brw_load_register_reg(brw, HSW_CS_GPR(0) + 4, HSW_CS_GPR(0)); 158 brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0); 159 } 160 161 /* 162 * GPR0 = (GPR0 == 0) ? 0 : 1; 163 */ 164 static void 165 gpr0_to_bool(struct brw_context *brw) 166 { 167 static const uint32_t maths[] = { 168 MI_MATH_ALU2(LOAD, SRCA, R0), 169 MI_MATH_ALU1(LOAD0, SRCB), 170 MI_MATH_ALU0(ADD), 171 MI_MATH_ALU2(STOREINV, R0, ZF), 172 MI_MATH_ALU2(LOAD, SRCA, R0), 173 MI_MATH_ALU2(LOAD, SRCB, R1), 174 MI_MATH_ALU0(AND), 175 MI_MATH_ALU2(STORE, R0, ACCU), 176 }; 177 178 brw_load_register_imm64(brw, HSW_CS_GPR(1), 1ull); 179 180 BEGIN_BATCH(1 + ARRAY_SIZE(maths)); 181 OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2)); 182 183 for (int m = 0; m < ARRAY_SIZE(maths); m++) 184 OUT_BATCH(maths[m]); 185 186 ADVANCE_BATCH(); 187 } 188 189 static void 190 hsw_result_to_gpr0(struct gl_context *ctx, struct brw_query_object *query, 191 struct gl_buffer_object *buf, intptr_t offset, 192 GLenum pname, GLenum ptype) 193 { 194 struct brw_context *brw = brw_context(ctx); 195 196 assert(query->bo); 197 assert(pname != GL_QUERY_TARGET); 198 199 if (pname == GL_QUERY_RESULT_AVAILABLE) { 200 /* The query result availability is stored at offset 0 of the buffer. */ 201 brw_load_register_mem64(brw, 202 HSW_CS_GPR(0), 203 query->bo, 204 I915_GEM_DOMAIN_INSTRUCTION, 205 I915_GEM_DOMAIN_INSTRUCTION, 206 2 * sizeof(uint64_t)); 207 return; 208 } 209 210 if (pname == GL_QUERY_RESULT) { 211 /* Since GL_QUERY_RESULT_NO_WAIT wasn't used, they want us to stall to 212 * make sure the query is available. 213 */ 214 brw_emit_pipe_control_flush(brw, 215 PIPE_CONTROL_CS_STALL | 216 PIPE_CONTROL_STALL_AT_SCOREBOARD); 217 } 218 219 if (query->Base.Target == GL_TIMESTAMP) { 220 brw_load_register_mem64(brw, 221 HSW_CS_GPR(0), 222 query->bo, 223 I915_GEM_DOMAIN_INSTRUCTION, 224 I915_GEM_DOMAIN_INSTRUCTION, 225 0 * sizeof(uint64_t)); 226 } else { 227 brw_load_register_mem64(brw, 228 HSW_CS_GPR(1), 229 query->bo, 230 I915_GEM_DOMAIN_INSTRUCTION, 231 I915_GEM_DOMAIN_INSTRUCTION, 232 0 * sizeof(uint64_t)); 233 brw_load_register_mem64(brw, 234 HSW_CS_GPR(2), 235 query->bo, 236 I915_GEM_DOMAIN_INSTRUCTION, 237 I915_GEM_DOMAIN_INSTRUCTION, 238 1 * sizeof(uint64_t)); 239 240 BEGIN_BATCH(5); 241 OUT_BATCH(HSW_MI_MATH | (5 - 2)); 242 243 OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2)); 244 OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); 245 OUT_BATCH(MI_MATH_ALU0(SUB)); 246 OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); 247 248 ADVANCE_BATCH(); 249 } 250 251 switch (query->Base.Target) { 252 case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: 253 /* Implement the "WaDividePSInvocationCountBy4:HSW,BDW" workaround: 254 * "Invocation counter is 4 times actual. WA: SW to divide HW reported 255 * PS Invocations value by 4." 256 * 257 * Prior to Haswell, invocation count was counted by the WM, and it 258 * buggily counted invocations in units of subspans (2x2 unit). To get the 259 * correct value, the CS multiplied this by 4. With HSW the logic moved, 260 * and correctly emitted the number of pixel shader invocations, but, 261 * whomever forgot to undo the multiply by 4. 262 */ 263 if (brw->gen == 8 || brw->is_haswell) 264 shr_gpr0_by_2_bits(brw); 265 break; 266 case GL_TIME_ELAPSED: 267 case GL_TIMESTAMP: 268 mult_gpr0_by_80(brw); 269 if (query->Base.Target == GL_TIMESTAMP) { 270 keep_gpr0_lower_n_bits(brw, 36); 271 } 272 break; 273 case GL_ANY_SAMPLES_PASSED: 274 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 275 gpr0_to_bool(brw); 276 break; 277 } 278 } 279 280 /* 281 * Store immediate data into the user buffer using the requested size. 282 */ 283 static void 284 store_query_result_imm(struct brw_context *brw, drm_intel_bo *bo, 285 uint32_t offset, GLenum ptype, uint64_t imm) 286 { 287 switch (ptype) { 288 case GL_INT: 289 case GL_UNSIGNED_INT: 290 brw_store_data_imm32(brw, bo, offset, imm); 291 break; 292 case GL_INT64_ARB: 293 case GL_UNSIGNED_INT64_ARB: 294 brw_store_data_imm64(brw, bo, offset, imm); 295 break; 296 default: 297 unreachable("Unexpected result type"); 298 } 299 } 300 301 static void 302 set_predicate(struct brw_context *brw, drm_intel_bo *query_bo) 303 { 304 brw_load_register_imm64(brw, MI_PREDICATE_SRC1, 0ull); 305 306 /* Load query availability into SRC0 */ 307 brw_load_register_mem64(brw, MI_PREDICATE_SRC0, query_bo, 308 I915_GEM_DOMAIN_INSTRUCTION, 0, 309 2 * sizeof(uint64_t)); 310 311 /* predicate = !(query_availability == 0); */ 312 BEGIN_BATCH(1); 313 OUT_BATCH(GEN7_MI_PREDICATE | 314 MI_PREDICATE_LOADOP_LOADINV | 315 MI_PREDICATE_COMBINEOP_SET | 316 MI_PREDICATE_COMPAREOP_SRCS_EQUAL); 317 ADVANCE_BATCH(); 318 } 319 320 /* 321 * Store data from the register into the user buffer using the requested size. 322 * The write also enables the predication to prevent writing the result if the 323 * query has not finished yet. 324 */ 325 static void 326 store_query_result_reg(struct brw_context *brw, drm_intel_bo *bo, 327 uint32_t offset, GLenum ptype, uint32_t reg, 328 const bool pipelined) 329 { 330 uint32_t cmd_size = brw->gen >= 8 ? 4 : 3; 331 uint32_t dwords = (ptype == GL_INT || ptype == GL_UNSIGNED_INT) ? 1 : 2; 332 assert(brw->gen >= 6); 333 334 BEGIN_BATCH(dwords * cmd_size); 335 for (int i = 0; i < dwords; i++) { 336 OUT_BATCH(MI_STORE_REGISTER_MEM | 337 (pipelined ? MI_STORE_REGISTER_MEM_PREDICATE : 0) | 338 (cmd_size - 2)); 339 OUT_BATCH(reg + 4 * i); 340 if (brw->gen >= 8) { 341 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, 342 I915_GEM_DOMAIN_INSTRUCTION, offset + 4 * i); 343 } else { 344 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, 345 I915_GEM_DOMAIN_INSTRUCTION, offset + 4 * i); 346 } 347 } 348 ADVANCE_BATCH(); 349 } 350 351 static void 352 hsw_store_query_result(struct gl_context *ctx, struct gl_query_object *q, 353 struct gl_buffer_object *buf, intptr_t offset, 354 GLenum pname, GLenum ptype) 355 { 356 struct brw_context *brw = brw_context(ctx); 357 struct brw_query_object *query = (struct brw_query_object *)q; 358 struct intel_buffer_object *bo = intel_buffer_object(buf); 359 const bool pipelined = brw_is_query_pipelined(query); 360 361 if (pname == GL_QUERY_TARGET) { 362 store_query_result_imm(brw, bo->buffer, offset, ptype, 363 query->Base.Target); 364 return; 365 } else if (pname == GL_QUERY_RESULT_AVAILABLE && !pipelined) { 366 store_query_result_imm(brw, bo->buffer, offset, ptype, 1ull); 367 } else if (query->bo) { 368 /* The query bo still around. Therefore, we: 369 * 370 * 1. Compute the current result in GPR0 371 * 2. Set the command streamer predicate based on query availability 372 * 3. (With predication) Write GPR0 to the requested buffer 373 */ 374 hsw_result_to_gpr0(ctx, query, buf, offset, pname, ptype); 375 if (pipelined) 376 set_predicate(brw, query->bo); 377 store_query_result_reg(brw, bo->buffer, offset, ptype, HSW_CS_GPR(0), 378 pipelined); 379 } else { 380 /* The query bo is gone, so the query must have been processed into 381 * client memory. In this case we can fill the buffer location with the 382 * requested data using MI_STORE_DATA_IMM. 383 */ 384 switch (pname) { 385 case GL_QUERY_RESULT_AVAILABLE: 386 store_query_result_imm(brw, bo->buffer, offset, ptype, 1ull); 387 break; 388 case GL_QUERY_RESULT_NO_WAIT: 389 case GL_QUERY_RESULT: 390 store_query_result_imm(brw, bo->buffer, offset, ptype, 391 q->Result); 392 break; 393 default: 394 unreachable("Unexpected result type"); 395 } 396 } 397 398 } 399 400 /* Initialize hsw+-specific query object functions. */ 401 void hsw_init_queryobj_functions(struct dd_function_table *functions) 402 { 403 gen6_init_queryobj_functions(functions); 404 functions->StoreQueryResult = hsw_store_query_result; 405 } 406