1 /* 2 * Copyright (c) 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 /** @file hsw_queryobj.c 26 * 27 * Support for query buffer objects (GL_ARB_query_buffer_object) on Haswell+. 28 */ 29 #include "main/imports.h" 30 31 #include "brw_context.h" 32 #include "brw_defines.h" 33 #include "intel_batchbuffer.h" 34 #include "intel_buffer_objects.h" 35 36 /* 37 * GPR0 = 80 * GPR0; 38 */ 39 static void 40 mult_gpr0_by_80(struct brw_context *brw) 41 { 42 static const uint32_t maths[] = { 43 MI_MATH_ALU2(LOAD, SRCA, R0), 44 MI_MATH_ALU2(LOAD, SRCB, R0), 45 MI_MATH_ALU0(ADD), 46 MI_MATH_ALU2(STORE, R1, ACCU), 47 MI_MATH_ALU2(LOAD, SRCA, R1), 48 MI_MATH_ALU2(LOAD, SRCB, R1), 49 MI_MATH_ALU0(ADD), 50 MI_MATH_ALU2(STORE, R1, ACCU), 51 MI_MATH_ALU2(LOAD, SRCA, R1), 52 MI_MATH_ALU2(LOAD, SRCB, R1), 53 MI_MATH_ALU0(ADD), 54 MI_MATH_ALU2(STORE, R1, ACCU), 55 MI_MATH_ALU2(LOAD, SRCA, R1), 56 MI_MATH_ALU2(LOAD, SRCB, R1), 57 MI_MATH_ALU0(ADD), 58 /* GPR1 = 16 * GPR0 */ 59 MI_MATH_ALU2(STORE, R1, ACCU), 60 MI_MATH_ALU2(LOAD, SRCA, R1), 61 MI_MATH_ALU2(LOAD, SRCB, R1), 62 MI_MATH_ALU0(ADD), 63 MI_MATH_ALU2(STORE, R2, ACCU), 64 MI_MATH_ALU2(LOAD, SRCA, R2), 65 MI_MATH_ALU2(LOAD, SRCB, R2), 66 MI_MATH_ALU0(ADD), 67 /* GPR2 = 64 * GPR0 */ 68 MI_MATH_ALU2(STORE, R2, ACCU), 69 MI_MATH_ALU2(LOAD, SRCA, R1), 70 MI_MATH_ALU2(LOAD, SRCB, R2), 71 MI_MATH_ALU0(ADD), 72 /* GPR0 = 80 * GPR0 */ 73 MI_MATH_ALU2(STORE, R0, ACCU), 74 }; 75 76 BEGIN_BATCH(1 + ARRAY_SIZE(maths)); 77 OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2)); 78 79 for (int m = 0; m < ARRAY_SIZE(maths); m++) 80 OUT_BATCH(maths[m]); 81 82 ADVANCE_BATCH(); 83 } 84 85 /* 86 * GPR0 = GPR0 & ((1ull << n) - 1); 87 */ 88 static void 89 keep_gpr0_lower_n_bits(struct brw_context *brw, uint32_t n) 90 { 91 static const uint32_t maths[] = { 92 MI_MATH_ALU2(LOAD, SRCA, R0), 93 MI_MATH_ALU2(LOAD, SRCB, R1), 94 MI_MATH_ALU0(AND), 95 MI_MATH_ALU2(STORE, R0, ACCU), 96 }; 97 98 assert(n < 64); 99 brw_load_register_imm64(brw, HSW_CS_GPR(1), (1ull << n) - 1); 100 101 BEGIN_BATCH(1 + ARRAY_SIZE(maths)); 102 OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2)); 103 104 for (int m = 0; m < ARRAY_SIZE(maths); m++) 105 OUT_BATCH(maths[m]); 106 107 ADVANCE_BATCH(); 108 } 109 110 /* 111 * GPR0 = GPR0 << 30; 112 */ 113 static void 114 shl_gpr0_by_30_bits(struct brw_context *brw) 115 { 116 /* First we mask 34 bits of GPR0 to prevent overflow */ 117 keep_gpr0_lower_n_bits(brw, 34); 118 119 static const uint32_t shl_maths[] = { 120 MI_MATH_ALU2(LOAD, SRCA, R0), 121 MI_MATH_ALU2(LOAD, SRCB, R0), 122 MI_MATH_ALU0(ADD), 123 MI_MATH_ALU2(STORE, R0, ACCU), 124 }; 125 126 const uint32_t outer_count = 5; 127 const uint32_t inner_count = 6; 128 STATIC_ASSERT(outer_count * inner_count == 30); 129 const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_maths); 130 const uint32_t batch_len = cmd_len * outer_count; 131 132 BEGIN_BATCH(batch_len); 133 134 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of 135 * 30 left shifts. 136 */ 137 for (int o = 0; o < outer_count; o++) { 138 /* Submit one MI_MATH to shift left by 6 bits */ 139 OUT_BATCH(HSW_MI_MATH | (cmd_len - 2)); 140 for (int i = 0; i < inner_count; i++) 141 for (int m = 0; m < ARRAY_SIZE(shl_maths); m++) 142 OUT_BATCH(shl_maths[m]); 143 } 144 145 ADVANCE_BATCH(); 146 } 147 148 /* 149 * GPR0 = GPR0 >> 2; 150 * 151 * Note that the upper 30 bits of GPR0 are lost! 152 */ 153 static void 154 shr_gpr0_by_2_bits(struct brw_context *brw) 155 { 156 shl_gpr0_by_30_bits(brw); 157 brw_load_register_reg(brw, HSW_CS_GPR(0) + 4, HSW_CS_GPR(0)); 158 brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0); 159 } 160 161 /* 162 * GPR0 = (GPR0 == 0) ? 0 : 1; 163 */ 164 static void 165 gpr0_to_bool(struct brw_context *brw) 166 { 167 static const uint32_t maths[] = { 168 MI_MATH_ALU2(LOAD, SRCA, R0), 169 MI_MATH_ALU1(LOAD0, SRCB), 170 MI_MATH_ALU0(ADD), 171 MI_MATH_ALU2(STOREINV, R0, ZF), 172 MI_MATH_ALU2(LOAD, SRCA, R0), 173 MI_MATH_ALU2(LOAD, SRCB, R1), 174 MI_MATH_ALU0(AND), 175 MI_MATH_ALU2(STORE, R0, ACCU), 176 }; 177 178 brw_load_register_imm64(brw, HSW_CS_GPR(1), 1ull); 179 180 BEGIN_BATCH(1 + ARRAY_SIZE(maths)); 181 OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2)); 182 183 for (int m = 0; m < ARRAY_SIZE(maths); m++) 184 OUT_BATCH(maths[m]); 185 186 ADVANCE_BATCH(); 187 } 188 189 static void 190 load_overflow_data_to_cs_gprs(struct brw_context *brw, 191 struct brw_query_object *query, 192 int idx) 193 { 194 int offset = idx * sizeof(uint64_t) * 4; 195 196 brw_load_register_mem64(brw, HSW_CS_GPR(1), query->bo, offset); 197 198 offset += sizeof(uint64_t); 199 brw_load_register_mem64(brw, HSW_CS_GPR(2), query->bo, offset); 200 201 offset += sizeof(uint64_t); 202 brw_load_register_mem64(brw, HSW_CS_GPR(3), query->bo, offset); 203 204 offset += sizeof(uint64_t); 205 brw_load_register_mem64(brw, HSW_CS_GPR(4), query->bo, offset); 206 } 207 208 /* 209 * R3 = R4 - R3; 210 * R1 = R2 - R1; 211 * R1 = R3 - R1; 212 * R0 = R0 | R1; 213 */ 214 static void 215 calc_overflow_for_stream(struct brw_context *brw) 216 { 217 static const uint32_t maths[] = { 218 MI_MATH_ALU2(LOAD, SRCA, R4), 219 MI_MATH_ALU2(LOAD, SRCB, R3), 220 MI_MATH_ALU0(SUB), 221 MI_MATH_ALU2(STORE, R3, ACCU), 222 MI_MATH_ALU2(LOAD, SRCA, R2), 223 MI_MATH_ALU2(LOAD, SRCB, R1), 224 MI_MATH_ALU0(SUB), 225 MI_MATH_ALU2(STORE, R1, ACCU), 226 MI_MATH_ALU2(LOAD, SRCA, R3), 227 MI_MATH_ALU2(LOAD, SRCB, R1), 228 MI_MATH_ALU0(SUB), 229 MI_MATH_ALU2(STORE, R1, ACCU), 230 MI_MATH_ALU2(LOAD, SRCA, R1), 231 MI_MATH_ALU2(LOAD, SRCB, R0), 232 MI_MATH_ALU0(OR), 233 MI_MATH_ALU2(STORE, R0, ACCU), 234 }; 235 236 BEGIN_BATCH(1 + ARRAY_SIZE(maths)); 237 OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2)); 238 239 for (int m = 0; m < ARRAY_SIZE(maths); m++) 240 OUT_BATCH(maths[m]); 241 242 ADVANCE_BATCH(); 243 } 244 245 static void 246 calc_overflow_to_gpr0(struct brw_context *brw, struct brw_query_object *query, 247 int count) 248 { 249 brw_load_register_imm64(brw, HSW_CS_GPR(0), 0ull); 250 251 for (int i = 0; i < count; i++) { 252 load_overflow_data_to_cs_gprs(brw, query, i); 253 calc_overflow_for_stream(brw); 254 } 255 } 256 257 /* 258 * Take a query and calculate whether there was overflow during transform 259 * feedback. Store the result in the gpr0 register. 260 */ 261 void 262 hsw_overflow_result_to_gpr0(struct brw_context *brw, 263 struct brw_query_object *query, 264 int count) 265 { 266 calc_overflow_to_gpr0(brw, query, count); 267 gpr0_to_bool(brw); 268 } 269 270 static void 271 hsw_result_to_gpr0(struct gl_context *ctx, struct brw_query_object *query, 272 struct gl_buffer_object *buf, intptr_t offset, 273 GLenum pname, GLenum ptype) 274 { 275 struct brw_context *brw = brw_context(ctx); 276 const struct gen_device_info *devinfo = &brw->screen->devinfo; 277 278 assert(query->bo); 279 assert(pname != GL_QUERY_TARGET); 280 281 if (pname == GL_QUERY_RESULT_AVAILABLE) { 282 /* The query result availability is stored at offset 0 of the buffer. */ 283 brw_load_register_mem64(brw, 284 HSW_CS_GPR(0), 285 query->bo, 286 2 * sizeof(uint64_t)); 287 return; 288 } 289 290 if (pname == GL_QUERY_RESULT) { 291 /* Since GL_QUERY_RESULT_NO_WAIT wasn't used, they want us to stall to 292 * make sure the query is available. 293 */ 294 brw_emit_pipe_control_flush(brw, 295 PIPE_CONTROL_CS_STALL | 296 PIPE_CONTROL_STALL_AT_SCOREBOARD); 297 } 298 299 if (query->Base.Target == GL_TIMESTAMP) { 300 brw_load_register_mem64(brw, 301 HSW_CS_GPR(0), 302 query->bo, 303 0 * sizeof(uint64_t)); 304 } else if (query->Base.Target == GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB 305 || query->Base.Target == GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB) { 306 /* Don't do anything in advance here, since the math for this is a little 307 * more complex. 308 */ 309 } else { 310 brw_load_register_mem64(brw, 311 HSW_CS_GPR(1), 312 query->bo, 313 0 * sizeof(uint64_t)); 314 brw_load_register_mem64(brw, 315 HSW_CS_GPR(2), 316 query->bo, 317 1 * sizeof(uint64_t)); 318 319 BEGIN_BATCH(5); 320 OUT_BATCH(HSW_MI_MATH | (5 - 2)); 321 322 OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2)); 323 OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); 324 OUT_BATCH(MI_MATH_ALU0(SUB)); 325 OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); 326 327 ADVANCE_BATCH(); 328 } 329 330 switch (query->Base.Target) { 331 case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: 332 /* Implement the "WaDividePSInvocationCountBy4:HSW,BDW" workaround: 333 * "Invocation counter is 4 times actual. WA: SW to divide HW reported 334 * PS Invocations value by 4." 335 * 336 * Prior to Haswell, invocation count was counted by the WM, and it 337 * buggily counted invocations in units of subspans (2x2 unit). To get the 338 * correct value, the CS multiplied this by 4. With HSW the logic moved, 339 * and correctly emitted the number of pixel shader invocations, but, 340 * whomever forgot to undo the multiply by 4. 341 */ 342 if (devinfo->gen == 8 || devinfo->is_haswell) 343 shr_gpr0_by_2_bits(brw); 344 break; 345 case GL_TIME_ELAPSED: 346 case GL_TIMESTAMP: 347 mult_gpr0_by_80(brw); 348 if (query->Base.Target == GL_TIMESTAMP) { 349 keep_gpr0_lower_n_bits(brw, 36); 350 } 351 break; 352 case GL_ANY_SAMPLES_PASSED: 353 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 354 gpr0_to_bool(brw); 355 break; 356 case GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB: 357 hsw_overflow_result_to_gpr0(brw, query, 1); 358 break; 359 case GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB: 360 hsw_overflow_result_to_gpr0(brw, query, MAX_VERTEX_STREAMS); 361 break; 362 } 363 } 364 365 /* 366 * Store immediate data into the user buffer using the requested size. 367 */ 368 static void 369 store_query_result_imm(struct brw_context *brw, struct brw_bo *bo, 370 uint32_t offset, GLenum ptype, uint64_t imm) 371 { 372 switch (ptype) { 373 case GL_INT: 374 case GL_UNSIGNED_INT: 375 brw_store_data_imm32(brw, bo, offset, imm); 376 break; 377 case GL_INT64_ARB: 378 case GL_UNSIGNED_INT64_ARB: 379 brw_store_data_imm64(brw, bo, offset, imm); 380 break; 381 default: 382 unreachable("Unexpected result type"); 383 } 384 } 385 386 static void 387 set_predicate(struct brw_context *brw, struct brw_bo *query_bo) 388 { 389 brw_load_register_imm64(brw, MI_PREDICATE_SRC1, 0ull); 390 391 /* Load query availability into SRC0 */ 392 brw_load_register_mem64(brw, MI_PREDICATE_SRC0, query_bo, 393 2 * sizeof(uint64_t)); 394 395 /* predicate = !(query_availability == 0); */ 396 BEGIN_BATCH(1); 397 OUT_BATCH(GEN7_MI_PREDICATE | 398 MI_PREDICATE_LOADOP_LOADINV | 399 MI_PREDICATE_COMBINEOP_SET | 400 MI_PREDICATE_COMPAREOP_SRCS_EQUAL); 401 ADVANCE_BATCH(); 402 } 403 404 /* 405 * Store data from the register into the user buffer using the requested size. 406 * The write also enables the predication to prevent writing the result if the 407 * query has not finished yet. 408 */ 409 static void 410 store_query_result_reg(struct brw_context *brw, struct brw_bo *bo, 411 uint32_t offset, GLenum ptype, uint32_t reg, 412 const bool pipelined) 413 { 414 const struct gen_device_info *devinfo = &brw->screen->devinfo; 415 uint32_t cmd_size = devinfo->gen >= 8 ? 4 : 3; 416 uint32_t dwords = (ptype == GL_INT || ptype == GL_UNSIGNED_INT) ? 1 : 2; 417 assert(devinfo->gen >= 6); 418 419 BEGIN_BATCH(dwords * cmd_size); 420 for (int i = 0; i < dwords; i++) { 421 OUT_BATCH(MI_STORE_REGISTER_MEM | 422 (pipelined ? MI_STORE_REGISTER_MEM_PREDICATE : 0) | 423 (cmd_size - 2)); 424 OUT_BATCH(reg + 4 * i); 425 if (devinfo->gen >= 8) { 426 OUT_RELOC64(bo, RELOC_WRITE, offset + 4 * i); 427 } else { 428 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + 4 * i); 429 } 430 } 431 ADVANCE_BATCH(); 432 } 433 434 static void 435 hsw_store_query_result(struct gl_context *ctx, struct gl_query_object *q, 436 struct gl_buffer_object *buf, intptr_t offset, 437 GLenum pname, GLenum ptype) 438 { 439 struct brw_context *brw = brw_context(ctx); 440 struct brw_query_object *query = (struct brw_query_object *)q; 441 struct intel_buffer_object *bo = intel_buffer_object(buf); 442 const bool pipelined = brw_is_query_pipelined(query); 443 444 if (pname == GL_QUERY_TARGET) { 445 store_query_result_imm(brw, bo->buffer, offset, ptype, 446 query->Base.Target); 447 return; 448 } else if (pname == GL_QUERY_RESULT_AVAILABLE && !pipelined) { 449 store_query_result_imm(brw, bo->buffer, offset, ptype, 1ull); 450 } else if (query->bo) { 451 /* The query bo still around. Therefore, we: 452 * 453 * 1. Compute the current result in GPR0 454 * 2. Set the command streamer predicate based on query availability 455 * 3. (With predication) Write GPR0 to the requested buffer 456 */ 457 hsw_result_to_gpr0(ctx, query, buf, offset, pname, ptype); 458 if (pipelined) 459 set_predicate(brw, query->bo); 460 store_query_result_reg(brw, bo->buffer, offset, ptype, HSW_CS_GPR(0), 461 pipelined); 462 } else { 463 /* The query bo is gone, so the query must have been processed into 464 * client memory. In this case we can fill the buffer location with the 465 * requested data using MI_STORE_DATA_IMM. 466 */ 467 switch (pname) { 468 case GL_QUERY_RESULT_AVAILABLE: 469 store_query_result_imm(brw, bo->buffer, offset, ptype, 1ull); 470 break; 471 case GL_QUERY_RESULT_NO_WAIT: 472 case GL_QUERY_RESULT: 473 store_query_result_imm(brw, bo->buffer, offset, ptype, 474 q->Result); 475 break; 476 default: 477 unreachable("Unexpected result type"); 478 } 479 } 480 481 } 482 483 /* Initialize hsw+-specific query object functions. */ 484 void hsw_init_queryobj_functions(struct dd_function_table *functions) 485 { 486 gen6_init_queryobj_functions(functions); 487 functions->StoreQueryResult = hsw_store_query_result; 488 } 489