Discussion:
[Mesa-dev] [PATCH 2/4] i965: Use I915_EXEC_NO_RELOC
Chris Wilson
2017-06-19 10:06:48 UTC
Permalink
If we correctly fill the batch with the right relocation value, and that
matches the expected location of the object, we can then tell the kernel
it can forgo checking each individual relocation by only checking
whether the object moved.

v2: Rebase to apply ahead of I915_EXEC_HANDLE_LUT

Signed-off-by: Chris Wilson <***@chris-wilson.co.uk>
Cc: Kenneth Graunke <***@whitecape.org>
Cc: Matt Turner <***@gmail.com>
Cc: Jason Ekstrand <***@intel.com>
---
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 53 +++++++++++++++++++++------
1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index ca7d6b81b1..7129209c26 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -517,19 +517,19 @@ throttle(struct brw_context *brw)

#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))

-static void
+static unsigned int
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
unsigned int index = READ_ONCE(bo->index);

if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return;
+ return index;

/* May have been shared between multiple active batches */
for (index = 0; index < batch->exec_count; index++) {
if (batch->exec_bos[index] == bo)
- return;
+ return index;
}

brw_bo_reference(bo);
@@ -563,8 +563,9 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)

bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
- batch->exec_count++;
batch->aperture_space += bo->size;
+
+ return batch->exec_count++;
}

static int
@@ -642,12 +643,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
}

if (!brw->screen->no_hw) {
- int flags;
-
+ unsigned int flags;
+
+ /* The requirement for using I915_EXEC_NO_RELOC are:
+ *
+ * The addresses written in the objects must match the corresponding
+ * reloc.presumed_offset which in turn must match the corresponding
+ * execobject.offset.
+ *
+ * Any render targets written to in the batch must be flagged with
+ * EXEC_OBJECT_WRITE.
+ *
+ * To avoid stalling, execobject.offset should match the current
+ * address of that object within the active context.
+ */
+ flags = I915_EXEC_NO_RELOC;
if (brw->gen >= 6 && batch->ring == BLT_RING) {
- flags = I915_EXEC_BLT;
+ flags |= I915_EXEC_BLT;
} else {
- flags = I915_EXEC_RENDER;
+ flags |= I915_EXEC_RENDER;
}
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;
@@ -779,16 +793,31 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);

- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
+ } else {
+ offset64 = target->offset64;
+ }

struct drm_i915_gem_relocation_entry *reloc =
&batch->relocs[batch->reloc_count];

batch->reloc_count++;

- /* ensure gcc doesn't reload */
- offset64 = *((volatile uint64_t *)&target->offset64);
reloc->offset = batch_offset;
reloc->delta = target_offset;
reloc->target_handle = target->gem_handle;
--
2.11.0
Chris Wilson
2017-06-19 10:06:49 UTC
Permalink
To avoid a forward declaration in the next patch, move the definition of
add_exec_bo() earlier.

Signed-off-by: Chris Wilson <***@chris-wilson.co.uk>
Cc: Kenneth Graunke <***@whitecape.org>
Cc: Matt Turner <***@gmail.com>
Cc: Jason Ekstrand <***@intel.com>
---
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 106 +++++++++++++-------------
1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 7129209c26..15aaf01e52 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -87,6 +87,59 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
}
}

+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
+static unsigned int
+add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
+{
+ if (bo != batch->bo) {
+ unsigned int index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return index;
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
+ return index;
+ }
+
+ brw_bo_reference(bo);
+ }
+
+ if (batch->exec_count == batch->exec_array_size) {
+ batch->exec_array_size *= 2;
+ batch->exec_bos =
+ realloc(batch->exec_bos,
+ batch->exec_array_size * sizeof(batch->exec_bos[0]));
+ batch->exec_objects =
+ realloc(batch->exec_objects,
+ batch->exec_array_size * sizeof(batch->exec_objects[0]));
+ }
+
+ struct drm_i915_gem_exec_object2 *validation_entry =
+ &batch->exec_objects[batch->exec_count];
+ validation_entry->handle = bo->gem_handle;
+ if (bo == batch->bo) {
+ validation_entry->relocation_count = batch->reloc_count;
+ validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
+ } else {
+ validation_entry->relocation_count = 0;
+ validation_entry->relocs_ptr = 0;
+ }
+ validation_entry->alignment = bo->align;
+ validation_entry->offset = bo->offset64;
+ validation_entry->flags = bo->kflags;
+ validation_entry->rsvd1 = 0;
+ validation_entry->rsvd2 = 0;
+
+ bo->index = batch->exec_count;
+ batch->exec_bos[batch->exec_count] = bo;
+ batch->aperture_space += bo->size;
+
+ return batch->exec_count++;
+}
+
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -515,59 +568,6 @@ throttle(struct brw_context *brw)
}
}

-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
-static unsigned int
-add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
-{
- if (bo != batch->bo) {
- unsigned int index = READ_ONCE(bo->index);
-
- if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return index;
-
- /* May have been shared between multiple active batches */
- for (index = 0; index < batch->exec_count; index++) {
- if (batch->exec_bos[index] == bo)
- return index;
- }
-
- brw_bo_reference(bo);
- }
-
- if (batch->exec_count == batch->exec_array_size) {
- batch->exec_array_size *= 2;
- batch->exec_bos =
- realloc(batch->exec_bos,
- batch->exec_array_size * sizeof(batch->exec_bos[0]));
- batch->exec_objects =
- realloc(batch->exec_objects,
- batch->exec_array_size * sizeof(batch->exec_objects[0]));
- }
-
- struct drm_i915_gem_exec_object2 *validation_entry =
- &batch->exec_objects[batch->exec_count];
- validation_entry->handle = bo->gem_handle;
- if (bo == batch->bo) {
- validation_entry->relocation_count = batch->reloc_count;
- validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
- } else {
- validation_entry->relocation_count = 0;
- validation_entry->relocs_ptr = 0;
- }
- validation_entry->alignment = bo->align;
- validation_entry->offset = bo->offset64;
- validation_entry->flags = bo->kflags;
- validation_entry->rsvd1 = 0;
- validation_entry->rsvd2 = 0;
-
- bo->index = batch->exec_count;
- batch->exec_bos[batch->exec_count] = bo;
- batch->aperture_space += bo->size;
-
- return batch->exec_count++;
-}
-
static int
execbuffer(int fd,
struct intel_batchbuffer *batch,
--
2.11.0
Daniel Vetter
2017-07-07 10:19:55 UTC
Permalink
Post by Chris Wilson
To avoid a forward declaration in the next patch, move the definition of
add_exec_bo() earlier.
Needs to be rebased when patch 2 is updated, but ack.
-Daniel
Post by Chris Wilson
---
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 106 +++++++++++++-------------
1 file changed, 53 insertions(+), 53 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 7129209c26..15aaf01e52 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -87,6 +87,59 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
}
}
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
+static unsigned int
+add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
+{
+ if (bo != batch->bo) {
+ unsigned int index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return index;
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
+ return index;
+ }
+
+ brw_bo_reference(bo);
+ }
+
+ if (batch->exec_count == batch->exec_array_size) {
+ batch->exec_array_size *= 2;
+ batch->exec_bos =
+ realloc(batch->exec_bos,
+ batch->exec_array_size * sizeof(batch->exec_bos[0]));
+ batch->exec_objects =
+ realloc(batch->exec_objects,
+ batch->exec_array_size * sizeof(batch->exec_objects[0]));
+ }
+
+ struct drm_i915_gem_exec_object2 *validation_entry =
+ &batch->exec_objects[batch->exec_count];
+ validation_entry->handle = bo->gem_handle;
+ if (bo == batch->bo) {
+ validation_entry->relocation_count = batch->reloc_count;
+ validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
+ } else {
+ validation_entry->relocation_count = 0;
+ validation_entry->relocs_ptr = 0;
+ }
+ validation_entry->alignment = bo->align;
+ validation_entry->offset = bo->offset64;
+ validation_entry->flags = bo->kflags;
+ validation_entry->rsvd1 = 0;
+ validation_entry->rsvd2 = 0;
+
+ bo->index = batch->exec_count;
+ batch->exec_bos[batch->exec_count] = bo;
+ batch->aperture_space += bo->size;
+
+ return batch->exec_count++;
+}
+
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -515,59 +568,6 @@ throttle(struct brw_context *brw)
}
}
-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
-static unsigned int
-add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
-{
- if (bo != batch->bo) {
- unsigned int index = READ_ONCE(bo->index);
-
- if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return index;
-
- /* May have been shared between multiple active batches */
- for (index = 0; index < batch->exec_count; index++) {
- if (batch->exec_bos[index] == bo)
- return index;
- }
-
- brw_bo_reference(bo);
- }
-
- if (batch->exec_count == batch->exec_array_size) {
- batch->exec_array_size *= 2;
- batch->exec_bos =
- realloc(batch->exec_bos,
- batch->exec_array_size * sizeof(batch->exec_bos[0]));
- batch->exec_objects =
- realloc(batch->exec_objects,
- batch->exec_array_size * sizeof(batch->exec_objects[0]));
- }
-
- struct drm_i915_gem_exec_object2 *validation_entry =
- &batch->exec_objects[batch->exec_count];
- validation_entry->handle = bo->gem_handle;
- if (bo == batch->bo) {
- validation_entry->relocation_count = batch->reloc_count;
- validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
- } else {
- validation_entry->relocation_count = 0;
- validation_entry->relocs_ptr = 0;
- }
- validation_entry->alignment = bo->align;
- validation_entry->offset = bo->offset64;
- validation_entry->flags = bo->kflags;
- validation_entry->rsvd1 = 0;
- validation_entry->rsvd2 = 0;
-
- bo->index = batch->exec_count;
- batch->exec_bos[batch->exec_count] = bo;
- batch->aperture_space += bo->size;
-
- return batch->exec_count++;
-}
-
static int
execbuffer(int fd,
struct intel_batchbuffer *batch,
--
2.11.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Chris Wilson
2017-06-19 10:06:50 UTC
Permalink
Passing the index of the target buffer via the reloc.target_handle is
marginally more efficient for the kernel (it can avoid some allocations,
and can use a direct lookup rather than a hash or search). It is also
useful for ourselves as we can use the index into our exec_bos for other
tasks.

v2: Only enable HANDLE_LUT if we can use BATCH_FIRST and thereby avoid
a post-processing loop to fixup the relocations.

Signed-off-by: Chris Wilson <***@chris-wilson.co.uk>
Cc: Kenneth Graunke <***@whitecape.org>
Cc: Matt Turner <***@gmail.com>
Cc: Jason Ekstrand <***@intel.com>
---
src/mesa/drivers/dri/i965/brw_context.h | 1 +
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 82 ++++++++++++++++++++-------
2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2fb2cab918..93ddd0825a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -450,6 +450,7 @@ struct intel_batchbuffer {

uint32_t state_batch_offset;
enum brw_gpu_ring ring;
+ bool has_batch_first;
bool needs_sol_reset;
bool state_base_address_emitted;

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 15aaf01e52..5fa849c5a5 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -40,6 +40,10 @@

#define FILE_DEBUG_FLAG DEBUG_BUFMGR

+#define DBG_NO_BATCH_FIRST 0
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST 48
+#define I915_EXEC_BATCH_FIRST (1 << 18)
+
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -57,13 +61,33 @@ uint_key_hash(const void *key)
return (uintptr_t) key;
}

+static int gem_param(int fd, int name)
+{
+ drm_i915_getparam_t gp;
+ int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+ memset(&gp, 0, sizeof(gp));
+ gp.param = name;
+ gp.value = &v;
+ if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+ return -1;
+
+ return v;
+}
+
+static bool test_has_batch_first(int fd)
+{
+ if (DBG_NO_BATCH_FIRST)
+ return DBG_NO_BATCH_FIRST < 0;
+
+ return gem_param(fd, I915_PARAM_HAS_EXEC_BATCH_FIRST) > 0;
+}
+
void
intel_batchbuffer_init(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
bool has_llc)
{
- intel_batchbuffer_reset(batch, bufmgr, has_llc);
-
if (!has_llc) {
batch->cpu_map = malloc(BATCH_SZ);
batch->map = batch->cpu_map;
@@ -85,6 +109,12 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
batch->state_batch_sizes =
_mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
}
+
+ struct brw_context *brw = container_of(batch, brw, batch);
+ __DRIscreen *dri_screen = brw->screen->driScrnPriv;
+ batch->has_batch_first = test_has_batch_first(dri_screen->fd);
+
+ intel_batchbuffer_reset(batch, bufmgr, has_llc);
}

#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
@@ -117,21 +147,12 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
batch->exec_array_size * sizeof(batch->exec_objects[0]));
}

- struct drm_i915_gem_exec_object2 *validation_entry =
- &batch->exec_objects[batch->exec_count];
- validation_entry->handle = bo->gem_handle;
- if (bo == batch->bo) {
- validation_entry->relocation_count = batch->reloc_count;
- validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
- } else {
- validation_entry->relocation_count = 0;
- validation_entry->relocs_ptr = 0;
- }
- validation_entry->alignment = bo->align;
- validation_entry->offset = bo->offset64;
- validation_entry->flags = bo->kflags;
- validation_entry->rsvd1 = 0;
- validation_entry->rsvd2 = 0;
+ struct drm_i915_gem_exec_object2 *exec =
+ memset(&batch->exec_objects[batch->exec_count], 0, sizeof(*exec));
+ exec->handle = bo->gem_handle;
+ exec->alignment = bo->align;
+ exec->offset = bo->offset64;
+ exec->flags = bo->kflags;

bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
@@ -157,6 +178,11 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
}
batch->map_next = batch->map;

+ if (batch->has_batch_first) {
+ add_exec_bo(batch, batch->bo);
+ assert(batch->bo->index == 0);
+ }
+
batch->reserved_space = BATCH_RESERVED;
batch->state_batch_offset = batch->bo->size;
batch->needs_sol_reset = false;
@@ -663,15 +689,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
} else {
flags |= I915_EXEC_RENDER;
}
+
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;

+ unsigned int index;
+ if (batch->has_batch_first) {
+ flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
+ index = 0;
+ } else {
+ index = add_exec_bo(batch, batch->bo);
+ }
+
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+ exec->relocation_count = batch->reloc_count;
+ exec->relocs_ptr = (uintptr_t) batch->relocs;
+
if (ret == 0) {
uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;

- /* Add the batch itself to the end of the validation list */
- add_exec_bo(batch, batch->bo);
-
ret = execbuffer(dri_screen->fd, batch, hw_ctx,
4 * USED_BATCH(*batch),
in_fence_fd, out_fence_fd, flags);
@@ -793,8 +829,9 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);

+ unsigned int index;
if (target != batch->bo) {
- unsigned int index = add_exec_bo(batch, target);
+ index = add_exec_bo(batch, target);
struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];

if (write_domain) {
@@ -811,6 +848,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
offset64 = exec->offset;
} else {
offset64 = target->offset64;
+ index = target->index;
}

struct drm_i915_gem_relocation_entry *reloc =
@@ -820,7 +858,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,

reloc->offset = batch_offset;
reloc->delta = target_offset;
- reloc->target_handle = target->gem_handle;
+ reloc->target_handle = batch->has_batch_first ? index : target->gem_handle;
reloc->read_domains = read_domains;
reloc->write_domain = write_domain;
reloc->presumed_offset = offset64;
--
2.11.0
Daniel Vetter
2017-07-07 10:31:46 UTC
Permalink
Post by Chris Wilson
Passing the index of the target buffer via the reloc.target_handle is
marginally more efficient for the kernel (it can avoid some allocations,
and can use a direct lookup rather than a hash or search). It is also
useful for ourselves as we can use the index into our exec_bos for other
tasks.
v2: Only enable HANDLE_LUT if we can use BATCH_FIRST and thereby avoid
a post-processing loop to fixup the relocations.
---
src/mesa/drivers/dri/i965/brw_context.h | 1 +
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 82 ++++++++++++++++++++-------
2 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2fb2cab918..93ddd0825a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -450,6 +450,7 @@ struct intel_batchbuffer {
uint32_t state_batch_offset;
enum brw_gpu_ring ring;
+ bool has_batch_first;
bool needs_sol_reset;
bool state_base_address_emitted;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 15aaf01e52..5fa849c5a5 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -40,6 +40,10 @@
#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+#define DBG_NO_BATCH_FIRST 0
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST 48
+#define I915_EXEC_BATCH_FIRST (1 << 18)
Needs an #ifndef I think, to avoid troubles when updating libdrm. Or just
properly update mesa's copy of i915_drm.h, that would be much better.
Post by Chris Wilson
+
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -57,13 +61,33 @@ uint_key_hash(const void *key)
return (uintptr_t) key;
}
+static int gem_param(int fd, int name)
+{
+ drm_i915_getparam_t gp;
+ int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+ memset(&gp, 0, sizeof(gp));
+ gp.param = name;
+ gp.value = &v;
+ if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+ return -1;
+
+ return v;
+}
Afaict this exists as intel_get_param already, why can't we use that?
Post by Chris Wilson
+
+static bool test_has_batch_first(int fd)
+{
+ if (DBG_NO_BATCH_FIRST)
+ return DBG_NO_BATCH_FIRST < 0;
+
+ return gem_param(fd, I915_PARAM_HAS_EXEC_BATCH_FIRST) > 0;
+}
+
void
intel_batchbuffer_init(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
bool has_llc)
{
- intel_batchbuffer_reset(batch, bufmgr, has_llc);
-
if (!has_llc) {
batch->cpu_map = malloc(BATCH_SZ);
batch->map = batch->cpu_map;
@@ -85,6 +109,12 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
batch->state_batch_sizes =
_mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
}
+
+ struct brw_context *brw = container_of(batch, brw, batch);
+ __DRIscreen *dri_screen = brw->screen->driScrnPriv;
+ batch->has_batch_first = test_has_batch_first(dri_screen->fd);
+
+ intel_batchbuffer_reset(batch, bufmgr, has_llc);
}
#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
@@ -117,21 +147,12 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
batch->exec_array_size * sizeof(batch->exec_objects[0]));
}
- struct drm_i915_gem_exec_object2 *validation_entry =
- &batch->exec_objects[batch->exec_count];
- validation_entry->handle = bo->gem_handle;
- if (bo == batch->bo) {
- validation_entry->relocation_count = batch->reloc_count;
- validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
- } else {
- validation_entry->relocation_count = 0;
- validation_entry->relocs_ptr = 0;
- }
- validation_entry->alignment = bo->align;
- validation_entry->offset = bo->offset64;
- validation_entry->flags = bo->kflags;
- validation_entry->rsvd1 = 0;
- validation_entry->rsvd2 = 0;
+ struct drm_i915_gem_exec_object2 *exec =
+ memset(&batch->exec_objects[batch->exec_count], 0, sizeof(*exec));
+ exec->handle = bo->gem_handle;
+ exec->alignment = bo->align;
+ exec->offset = bo->offset64;
+ exec->flags = bo->kflags;
bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
@@ -157,6 +178,11 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
}
batch->map_next = batch->map;
+ if (batch->has_batch_first) {
+ add_exec_bo(batch, batch->bo);
+ assert(batch->bo->index == 0);
+ }
+
batch->reserved_space = BATCH_RESERVED;
batch->state_batch_offset = batch->bo->size;
batch->needs_sol_reset = false;
@@ -663,15 +689,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
} else {
flags |= I915_EXEC_RENDER;
}
+
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;
+ unsigned int index;
+ if (batch->has_batch_first) {
+ flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
+ index = 0;
+ } else {
+ index = add_exec_bo(batch, batch->bo);
+ }
+
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+ exec->relocation_count = batch->reloc_count;
+ exec->relocs_ptr = (uintptr_t) batch->relocs;
+
if (ret == 0) {
uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
- /* Add the batch itself to the end of the validation list */
- add_exec_bo(batch, batch->bo);
-
ret = execbuffer(dri_screen->fd, batch, hw_ctx,
4 * USED_BATCH(*batch),
in_fence_fd, out_fence_fd, flags);
@@ -793,8 +829,9 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);
+ unsigned int index;
if (target != batch->bo) {
- unsigned int index = add_exec_bo(batch, target);
+ index = add_exec_bo(batch, target);
struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
if (write_domain) {
@@ -811,6 +848,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
offset64 = exec->offset;
} else {
offset64 = target->offset64;
+ index = target->index;
index = 0; Yes the bathc isn't ever shared, but I think it's better to
make this obviously safe.
Post by Chris Wilson
}
struct drm_i915_gem_relocation_entry *reloc =
@@ -820,7 +858,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
reloc->offset = batch_offset;
reloc->delta = target_offset;
- reloc->target_handle = target->gem_handle;
+ reloc->target_handle = batch->has_batch_first ? index : target->gem_handle;
reloc->read_domains = read_domains;
reloc->write_domain = write_domain;
reloc->presumed_offset = offset64;
--
2.11.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Chris Wilson
2017-07-07 10:41:55 UTC
Permalink
Quoting Daniel Vetter (2017-07-07 11:31:46)
Post by Daniel Vetter
Post by Chris Wilson
Passing the index of the target buffer via the reloc.target_handle is
marginally more efficient for the kernel (it can avoid some allocations,
and can use a direct lookup rather than a hash or search). It is also
useful for ourselves as we can use the index into our exec_bos for other
tasks.
v2: Only enable HANDLE_LUT if we can use BATCH_FIRST and thereby avoid
a post-processing loop to fixup the relocations.
---
src/mesa/drivers/dri/i965/brw_context.h | 1 +
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 82 ++++++++++++++++++++-------
2 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2fb2cab918..93ddd0825a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -450,6 +450,7 @@ struct intel_batchbuffer {
uint32_t state_batch_offset;
enum brw_gpu_ring ring;
+ bool has_batch_first;
bool needs_sol_reset;
bool state_base_address_emitted;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 15aaf01e52..5fa849c5a5 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -40,6 +40,10 @@
#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+#define DBG_NO_BATCH_FIRST 0
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST 48
+#define I915_EXEC_BATCH_FIRST (1 << 18)
Needs an #ifndef I think, to avoid troubles when updating libdrm. Or just
properly update mesa's copy of i915_drm.h, that would be much better.
Because src/include/drm/i915_drm.h didn't exist at time. In the current
version of the patch, this pair are no longer required as they are
already defined.
Post by Daniel Vetter
Post by Chris Wilson
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -57,13 +61,33 @@ uint_key_hash(const void *key)
return (uintptr_t) key;
}
+static int gem_param(int fd, int name)
+{
+ drm_i915_getparam_t gp;
+ int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+ memset(&gp, 0, sizeof(gp));
+ gp.param = name;
+ gp.value = &v;
+ if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+ return -1;
+
+ return v;
+}
Afaict this exists as intel_get_param already, why can't we use that?
Because it didn't exist when I wrote the patch. (I'm platforming for
kern_info to be queried once alongside dev_info.)
Post by Daniel Vetter
Post by Chris Wilson
@@ -793,8 +829,9 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);
+ unsigned int index;
if (target != batch->bo) {
- unsigned int index = add_exec_bo(batch, target);
+ index = add_exec_bo(batch, target);
struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
if (write_domain) {
@@ -811,6 +848,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
offset64 = exec->offset;
} else {
offset64 = target->offset64;
+ index = target->index;
index = 0; Yes the bathc isn't ever shared, but I think it's better to
make this obviously safe.
It's safer using the tracking than adding the presumption, surely?
That way this patch (in the original ordering) was just as happy with
using the batch in the last_slot and doing the fixups.
-Chris
Daniel Vetter
2017-07-07 10:34:36 UTC
Permalink
Post by Chris Wilson
Passing the index of the target buffer via the reloc.target_handle is
marginally more efficient for the kernel (it can avoid some allocations,
and can use a direct lookup rather than a hash or search). It is also
useful for ourselves as we can use the index into our exec_bos for other
tasks.
v2: Only enable HANDLE_LUT if we can use BATCH_FIRST and thereby avoid
a post-processing loop to fixup the relocations.
Bikeshed for clarity: s/has_batch_first/exec_lut/ and a comment explaining
that we need both LUT and BATCH_FIRST and why. That would make some of the
if (batch->exec_lut) paths easier to grok on a quick reading.
-Daniel
Post by Chris Wilson
---
src/mesa/drivers/dri/i965/brw_context.h | 1 +
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 82 ++++++++++++++++++++-------
2 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2fb2cab918..93ddd0825a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -450,6 +450,7 @@ struct intel_batchbuffer {
uint32_t state_batch_offset;
enum brw_gpu_ring ring;
+ bool has_batch_first;
bool needs_sol_reset;
bool state_base_address_emitted;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 15aaf01e52..5fa849c5a5 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -40,6 +40,10 @@
#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+#define DBG_NO_BATCH_FIRST 0
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST 48
+#define I915_EXEC_BATCH_FIRST (1 << 18)
+
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -57,13 +61,33 @@ uint_key_hash(const void *key)
return (uintptr_t) key;
}
+static int gem_param(int fd, int name)
+{
+ drm_i915_getparam_t gp;
+ int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+ memset(&gp, 0, sizeof(gp));
+ gp.param = name;
+ gp.value = &v;
+ if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+ return -1;
+
+ return v;
+}
+
+static bool test_has_batch_first(int fd)
+{
+ if (DBG_NO_BATCH_FIRST)
+ return DBG_NO_BATCH_FIRST < 0;
+
+ return gem_param(fd, I915_PARAM_HAS_EXEC_BATCH_FIRST) > 0;
+}
+
void
intel_batchbuffer_init(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
bool has_llc)
{
- intel_batchbuffer_reset(batch, bufmgr, has_llc);
-
if (!has_llc) {
batch->cpu_map = malloc(BATCH_SZ);
batch->map = batch->cpu_map;
@@ -85,6 +109,12 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
batch->state_batch_sizes =
_mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
}
+
+ struct brw_context *brw = container_of(batch, brw, batch);
+ __DRIscreen *dri_screen = brw->screen->driScrnPriv;
+ batch->has_batch_first = test_has_batch_first(dri_screen->fd);
+
+ intel_batchbuffer_reset(batch, bufmgr, has_llc);
}
#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
@@ -117,21 +147,12 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
batch->exec_array_size * sizeof(batch->exec_objects[0]));
}
- struct drm_i915_gem_exec_object2 *validation_entry =
- &batch->exec_objects[batch->exec_count];
- validation_entry->handle = bo->gem_handle;
- if (bo == batch->bo) {
- validation_entry->relocation_count = batch->reloc_count;
- validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
- } else {
- validation_entry->relocation_count = 0;
- validation_entry->relocs_ptr = 0;
- }
- validation_entry->alignment = bo->align;
- validation_entry->offset = bo->offset64;
- validation_entry->flags = bo->kflags;
- validation_entry->rsvd1 = 0;
- validation_entry->rsvd2 = 0;
+ struct drm_i915_gem_exec_object2 *exec =
+ memset(&batch->exec_objects[batch->exec_count], 0, sizeof(*exec));
+ exec->handle = bo->gem_handle;
+ exec->alignment = bo->align;
+ exec->offset = bo->offset64;
+ exec->flags = bo->kflags;
bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
@@ -157,6 +178,11 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
}
batch->map_next = batch->map;
+ if (batch->has_batch_first) {
+ add_exec_bo(batch, batch->bo);
+ assert(batch->bo->index == 0);
+ }
+
batch->reserved_space = BATCH_RESERVED;
batch->state_batch_offset = batch->bo->size;
batch->needs_sol_reset = false;
@@ -663,15 +689,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
} else {
flags |= I915_EXEC_RENDER;
}
+
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;
+ unsigned int index;
+ if (batch->has_batch_first) {
+ flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
+ index = 0;
+ } else {
+ index = add_exec_bo(batch, batch->bo);
+ }
+
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+ exec->relocation_count = batch->reloc_count;
+ exec->relocs_ptr = (uintptr_t) batch->relocs;
+
if (ret == 0) {
uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
- /* Add the batch itself to the end of the validation list */
- add_exec_bo(batch, batch->bo);
-
ret = execbuffer(dri_screen->fd, batch, hw_ctx,
4 * USED_BATCH(*batch),
in_fence_fd, out_fence_fd, flags);
@@ -793,8 +829,9 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);
+ unsigned int index;
if (target != batch->bo) {
- unsigned int index = add_exec_bo(batch, target);
+ index = add_exec_bo(batch, target);
struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
if (write_domain) {
@@ -811,6 +848,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
offset64 = exec->offset;
} else {
offset64 = target->offset64;
+ index = target->index;
}
struct drm_i915_gem_relocation_entry *reloc =
@@ -820,7 +858,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
reloc->offset = batch_offset;
reloc->delta = target_offset;
- reloc->target_handle = target->gem_handle;
+ reloc->target_handle = batch->has_batch_first ? index : target->gem_handle;
reloc->read_domains = read_domains;
reloc->write_domain = write_domain;
reloc->presumed_offset = offset64;
--
2.11.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Kenneth Graunke
2017-06-19 19:28:31 UTC
Permalink
Post by Chris Wilson
If we correctly fill the batch with the right relocation value, and that
matches the expected location of the object, we can then tell the kernel
it can forgo checking each individual relocation by only checking
whether the object moved.
v2: Rebase to apply ahead of I915_EXEC_HANDLE_LUT
---
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 53 +++++++++++++++++++++------
1 file changed, 41 insertions(+), 12 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index ca7d6b81b1..7129209c26 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -517,19 +517,19 @@ throttle(struct brw_context *brw)
#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-static void
+static unsigned int
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
unsigned int index = READ_ONCE(bo->index);
if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return;
+ return index;
/* May have been shared between multiple active batches */
for (index = 0; index < batch->exec_count; index++) {
if (batch->exec_bos[index] == bo)
- return;
+ return index;
}
brw_bo_reference(bo);
@@ -563,8 +563,9 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
- batch->exec_count++;
batch->aperture_space += bo->size;
+
+ return batch->exec_count++;
}
static int
@@ -642,12 +643,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
}
if (!brw->screen->no_hw) {
- int flags;
-
+ unsigned int flags;
+
+ *
+ * The addresses written in the objects must match the corresponding
+ * reloc.presumed_offset which in turn must match the corresponding
+ * execobject.offset.
+ *
+ * Any render targets written to in the batch must be flagged with
+ * EXEC_OBJECT_WRITE.
+ *
+ * To avoid stalling, execobject.offset should match the current
+ * address of that object within the active context.
+ */
+ flags = I915_EXEC_NO_RELOC;
if (brw->gen >= 6 && batch->ring == BLT_RING) {
- flags = I915_EXEC_BLT;
+ flags |= I915_EXEC_BLT;
} else {
- flags = I915_EXEC_RENDER;
+ flags |= I915_EXEC_RENDER;
}
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;
@@ -779,16 +793,31 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
I don't think this works. brw_bo still has a single offset64 value that's
shared across all contexts. So, in a multi-context/multi-threaded case, we
may put a bogus offset in the validation list.

Pulling the value out of the validation list is nice, but it's still
potentially bogus.

I think we still need per-context-bo fields. I have a patch series in
progress to do that, but I moved it down the priority list a ways...
need to get back to that...
Post by Chris Wilson
+ } else {
+ offset64 = target->offset64;
+ }
struct drm_i915_gem_relocation_entry *reloc =
&batch->relocs[batch->reloc_count];
batch->reloc_count++;
- /* ensure gcc doesn't reload */
- offset64 = *((volatile uint64_t *)&target->offset64);
reloc->offset = batch_offset;
reloc->delta = target_offset;
reloc->target_handle = target->gem_handle;
Chris Wilson
2017-06-19 19:53:33 UTC
Permalink
Quoting Kenneth Graunke (2017-06-19 20:28:31)
Post by Kenneth Graunke
Post by Chris Wilson
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
I don't think this works. brw_bo still has a single offset64 value that's
shared across all contexts. So, in a multi-context/multi-threaded case, we
may put a bogus offset in the validation list.
But the value that we put there is the *only* value that we then use for
all of the relocation entries. We are consistently wrong which is what
is mandated by the interface, i.e. we don't prevent a relocation stall
(unless the kernel can move the buffers for us) but more importantly the
relocation.presumed_offset == execobject.offset == batch[reloc.offset]
for all relocations.
Post by Kenneth Graunke
Pulling the value out of the validation list is nice, but it's still
potentially bogus.
I think we still need per-context-bo fields. I have a patch series in
progress to do that, but I moved it down the priority list a ways...
need to get back to that...
Per-context tracking will allow us to avoid stalls between contexts,
which will be an improvement above and beyond this patch. This patch
will work even with conflicting contexts, just no better than today. For
the single context (or no conflict cases), we do realise the improvement
from avoiding always having to inspect reloc[].

Hope that helps,
-Chris
Jason Ekstrand
2017-06-19 21:00:45 UTC
Permalink
Post by Chris Wilson
Quoting Kenneth Graunke (2017-06-19 20:28:31)
Post by Kenneth Graunke
Post by Chris Wilson
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec =
&batch->exec_objects[index];
Post by Kenneth Graunke
Post by Chris Wilson
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
I don't think this works. brw_bo still has a single offset64 value
that's
Post by Kenneth Graunke
shared across all contexts. So, in a multi-context/multi-threaded case,
we
Post by Kenneth Graunke
may put a bogus offset in the validation list.
But the value that we put there is the *only* value that we then use for
all of the relocation entries. We are consistently wrong which is what
is mandated by the interface, i.e. we don't prevent a relocation stall
(unless the kernel can move the buffers for us) but more importantly the
relocation.presumed_offset == execobject.offset == batch[reloc.offset]
for all relocations.
Is the BO address in the kernel per-context or per-drm-fd? If it's the
later then we may be safe though it sounds scary. If it's the former, then
we definitely have a problem.
Post by Chris Wilson
Post by Kenneth Graunke
Pulling the value out of the validation list is nice, but it's still
potentially bogus.
I think we still need per-context-bo fields. I have a patch series in
progress to do that, but I moved it down the priority list a ways...
need to get back to that...
Per-context tracking will allow us to avoid stalls between contexts,
which will be an improvement above and beyond this patch. This patch
will work even with conflicting contexts, just no better than today. For
the single context (or no conflict cases), we do realise the improvement
from avoiding always having to inspect reloc[].
Hope that helps,
-Chris
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Chris Wilson
2017-06-19 22:32:21 UTC
Permalink
Quoting Jason Ekstrand (2017-06-19 22:00:45)
Post by Chris Wilson
Quoting Kenneth Graunke (2017-06-19 20:28:31)
-   if (target != batch->bo)
-      add_exec_bo(batch, target);
+   if (target != batch->bo) {
+      unsigned int index = add_exec_bo(batch, target);
+      struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects
[index];
+
+      if (write_domain) {
+         exec->flags |= EXEC_OBJECT_WRITE;
+
+         /* PIPECONTROL needs a w/a on gen6 */
+         if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+            struct brw_context *brw = container_of(batch, brw, batch);
+            if (brw->gen == 6)
+               exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+         }
+      }
+
+      offset64 = exec->offset;
I don't think this works.  brw_bo still has a single offset64 value
that's
shared across all contexts.  So, in a multi-context/multi-threaded case,
we
may put a bogus offset in the validation list.
But the value that we put there is the *only* value that we then use for
all of the relocation entries. We are consistently wrong which is what
is mandated by the interface, i.e. we don't prevent a relocation stall
(unless the kernel can move the buffers for us) but more importantly the
relocation.presumed_offset == execobject.offset == batch[reloc.offset]
for all relocations.
Is the BO address in the kernel per-context or per-drm-fd?  If it's the later
then we may be safe though it sounds scary.  If it's the former, then we
definitely have a problem.
The binding of a buffer into its ppGTT is local to that context. The
kernel allocates an address for a buffer separately for each context.
(At the start, the motivating factor was to keep the shared GGTT with
its mappable/non-mappable restrictions entirely separate from ppGTT).
One trick the kernel finally just learnt is to use the supplied offset
for new buffers if it is valid and available. That reduces the impact of
using an offset from context1 in context2, so long as that address was
not already allocated to something else. (So hit and miss in a long
running clients, but any hit is better than always missing.)

However, with softpin I suggest mesa allocates an address at the fd
level and assign that for all contexts using the bo. It does
dramatically reduce the working space, but it still should be many times
physical, except for the complications with 32b restrictions. It was
definitely a very simple proof-of-concept patch (device level monotonic
allocator, falling back to kernel on exhaustion).
-Chris
Daniel Vetter
2017-07-05 08:57:20 UTC
Permalink
Post by Chris Wilson
Quoting Kenneth Graunke (2017-06-19 20:28:31)
Post by Kenneth Graunke
Post by Chris Wilson
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
+
+ offset64 = exec->offset;
I don't think this works. brw_bo still has a single offset64 value that's
shared across all contexts. So, in a multi-context/multi-threaded case, we
may put a bogus offset in the validation list.
But the value that we put there is the *only* value that we then use for
all of the relocation entries. We are consistently wrong which is what
is mandated by the interface, i.e. we don't prevent a relocation stall
(unless the kernel can move the buffers for us) but more importantly the
relocation.presumed_offset == execobject.offset == batch[reloc.offset]
for all relocations.
Yeah the only invariant NO_RELOC needs is that for all relocations
targeting the same buffer in a batch the relocation.presumed_offset must
match execobject.offset. It doesn't matter at all what value you put in
there, you could stuff random() in it and the kernel will fix things up.

It won't be fast ofc, because you're going to thrash relocs and hence
stall like crazy.
Post by Chris Wilson
Post by Kenneth Graunke
Pulling the value out of the validation list is nice, but it's still
potentially bogus.
I think we still need per-context-bo fields. I have a patch series in
progress to do that, but I moved it down the priority list a ways...
need to get back to that...
Per-context tracking will allow us to avoid stalls between contexts,
which will be an improvement above and beyond this patch. This patch
will work even with conflicting contexts, just no better than today. For
the single context (or no conflict cases), we do realise the improvement
from avoiding always having to inspect reloc[].
Yup, for single contexts and disjoint bo sets in multi-context this will
be a great improvement. For shared buffers that somehow ended up with
different offsets in different contexts you're still going to thrash the
relocs every time and simply fall back to the old paths where you have to
process every reloc individually in the kernel. I think this here should
be a strict improvement over what we have, and should give the full
NO_RELOC benefits for all common cases (well maybe not browsers with too
many tabs and just a single gl renderer).
-Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Daniel Vetter
2017-07-07 09:55:49 UTC
Permalink
Borrow a trick from anv, and use the last known index for the bo to skip
a search of the batch->exec_bo when adding a new relocation. In defence
against the bo being used in multiple batches simultaneously, we check
that this slot exists and points back to us.
---
src/mesa/drivers/dri/i965/brw_bufmgr.h | 5 +++++
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 14 ++++++++++++--
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 48488bc33b..dd3a37040a 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -76,6 +76,11 @@ struct brw_bo {
uint64_t offset64;
/**
+ * Index of this buffer inside the batch, -1 when not in a batch.
+ */
+ unsigned int index;
+
+ /**
* Boolean of whether the GPU is definitely not accessing the buffer.
*
* This is only valid when reusable, since non-reusable
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 62d2fe8ef3..ca7d6b81b1 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -515,12 +515,20 @@ throttle(struct brw_context *brw)
}
}
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
static void
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
- for (int i = 0; i < batch->exec_count; i++) {
- if (batch->exec_bos[i] == bo)
+ unsigned int index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return;
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
return;
}
@@ -553,6 +561,7 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
validation_entry->rsvd1 = 0;
validation_entry->rsvd2 = 0;
+ bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
batch->exec_count++;
batch->aperture_space += bo->size;
@@ -597,6 +606,7 @@ execbuffer(int fd,
struct brw_bo *bo = batch->exec_bos[i];
bo->idle = false;
+ bo->index = -1;
Since we check for matching backpointer I don't think we even need to
clear this here, but it's cheap. For consistency I think we should also
clear when allocating a bo:


diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index da12a131526a..0c3199690257 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -325,6 +325,7 @@ retry:

bo->size = bo_size;
bo->idle = true;
+ bo->index = -1;

memclear(create);
create.size = bo_size;
/* Update brw_bo::offset64 */
if (batch->exec_objects[i].offset != bo->offset64) {
--
2.11.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Chris Wilson
2017-07-07 10:04:11 UTC
Permalink
Quoting Daniel Vetter (2017-07-07 10:55:49)
Post by Daniel Vetter
Borrow a trick from anv, and use the last known index for the bo to skip
a search of the batch->exec_bo when adding a new relocation. In defence
against the bo being used in multiple batches simultaneously, we check
that this slot exists and points back to us.
---
src/mesa/drivers/dri/i965/brw_bufmgr.h | 5 +++++
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 14 ++++++++++++--
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 48488bc33b..dd3a37040a 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -76,6 +76,11 @@ struct brw_bo {
uint64_t offset64;
/**
+ * Index of this buffer inside the batch, -1 when not in a batch.
+ */
+ unsigned int index;
+
+ /**
* Boolean of whether the GPU is definitely not accessing the buffer.
*
* This is only valid when reusable, since non-reusable
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 62d2fe8ef3..ca7d6b81b1 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -515,12 +515,20 @@ throttle(struct brw_context *brw)
}
}
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
static void
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
- for (int i = 0; i < batch->exec_count; i++) {
- if (batch->exec_bos[i] == bo)
+ unsigned int index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return;
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
return;
}
@@ -553,6 +561,7 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
validation_entry->rsvd1 = 0;
validation_entry->rsvd2 = 0;
+ bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
batch->exec_count++;
batch->aperture_space += bo->size;
@@ -597,6 +606,7 @@ execbuffer(int fd,
struct brw_bo *bo = batch->exec_bos[i];
bo->idle = false;
+ bo->index = -1;
Since we check for matching backpointer I don't think we even need to
clear this here, but it's cheap. For consistency I think we should also
At the time I was thinking about trying to make -1 mean something
useful, but was kept being stymied by the lack of per-context tracking.

The advantage is that for a single user, it speeds up the not in this
batch check. Multi-user batches has the disadvantage that the bo->index
will ping-pong and they have to keep rechecking (same as before). So I
think it does come out on top, and adding -1 to init should help even
more.

There was also another spot I noticed that used the same hunt for an
exec_bo, brw_batch_references(), that can use the same quick check.
-Chris
Daniel Vetter
2017-07-07 09:57:03 UTC
Permalink
Borrow a trick from anv, and use the last known index for the bo to skip
a search of the batch->exec_bo when adding a new relocation. In defence
against the bo being used in multiple batches simultaneously, we check
that this slot exists and points back to us.
Maybe s/-1/UNIT_MAX/ if you feel like too, as a bikeshed.
-Daniel
---
src/mesa/drivers/dri/i965/brw_bufmgr.h | 5 +++++
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 14 ++++++++++++--
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 48488bc33b..dd3a37040a 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -76,6 +76,11 @@ struct brw_bo {
uint64_t offset64;
/**
+ * Index of this buffer inside the batch, -1 when not in a batch.
+ */
+ unsigned int index;
+
+ /**
* Boolean of whether the GPU is definitely not accessing the buffer.
*
* This is only valid when reusable, since non-reusable
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 62d2fe8ef3..ca7d6b81b1 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -515,12 +515,20 @@ throttle(struct brw_context *brw)
}
}
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
static void
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
- for (int i = 0; i < batch->exec_count; i++) {
- if (batch->exec_bos[i] == bo)
+ unsigned int index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return;
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
return;
}
@@ -553,6 +561,7 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
validation_entry->rsvd1 = 0;
validation_entry->rsvd2 = 0;
+ bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
batch->exec_count++;
batch->aperture_space += bo->size;
@@ -597,6 +606,7 @@ execbuffer(int fd,
struct brw_bo *bo = batch->exec_bos[i];
bo->idle = false;
+ bo->index = -1;
/* Update brw_bo::offset64 */
if (batch->exec_objects[i].offset != bo->offset64) {
--
2.11.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Daniel Vetter
2017-07-07 10:04:00 UTC
Permalink
Post by Chris Wilson
If we correctly fill the batch with the right relocation value, and that
matches the expected location of the object, we can then tell the kernel
it can forgo checking each individual relocation by only checking
whether the object moved.
v2: Rebase to apply ahead of I915_EXEC_HANDLE_LUT
---
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 53 +++++++++++++++++++++------
1 file changed, 41 insertions(+), 12 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index ca7d6b81b1..7129209c26 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -517,19 +517,19 @@ throttle(struct brw_context *brw)
#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-static void
+static unsigned int
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
unsigned int index = READ_ONCE(bo->index);
if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return;
+ return index;
/* May have been shared between multiple active batches */
for (index = 0; index < batch->exec_count; index++) {
if (batch->exec_bos[index] == bo)
- return;
+ return index;
}
brw_bo_reference(bo);
@@ -563,8 +563,9 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
- batch->exec_count++;
batch->aperture_space += bo->size;
+
+ return batch->exec_count++;
}
static int
@@ -642,12 +643,25 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
}
if (!brw->screen->no_hw) {
- int flags;
-
+ unsigned int flags;
+
+ *
+ * The addresses written in the objects must match the corresponding
+ * reloc.presumed_offset which in turn must match the corresponding
+ * execobject.offset.
+ *
+ * Any render targets written to in the batch must be flagged with
+ * EXEC_OBJECT_WRITE.
+ *
+ * To avoid stalling, execobject.offset should match the current
+ * address of that object within the active context.
+ */
+ flags = I915_EXEC_NO_RELOC;
if (brw->gen >= 6 && batch->ring == BLT_RING) {
- flags = I915_EXEC_BLT;
+ flags |= I915_EXEC_BLT;
} else {
- flags = I915_EXEC_RENDER;
+ flags |= I915_EXEC_RENDER;
}
if (batch->needs_sol_reset)
flags |= I915_EXEC_GEN7_SOL_RESET;
@@ -779,16 +793,31 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
If we ever do a write to the batch this goes boom I think. Can we move the
if (write_domain) out of the batch check? Means we need to cache the batch
exec entry somewhere. Just having a batch->batch_exec_flags would do it I
think.
Post by Chris Wilson
+
+ offset64 = exec->offset;
+ } else {
+ offset64 = target->offset64;
You lost the READ_ONCE for the above two, and since that's now at least
locally defined, we don't even need the comment.
Post by Chris Wilson
+ }
struct drm_i915_gem_relocation_entry *reloc =
&batch->relocs[batch->reloc_count];
batch->reloc_count++;
- /* ensure gcc doesn't reload */
- offset64 = *((volatile uint64_t *)&target->offset64);
reloc->offset = batch_offset;
reloc->delta = target_offset;
reloc->target_handle = target->gem_handle;
--
2.11.0
_______________________________________________
mesa-dev mailing list
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Chris Wilson
2017-07-07 10:55:29 UTC
Permalink
Quoting Daniel Vetter (2017-07-07 11:04:00)
Post by Daniel Vetter
Post by Chris Wilson
- if (target != batch->bo)
- add_exec_bo(batch, target);
+ if (target != batch->bo) {
+ unsigned int index = add_exec_bo(batch, target);
+ struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+
+ if (write_domain) {
+ exec->flags |= EXEC_OBJECT_WRITE;
+
+ /* PIPECONTROL needs a w/a on gen6 */
+ if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+ struct brw_context *brw = container_of(batch, brw, batch);
+ if (brw->gen == 6)
+ exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+ }
+ }
If we ever do a write to the batch this goes boom I think. Can we move the
if (write_domain) out of the batch check? Means we need to cache the batch
exec entry somewhere. Just having a batch->batch_exec_flags would do it I
think.
You are strictly not allowed to write to the batch; it was overkill in
hindsight, but the kernel will reject such execbuf.
Post by Daniel Vetter
Post by Chris Wilson
+ offset64 = exec->offset;
+ } else {
+ offset64 = target->offset64;
You lost the READ_ONCE for the above two, and since that's now at least
locally defined, we don't even need the comment.
Where's the second issue? There's no need for a READ_ONCE for offset
here as both are local to the context (and a context is not supposed to
be accessed concurrently, hence the lack of locking around here) and not
shared.
-Chris

Chris Wilson
2017-07-07 10:23:36 UTC
Permalink
Borrow a trick from anv, and use the last known index for the bo to skip
a search of the batch->exec_bo when adding a new relocation. In defence
against the bo being used in multiple batches simultaneously, we check
that this slot exists and points back to us.

v2: Also update brw_batch_references()
v3: Reset bo->index on creation (Daniel)

Signed-off-by: Chris Wilson <***@chris-wilson.co.uk>
Cc: Kenneth Graunke <***@whitecape.org>
Cc: Matt Turner <***@gmail.com>
Cc: Jason Ekstrand <***@intel.com>
Cc: Daniel Vetter <***@ffwll.ch>
---
src/mesa/drivers/dri/i965/brw_bufmgr.c | 1 +
src/mesa/drivers/dri/i965/brw_bufmgr.h | 7 +++++++
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 18 ++++++++++++++++--
3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index da12a13152..4e43a448ae 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -353,6 +353,7 @@ retry:
p_atomic_set(&bo->refcount, 1);
bo->reusable = true;
bo->cache_coherent = bufmgr->has_llc;
+ bo->index = -1;

pthread_mutex_unlock(&bufmgr->lock);

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 4d671b6aae..27a27ca244 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -76,6 +76,13 @@ struct brw_bo {
uint64_t offset64;

/**
+ * Index of this buffer inside the batch, -1 when not in a batch. Note
+ * that a single buffer may be in multiple batches (contexts), the index
+ * only refers to its last use and should not be trusted!
+ */
+ unsigned int index;
+
+ /**
* Boolean of whether the GPU is definitely not accessing the buffer.
*
* This is only valid when reusable, since non-reusable
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 62d2fe8ef3..f76ece8d71 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -515,12 +515,20 @@ throttle(struct brw_context *brw)
}
}

+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
static void
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
- for (int i = 0; i < batch->exec_count; i++) {
- if (batch->exec_bos[i] == bo)
+ unsigned int index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return;
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
return;
}

@@ -553,6 +561,7 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
validation_entry->rsvd1 = 0;
validation_entry->rsvd2 = 0;

+ bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
batch->exec_count++;
batch->aperture_space += bo->size;
@@ -597,6 +606,7 @@ execbuffer(int fd,
struct brw_bo *bo = batch->exec_bos[i];

bo->idle = false;
+ bo->index = -1;

/* Update brw_bo::offset64 */
if (batch->exec_objects[i].offset != bo->offset64) {
@@ -742,6 +752,10 @@ brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
bool
brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
+ unsigned int index = READ_ONCE(bo->index);
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return true;
+
for (int i = 0; i < batch->exec_count; i++) {
if (batch->exec_bos[i] == bo)
return true;
--
2.13.2
Daniel Vetter
2017-07-07 10:36:52 UTC
Permalink
Borrow a trick from anv, and use the last known index for the bo to skip
a search of the batch->exec_bo when adding a new relocation. In defence
against the bo being used in multiple batches simultaneously, we check
that this slot exists and points back to us.
v2: Also update brw_batch_references()
v3: Reset bo->index on creation (Daniel)
---
src/mesa/drivers/dri/i965/brw_bufmgr.c | 1 +
src/mesa/drivers/dri/i965/brw_bufmgr.h | 7 +++++++
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 18 ++++++++++++++++--
3 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index da12a13152..4e43a448ae 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
p_atomic_set(&bo->refcount, 1);
bo->reusable = true;
bo->cache_coherent = bufmgr->has_llc;
+ bo->index = -1;
pthread_mutex_unlock(&bufmgr->lock);
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 4d671b6aae..27a27ca244 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -76,6 +76,13 @@ struct brw_bo {
uint64_t offset64;
/**
+ * Index of this buffer inside the batch, -1 when not in a batch. Note
+ * that a single buffer may be in multiple batches (contexts), the index
+ * only refers to its last use and should not be trusted!
+ */
+ unsigned int index;
+
+ /**
* Boolean of whether the GPU is definitely not accessing the buffer.
*
* This is only valid when reusable, since non-reusable
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 62d2fe8ef3..f76ece8d71 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -515,12 +515,20 @@ throttle(struct brw_context *brw)
}
}
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
static void
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
if (bo != batch->bo) {
- for (int i = 0; i < batch->exec_count; i++) {
- if (batch->exec_bos[i] == bo)
+ unsigned int index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return;
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
return;
}
@@ -553,6 +561,7 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
validation_entry->rsvd1 = 0;
validation_entry->rsvd2 = 0;
+ bo->index = batch->exec_count;
batch->exec_bos[batch->exec_count] = bo;
batch->exec_count++;
batch->aperture_space += bo->size;
@@ -597,6 +606,7 @@ execbuffer(int fd,
struct brw_bo *bo = batch->exec_bos[i];
bo->idle = false;
+ bo->index = -1;
/* Update brw_bo::offset64 */
if (batch->exec_objects[i].offset != bo->offset64) {
@@ -742,6 +752,10 @@ brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
bool
brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
+ unsigned int index = READ_ONCE(bo->index);
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return true;
Yeah that's a neat addition.
+
for (int i = 0; i < batch->exec_count; i++) {
if (batch->exec_bos[i] == bo)
return true;
--
2.13.2
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Loading...