diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c')
-rw-r--r-- | drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 348 |
1 files changed, 293 insertions, 55 deletions
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 1db59eeb34db..0a42f1807f52 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -6,6 +6,7 @@ #include <linux/circ_buf.h> #include "gem/i915_gem_context.h" +#include "gem/i915_gem_lmem.h" #include "gt/gen8_engine_cs.h" #include "gt/intel_breadcrumbs.h" #include "gt/intel_context.h" @@ -29,6 +30,7 @@ #include "intel_guc_submission.h" #include "i915_drv.h" +#include "i915_reg.h" #include "i915_trace.h" /** @@ -65,7 +67,13 @@ * corresponding G2H returns indicating the scheduling disable operation has * completed it is safe to unpin the context. While a disable is in flight it * isn't safe to resubmit the context so a fence is used to stall all future - * requests of that context until the G2H is returned. + * requests of that context until the G2H is returned. Because this interaction + * with the GuC takes a non-zero amount of time we delay the disabling of + * scheduling after the pin count goes to zero by a configurable period of time + * (see SCHED_DISABLE_DELAY_MS). The thought is this gives the user a window of + * time to resubmit something on the context before doing this costly operation. + * This delay is only done if the context isn't closed and the guc_id usage is + * less than a threshold (see NUM_SCHED_DISABLE_GUC_IDS_THRESHOLD). * * Context deregistration: * Before a context can be destroyed or if we steal its guc_id we must @@ -163,7 +171,8 @@ guc_create_parallel(struct intel_engine_cs **engines, #define SCHED_STATE_PENDING_ENABLE BIT(5) #define SCHED_STATE_REGISTERED BIT(6) #define SCHED_STATE_POLICY_REQUIRED BIT(7) -#define SCHED_STATE_BLOCKED_SHIFT 8 +#define SCHED_STATE_CLOSED BIT(8) +#define SCHED_STATE_BLOCKED_SHIFT 9 #define SCHED_STATE_BLOCKED BIT(SCHED_STATE_BLOCKED_SHIFT) #define SCHED_STATE_BLOCKED_MASK (0xfff << SCHED_STATE_BLOCKED_SHIFT) @@ -173,12 +182,20 @@ static inline void init_sched_state(struct intel_context *ce) ce->guc_state.sched_state &= SCHED_STATE_BLOCKED_MASK; } +/* + * Kernel contexts can have SCHED_STATE_REGISTERED after suspend. + * A context close can race with the submission path, so SCHED_STATE_CLOSED + * can be set immediately before we try to register. + */ +#define SCHED_STATE_VALID_INIT \ + (SCHED_STATE_BLOCKED_MASK | \ + SCHED_STATE_CLOSED | \ + SCHED_STATE_REGISTERED) + __maybe_unused static bool sched_state_is_init(struct intel_context *ce) { - /* Kernel contexts can have SCHED_STATE_REGISTERED after suspend. */ - return !(ce->guc_state.sched_state & - ~(SCHED_STATE_BLOCKED_MASK | SCHED_STATE_REGISTERED)); + return !(ce->guc_state.sched_state & ~SCHED_STATE_VALID_INIT); } static inline bool @@ -319,6 +336,17 @@ static inline void clr_context_policy_required(struct intel_context *ce) ce->guc_state.sched_state &= ~SCHED_STATE_POLICY_REQUIRED; } +static inline bool context_close_done(struct intel_context *ce) +{ + return ce->guc_state.sched_state & SCHED_STATE_CLOSED; +} + +static inline void set_context_close_done(struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); + ce->guc_state.sched_state |= SCHED_STATE_CLOSED; +} + static inline u32 context_blocked(struct intel_context *ce) { return (ce->guc_state.sched_state & SCHED_STATE_BLOCKED_MASK) >> @@ -343,25 +371,6 @@ static inline void decr_context_blocked(struct intel_context *ce) ce->guc_state.sched_state -= SCHED_STATE_BLOCKED; } -static inline bool context_has_committed_requests(struct intel_context *ce) -{ - return !!ce->guc_state.number_committed_requests; -} - -static inline void incr_context_committed_requests(struct intel_context *ce) -{ - lockdep_assert_held(&ce->guc_state.lock); - ++ce->guc_state.number_committed_requests; - GEM_BUG_ON(ce->guc_state.number_committed_requests < 0); -} - -static inline void decr_context_committed_requests(struct intel_context *ce) -{ - lockdep_assert_held(&ce->guc_state.lock); - --ce->guc_state.number_committed_requests; - GEM_BUG_ON(ce->guc_state.number_committed_requests < 0); -} - static struct intel_context * request_to_scheduling_context(struct i915_request *rq) { @@ -1067,6 +1076,12 @@ static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc) xa_unlock(&guc->context_lookup); + if (test_bit(CONTEXT_GUC_INIT, &ce->flags) && + (cancel_delayed_work(&ce->guc_state.sched_disable_delay_work))) { + /* successful cancel so jump straight to close it */ + intel_context_sched_disable_unpin(ce); + } + spin_lock(&ce->guc_state.lock); /* @@ -1387,7 +1402,9 @@ static void guc_timestamp_ping(struct work_struct *wrk) /* * Synchronize with gt reset to make sure the worker does not - * corrupt the engine/guc stats. + * corrupt the engine/guc stats. NB: can't actually block waiting + * for a reset to complete as the reset requires flushing out + * this worker thread if started. So waiting would deadlock. */ ret = intel_gt_reset_trylock(gt, &srcu); if (ret) @@ -1994,6 +2011,9 @@ static int new_guc_id(struct intel_guc *guc, struct intel_context *ce) if (unlikely(ret < 0)) return ret; + if (!intel_context_is_parent(ce)) + ++guc->submission_state.guc_ids_in_use; + ce->guc_id.id = ret; return 0; } @@ -2003,14 +2023,16 @@ static void __release_guc_id(struct intel_guc *guc, struct intel_context *ce) GEM_BUG_ON(intel_context_is_child(ce)); if (!context_guc_id_invalid(ce)) { - if (intel_context_is_parent(ce)) + if (intel_context_is_parent(ce)) { bitmap_release_region(guc->submission_state.guc_ids_bitmap, ce->guc_id.id, order_base_2(ce->parallel.number_children + 1)); - else + } else { + --guc->submission_state.guc_ids_in_use; ida_simple_remove(&guc->submission_state.guc_ids, ce->guc_id.id); + } clr_ctx_id_mapping(guc, ce->guc_id.id); set_context_guc_id_invalid(ce); } @@ -2429,6 +2451,10 @@ static int guc_context_policy_init_v70(struct intel_context *ce, bool loop) int ret; /* NB: For both of these, zero means disabled. */ + GEM_BUG_ON(overflows_type(engine->props.timeslice_duration_ms * 1000, + execution_quantum)); + GEM_BUG_ON(overflows_type(engine->props.preempt_timeout_ms * 1000, + preemption_timeout)); execution_quantum = engine->props.timeslice_duration_ms * 1000; preemption_timeout = engine->props.preempt_timeout_ms * 1000; @@ -2462,6 +2488,10 @@ static void guc_context_policy_init_v69(struct intel_engine_cs *engine, desc->policy_flags |= CONTEXT_POLICY_FLAG_PREEMPT_TO_IDLE_V69; /* NB: For both of these, zero means disabled. */ + GEM_BUG_ON(overflows_type(engine->props.timeslice_duration_ms * 1000, + desc->execution_quantum)); + GEM_BUG_ON(overflows_type(engine->props.preempt_timeout_ms * 1000, + desc->preemption_timeout)); desc->execution_quantum = engine->props.timeslice_duration_ms * 1000; desc->preemption_timeout = engine->props.preempt_timeout_ms * 1000; } @@ -2998,41 +3028,104 @@ guc_context_revoke(struct intel_context *ce, struct i915_request *rq, } } -static void guc_context_sched_disable(struct intel_context *ce) +static void do_sched_disable(struct intel_guc *guc, struct intel_context *ce, + unsigned long flags) + __releases(ce->guc_state.lock) { - struct intel_guc *guc = ce_to_guc(ce); - unsigned long flags; struct intel_runtime_pm *runtime_pm = &ce->engine->gt->i915->runtime_pm; intel_wakeref_t wakeref; u16 guc_id; + lockdep_assert_held(&ce->guc_state.lock); + guc_id = prep_context_pending_disable(ce); + + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + + with_intel_runtime_pm(runtime_pm, wakeref) + __guc_context_sched_disable(guc, ce, guc_id); +} + +static bool bypass_sched_disable(struct intel_guc *guc, + struct intel_context *ce) +{ + lockdep_assert_held(&ce->guc_state.lock); GEM_BUG_ON(intel_context_is_child(ce)); + if (submission_disabled(guc) || context_guc_id_invalid(ce) || + !ctx_id_mapped(guc, ce->guc_id.id)) { + clr_context_enabled(ce); + return true; + } + + return !context_enabled(ce); +} + +static void __delay_sched_disable(struct work_struct *wrk) +{ + struct intel_context *ce = + container_of(wrk, typeof(*ce), guc_state.sched_disable_delay_work.work); + struct intel_guc *guc = ce_to_guc(ce); + unsigned long flags; + spin_lock_irqsave(&ce->guc_state.lock, flags); + if (bypass_sched_disable(guc, ce)) { + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + intel_context_sched_disable_unpin(ce); + } else { + do_sched_disable(guc, ce, flags); + } +} + +static bool guc_id_pressure(struct intel_guc *guc, struct intel_context *ce) +{ /* - * We have to check if the context has been disabled by another thread, - * check if submssion has been disabled to seal a race with reset and - * finally check if any more requests have been committed to the - * context ensursing that a request doesn't slip through the - * 'context_pending_disable' fence. + * parent contexts are perma-pinned, if we are unpinning do schedule + * disable immediately. */ - if (unlikely(!context_enabled(ce) || submission_disabled(guc) || - context_has_committed_requests(ce))) { - clr_context_enabled(ce); + if (intel_context_is_parent(ce)) + return true; + + /* + * If we are beyond the threshold for avail guc_ids, do schedule disable immediately. + */ + return guc->submission_state.guc_ids_in_use > + guc->submission_state.sched_disable_gucid_threshold; +} + +static void guc_context_sched_disable(struct intel_context *ce) +{ + struct intel_guc *guc = ce_to_guc(ce); + u64 delay = guc->submission_state.sched_disable_delay_ms; + unsigned long flags; + + spin_lock_irqsave(&ce->guc_state.lock, flags); + + if (bypass_sched_disable(guc, ce)) { + spin_unlock_irqrestore(&ce->guc_state.lock, flags); + intel_context_sched_disable_unpin(ce); + } else if (!intel_context_is_closed(ce) && !guc_id_pressure(guc, ce) && + delay) { spin_unlock_irqrestore(&ce->guc_state.lock, flags); - goto unpin; + mod_delayed_work(system_unbound_wq, + &ce->guc_state.sched_disable_delay_work, + msecs_to_jiffies(delay)); + } else { + do_sched_disable(guc, ce, flags); } - guc_id = prep_context_pending_disable(ce); +} - spin_unlock_irqrestore(&ce->guc_state.lock, flags); +static void guc_context_close(struct intel_context *ce) +{ + unsigned long flags; - with_intel_runtime_pm(runtime_pm, wakeref) - __guc_context_sched_disable(guc, ce, guc_id); + if (test_bit(CONTEXT_GUC_INIT, &ce->flags) && + cancel_delayed_work(&ce->guc_state.sched_disable_delay_work)) + __delay_sched_disable(&ce->guc_state.sched_disable_delay_work.work); - return; -unpin: - intel_context_sched_disable_unpin(ce); + spin_lock_irqsave(&ce->guc_state.lock, flags); + set_context_close_done(ce); + spin_unlock_irqrestore(&ce->guc_state.lock, flags); } static inline void guc_lrc_desc_unpin(struct intel_context *ce) @@ -3071,7 +3164,6 @@ static void __guc_context_destroy(struct intel_context *ce) ce->guc_state.prio_count[GUC_CLIENT_PRIORITY_HIGH] || ce->guc_state.prio_count[GUC_CLIENT_PRIORITY_KMD_NORMAL] || ce->guc_state.prio_count[GUC_CLIENT_PRIORITY_NORMAL]); - GEM_BUG_ON(ce->guc_state.number_committed_requests); lrc_fini(ce); intel_context_fini(ce); @@ -3340,8 +3432,6 @@ static void remove_from_context(struct i915_request *rq) guc_prio_fini(rq, ce); - decr_context_committed_requests(ce); - spin_unlock_irq(&ce->guc_state.lock); atomic_dec(&ce->guc_id.ref); @@ -3351,6 +3441,8 @@ static void remove_from_context(struct i915_request *rq) static const struct intel_context_ops guc_context_ops = { .alloc = guc_context_alloc, + .close = guc_context_close, + .pre_pin = guc_context_pre_pin, .pin = guc_context_pin, .unpin = guc_context_unpin, @@ -3433,6 +3525,10 @@ static void guc_context_init(struct intel_context *ce) rcu_read_unlock(); ce->guc_state.prio = map_i915_prio_to_guc_prio(prio); + + INIT_DELAYED_WORK(&ce->guc_state.sched_disable_delay_work, + __delay_sched_disable); + set_bit(CONTEXT_GUC_INIT, &ce->flags); } @@ -3471,6 +3567,26 @@ static int guc_request_alloc(struct i915_request *rq) guc_context_init(ce); /* + * If the context gets closed while the execbuf is ongoing, the context + * close code will race with the below code to cancel the delayed work. + * If the context close wins the race and cancels the work, it will + * immediately call the sched disable (see guc_context_close), so there + * is a chance we can get past this check while the sched_disable code + * is being executed. To make sure that code completes before we check + * the status further down, we wait for the close process to complete. + * Else, this code path could send a request down thinking that the + * context is still in a schedule-enable mode while the GuC ends up + * dropping the request completely because the disable did go from the + * context_close path right to GuC just prior. In the event the CT is + * full, we could potentially need to wait up to 1.5 seconds. + */ + if (cancel_delayed_work_sync(&ce->guc_state.sched_disable_delay_work)) + intel_context_sched_disable_unpin(ce); + else if (intel_context_is_closed(ce)) + if (wait_for(context_close_done(ce), 1500)) + drm_warn(&guc_to_gt(guc)->i915->drm, + "timed out waiting on context sched close before realloc\n"); + /* * Call pin_guc_id here rather than in the pinning step as with * dma_resv, contexts can be repeatedly pinned / unpinned trashing the * guc_id and creating horrible race conditions. This is especially bad @@ -3524,7 +3640,6 @@ out: list_add_tail(&rq->guc_fence_link, &ce->guc_state.fences); } - incr_context_committed_requests(ce); spin_unlock_irqrestore(&ce->guc_state.lock, flags); return 0; @@ -3600,6 +3715,8 @@ static int guc_virtual_context_alloc(struct intel_context *ce) static const struct intel_context_ops virtual_guc_context_ops = { .alloc = guc_virtual_context_alloc, + .close = guc_context_close, + .pre_pin = guc_virtual_context_pre_pin, .pin = guc_virtual_context_pin, .unpin = guc_virtual_context_unpin, @@ -3689,6 +3806,8 @@ static void guc_child_context_destroy(struct kref *kref) static const struct intel_context_ops virtual_parent_context_ops = { .alloc = guc_virtual_context_alloc, + .close = guc_context_close, + .pre_pin = guc_context_pre_pin, .pin = guc_parent_context_pin, .unpin = guc_parent_context_unpin, @@ -3995,6 +4114,9 @@ static inline void guc_kernel_context_pin(struct intel_guc *guc, if (context_guc_id_invalid(ce)) pin_guc_id(guc, ce); + if (!test_bit(CONTEXT_GUC_INIT, &ce->flags)) + guc_context_init(ce); + try_context_registration(ce, true); } @@ -4093,7 +4215,7 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine) engine->emit_bb_start = gen8_emit_bb_start; if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) - engine->emit_bb_start = gen125_emit_bb_start; + engine->emit_bb_start = xehp_emit_bb_start; } static void rcs_submission_override(struct intel_engine_cs *engine) @@ -4177,6 +4299,98 @@ int intel_guc_submission_setup(struct intel_engine_cs *engine) return 0; } +struct scheduling_policy { + /* internal data */ + u32 max_words, num_words; + u32 count; + /* API data */ + struct guc_update_scheduling_policy h2g; +}; + +static u32 __guc_scheduling_policy_action_size(struct scheduling_policy *policy) +{ + u32 *start = (void *)&policy->h2g; + u32 *end = policy->h2g.data + policy->num_words; + size_t delta = end - start; + + return delta; +} + +static struct scheduling_policy *__guc_scheduling_policy_start_klv(struct scheduling_policy *policy) +{ + policy->h2g.header.action = INTEL_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV; + policy->max_words = ARRAY_SIZE(policy->h2g.data); + policy->num_words = 0; + policy->count = 0; + + return policy; +} + +static void __guc_scheduling_policy_add_klv(struct scheduling_policy *policy, + u32 action, u32 *data, u32 len) +{ + u32 *klv_ptr = policy->h2g.data + policy->num_words; + + GEM_BUG_ON((policy->num_words + 1 + len) > policy->max_words); + *(klv_ptr++) = FIELD_PREP(GUC_KLV_0_KEY, action) | + FIELD_PREP(GUC_KLV_0_LEN, len); + memcpy(klv_ptr, data, sizeof(u32) * len); + policy->num_words += 1 + len; + policy->count++; +} + +static int __guc_action_set_scheduling_policies(struct intel_guc *guc, + struct scheduling_policy *policy) +{ + int ret; + + ret = intel_guc_send(guc, (u32 *)&policy->h2g, + __guc_scheduling_policy_action_size(policy)); + if (ret < 0) + return ret; + + if (ret != policy->count) { + drm_warn(&guc_to_gt(guc)->i915->drm, "GuC global scheduler policy processed %d of %d KLVs!", + ret, policy->count); + if (ret > policy->count) + return -EPROTO; + } + + return 0; +} + +static int guc_init_global_schedule_policy(struct intel_guc *guc) +{ + struct scheduling_policy policy; + struct intel_gt *gt = guc_to_gt(guc); + intel_wakeref_t wakeref; + int ret = 0; + + if (GET_UC_VER(guc) < MAKE_UC_VER(70, 3, 0)) + return 0; + + __guc_scheduling_policy_start_klv(&policy); + + with_intel_runtime_pm(>->i915->runtime_pm, wakeref) { + u32 yield[] = { + GLOBAL_SCHEDULE_POLICY_RC_YIELD_DURATION, + GLOBAL_SCHEDULE_POLICY_RC_YIELD_RATIO, + }; + + __guc_scheduling_policy_add_klv(&policy, + GUC_SCHEDULING_POLICIES_KLV_ID_RENDER_COMPUTE_YIELD, + yield, ARRAY_SIZE(yield)); + + ret = __guc_action_set_scheduling_policies(guc, &policy); + if (ret) + i915_probe_error(gt->i915, + "Failed to configure global scheduling policies: %pe!\n", + ERR_PTR(ret)); + } + + return ret; +} + void intel_guc_submission_enable(struct intel_guc *guc) { struct intel_gt *gt = guc_to_gt(guc); @@ -4189,6 +4403,7 @@ void intel_guc_submission_enable(struct intel_guc *guc) guc_init_lrc_mapping(guc); guc_init_engine_stats(guc); + guc_init_global_schedule_policy(guc); } void intel_guc_submission_disable(struct intel_guc *guc) @@ -4219,6 +4434,26 @@ static bool __guc_submission_selected(struct intel_guc *guc) return i915->params.enable_guc & ENABLE_GUC_SUBMISSION; } +int intel_guc_sched_disable_gucid_threshold_max(struct intel_guc *guc) +{ + return guc->submission_state.num_guc_ids - NUMBER_MULTI_LRC_GUC_ID(guc); +} + +/* + * This default value of 33 milisecs (+1 milisec round up) ensures 30fps or higher + * workloads are able to enjoy the latency reduction when delaying the schedule-disable + * operation. This matches the 30fps game-render + encode (real world) workload this + * knob was tested against. + */ +#define SCHED_DISABLE_DELAY_MS 34 + +/* + * A threshold of 75% is a reasonable starting point considering that real world apps + * generally don't get anywhere near this. + */ +#define NUM_SCHED_DISABLE_GUCIDS_DEFAULT_THRESHOLD(__guc) \ + (((intel_guc_sched_disable_gucid_threshold_max(guc)) * 3) / 4) + void intel_guc_submission_init_early(struct intel_guc *guc) { xa_init_flags(&guc->context_lookup, XA_FLAGS_LOCK_IRQ); @@ -4235,7 +4470,10 @@ void intel_guc_submission_init_early(struct intel_guc *guc) spin_lock_init(&guc->timestamp.lock); INIT_DELAYED_WORK(&guc->timestamp.work, guc_timestamp_ping); + guc->submission_state.sched_disable_delay_ms = SCHED_DISABLE_DELAY_MS; guc->submission_state.num_guc_ids = GUC_MAX_CONTEXT_ID; + guc->submission_state.sched_disable_gucid_threshold = + NUM_SCHED_DISABLE_GUCIDS_DEFAULT_THRESHOLD(guc); guc->submission_supported = __guc_submission_supported(guc); guc->submission_selected = __guc_submission_selected(guc); } @@ -4669,7 +4907,7 @@ void intel_guc_submission_print_info(struct intel_guc *guc, drm_printf(p, "GuC Number Outstanding Submission G2H: %u\n", atomic_read(&guc->outstanding_submission_g2h)); - drm_printf(p, "GuC tasklet count: %u\n\n", + drm_printf(p, "GuC tasklet count: %u\n", atomic_read(&sched_engine->tasklet.count)); spin_lock_irqsave(&sched_engine->lock, flags); @@ -4717,7 +4955,7 @@ static inline void guc_log_context(struct drm_printer *p, atomic_read(&ce->pin_count)); drm_printf(p, "\t\tGuC ID Ref Count: %u\n", atomic_read(&ce->guc_id.ref)); - drm_printf(p, "\t\tSchedule State: 0x%x\n\n", + drm_printf(p, "\t\tSchedule State: 0x%x\n", ce->guc_state.sched_state); } @@ -4746,7 +4984,7 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, READ_ONCE(*ce->parallel.guc.wq_head)); drm_printf(p, "\t\tWQI Tail: %u\n", READ_ONCE(*ce->parallel.guc.wq_tail)); - drm_printf(p, "\t\tWQI Status: %u\n\n", + drm_printf(p, "\t\tWQI Status: %u\n", READ_ONCE(*ce->parallel.guc.wq_status)); } @@ -4754,7 +4992,7 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, emit_bb_start_parent_no_preempt_mid_batch) { u8 i; - drm_printf(p, "\t\tChildren Go: %u\n\n", + drm_printf(p, "\t\tChildren Go: %u\n", get_children_go_value(ce)); for (i = 0; i < ce->parallel.number_children; ++i) drm_printf(p, "\t\tChildren Join: %u\n", |