summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorJonathan Kim <jonathan.kim@amd.com>2020-03-17 15:43:41 -0400
committerAlex Deucher <alexander.deucher@amd.com>2020-04-22 18:11:46 -0400
commitd84a430d9f7b1ce6baedf1305106d0ae706aca76 (patch)
treebbed9c0ce3c43692244ee9033dc685be34c842f2 /drivers
parent9f65693577d976792e6dbad9ad42105b129259c6 (diff)
downloadlinux-stable-d84a430d9f7b1ce6baedf1305106d0ae706aca76.tar.gz
linux-stable-d84a430d9f7b1ce6baedf1305106d0ae706aca76.tar.bz2
linux-stable-d84a430d9f7b1ce6baedf1305106d0ae706aca76.zip
drm/amdgpu: fix race between pstate and remote buffer map
Vega20 arbitrates pstate at hive level and not device level. Last peer to remote buffer unmap could drop P-State while another process is still remote buffer mapped. With this fix, P-States still needs to be disabled for now as SMU bug was discovered on synchronous P2P transfers. This should be fixed in the next FW update. Signed-off-by: Jonathan Kim <Jonathan.Kim@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c16
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c68
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h9
6 files changed, 49 insertions, 53 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 215f57765e5e..8a7f794e3ffa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -986,8 +986,6 @@ struct amdgpu_device {
uint64_t unique_id;
uint64_t df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];
- /* device pstate */
- int pstate;
/* enable runtime pm on the device */
bool runpm;
bool in_runpm;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9da5fc805d00..ad74fd88fa76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2248,7 +2248,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
if (gpu_instance->adev->flags & AMD_IS_APU)
continue;
- r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+ r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
+ AMDGPU_XGMI_PSTATE_MIN);
if (r) {
DRM_ERROR("pstate setting failed (%d).\n", r);
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 6d9252a27916..c6c3fc276fa4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2124,11 +2124,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) &&
(bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) {
bo_va->is_xgmi = true;
- mutex_lock(&adev->vm_manager.lock_pstate);
/* Power up XGMI if it can be potentially used */
- if (++adev->vm_manager.xgmi_map_counter == 1)
- amdgpu_xgmi_set_pstate(adev, 1);
- mutex_unlock(&adev->vm_manager.lock_pstate);
+ amdgpu_xgmi_set_pstate(adev, AMDGPU_XGMI_PSTATE_MAX_VEGA20);
}
return bo_va;
@@ -2551,12 +2548,8 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
dma_fence_put(bo_va->last_pt_update);
- if (bo && bo_va->is_xgmi) {
- mutex_lock(&adev->vm_manager.lock_pstate);
- if (--adev->vm_manager.xgmi_map_counter == 0)
- amdgpu_xgmi_set_pstate(adev, 0);
- mutex_unlock(&adev->vm_manager.lock_pstate);
- }
+ if (bo && bo_va->is_xgmi)
+ amdgpu_xgmi_set_pstate(adev, AMDGPU_XGMI_PSTATE_MIN);
kfree(bo_va);
}
@@ -3166,9 +3159,6 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
idr_init(&adev->vm_manager.pasid_idr);
spin_lock_init(&adev->vm_manager.pasid_lock);
-
- adev->vm_manager.xgmi_map_counter = 0;
- mutex_init(&adev->vm_manager.lock_pstate);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 06fe30e1492d..b13c14d6b820 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -349,10 +349,6 @@ struct amdgpu_vm_manager {
*/
struct idr pasid_idr;
spinlock_t pasid_lock;
-
- /* counter of mapped memory through xgmi */
- uint32_t xgmi_map_counter;
- struct mutex lock_pstate;
};
#define amdgpu_vm_copy_pte(adev, ib, pe, src, count) ((adev)->vm_manager.vm_pte_funcs->copy_pte((ib), (pe), (src), (count)))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8c3215505e78..54d8a3e7e75c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -373,7 +373,13 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
if (lock)
mutex_lock(&tmp->hive_lock);
- tmp->pstate = -1;
+ tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
+ tmp->hi_req_gpu = NULL;
+ /*
+ * hive pstate on boot is high in vega20 so we have to go to low
+ * pstate on after boot.
+ */
+ tmp->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
mutex_unlock(&xgmi_mutex);
return tmp;
@@ -383,50 +389,51 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
{
int ret = 0;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
- struct amdgpu_device *tmp_adev;
- bool update_hive_pstate = true;
- bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20;
+ struct amdgpu_device *request_adev = hive->hi_req_gpu ?
+ hive->hi_req_gpu : adev;
+ bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
+ bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
- if (!hive)
+ /* fw bug so temporarily disable pstate switching */
+ if (!hive || adev->asic_type == CHIP_VEGA20)
return 0;
mutex_lock(&hive->hive_lock);
- if (hive->pstate == pstate) {
- adev->pstate = is_high_pstate ? pstate : adev->pstate;
+ if (is_hi_req)
+ hive->hi_req_count++;
+ else
+ hive->hi_req_count--;
+
+ /*
+ * Vega20 only needs single peer to request pstate high for the hive to
+ * go high but all peers must request pstate low for the hive to go low
+ */
+ if (hive->pstate == pstate ||
+ (!is_hi_req && hive->hi_req_count && !init_low))
goto out;
- }
- dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
+ dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
- ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate);
+ ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
if (ret) {
- dev_err(adev->dev,
+ dev_err(request_adev->dev,
"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
- adev->gmc.xgmi.node_id,
- adev->gmc.xgmi.hive_id, ret);
+ request_adev->gmc.xgmi.node_id,
+ request_adev->gmc.xgmi.hive_id, ret);
goto out;
}
- /* Update device pstate */
- adev->pstate = pstate;
-
- /*
- * Update the hive pstate only all devices of the hive
- * are in the same pstate
- */
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
- if (tmp_adev->pstate != adev->pstate) {
- update_hive_pstate = false;
- break;
- }
- }
- if (update_hive_pstate || is_high_pstate)
+ if (init_low)
+ hive->pstate = hive->hi_req_count ?
+ hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
+ else {
hive->pstate = pstate;
-
+ hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
+ adev : NULL;
+ }
out:
mutex_unlock(&hive->hive_lock);
-
return ret;
}
@@ -507,9 +514,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
goto exit;
}
- /* Set default device pstate */
- adev->pstate = -1;
-
top_info = &adev->psp.xgmi_context.top_info;
list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index d5a63904ec33..6999eab16a72 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -25,6 +25,7 @@
#include <drm/task_barrier.h>
#include "amdgpu_psp.h"
+
struct amdgpu_hive_info {
uint64_t hive_id;
struct list_head device_list;
@@ -33,8 +34,14 @@ struct amdgpu_hive_info {
struct kobject *kobj;
struct device_attribute dev_attr;
struct amdgpu_device *adev;
- int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+ int hi_req_count;
+ struct amdgpu_device *hi_req_gpu;
struct task_barrier tb;
+ enum {
+ AMDGPU_XGMI_PSTATE_MIN,
+ AMDGPU_XGMI_PSTATE_MAX_VEGA20,
+ AMDGPU_XGMI_PSTATE_UNKNOWN
+ } pstate;
};
struct amdgpu_pcs_ras_field {