summaryrefslogtreecommitdiffstats
path: root/drivers/misc/habanalabs/common
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/common')
-rw-r--r--drivers/misc/habanalabs/common/command_submission.c2
-rw-r--r--drivers/misc/habanalabs/common/firmware_if.c53
-rw-r--r--drivers/misc/habanalabs/common/habanalabs.h23
-rw-r--r--drivers/misc/habanalabs/common/habanalabs_drv.c7
-rw-r--r--drivers/misc/habanalabs/common/sysfs.c4
5 files changed, 56 insertions, 33 deletions
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index ff8791a651fd..af3c497defb1 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -2017,7 +2017,7 @@ wait_again:
if (completion_value >= target_value) {
*status = CS_WAIT_STATUS_COMPLETED;
} else {
- timeout -= jiffies_to_usecs(completion_rc);
+ timeout = completion_rc;
goto wait_again;
}
} else {
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 832dd5c5bb06..0713b2c12d54 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -362,12 +362,9 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
}
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
- dev_warn(hdev->dev,
+ dev_err(hdev->dev,
"Device boot warning - security not ready\n");
- /* This is a warning so we don't want it to disable the
- * device
- */
- err_val &= ~CPU_BOOT_ERR0_SECURITY_NOT_RDY;
+ err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
@@ -403,7 +400,8 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
err_exists = true;
}
- if (err_exists)
+ if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &
+ lower_32_bits(hdev->boot_error_status_mask)))
return -EIO;
return 0;
@@ -661,18 +659,13 @@ int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
return rc;
}
-int get_used_pll_index(struct hl_device *hdev, enum pll_index input_pll_index,
+int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index,
enum pll_index *pll_index)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
u8 pll_byte, pll_bit_off;
bool dynamic_pll;
-
- if (input_pll_index >= PLL_MAX) {
- dev_err(hdev->dev, "PLL index %d is out of range\n",
- input_pll_index);
- return -EINVAL;
- }
+ int fw_pll_idx;
dynamic_pll = prop->fw_security_status_valid &&
(prop->fw_app_security_map & CPU_BOOT_DEV_STS0_DYN_PLL_EN);
@@ -680,28 +673,39 @@ int get_used_pll_index(struct hl_device *hdev, enum pll_index input_pll_index,
if (!dynamic_pll) {
/*
* in case we are working with legacy FW (each asic has unique
- * PLL numbering) extract the legacy numbering
+ * PLL numbering) use the driver based index as they are
+ * aligned with fw legacy numbering
*/
- *pll_index = hdev->legacy_pll_map[input_pll_index];
+ *pll_index = input_pll_index;
return 0;
}
+ /* retrieve a FW compatible PLL index based on
+ * ASIC specific user request
+ */
+ fw_pll_idx = hdev->asic_funcs->map_pll_idx_to_fw_idx(input_pll_index);
+ if (fw_pll_idx < 0) {
+ dev_err(hdev->dev, "Invalid PLL index (%u) error %d\n",
+ input_pll_index, fw_pll_idx);
+ return -EINVAL;
+ }
+
/* PLL map is a u8 array */
- pll_byte = prop->cpucp_info.pll_map[input_pll_index >> 3];
- pll_bit_off = input_pll_index & 0x7;
+ pll_byte = prop->cpucp_info.pll_map[fw_pll_idx >> 3];
+ pll_bit_off = fw_pll_idx & 0x7;
if (!(pll_byte & BIT(pll_bit_off))) {
dev_err(hdev->dev, "PLL index %d is not supported\n",
- input_pll_index);
+ fw_pll_idx);
return -EINVAL;
}
- *pll_index = input_pll_index;
+ *pll_index = fw_pll_idx;
return 0;
}
-int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, enum pll_index pll_index,
+int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
u16 *pll_freq_arr)
{
struct cpucp_packet pkt;
@@ -844,8 +848,13 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
if (rc) {
dev_err(hdev->dev, "Failed to read preboot version\n");
detect_cpu_boot_status(hdev, status);
- fw_read_errors(hdev, boot_err0_reg,
- cpu_security_boot_status_reg);
+
+ /* If we read all FF, then something is totally wrong, no point
+ * of reading specific errors
+ */
+ if (status != -1)
+ fw_read_errors(hdev, boot_err0_reg,
+ cpu_security_boot_status_reg);
return -EIO;
}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 44e89da30b4a..6579f8767abd 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -930,6 +930,9 @@ enum div_select_defs {
* driver is ready to receive asynchronous events. This
* function should be called during the first init and
* after every hard-reset of the device
+ * @get_msi_info: Retrieve asic-specific MSI ID of the f/w async event
+ * @map_pll_idx_to_fw_idx: convert driver specific per asic PLL index to
+ * generic f/w compatible PLL Indexes
*/
struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev);
@@ -1054,6 +1057,7 @@ struct hl_asic_funcs {
u32 block_id, u32 block_size);
void (*enable_events_from_fw)(struct hl_device *hdev);
void (*get_msi_info)(u32 *table);
+ int (*map_pll_idx_to_fw_idx)(u32 pll_idx);
};
@@ -1950,8 +1954,6 @@ struct hl_mmu_funcs {
* @aggregated_cs_counters: aggregated cs counters among all contexts
* @mmu_priv: device-specific MMU data.
* @mmu_func: device-related MMU functions.
- * @legacy_pll_map: map holding map between dynamic (common) PLL indexes and
- * static (asic specific) PLL indexes.
* @dram_used_mem: current DRAM memory consumption.
* @timeout_jiffies: device CS timeout value.
* @max_power: the max power of the device, as configured by the sysadmin. This
@@ -1960,6 +1962,12 @@ struct hl_mmu_funcs {
* @clock_gating_mask: is clock gating enabled. bitmask that represents the
* different engines. See debugfs-driver-habanalabs for
* details.
+ * @boot_error_status_mask: contains a mask of the device boot error status.
+ * Each bit represents a different error, according to
+ * the defines in hl_boot_if.h. If the bit is cleared,
+ * the error will be ignored by the driver during
+ * device initialization. Mainly used to debug and
+ * workaround firmware bugs
* @in_reset: is device in reset flow.
* @curr_pll_profile: current PLL profile.
* @card_type: Various ASICs have several card types. This indicates the card
@@ -2071,12 +2079,11 @@ struct hl_device {
struct hl_mmu_priv mmu_priv;
struct hl_mmu_funcs mmu_func[MMU_NUM_PGT_LOCATIONS];
- enum pll_index *legacy_pll_map;
-
atomic64_t dram_used_mem;
u64 timeout_jiffies;
u64 max_power;
u64 clock_gating_mask;
+ u64 boot_error_status_mask;
atomic_t in_reset;
enum hl_pll_frequency curr_pll_profile;
enum cpucp_card_types card_type;
@@ -2387,9 +2394,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
struct hl_info_pci_counters *counters);
int hl_fw_cpucp_total_energy_get(struct hl_device *hdev,
u64 *total_energy);
-int get_used_pll_index(struct hl_device *hdev, enum pll_index input_pll_index,
+int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index,
enum pll_index *pll_index);
-int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, enum pll_index pll_index,
+int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
u16 *pll_freq_arr);
int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power);
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
@@ -2411,9 +2418,9 @@ int hl_pci_set_outbound_region(struct hl_device *hdev,
int hl_pci_init(struct hl_device *hdev);
void hl_pci_fini(struct hl_device *hdev);
-long hl_get_frequency(struct hl_device *hdev, enum pll_index pll_index,
+long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
bool curr);
-void hl_set_frequency(struct hl_device *hdev, enum pll_index pll_index,
+void hl_set_frequency(struct hl_device *hdev, u32 pll_index,
u64 freq);
int hl_get_temperature(struct hl_device *hdev,
int sensor_index, u32 attr, long *value);
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 7135f1e03864..64d1530db985 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -30,6 +30,7 @@ static DEFINE_MUTEX(hl_devs_idr_lock);
static int timeout_locked = 30;
static int reset_on_lockup = 1;
static int memory_scrub = 1;
+static ulong boot_error_status_mask = ULONG_MAX;
module_param(timeout_locked, int, 0444);
MODULE_PARM_DESC(timeout_locked,
@@ -43,6 +44,10 @@ module_param(memory_scrub, int, 0444);
MODULE_PARM_DESC(memory_scrub,
"Scrub device memory in various states (0 = no, 1 = yes, default yes)");
+module_param(boot_error_status_mask, ulong, 0444);
+MODULE_PARM_DESC(boot_error_status_mask,
+ "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
+
#define PCI_VENDOR_ID_HABANALABS 0x1da3
#define PCI_IDS_GOYA 0x0001
@@ -319,6 +324,8 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
hdev->major = hl_major;
hdev->reset_on_lockup = reset_on_lockup;
hdev->memory_scrub = memory_scrub;
+ hdev->boot_error_status_mask = boot_error_status_mask;
+
hdev->pldm = 0;
set_driver_behavior_per_device(hdev);
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index 9fa61573a89d..c9f649b31e3a 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -9,7 +9,7 @@
#include <linux/pci.h>
-long hl_get_frequency(struct hl_device *hdev, enum pll_index pll_index,
+long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
bool curr)
{
struct cpucp_packet pkt;
@@ -44,7 +44,7 @@ long hl_get_frequency(struct hl_device *hdev, enum pll_index pll_index,
return (long) result;
}
-void hl_set_frequency(struct hl_device *hdev, enum pll_index pll_index,
+void hl_set_frequency(struct hl_device *hdev, u32 pll_index,
u64 freq)
{
struct cpucp_packet pkt;