From e5f4e68eed85fa8495d78cd966eecc2b27bb9e53 Mon Sep 17 00:00:00 2001 From: Doug Smythies Date: Mon, 3 Apr 2023 14:11:38 -0700 Subject: tools/power turbostat: Fix added raw MSR output When using --Summary mode, added MSRs in raw mode always print zeros. Print the actual register contents. Example, with patch: note the added column: --add msr0x64f,u32,package,raw,REASON Where: 0x64F is MSR_CORE_PERF_LIMIT_REASONS Busy% Bzy_MHz PkgTmp PkgWatt CorWatt REASON 0.00 4800 35 1.42 0.76 0x00000000 0.00 4801 34 1.42 0.76 0x00000000 80.08 4531 66 108.17 107.52 0x08000000 98.69 4530 66 133.21 132.54 0x08000000 99.28 4505 66 128.26 127.60 0x0c000400 99.65 4486 68 124.91 124.25 0x0c000400 99.63 4483 68 124.90 124.25 0x0c000400 79.34 4481 41 99.80 99.13 0x0c000000 0.00 4801 41 1.40 0.73 0x0c000000 Where, for the test processor (i5-10600K): PKG Limit #1: 125.000 Watts, 8.000000 sec MSR bit 26 = log; bit 10 = status PKG Limit #2: 136.000 Watts, 0.002441 sec MSR bit 27 = log; bit 11 = status Example, without patch: Busy% Bzy_MHz PkgTmp PkgWatt CorWatt REASON 0.01 4800 35 1.43 0.77 0x00000000 0.00 4801 35 1.39 0.73 0x00000000 83.49 4531 66 112.71 112.06 0x00000000 98.69 4530 68 133.35 132.69 0x00000000 99.31 4500 67 127.96 127.30 0x00000000 99.63 4483 69 124.91 124.25 0x00000000 99.61 4481 69 124.90 124.25 0x00000000 99.61 4481 71 124.92 124.25 0x00000000 59.35 4479 42 75.03 74.37 0x00000000 0.00 4800 42 1.39 0.73 0x00000000 0.00 4801 42 1.42 0.76 0x00000000 c000000 [lenb: simplified patch to apply only to package scope] Signed-off-by: Doug Smythies Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7a334377f92b..fca7913f6c84 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2444,9 +2444,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - if (mp->format == FORMAT_RAW) - continue; - average.packages.counter[i] += p->counter[i]; + if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.counter[i] = p->counter[i]; + else + average.packages.counter[i] += p->counter[i]; } return 0; } -- cgit v1.2.3 From 3ac1d14d0583a2de75d49a5234d767e2590384dd Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Tue, 3 Oct 2023 05:07:51 +0000 Subject: tools/power turbostat: Increase the limit for fd opened When running turbostat, a system with 512 cpus reaches the limit for maximum number of file descriptors that can be opened. To solve this problem, the limit is raised to 2^15, which is a large enough number. Below data is collected from AMD server systems while running turbostat: |-----------+-------------------------------| | # of cpus | # of opened fds for turbostat | |-----------+-------------------------------| | 128 | 260 | |-----------+-------------------------------| | 192 | 388 | |-----------+-------------------------------| | 512 | 1028 | |-----------+-------------------------------| So, the new max limit would be sufficient up to 2^14 cpus (but this also depends on how many counters are enabled). Reviewed-by: Doug Smythies Tested-by: Doug Smythies Signed-off-by: Wyes Karny Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fca7913f6c84..2550a0e35914 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -53,6 +53,8 @@ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define MAX_NOFILE 0x8000 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; @@ -6705,6 +6707,22 @@ void cmdline(int argc, char **argv) } } +void set_rlimit(void) +{ + struct rlimit limit; + + if (getrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to get rlimit"); + + if (limit.rlim_max < MAX_NOFILE) + limit.rlim_max = MAX_NOFILE; + if (limit.rlim_cur < MAX_NOFILE) + limit.rlim_cur = MAX_NOFILE; + + if (setrlimit(RLIMIT_NOFILE, &limit) < 0) + err(1, "Failed to set rlimit"); +} + int main(int argc, char **argv) { int fd, ret; @@ -6730,6 +6748,9 @@ skip_cgroup_setting: probe_sysfs(); + if (!getuid()) + set_rlimit(); + turbostat_init(); msr_sum_record(); -- cgit v1.2.3 From 0b13410b52c4636aacb6964a4253a797c0fa0d16 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Sat, 7 Oct 2023 13:46:22 +0800 Subject: tools/power turbostat: Fix Bzy_MHz documentation typo The code calculates Bzy_MHz by multiplying TSC_delta * APERF_delta/MPERF_delta The man page erroneously showed that TSC_delta was divided. Signed-off-by: Peng Liu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8f08c3fd498d..1ba6340d3b3d 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -370,7 +370,7 @@ below the processor's base frequency. Busy% = MPERF_delta/TSC_delta -Bzy_MHz = TSC_delta/APERF_delta/MPERF_delta/measurement_interval +Bzy_MHz = TSC_delta*APERF_delta/MPERF_delta/measurement_interval Note that these calculations depend on TSC_delta, so they are not reliable during intervals when TSC_MHz is not running at the base frequency. -- cgit v1.2.3 From 227ed18f456a68bbb69807294a9089208663a6d3 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Sun, 22 Oct 2023 13:52:21 +0800 Subject: tools/power turbostat: Do not print negative LPI residency turbostat prints the abnormal SYS%LPI across suspend-to-idle: SYS%LPI = 114479815993277.50 This is reproduced by: Run a freeze cycle, e.g. "sleepgraph -m freeze -rtcwake 15". Then do a reboot. After boot up, launch the suspend-idle-idle and check the SYS%LPI field. The slp_so residence counter is in LPIT table, and BIOS does not clears this register across reset. The PMC expects the OS to calculate the LPI residency based on the delta. However, there is an firmware issue that the LPIT gets cleared to 0 during the second suspend to idle after the reboot, which brings negative delta value. [lenb: updated to print "neg" upon this BIOS failure] Reported-by: Todd Brandt Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2550a0e35914..c23703dd54aa 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -991,8 +991,8 @@ struct pkg_data { unsigned long long pc8; unsigned long long pc9; unsigned long long pc10; - unsigned long long cpu_lpi; - unsigned long long sys_lpi; + long long cpu_lpi; + long long sys_lpi; unsigned long long pkg_wtd_core_c0; unsigned long long pkg_any_core_c0; unsigned long long pkg_any_gfxe_c0; @@ -1978,12 +1978,22 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); - if (DO_BIC(BIC_CPU_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); - if (DO_BIC(BIC_SYS_LPI)) - outp += - sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { + if (p->cpu_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->cpu_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } + if (DO_BIC(BIC_SYS_LPI)) { + if (p->sys_lpi >= 0) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + 100.0 * p->sys_lpi / 1000000.0 / interval_float); + else + outp += sprintf(outp, "%s(neg)", (printed++ ? delim : "")); + } if (DO_BIC(BIC_PkgWatt)) outp += @@ -3832,7 +3842,8 @@ void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); - fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); + fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, + topo.allowed_cpus); } void set_max_cpu_num(void) @@ -6145,6 +6156,7 @@ void topology_update(void) topo.allowed_packages = 0; for_all_cpus(update_topo, ODD_COUNTERS); } + void setup_all_buffers(bool startup) { topology_probe(startup); -- cgit v1.2.3 From bb6181fa6bc942aac3f7f2fa8e3831952a2ef118 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 20 Dec 2023 13:11:05 -0500 Subject: tools/power turbostat: Expand probe_intel_uncore_frequency() Print current frequency along with the current (and initial) limits Probe and print uncore config also for machines using the new cluster API Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 84 ++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c23703dd54aa..bbd2e0edadfa 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4581,20 +4581,15 @@ static void dump_sysfs_file(char *path) static void probe_intel_uncore_frequency(void) { int i, j; - char path[128]; + char path[256]; if (!genuine_intel) return; - if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK)) - return; - - /* Cluster level sysfs not supported yet. */ - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK)) - return; + if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) + goto probe_cluster; - if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) - BIC_PRESENT(BIC_UNCORE_MHZ); + BIC_PRESENT(BIC_UNCORE_MHZ); if (quiet) return; @@ -4602,26 +4597,73 @@ static void probe_intel_uncore_frequency(void) for (i = 0; i < topo.num_packages; ++i) { for (j = 0; j < topo.num_die; ++j) { int k, l; + char path_base[128]; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i, + j); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/min_freq_khz", - i, j); + sprintf(path, "%s/min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/max_freq_khz", - i, j); + sprintf(path, "%s/max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "Uncore Frequency pkg%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); + fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_min_freq_khz", - i, j); + sprintf(path, "%s/initial_min_freq_khz", path_base); k = read_sysfs_int(path); - sprintf(path, - "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_max_freq_khz", - i, j); + sprintf(path, "%s/initial_max_freq_khz", path_base); l = read_sysfs_int(path); - fprintf(outf, "(%d - %d MHz)\n", k / 1000, l / 1000); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); } } + return; + +probe_cluster: + if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK)) + return; + + if (quiet) + return; + + for (i = 0;; ++i) { + int k, l; + char path_base[128]; + int package_id, domain_id, cluster_id; + + sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i); + + if (access(path_base, R_OK)) + break; + + sprintf(path, "%s/package_id", path_base); + package_id = read_sysfs_int(path); + + sprintf(path, "%s/domain_id", path_base); + domain_id = read_sysfs_int(path); + + sprintf(path, "%s/fabric_cluster_id", path_base); + cluster_id = read_sysfs_int(path); + + sprintf(path, "%s/min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, + cluster_id, k / 1000, l / 1000); + + sprintf(path, "%s/initial_min_freq_khz", path_base); + k = read_sysfs_int(path); + sprintf(path, "%s/initial_max_freq_khz", path_base); + l = read_sysfs_int(path); + fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000); + + sprintf(path, "%s/current_freq_khz", path_base); + k = read_sysfs_int(path); + fprintf(outf, " %d MHz\n", k / 1000); + } } static void probe_graphics(void) -- cgit v1.2.3 From fb5ceca046efc84f69fcf9779a013f8a0e63bbff Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 12 Jan 2024 13:48:14 +0100 Subject: tools/power turbostat: Print ucode revision only if valid If the MSR read were to fail, turbostat would print "microcode 0x0" Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bbd2e0edadfa..a4a40a6e1b95 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5679,6 +5679,7 @@ void process_cpuid() unsigned int eax, ebx, ecx, edx; unsigned int fms, family, model, stepping, ecx_flags, edx_flags; unsigned long long ucode_patch = 0; + bool ucode_patch_valid = false; eax = ebx = ecx = edx = 0; @@ -5708,6 +5709,8 @@ void process_cpuid() if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) warnx("get_msr(UCODE)"); + else + ucode_patch_valid = true; /* * check max extended function levels of CPUID. @@ -5718,9 +5721,12 @@ void process_cpuid() __cpuid(0x80000000, max_extended_level, ebx, ecx, edx); if (!quiet) { - fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", - family, model, stepping, family, model, stepping, - (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)", + family, model, stepping, family, model, stepping); + if (ucode_patch_valid) + fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fputc('\n', outf); + fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", -- cgit v1.2.3 From ccb2280ec2f9e805d70f57a3a1c5deff0d532cb3 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 25 Oct 2023 01:59:13 -0400 Subject: x86/kvm: Use separate percpu variable to track the enabling of asyncpf Refer to commit fd10cde9294f ("KVM paravirt: Add async PF initialization to PV guest") and commit 344d9588a9df ("KVM: Add PV MSR to enable asynchronous page faults delivery"). It turns out that at the time when asyncpf was introduced, the purpose was defining the shared PV data 'struct kvm_vcpu_pv_apf_data' with the size of 64 bytes. However, it made a mistake and defined the size to 68 bytes, which failed to make fit in a cache line and made the code inconsistent with the documentation. Below justification quoted from Sean[*] KVM (the host side) has *never* read kvm_vcpu_pv_apf_data.enabled, and the documentation clearly states that enabling is based solely on the bit in the synthetic MSR. So rather than update the documentation, fix the goof by removing the enabled filed and use the separate percpu variable instread. KVM-as-a-host obviously doesn't enforce anything or consume the size, and changing the header will only affect guests that are rebuilt against the new header, so there's no chance of ABI breakage between KVM and its guests. The only possible breakage is if some other hypervisor is emulating KVM's async #PF (LOL) and relies on the guest to set kvm_vcpu_pv_apf_data.enabled. But (a) I highly doubt such a hypervisor exists, (b) that would arguably be a violation of KVM's "spec", and (c) the worst case scenario is that the guest would simply lose async #PF functionality. [*] https://lore.kernel.org/all/ZS7ERnnRqs8Fl0ZF@google.com/T/#u Suggested-by: Sean Christopherson Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20231025055914.1201792-2-xiaoyao.li@intel.com [sean: use true/false instead of 1/0 for booleans] Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/msr.rst | 1 - arch/x86/include/uapi/asm/kvm_para.h | 1 - arch/x86/kernel/kvm.c | 11 ++++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst index 9315fc385fb0..f6d70f99a1a7 100644 --- a/Documentation/virt/kvm/x86/msr.rst +++ b/Documentation/virt/kvm/x86/msr.rst @@ -204,7 +204,6 @@ data: __u32 token; __u8 pad[56]; - __u32 enabled; }; Bits 5-4 of the MSR are reserved and should be zero. Bit 0 is set to 1 diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 6e64b27b2c1e..605899594ebb 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -142,7 +142,6 @@ struct kvm_vcpu_pv_apf_data { __u32 token; __u8 pad[56]; - __u32 enabled; }; #define KVM_PV_EOI_BIT 0 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index dfe9945b9bec..e34f985c9a6f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; @@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { flags = __this_cpu_read(apf_reason.flags); __this_cpu_write(apf_reason.flags, 0); } @@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) inc_irq_stat(irq_hv_callback_count); - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); @@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); - __this_cpu_write(apf_reason.enabled, 1); + __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } @@ -383,11 +384,11 @@ static void kvm_guest_cpu_init(void) static void kvm_pv_disable_apf(void) { - if (!__this_cpu_read(apf_reason.enabled)) + if (!__this_cpu_read(async_pf_enabled)) return; wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); - __this_cpu_write(apf_reason.enabled, 0); + __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } -- cgit v1.2.3 From df01f0a1165c35e95b5f52c7ba25c19020352ff9 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 25 Oct 2023 01:59:14 -0400 Subject: KVM: x86: Improve documentation of MSR_KVM_ASYNC_PF_EN Fix some incorrect statement of MSR_KVM_ASYNC_PF_EN documentation and state clearly the token in 'struct kvm_vcpu_pv_apf_data' of 'page ready' event is matchted with the token in CR2 in 'page not present' event. Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20231025055914.1201792-3-xiaoyao.li@intel.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/msr.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst index f6d70f99a1a7..3aecf2a70e7b 100644 --- a/Documentation/virt/kvm/x86/msr.rst +++ b/Documentation/virt/kvm/x86/msr.rst @@ -193,8 +193,8 @@ data: Asynchronous page fault (APF) control MSR. Bits 63-6 hold 64-byte aligned physical address of a 64 byte memory area - which must be in guest RAM and must be zeroed. This memory is expected - to hold a copy of the following structure:: + which must be in guest RAM. This memory is expected to hold the + following structure:: struct kvm_vcpu_pv_apf_data { /* Used for 'page not present' events delivered via #PF */ @@ -231,14 +231,14 @@ data: as regular page fault, guest must reset 'flags' to '0' before it does something that can generate normal page fault. - Bytes 5-7 of 64 byte memory location ('token') will be written to by the + Bytes 4-7 of 64 byte memory location ('token') will be written to by the hypervisor at the time of APF 'page ready' event injection. The content - of these bytes is a token which was previously delivered as 'page not - present' event. The event indicates the page in now available. Guest is - supposed to write '0' to 'token' when it is done handling 'page ready' - event and to write 1' to MSR_KVM_ASYNC_PF_ACK after clearing the location; - writing to the MSR forces KVM to re-scan its queue and deliver the next - pending notification. + of these bytes is a token which was previously delivered in CR2 as + 'page not present' event. The event indicates the page is now available. + Guest is supposed to write '0' to 'token' when it is done handling + 'page ready' event and to write '1' to MSR_KVM_ASYNC_PF_ACK after + clearing the location; writing to the MSR forces KVM to re-scan its + queue and deliver the next pending notification. Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page ready' APF delivery needs to be written to before enabling APF mechanism -- cgit v1.2.3 From cc4ce37bed85989daf8f38bf19ea591f3b36fb0c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Jan 2024 15:56:06 -0800 Subject: KVM: SVM: Set sev->asid in sev_asid_new() instead of overloading the return Explicitly set sev->asid in sev_asid_new() when a new ASID is successfully allocated, and return '0' to indicate success instead of overloading the return value to multiplex the ASID with error codes. There is exactly one caller of sev_asid_new(), and sev_asid_free() already consumes sev->asid, i.e. returning the ASID isn't necessary for flexibility, nor does it provide symmetry between related APIs. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240131235609.4161407-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f760106c31f8..7c000088bca6 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -179,7 +179,8 @@ again: mutex_unlock(&sev_bitmap_lock); - return asid; + sev->asid = asid; + return 0; e_uncharge: sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); @@ -246,7 +247,7 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - int asid, ret; + int ret; if (kvm->created_vcpus) return -EINVAL; @@ -257,10 +258,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) sev->active = true; sev->es_active = argp->id == KVM_SEV_ES_INIT; - asid = sev_asid_new(sev); - if (asid < 0) + ret = sev_asid_new(sev); + if (ret) goto e_no_asid; - sev->asid = asid; ret = sev_platform_init(&argp->error); if (ret) -- cgit v1.2.3 From 466eec4a22a76c462781bf6d45cb02cbedf21a61 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Jan 2024 15:56:07 -0800 Subject: KVM: SVM: Use unsigned integers when dealing with ASIDs Convert all local ASID variables and parameters throughout the SEV code from signed integers to unsigned integers. As ASIDs are fundamentally unsigned values, and the global min/max variables are appropriately unsigned integers, too. Functionally, this is a glorified nop as KVM guarantees min_sev_asid is non-zero, and no CPU supports -1u as the _only_ asid, i.e. the signed vs. unsigned goof won't cause problems in practice. Opportunistically use sev_get_asid() in sev_flush_encrypted_page() instead of open coding an equivalent. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240131235609.4161407-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 18 ++++++++++-------- arch/x86/kvm/trace.h | 10 +++++----- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7c000088bca6..eeef43c795d8 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -84,9 +84,10 @@ struct enc_region { }; /* Called with the sev_bitmap_lock held, or on shutdown */ -static int sev_flush_asids(int min_asid, int max_asid) +static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) { - int ret, asid, error = 0; + int ret, error = 0; + unsigned int asid; /* Check if there are any ASIDs to reclaim before performing a flush */ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); @@ -116,7 +117,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm) } /* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(int min_asid, int max_asid) +static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { if (sev_flush_asids(min_asid, max_asid)) return false; @@ -143,8 +144,9 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) static int sev_asid_new(struct kvm_sev_info *sev) { - int asid, min_asid, max_asid, ret; + unsigned int asid, min_asid, max_asid; bool retry = true; + int ret; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); @@ -188,7 +190,7 @@ e_uncharge: return ret; } -static int sev_get_asid(struct kvm *kvm) +static unsigned int sev_get_asid(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -284,8 +286,8 @@ e_no_asid: static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { + unsigned int asid = sev_get_asid(kvm); struct sev_data_activate activate; - int asid = sev_get_asid(kvm); int ret; /* activate ASID on the given handle */ @@ -2312,7 +2314,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) */ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) { - int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid; + unsigned int asid = sev_get_asid(vcpu->kvm); /* * Note! The address must be a kernel address, as regular page walk @@ -2630,7 +2632,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) void pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - int asid = sev_get_asid(svm->vcpu.kvm); + unsigned int asid = sev_get_asid(svm->vcpu.kvm); /* Assign the asid allocated with this SEV guest */ svm->asid = asid; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 83843379813e..b82e6ed4f024 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -732,13 +732,13 @@ TRACE_EVENT(kvm_nested_intr_vmexit, * Tracepoint for nested #vmexit because of interrupt pending */ TRACE_EVENT(kvm_invlpga, - TP_PROTO(__u64 rip, int asid, u64 address), + TP_PROTO(__u64 rip, unsigned int asid, u64 address), TP_ARGS(rip, asid, address), TP_STRUCT__entry( - __field( __u64, rip ) - __field( int, asid ) - __field( __u64, address ) + __field( __u64, rip ) + __field( unsigned int, asid ) + __field( __u64, address ) ), TP_fast_assign( @@ -747,7 +747,7 @@ TRACE_EVENT(kvm_invlpga, __entry->address = address; ), - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", + TP_printk("rip: 0x%016llx asid: %u address: 0x%016llx", __entry->rip, __entry->asid, __entry->address) ); -- cgit v1.2.3 From 0aa6b90ef9d75b4bd7b6d106d85f2a3437697f91 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Wed, 31 Jan 2024 15:56:08 -0800 Subject: KVM: SVM: Add support for allowing zero SEV ASIDs Some BIOSes allow the end user to set the minimum SEV ASID value (CPUID 0x8000001F_EDX) to be greater than the maximum number of encrypted guests, or maximum SEV ASID value (CPUID 0x8000001F_ECX) in order to dedicate all the SEV ASIDs to SEV-ES or SEV-SNP. The SEV support, as coded, does not handle the case where the minimum SEV ASID value can be greater than the maximum SEV ASID value. As a result, the following confusing message is issued: [ 30.715724] kvm_amd: SEV enabled (ASIDs 1007 - 1006) Fix the support to properly handle this case. Fixes: 916391a2d1dc ("KVM: SVM: Add support for SEV-ES capability in KVM") Suggested-by: Sean Christopherson Signed-off-by: Ashish Kalra Cc: stable@vger.kernel.org Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/20240104190520.62510-1-Ashish.Kalra@amd.com Link: https://lore.kernel.org/r/20240131235609.4161407-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index eeef43c795d8..5f8312edee36 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -144,10 +144,21 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) static int sev_asid_new(struct kvm_sev_info *sev) { - unsigned int asid, min_asid, max_asid; + /* + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. + * Note: min ASID can end up larger than the max if basic SEV support is + * effectively disabled by disallowing use of ASIDs for SEV guests. + */ + unsigned int min_asid = sev->es_active ? 1 : min_sev_asid; + unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; + unsigned int asid; bool retry = true; int ret; + if (min_asid > max_asid) + return -ENOTTY; + WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); ret = sev_misc_cg_try_charge(sev); @@ -159,12 +170,6 @@ static int sev_asid_new(struct kvm_sev_info *sev) mutex_lock(&sev_bitmap_lock); - /* - * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. - * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. - */ - min_asid = sev->es_active ? 1 : min_sev_asid; - max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); if (asid > max_asid) { @@ -2234,8 +2239,10 @@ void __init sev_hardware_setup(void) goto out; } - sev_asid_count = max_sev_asid - min_sev_asid + 1; - WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + if (min_sev_asid <= max_sev_asid) { + sev_asid_count = max_sev_asid - min_sev_asid + 1; + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + } sev_supported = true; /* SEV-ES support requested? */ @@ -2266,7 +2273,9 @@ void __init sev_hardware_setup(void) out: if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", - sev_supported ? "enabled" : "disabled", + sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : + "unusable" : + "disabled", min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", -- cgit v1.2.3 From fdd58834d132046149699b88a27a0db26829f4fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Jan 2024 15:56:09 -0800 Subject: KVM: SVM: Return -EINVAL instead of -EBUSY on attempt to re-init SEV/SEV-ES Return -EINVAL instead of -EBUSY if userspace attempts KVM_SEV{,ES}_INIT on a VM that already has SEV active. Returning -EBUSY is nonsencial as it's impossible to deactivate SEV without destroying the VM, i.e. the VM isn't "busy" in any sane sense of the word, and the odds of any userspace wanting exactly -EBUSY on a userspace bug are minuscule. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240131235609.4161407-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5f8312edee36..f06f9e51ad9d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -259,9 +259,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (kvm->created_vcpus) return -EINVAL; - ret = -EBUSY; if (unlikely(sev->active)) - return ret; + return -EINVAL; sev->active = true; sev->es_active = argp->id == KVM_SEV_ES_INIT; -- cgit v1.2.3 From 7013482ff5945aee7390c55ababcb390de1a4aad Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 11 Feb 2024 20:33:41 -0800 Subject: 9p/trans_fd: remove Excess kernel-doc comment Remove the "@req" kernel-doc description since there is not 'req' member in the struct p9_conn. Fixes one kernel-doc warning: trans_fd.c:133: warning: Excess struct member 'req' description in 'p9_conn' Signed-off-by: Randy Dunlap Cc: Eric Van Hensbergen Cc: Latchesar Ionkov Cc: Dominique Martinet Cc: v9fs@lists.linux.dev Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Reviewed-by: Simon Horman Message-ID: <20240212043341.4631-1-rdunlap@infradead.org> Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 1a3948b8c493..196060dc6138 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -95,7 +95,6 @@ struct p9_poll_wait { * @unsent_req_list: accounting for requests that haven't been sent * @rreq: read request * @wreq: write request - * @req: current request being processed (if any) * @tmp_buf: temporary buffer to read in header * @rc: temporary fcall for reading current frame * @wpos: write position for current frame -- cgit v1.2.3 From be3193e58ec210b2a72fb1134c2a0695088a911d Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Tue, 9 Jan 2024 12:39:03 +0900 Subject: 9p: Fix read/write debug statements to report server reply Previous conversion to iov missed these debug statements which would now always print the requested size instead of the actual server reply. Write also added a loop in a much older commit but we didn't report these, while reads do report each iteration -- it's more coherent to keep reporting all requests to server so move that at the same time. Fixes: 7f02464739da ("9p: convert to advancing variant of iov_iter_get_pages_alloc()") Signed-off-by: Dominique Martinet Message-ID: <20240109-9p-rw-trace-v1-1-327178114257@codewreck.org> --- net/9p/client.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index e265a0ca6bdd..f7e90b4769bb 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1583,7 +1583,7 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, received = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", received); if (non_zc) { int n = copy_to_iter(dataptr, received, to); @@ -1609,9 +1609,6 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) int total = 0; *err = 0; - p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n", - fid->fid, offset, iov_iter_count(from)); - while (iov_iter_count(from)) { int count = iov_iter_count(from); int rsize = fid->iounit; @@ -1623,6 +1620,9 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) if (count < rsize) rsize = count; + p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d (/%d)\n", + fid->fid, offset, rsize, count); + /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0, @@ -1650,7 +1650,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) written = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", written); p9_req_put(clnt, req); iov_iter_revert(from, count - written - iov_iter_count(from)); -- cgit v1.2.3 From 2a0505cdd8c8b12670f4b5a6eb5c996c0861c2d5 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:46:17 +0000 Subject: 9p: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1, so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete to avoid confusion for users. Here we can just remove all its users, which has no functional change. Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz/ [1] Signed-off-by: Chengming Zhou Message-ID: <20240224134617.829016-1-chengming.zhou@linux.dev> Signed-off-by: Dominique Martinet --- fs/9p/v9fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 61dbe52bb3a3..281a1ed03a04 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -637,7 +637,7 @@ static int v9fs_init_inode_cache(void) v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; -- cgit v1.2.3 From 92e82cf632e85474e1e19a6f1cae813e87b07965 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Feb 2024 11:18:35 +0100 Subject: KVM: x86: Introduce __kvm_get_hypervisor_cpuid() helper Similar to kvm_find_kvm_cpuid_features()/__kvm_find_kvm_cpuid_features(), introduce a helper to search for the specific hypervisor signature in any struct kvm_cpuid_entry2 array, not only in vcpu->arch.cpuid_entries. No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240228101837.93642-2-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index adba49afb5fe..0e1e3e5df6dd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -189,15 +189,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 return 0; } -static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, - const char *sig) +static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_entry2 *entries, + int nent, const char *sig) { struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; u32 base; for_each_possible_hypervisor_cpuid_base(base) { - entry = kvm_find_cpuid_entry(vcpu, base); + entry = cpuid_entry2_find(entries, nent, base, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (entry) { u32 signature[3]; @@ -217,6 +217,13 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp return cpuid; } +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) +{ + return __kvm_get_hypervisor_cpuid(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, sig); +} + static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { -- cgit v1.2.3 From 4736d85f0d18ad0469439a0ebc7ccb0cd94bd754 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Feb 2024 11:18:36 +0100 Subject: KVM: x86: Use actual kvm_cpuid.base for clearing KVM_FEATURE_PV_UNHALT Commit ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") moved tweaking of the supplied CPUID data earlier in kvm_set_cpuid() but __kvm_update_cpuid_runtime() actually uses 'vcpu->arch.kvm_cpuid' (though __kvm_find_kvm_cpuid_features()) which gets set later in kvm_set_cpuid(). In some cases, e.g. when kvm_set_cpuid() is called for the first time and 'vcpu->arch.kvm_cpuid' is clear, __kvm_find_kvm_cpuid_features() fails to find KVM PV feature entry and the logic which clears KVM_FEATURE_PV_UNHALT after enabling KVM_X86_DISABLE_EXITS_HLT does not work. The logic, introduced by the commit ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") must stay: the supplied CPUID data is tweaked by KVM first (__kvm_update_cpuid_runtime()) and checked later (kvm_check_cpuid()) and the actual data (vcpu->arch.cpuid_*, vcpu->arch.kvm_cpuid, vcpu->arch.xen.cpuid,..) is only updated on success. Switch to searching for KVM_SIGNATURE in the supplied CPUID data to discover KVM PV feature entry instead of using stale 'vcpu->arch.kvm_cpuid'. While on it, drop pointless "&& (best->eax & (1 << KVM_FEATURE_PV_UNHALT)" check when clearing KVM_FEATURE_PV_UNHALT bit. Fixes: ee3a5f9e3d9b ("KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries") Reported-and-tested-by: Li RongQing Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240228101837.93642-3-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0e1e3e5df6dd..bfc0bfcb2bc6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -224,22 +224,22 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp vcpu->arch.cpuid_nent, sig); } -static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, - struct kvm_cpuid_entry2 *entries, int nent) +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries, + int nent, u32 kvm_cpuid_base) { - u32 base = vcpu->arch.kvm_cpuid.base; - - if (!base) - return NULL; - - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + return cpuid_entry2_find(entries, nent, kvm_cpuid_base | KVM_CPUID_FEATURES, KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) { - return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent); + u32 base = vcpu->arch.kvm_cpuid.base; + + if (!base) + return NULL; + + return __kvm_find_kvm_cpuid_features(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, base); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -273,6 +273,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e int nent) { struct kvm_cpuid_entry2 *best; + struct kvm_hypervisor_cpuid kvm_cpuid; best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { @@ -299,10 +300,12 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); - if (kvm_hlt_in_guest(vcpu->kvm) && best && - (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) - best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + kvm_cpuid = __kvm_get_hypervisor_cpuid(entries, nent, KVM_SIGNATURE); + if (kvm_cpuid.base) { + best = __kvm_find_kvm_cpuid_features(entries, nent, kvm_cpuid.base); + if (kvm_hlt_in_guest(vcpu->kvm) && best) + best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + } if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); -- cgit v1.2.3 From c2585047c8e185b070ad5c7bd887ef59cee3941f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 28 Feb 2024 11:18:37 +0100 Subject: KVM: selftests: Check that PV_UNHALT is cleared when HLT exiting is disabled KVM_FEATURE_PV_UNHALT is expected to get cleared from KVM PV feature CPUID data when KVM_X86_DISABLE_EXITS_HLT is enabled. Add the corresponding test to kvm_pv_test. Note, the newly added code doesn't actually test KVM_FEATURE_PV_UNHALT and KVM_X86_DISABLE_EXITS_HLT features. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240228101837.93642-4-vkuznets@redhat.com [sean: add and use vcpu_cpuid_has()] Signed-off-by: Sean Christopherson --- .../selftests/kvm/include/x86_64/processor.h | 9 +++++ tools/testing/selftests/kvm/x86_64/kvm_pv_test.c | 39 ++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index a84863503fcb..8b5c804562f9 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -995,6 +995,15 @@ static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); } +static inline bool vcpu_cpuid_has(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature) +{ + struct kvm_cpuid_entry2 *entry; + + entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index); + return *((&entry->eax) + feature.reg) & BIT(feature.bit); +} + void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr); void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function); diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c index 9e2879af7c20..40cc59f4e650 100644 --- a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -133,6 +133,43 @@ static void enter_guest(struct kvm_vcpu *vcpu) } } +static void test_pv_unhalt(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_cpuid_entry2 *ent; + u32 kvm_sig_old; + + pr_info("testing KVM_FEATURE_PV_UNHALT\n"); + + TEST_REQUIRE(KVM_CAP_X86_DISABLE_EXITS); + + /* KVM_PV_UNHALT test */ + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT); + + TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT), + "Enabling X86_FEATURE_KVM_PV_UNHALT had no effect"); + + /* Make sure KVM clears vcpu->arch.kvm_cpuid */ + ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE); + kvm_sig_old = ent->ebx; + ent->ebx = 0xdeadbeef; + vcpu_set_cpuid(vcpu); + + vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT); + ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE); + ent->ebx = kvm_sig_old; + vcpu_set_cpuid(vcpu); + + TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT), + "KVM_FEATURE_PV_UNHALT is set with KVM_CAP_X86_DISABLE_EXITS"); + + /* FIXME: actually test KVM_FEATURE_PV_UNHALT feature */ + + kvm_vm_free(vm); +} + int main(void) { struct kvm_vcpu *vcpu; @@ -151,4 +188,6 @@ int main(void) enter_guest(vcpu); kvm_vm_free(vm); + + test_pv_unhalt(); } -- cgit v1.2.3 From f193957b0fbbba397c8bddedf158b3bf7e4850fc Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 7 Mar 2024 11:02:27 +0000 Subject: ASoC: wm_adsp: Fix missing mutex_lock in wm_adsp_write_ctl() wm_adsp_write_ctl() must hold the pwr_lock mutex when calling cs_dsp_get_ctl(). This was previously partially fixed by commit 781118bc2fc1 ("ASoC: wm_adsp: Fix missing locking in wm_adsp_[read|write]_ctl()") but this only put locking around the call to cs_dsp_coeff_write_ctrl(), missing the call to cs_dsp_get_ctl(). Signed-off-by: Richard Fitzgerald Fixes: 781118bc2fc1 ("ASoC: wm_adsp: Fix missing locking in wm_adsp_[read|write]_ctl()") Link: https://msgid.link/r/20240307110227.41421-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm_adsp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 36ea0dcdc7ab..9cb9068c0462 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -683,11 +683,12 @@ static void wm_adsp_control_remove(struct cs_dsp_coeff_ctl *cs_ctl) int wm_adsp_write_ctl(struct wm_adsp *dsp, const char *name, int type, unsigned int alg, void *buf, size_t len) { - struct cs_dsp_coeff_ctl *cs_ctl = cs_dsp_get_ctl(&dsp->cs_dsp, name, type, alg); + struct cs_dsp_coeff_ctl *cs_ctl; struct wm_coeff_ctl *ctl; int ret; mutex_lock(&dsp->cs_dsp.pwr_lock); + cs_ctl = cs_dsp_get_ctl(&dsp->cs_dsp, name, type, alg); ret = cs_dsp_coeff_write_ctrl(cs_ctl, 0, buf, len); mutex_unlock(&dsp->cs_dsp.pwr_lock); -- cgit v1.2.3 From 95f37eb52e18879a1b16e51b972d992b39e50a81 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:35 +0200 Subject: ARM: OMAP2+: fix bogus MMC GPIO labels on Nokia N8x0 The GPIO bank width is 32 on OMAP2, so all labels are incorrect. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-2-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index 31755a378c73..3e48f34016c1 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -144,8 +144,7 @@ static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot switch, GPIO 96 */ - GPIO_LOOKUP("gpio-80-111", 16, - "switch", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-96-127", 0, "switch", GPIO_ACTIVE_HIGH), { } }, }; @@ -154,11 +153,9 @@ static struct gpiod_lookup_table nokia810_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot index 1, VSD power, GPIO 23 */ - GPIO_LOOKUP_IDX("gpio-16-31", 7, - "vsd", 1, GPIO_ACTIVE_HIGH), + GPIO_LOOKUP_IDX("gpio-0-31", 23, "vsd", 1, GPIO_ACTIVE_HIGH), /* Slot index 1, VIO power, GPIO 9 */ - GPIO_LOOKUP_IDX("gpio-0-15", 9, - "vio", 1, GPIO_ACTIVE_HIGH), + GPIO_LOOKUP_IDX("gpio-0-31", 9, "vio", 1, GPIO_ACTIVE_HIGH), { } }, }; -- cgit v1.2.3 From 480d44d0820dd5ae043dc97c0b46dabbe53cb1cf Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:36 +0200 Subject: ARM: OMAP2+: fix N810 MMC gpiod table Trying to append a second table for the same dev_id doesn't seem to work. The second table is just silently ignored. As a result eMMC GPIOs are not present. Fix by using separate tables for N800 and N810. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-3-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index 3e48f34016c1..c933a91751e4 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -140,7 +140,7 @@ static int slot1_cover_open; static int slot2_cover_open; static struct device *mmc_device; -static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { +static struct gpiod_lookup_table nokia800_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { /* Slot switch, GPIO 96 */ @@ -152,6 +152,8 @@ static struct gpiod_lookup_table nokia8xx_mmc_gpio_table = { static struct gpiod_lookup_table nokia810_mmc_gpio_table = { .dev_id = "mmci-omap.0", .table = { + /* Slot switch, GPIO 96 */ + GPIO_LOOKUP("gpio-96-127", 0, "switch", GPIO_ACTIVE_HIGH), /* Slot index 1, VSD power, GPIO 23 */ GPIO_LOOKUP_IDX("gpio-0-31", 23, "vsd", 1, GPIO_ACTIVE_HIGH), /* Slot index 1, VIO power, GPIO 9 */ @@ -412,8 +414,6 @@ static struct omap_mmc_platform_data *mmc_data[OMAP24XX_NR_MMC]; static void __init n8x0_mmc_init(void) { - gpiod_add_lookup_table(&nokia8xx_mmc_gpio_table); - if (board_is_n810()) { mmc1_data.slots[0].name = "external"; @@ -426,6 +426,8 @@ static void __init n8x0_mmc_init(void) mmc1_data.slots[1].name = "internal"; mmc1_data.slots[1].ban_openended = 1; gpiod_add_lookup_table(&nokia810_mmc_gpio_table); + } else { + gpiod_add_lookup_table(&nokia800_mmc_gpio_table); } mmc1_data.nr_slots = 2; -- cgit v1.2.3 From d4debbcbffa45c3de5df0040af2eea74a9e794a3 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:37 +0200 Subject: mmc: omap: fix broken slot switch lookup The lookup is done before host->dev is initialized. It will always just fail silently, and the MMC behaviour is totally unpredictable as the switch is left in an undefined state. Fix that. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-4-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 9fb8995b43a1..aa40e1a9dc29 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1384,13 +1384,6 @@ static int mmc_omap_probe(struct platform_device *pdev) if (IS_ERR(host->virt_base)) return PTR_ERR(host->virt_base); - host->slot_switch = gpiod_get_optional(host->dev, "switch", - GPIOD_OUT_LOW); - if (IS_ERR(host->slot_switch)) - return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), - "error looking up slot switch GPIO\n"); - - INIT_WORK(&host->slot_release_work, mmc_omap_slot_release_work); INIT_WORK(&host->send_stop_work, mmc_omap_send_stop_work); @@ -1409,6 +1402,12 @@ static int mmc_omap_probe(struct platform_device *pdev) host->dev = &pdev->dev; platform_set_drvdata(pdev, host); + host->slot_switch = gpiod_get_optional(host->dev, "switch", + GPIOD_OUT_LOW); + if (IS_ERR(host->slot_switch)) + return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), + "error looking up slot switch GPIO\n"); + host->id = pdev->id; host->irq = irq; host->phys_base = res->start; -- cgit v1.2.3 From f6862c7f156d04f81c38467e1c304b7e9517e810 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:38 +0200 Subject: mmc: omap: fix deferred probe After a deferred probe, GPIO descriptor lookup will fail with EBUSY. Fix by using managed descriptors. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-5-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index aa40e1a9dc29..50408771ae01 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1259,18 +1259,18 @@ static int mmc_omap_new_slot(struct mmc_omap_host *host, int id) slot->pdata = &host->pdata->slots[id]; /* Check for some optional GPIO controls */ - slot->vsd = gpiod_get_index_optional(host->dev, "vsd", - id, GPIOD_OUT_LOW); + slot->vsd = devm_gpiod_get_index_optional(host->dev, "vsd", + id, GPIOD_OUT_LOW); if (IS_ERR(slot->vsd)) return dev_err_probe(host->dev, PTR_ERR(slot->vsd), "error looking up VSD GPIO\n"); - slot->vio = gpiod_get_index_optional(host->dev, "vio", - id, GPIOD_OUT_LOW); + slot->vio = devm_gpiod_get_index_optional(host->dev, "vio", + id, GPIOD_OUT_LOW); if (IS_ERR(slot->vio)) return dev_err_probe(host->dev, PTR_ERR(slot->vio), "error looking up VIO GPIO\n"); - slot->cover = gpiod_get_index_optional(host->dev, "cover", - id, GPIOD_IN); + slot->cover = devm_gpiod_get_index_optional(host->dev, "cover", + id, GPIOD_IN); if (IS_ERR(slot->cover)) return dev_err_probe(host->dev, PTR_ERR(slot->cover), "error looking up cover switch GPIO\n"); @@ -1402,8 +1402,8 @@ static int mmc_omap_probe(struct platform_device *pdev) host->dev = &pdev->dev; platform_set_drvdata(pdev, host); - host->slot_switch = gpiod_get_optional(host->dev, "switch", - GPIOD_OUT_LOW); + host->slot_switch = devm_gpiod_get_optional(host->dev, "switch", + GPIOD_OUT_LOW); if (IS_ERR(host->slot_switch)) return dev_err_probe(host->dev, PTR_ERR(host->slot_switch), "error looking up slot switch GPIO\n"); -- cgit v1.2.3 From 894ad61b85d6ba8efd4274aa8719d9ff1c89ea54 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:14:39 +0200 Subject: mmc: omap: restore original power up/down steps Commit e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") moved Nokia N810 MMC power up/down from the board file into the MMC driver. The change removed some delays, and ordering without a valid reason. Restore power up/down to match the original code. This matters only on N810 where the 2nd GPIO is in use. Other boards will see an additional delay but that should be a lesser concern than omitting delays altogether. Fixes: e519f0bb64ef ("ARM/mmc: Convert old mmci-omap to GPIO descriptors") Signed-off-by: Aaro Koskinen Message-ID: <20240223181439.1099750-6-aaro.koskinen@iki.fi> Reviewed-by: Linus Walleij Acked-by: Ulf Hansson Signed-off-by: Tony Lindgren --- drivers/mmc/host/omap.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 50408771ae01..13fa8588e38c 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1119,10 +1119,25 @@ static void mmc_omap_set_power(struct mmc_omap_slot *slot, int power_on, host = slot->host; - if (slot->vsd) - gpiod_set_value(slot->vsd, power_on); - if (slot->vio) - gpiod_set_value(slot->vio, power_on); + if (power_on) { + if (slot->vsd) { + gpiod_set_value(slot->vsd, power_on); + msleep(1); + } + if (slot->vio) { + gpiod_set_value(slot->vio, power_on); + msleep(1); + } + } else { + if (slot->vio) { + gpiod_set_value(slot->vio, power_on); + msleep(50); + } + if (slot->vsd) { + gpiod_set_value(slot->vsd, power_on); + msleep(50); + } + } if (slot->pdata->set_power != NULL) slot->pdata->set_power(mmc_dev(slot->mmc), slot->id, power_on, -- cgit v1.2.3 From 4421405e3634a3189b541cf1e34598e44260720d Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 23 Feb 2024 20:16:56 +0200 Subject: ARM: OMAP2+: fix USB regression on Nokia N8x0 GPIO chip labels are wrong for OMAP2, so the USB does not work. Fix. Fixes: 8e0285ab95a9 ("ARM/musb: omap2: Remove global GPIO numbers from TUSB6010") Signed-off-by: Aaro Koskinen Reviewed-by: Linus Walleij Message-ID: <20240223181656.1099845-1-aaro.koskinen@iki.fi> Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-n8x0.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index c933a91751e4..ff2a4a4d8220 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -79,10 +79,8 @@ static struct musb_hdrc_platform_data tusb_data = { static struct gpiod_lookup_table tusb_gpio_table = { .dev_id = "musb-tusb", .table = { - GPIO_LOOKUP("gpio-0-15", 0, "enable", - GPIO_ACTIVE_HIGH), - GPIO_LOOKUP("gpio-48-63", 10, "int", - GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-0-31", 0, "enable", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-32-63", 26, "int", GPIO_ACTIVE_HIGH), { } }, }; -- cgit v1.2.3 From fcf3f7e2fc8a53a6140beee46ec782a4c88e4744 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 8 Mar 2024 17:37:26 +0800 Subject: raid1: fix use-after-free for original bio in raid1_write_request() r1_bio->bios[] is used to record new bios that will be issued to underlying disks, however, in raid1_write_request(), r1_bio->bios[] will set to the original bio temporarily. Meanwhile, if blocked rdev is set, free_r1bio() will be called causing that all r1_bio->bios[] to be freed: raid1_write_request() r1_bio = alloc_r1bio(mddev, bio); -> r1_bio->bios[] is NULL for (i = 0; i < disks; i++) -> for each rdev in conf // first rdev is normal r1_bio->bios[0] = bio; -> set to original bio // second rdev is blocked if (test_bit(Blocked, &rdev->flags)) break if (blocked_rdev) free_r1bio() put_all_bios() bio_put(r1_bio->bios[0]) -> original bio is freed Test scripts: mdadm -CR /dev/md0 -l1 -n4 /dev/sd[abcd] --assume-clean fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 \ -iodepth=128 -name=test -direct=1 echo blocked > /sys/block/md0/md/rd2/state Test result: BUG bio-264 (Not tainted): Object already free ----------------------------------------------------------------------------- Allocated in mempool_alloc_slab+0x24/0x50 age=1 cpu=1 pid=869 kmem_cache_alloc+0x324/0x480 mempool_alloc_slab+0x24/0x50 mempool_alloc+0x6e/0x220 bio_alloc_bioset+0x1af/0x4d0 blkdev_direct_IO+0x164/0x8a0 blkdev_write_iter+0x309/0x440 aio_write+0x139/0x2f0 io_submit_one+0x5ca/0xb70 __do_sys_io_submit+0x86/0x270 __x64_sys_io_submit+0x22/0x30 do_syscall_64+0xb1/0x210 entry_SYSCALL_64_after_hwframe+0x6c/0x74 Freed in mempool_free_slab+0x1f/0x30 age=1 cpu=1 pid=869 kmem_cache_free+0x28c/0x550 mempool_free_slab+0x1f/0x30 mempool_free+0x40/0x100 bio_free+0x59/0x80 bio_put+0xf0/0x220 free_r1bio+0x74/0xb0 raid1_make_request+0xadf/0x1150 md_handle_request+0xc7/0x3b0 md_submit_bio+0x76/0x130 __submit_bio+0xd8/0x1d0 submit_bio_noacct_nocheck+0x1eb/0x5c0 submit_bio_noacct+0x169/0xd40 submit_bio+0xee/0x1d0 blkdev_direct_IO+0x322/0x8a0 blkdev_write_iter+0x309/0x440 aio_write+0x139/0x2f0 Since that bios for underlying disks are not allocated yet, fix this problem by using mempool_free() directly to free the r1_bio. Fixes: 992db13a4aee ("md/raid1: free the r1bio before waiting for blocked rdev") Cc: stable@vger.kernel.org # v6.6+ Reported-by: Coly Li Signed-off-by: Yu Kuai Tested-by: Coly Li Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240308093726.1047420-1-yukuai1@huaweicloud.com --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index be8ac24f50b6..7b8a71ca66dd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1558,7 +1558,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - free_r1bio(r1_bio); + mempool_free(r1_bio, &conf->r1bio_pool); allow_barrier(conf, bio->bi_iter.bi_sector); if (bio->bi_opf & REQ_NOWAIT) { -- cgit v1.2.3 From 59097a2a5ecadb0f025232c665fd11c8ae1e1f58 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Sat, 2 Mar 2024 03:22:49 +0100 Subject: interconnect: qcom: x1e80100: Remove inexistent ACV_PERF BCM Booting the kernel on X1E results in a message like: [ 2.561524] qnoc-x1e80100 interconnect-0: ACV_PERF could not find RPMh address And indeed, taking a look at cmd-db, no such BCM exists. Remove it. Fixes: 9f196772841e ("interconnect: qcom: Add X1E80100 interconnect provider driver") Signed-off-by: Konrad Dybcio Reviewed-by: Mike Tipton Link: https://lore.kernel.org/r/20240302-topic-faux_bcm_x1e-v1-1-c40fab7c4bc5@linaro.org Signed-off-by: Georgi Djakov --- drivers/interconnect/qcom/x1e80100.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/interconnect/qcom/x1e80100.c b/drivers/interconnect/qcom/x1e80100.c index cbaf4f9c41be..06f0a6d6cbbc 100644 --- a/drivers/interconnect/qcom/x1e80100.c +++ b/drivers/interconnect/qcom/x1e80100.c @@ -116,15 +116,6 @@ static struct qcom_icc_node xm_sdc2 = { .links = { X1E80100_SLAVE_A2NOC_SNOC }, }; -static struct qcom_icc_node ddr_perf_mode_master = { - .name = "ddr_perf_mode_master", - .id = X1E80100_MASTER_DDR_PERF_MODE, - .channels = 1, - .buswidth = 4, - .num_links = 1, - .links = { X1E80100_SLAVE_DDR_PERF_MODE }, -}; - static struct qcom_icc_node qup0_core_master = { .name = "qup0_core_master", .id = X1E80100_MASTER_QUP_CORE_0, @@ -832,14 +823,6 @@ static struct qcom_icc_node qns_a2noc_snoc = { .links = { X1E80100_MASTER_A2NOC_SNOC }, }; -static struct qcom_icc_node ddr_perf_mode_slave = { - .name = "ddr_perf_mode_slave", - .id = X1E80100_SLAVE_DDR_PERF_MODE, - .channels = 1, - .buswidth = 4, - .num_links = 0, -}; - static struct qcom_icc_node qup0_core_slave = { .name = "qup0_core_slave", .id = X1E80100_SLAVE_QUP_CORE_0, @@ -1591,12 +1574,6 @@ static struct qcom_icc_bcm bcm_acv = { .nodes = { &ebi }, }; -static struct qcom_icc_bcm bcm_acv_perf = { - .name = "ACV_PERF", - .num_nodes = 1, - .nodes = { &ddr_perf_mode_slave }, -}; - static struct qcom_icc_bcm bcm_ce0 = { .name = "CE0", .num_nodes = 1, @@ -1863,18 +1840,15 @@ static const struct qcom_icc_desc x1e80100_aggre2_noc = { }; static struct qcom_icc_bcm * const clk_virt_bcms[] = { - &bcm_acv_perf, &bcm_qup0, &bcm_qup1, &bcm_qup2, }; static struct qcom_icc_node * const clk_virt_nodes[] = { - [MASTER_DDR_PERF_MODE] = &ddr_perf_mode_master, [MASTER_QUP_CORE_0] = &qup0_core_master, [MASTER_QUP_CORE_1] = &qup1_core_master, [MASTER_QUP_CORE_2] = &qup2_core_master, - [SLAVE_DDR_PERF_MODE] = &ddr_perf_mode_slave, [SLAVE_QUP_CORE_0] = &qup0_core_slave, [SLAVE_QUP_CORE_1] = &qup1_core_slave, [SLAVE_QUP_CORE_2] = &qup2_core_slave, -- cgit v1.2.3 From de1bf25b6d771abdb52d43546cf57ad775fb68a1 Mon Sep 17 00:00:00 2001 From: Mike Tipton Date: Tue, 5 Mar 2024 14:56:52 -0800 Subject: interconnect: Don't access req_list while it's being manipulated The icc_lock mutex was split into separate icc_lock and icc_bw_lock mutexes in [1] to avoid lockdep splats. However, this didn't adequately protect access to icc_node::req_list. The icc_set_bw() function will eventually iterate over req_list while only holding icc_bw_lock, but req_list can be modified while only holding icc_lock. This causes races between icc_set_bw(), of_icc_get(), and icc_put(). Example A: CPU0 CPU1 ---- ---- icc_set_bw(path_a) mutex_lock(&icc_bw_lock); icc_put(path_b) mutex_lock(&icc_lock); aggregate_requests() hlist_for_each_entry(r, ... hlist_del(... Example B: CPU0 CPU1 ---- ---- icc_set_bw(path_a) mutex_lock(&icc_bw_lock); path_b = of_icc_get() of_icc_get_by_index() mutex_lock(&icc_lock); path_find() path_init() aggregate_requests() hlist_for_each_entry(r, ... hlist_add_head(... Fix this by ensuring icc_bw_lock is always held before manipulating icc_node::req_list. The additional places icc_bw_lock is held don't perform any memory allocations, so we should still be safe from the original lockdep splats that motivated the separate locks. [1] commit af42269c3523 ("interconnect: Fix locking for runpm vs reclaim") Signed-off-by: Mike Tipton Fixes: af42269c3523 ("interconnect: Fix locking for runpm vs reclaim") Reviewed-by: Rob Clark Link: https://lore.kernel.org/r/20240305225652.22872-1-quic_mdtipton@quicinc.com Signed-off-by: Georgi Djakov --- drivers/interconnect/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index 50bac2d79d9b..68edb07d4443 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -176,6 +176,8 @@ static struct icc_path *path_init(struct device *dev, struct icc_node *dst, path->num_nodes = num_nodes; + mutex_lock(&icc_bw_lock); + for (i = num_nodes - 1; i >= 0; i--) { node->provider->users++; hlist_add_head(&path->reqs[i].req_node, &node->req_list); @@ -186,6 +188,8 @@ static struct icc_path *path_init(struct device *dev, struct icc_node *dst, node = node->reverse; } + mutex_unlock(&icc_bw_lock); + return path; } @@ -792,12 +796,16 @@ void icc_put(struct icc_path *path) pr_err("%s: error (%d)\n", __func__, ret); mutex_lock(&icc_lock); + mutex_lock(&icc_bw_lock); + for (i = 0; i < path->num_nodes; i++) { node = path->reqs[i].node; hlist_del(&path->reqs[i].req_node); if (!WARN_ON(!node->provider->users)) node->provider->users--; } + + mutex_unlock(&icc_bw_lock); mutex_unlock(&icc_lock); kfree_const(path->name); -- cgit v1.2.3 From f3c80061c0d35c60709088ccb019305796d3f6ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 9 Feb 2024 13:37:33 -0500 Subject: KVM: SEV: fix compat ABI for KVM_MEMORY_ENCRYPT_OP The data structs for KVM_MEMORY_ENCRYPT_OP have different sizes for 32- and 64-bit userspace, but they do not make any attempt to convert from one ABI to the other when 32-bit userspace is running on 64-bit kernels. This configuration never worked, and SEV is only for 64-bit kernels so we're not breaking ABI on 32-bit kernels. Fix this by adding the appropriate padding; no functional change intended for 64-bit userspace. Reviewed-by: Michael Roth Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ad29984d5e39..ef11aa4cab42 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -694,6 +694,7 @@ enum sev_cmd_id { struct kvm_sev_cmd { __u32 id; + __u32 pad0; __u64 data; __u32 error; __u32 sev_fd; @@ -704,28 +705,35 @@ struct kvm_sev_launch_start { __u32 policy; __u64 dh_uaddr; __u32 dh_len; + __u32 pad0; __u64 session_uaddr; __u32 session_len; + __u32 pad1; }; struct kvm_sev_launch_update_data { __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_launch_secret { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; struct kvm_sev_launch_measure { __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_guest_status { @@ -738,33 +746,43 @@ struct kvm_sev_dbg { __u64 src_uaddr; __u64 dst_uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_attestation_report { __u8 mnonce[16]; __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_send_start { __u32 policy; + __u32 pad0; __u64 pdh_cert_uaddr; __u32 pdh_cert_len; + __u32 pad1; __u64 plat_certs_uaddr; __u32 plat_certs_len; + __u32 pad2; __u64 amd_certs_uaddr; __u32 amd_certs_len; + __u32 pad3; __u64 session_uaddr; __u32 session_len; + __u32 pad4; }; struct kvm_sev_send_update_data { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; struct kvm_sev_receive_start { @@ -772,17 +790,22 @@ struct kvm_sev_receive_start { __u32 policy; __u64 pdh_uaddr; __u32 pdh_len; + __u32 pad0; __u64 session_uaddr; __u32 session_len; + __u32 pad1; }; struct kvm_sev_receive_update_data { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -- cgit v1.2.3 From 19cebbab995b2c99e927d6c5f3f24ded0740efd5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 9 Feb 2024 13:37:35 -0500 Subject: Documentation: kvm/sev: separate description of firmware The description of firmware is included part under the "SEV Key Management" header, part under the KVM_SEV_INIT ioctl. Put these two bits together and and rename "SEV Key Management" to what it actually is, namely a description of the KVM_MEMORY_ENCRYPT_OP API. Reviewed-by: Michael Roth Signed-off-by: Paolo Bonzini --- .../virt/kvm/x86/amd-memory-encryption.rst | 29 +++++++++++++--------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 995780088eb2..4f2eb441c718 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -46,14 +46,8 @@ SEV hardware uses ASIDs to associate a memory encryption key with a VM. Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value defined in the CPUID 0x8000001f[ecx] field. -SEV Key Management -================== - -The SEV guest key management is handled by a separate processor called the AMD -Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure -key management interface to perform common hypervisor activities such as -encrypting bootstrap code, snapshot, migrating and debugging the guest. For more -information, see the SEV Key Management spec [api-spec]_ +The KVM_MEMORY_ENCRYPT_OP ioctl +=============================== The main ioctl to access SEV is KVM_MEMORY_ENCRYPT_OP. If the argument to KVM_MEMORY_ENCRYPT_OP is NULL, the ioctl returns 0 if SEV is enabled @@ -87,10 +81,6 @@ guests, such as launching, running, snapshotting, migrating and decommissioning. The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform context. In a typical workflow, this command should be the first command issued. -The firmware can be initialized either by using its own non-volatile storage or -the OS can manage the NV storage for the firmware using the module parameter -``init_ex_path``. If the file specified by ``init_ex_path`` does not exist or -is invalid, the OS will create or override the file with output from PSP. Returns: 0 on success, -negative on error @@ -434,6 +424,21 @@ issued by the hypervisor to make the guest ready for execution. Returns: 0 on success, -negative on error +Firmware Management +=================== + +The SEV guest key management is handled by a separate processor called the AMD +Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure +key management interface to perform common hypervisor activities such as +encrypting bootstrap code, snapshot, migrating and debugging the guest. For more +information, see the SEV Key Management spec [api-spec]_ + +The AMD-SP firmware can be initialized either by using its own non-volatile +storage or the OS can manage the NV storage for the firmware using +parameter ``init_ex_path`` of the ``ccp`` module. If the file specified +by ``init_ex_path`` does not exist or is invalid, the OS will create or +override the file with PSP non-volatile storage. + References ========== -- cgit v1.2.3 From c20722c412f1cc1879e994fe169278bd0f322ad9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 Mar 2024 12:11:41 -0400 Subject: Documentation: kvm/sev: clarify usage of KVM_MEMORY_ENCRYPT_OP Explain that it operates on the VM file descriptor, and also clarify how detection of SEV operates on old kernels predating commit 2da1ed62d55c ("KVM: SVM: document KVM_MEM_ENCRYPT_OP, let userspace detect if SEV is available"). Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/x86/amd-memory-encryption.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 4f2eb441c718..84335d119ff1 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -49,12 +49,13 @@ defined in the CPUID 0x8000001f[ecx] field. The KVM_MEMORY_ENCRYPT_OP ioctl =============================== -The main ioctl to access SEV is KVM_MEMORY_ENCRYPT_OP. If the argument -to KVM_MEMORY_ENCRYPT_OP is NULL, the ioctl returns 0 if SEV is enabled -and ``ENOTTY`` if it is disabled (on some older versions of Linux, -the ioctl runs normally even with a NULL argument, and therefore will -likely return ``EFAULT``). If non-NULL, the argument to KVM_MEMORY_ENCRYPT_OP -must be a struct kvm_sev_cmd:: +The main ioctl to access SEV is KVM_MEMORY_ENCRYPT_OP, which operates on +the VM file descriptor. If the argument to KVM_MEMORY_ENCRYPT_OP is NULL, +the ioctl returns 0 if SEV is enabled and ``ENOTTY`` if it is disabled +(on some older versions of Linux, the ioctl tries to run normally even +with a NULL argument, and therefore will likely return ``EFAULT`` instead +of zero if SEV is enabled). If non-NULL, the argument to +KVM_MEMORY_ENCRYPT_OP must be a struct kvm_sev_cmd:: struct kvm_sev_cmd { __u32 id; -- cgit v1.2.3 From e249884e106b81c34f8050d23935ffc12623843f Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Thu, 21 Mar 2024 01:22:05 -0700 Subject: x86/hyperv: Cosmetic changes for hv_apic.c Fix issues reported by checkpatch.pl script for hv_apic.c file - Alignment should match open parenthesis - Remove unnecessary parenthesis No functional changes intended. Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/1711009325-21894-1-git-send-email-ernis@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711009325-21894-1-git-send-email-ernis@linux.microsoft.com> --- arch/x86/hyperv/hv_apic.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5fc45543e955..0569f579338b 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -105,7 +105,7 @@ static bool cpu_is_self(int cpu) * IPI implementation on Hyper-V. */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { struct hv_send_ipi_ex *ipi_arg; unsigned long flags; @@ -132,8 +132,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask, - exclude_self ? cpu_is_self : NULL); + nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask, + exclude_self ? cpu_is_self : NULL); /* * 'nr_bank <= 0' means some CPUs in cpumask can't be @@ -147,7 +147,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, - ipi_arg, NULL); + ipi_arg, NULL); ipi_mask_ex_done: local_irq_restore(flags); @@ -155,7 +155,7 @@ ipi_mask_ex_done: } static bool __send_ipi_mask(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { int cur_cpu, vcpu, this_cpu = smp_processor_id(); struct hv_send_ipi ipi_arg; @@ -181,7 +181,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, return false; } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; /* @@ -218,7 +218,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, } status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, - ipi_arg.cpu_mask); + ipi_arg.cpu_mask); return hv_result_success(status); do_ex_hypercall: @@ -241,7 +241,7 @@ static bool __send_ipi_one(int cpu, int vector) return false; } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; if (vp >= 64) -- cgit v1.2.3 From 1f1dc442c57ec61c08d21d47e4c5b4f16446fe00 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 22 Mar 2024 14:10:26 -0700 Subject: mshyperv: Introduce hv_numa_node_to_pxm_info() Factor out logic for converting numa node to hv_proximity_domain_info into a helper function. Change hv_proximity_domain_info to a struct to improve readability. While at it, rename hv_add_logical_processor_* structs to the correct hv_input_/hv_output_ prefix, and remove the flags field which is not present in the ABI. Signed-off-by: Nuno Das Neves Reviewed-by: Wei Liu Link: https://lore.kernel.org/r/1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com> --- arch/x86/hyperv/hv_proc.c | 22 ++++------------------ include/asm-generic/hyperv-tlfs.h | 19 +++++++------------ include/asm-generic/mshyperv.h | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c index 68a0843d4750..3fa1f2ee7b0d 100644 --- a/arch/x86/hyperv/hv_proc.c +++ b/arch/x86/hyperv/hv_proc.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -116,12 +115,11 @@ free_buf: int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) { - struct hv_add_logical_processor_in *input; - struct hv_add_logical_processor_out *output; + struct hv_input_add_logical_processor *input; + struct hv_output_add_logical_processor *output; u64 status; unsigned long flags; int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); /* * When adding a logical processor, the hypervisor may return @@ -137,11 +135,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) input->lp_index = lp_index; input->apic_id = apic_id; - input->flags = 0; - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, input, output); local_irq_restore(flags); @@ -166,7 +160,6 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) u64 status; unsigned long irq_flags; int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); /* Root VPs don't seem to need pages deposited */ if (partition_id != hv_current_partition_id) { @@ -185,14 +178,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) input->vp_index = vp_index; input->flags = flags; input->subnode_type = HvSubnodeAny; - if (node != NUMA_NO_NODE) { - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; - } else { - input->proximity_domain_info.as_uint64 = 0; - } + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); local_irq_restore(irq_flags); diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index fdac4a1714ec..69f3f68dc249 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -512,13 +512,9 @@ struct hv_proximity_domain_flags { u32 proximity_info_valid : 1; } __packed; -/* Not a union in windows but useful for zeroing */ -union hv_proximity_domain_info { - struct { - u32 domain_id; - struct hv_proximity_domain_flags flags; - }; - u64 as_uint64; +struct hv_proximity_domain_info { + u32 domain_id; + struct hv_proximity_domain_flags flags; } __packed; struct hv_lp_startup_status { @@ -532,14 +528,13 @@ struct hv_lp_startup_status { } __packed; /* HvAddLogicalProcessor hypercall */ -struct hv_add_logical_processor_in { +struct hv_input_add_logical_processor { u32 lp_index; u32 apic_id; - union hv_proximity_domain_info proximity_domain_info; - u64 flags; + struct hv_proximity_domain_info proximity_domain_info; } __packed; -struct hv_add_logical_processor_out { +struct hv_output_add_logical_processor { struct hv_lp_startup_status startup_status; } __packed; @@ -560,7 +555,7 @@ struct hv_create_vp { u8 padding[3]; u8 subnode_type; u64 subnode_id; - union hv_proximity_domain_info proximity_domain_info; + struct hv_proximity_domain_info proximity_domain_info; u64 flags; } __packed; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 430f0ae0dde2..cfb0b51a0998 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,19 @@ extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); +static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) +{ + struct hv_proximity_domain_info pxm_info = {}; + + if (node != NUMA_NO_NODE) { + pxm_info.domain_id = node_to_pxm(node); + pxm_info.flags.proximity_info_valid = 1; + pxm_info.flags.proximity_preferred = 1; + } + + return pxm_info; +} + /* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */ static inline int hv_result(u64 status) { -- cgit v1.2.3 From 5448d9282af57c2c89a3033f1d56b31689d09b73 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 7 Mar 2024 08:19:51 +0000 Subject: KVM: selftests: Fix spelling mistake "trigged" -> "triggered" There are spelling mistakes in __GUEST_ASSERT messages. Fix them. Signed-off-by: Colin Ian King Acked-by: Oliver Upton Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20240307081951.1954830-1-colin.i.king@gmail.com --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 2 +- tools/testing/selftests/kvm/riscv/arch_timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index ddba2c2fb5de..16ac74d07d68 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -136,7 +136,7 @@ static void guest_run_stage(struct test_vcpu_shared_data *shared_data, irq_iter = READ_ONCE(shared_data->nr_iter); __GUEST_ASSERT(config_iter + 1 == irq_iter, "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n" - " Guest timer interrupt was not trigged within the specified\n" + " Guest timer interrupt was not triggered within the specified\n" " interval, try to increase the error margin by [-e] option.\n", config_iter + 1, irq_iter); } diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c index e22848f747c0..0f9cabd99fd4 100644 --- a/tools/testing/selftests/kvm/riscv/arch_timer.c +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -60,7 +60,7 @@ static void guest_run(struct test_vcpu_shared_data *shared_data) irq_iter = READ_ONCE(shared_data->nr_iter); __GUEST_ASSERT(config_iter + 1 == irq_iter, "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" - " Guest timer interrupt was not trigged within the specified\n" + " Guest timer interrupt was not triggered within the specified\n" " interval, try to increase the error margin by [-e] option.\n", config_iter + 1, irq_iter); } -- cgit v1.2.3 From 7fd99b7ab57057f219bb6cb2a1ff0b88a2b84420 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Mar 2024 09:29:14 +0000 Subject: RISC-V: KVM: Remove second semicolon There is a statement with two semicolons. Remove the second one, it is redundant. Signed-off-by: Colin Ian King Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20240315092914.2431214-1-colin.i.king@gmail.com --- arch/riscv/kvm/vcpu_onereg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index f4a6124d25c9..994adc26db4b 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -986,7 +986,7 @@ static int copy_isa_ext_reg_indices(const struct kvm_vcpu *vcpu, static inline unsigned long num_isa_ext_regs(const struct kvm_vcpu *vcpu) { - return copy_isa_ext_reg_indices(vcpu, NULL);; + return copy_isa_ext_reg_indices(vcpu, NULL); } static int copy_sbi_ext_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -- cgit v1.2.3 From d8dd9f113e16bef3b29c9dcceb584a6f144f55e4 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 21 Mar 2024 14:20:40 +0530 Subject: RISC-V: KVM: Fix APLIC setipnum_le/be write emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The writes to setipnum_le/be register for APLIC in MSI-mode have special consideration for level-triggered interrupts as-per the section "4.9.2 Special consideration for level-sensitive interrupt sources" of the RISC-V AIA specification. Particularly, the below text from the RISC-V AIA specification defines the behaviour of writes to setipnum_le/be register for level-triggered interrupts: "A second option is for the interrupt service routine to write the APLIC’s source identity number for the interrupt to the domain’s setipnum register just before exiting. This will cause the interrupt’s pending bit to be set to one again if the source is still asserting an interrupt, but not if the source is not asserting an interrupt." Fix setipnum_le/be write emulation for in-kernel APLIC by implementing the above behaviour in aplic_write_pending() function. Cc: stable@vger.kernel.org Fixes: 74967aa208e2 ("RISC-V: KVM: Add in-kernel emulation of AIA APLIC") Signed-off-by: Anup Patel Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20240321085041.1955293-2-apatel@ventanamicro.com --- arch/riscv/kvm/aia_aplic.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c index 39e72aa016a4..5e842b92dc46 100644 --- a/arch/riscv/kvm/aia_aplic.c +++ b/arch/riscv/kvm/aia_aplic.c @@ -137,11 +137,21 @@ static void aplic_write_pending(struct aplic *aplic, u32 irq, bool pending) raw_spin_lock_irqsave(&irqd->lock, flags); sm = irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK; - if (!pending && - ((sm == APLIC_SOURCECFG_SM_LEVEL_HIGH) || - (sm == APLIC_SOURCECFG_SM_LEVEL_LOW))) + if (sm == APLIC_SOURCECFG_SM_INACTIVE) goto skip_write_pending; + if (sm == APLIC_SOURCECFG_SM_LEVEL_HIGH || + sm == APLIC_SOURCECFG_SM_LEVEL_LOW) { + if (!pending) + goto skip_write_pending; + if ((irqd->state & APLIC_IRQ_STATE_INPUT) && + sm == APLIC_SOURCECFG_SM_LEVEL_LOW) + goto skip_write_pending; + if (!(irqd->state & APLIC_IRQ_STATE_INPUT) && + sm == APLIC_SOURCECFG_SM_LEVEL_HIGH) + goto skip_write_pending; + } + if (pending) irqd->state |= APLIC_IRQ_STATE_PENDING; else -- cgit v1.2.3 From 1a4bd2b128fb5ca62e4d1c5ca298d3d06b9c1e8e Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 11 Mar 2024 12:07:00 +0100 Subject: firmware: arm_ffa: Fix the partition ID check in ffa_notification_info_get() FFA_NOTIFICATION_INFO_GET retrieves information about pending notifications. Notifications can be either global or per VCPU. Global notifications are reported with the partition ID only in the list of endpoints with pending notifications. ffa_notification_info_get() incorrectly expect no ID at all for global notifications. Fix this by checking for ID = 1 instead of ID = 0. Fixes: 3522be48d82b ("firmware: arm_ffa: Implement the NOTIFICATION_INFO_GET interface") Signed-off-by: Jens Wiklander Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20240311110700.2367142-1-jens.wiklander@linaro.org Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index f2556a8e9401..9bc2e10381af 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -790,7 +790,7 @@ static void ffa_notification_info_get(void) part_id = packed_id_list[ids_processed++]; - if (!ids_count[list]) { /* Global Notification */ + if (ids_count[list] == 1) { /* Global Notification */ __do_sched_recv_cb(part_id, 0, false); continue; } -- cgit v1.2.3 From 17f243adf1653bdbaeec767e3e74c9ad089f470b Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 11 Mar 2024 10:04:12 +0100 Subject: firmware: arm_scmi: Fix wrong fastchannel initialization Fastchannels are initialized with an incorrect index(POWERCAP_PAI_GET) in: commit 2441caa84aac ("firmware: arm_scmi: Populate fast channel rate_limit") Fix this and provide a correct index(POWERCAP_FC_PAI) Fixes: 2441caa84aac ("firmware: arm_scmi: Populate fast channel rate_limit") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202403100744.7Op3PI8L-lkp@intel.com/ Signed-off-by: Pierre Gondois Link: https://lore.kernel.org/r/20240311090413.1710725-1-pierre.gondois@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/powercap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/powercap.c b/drivers/firmware/arm_scmi/powercap.c index ea9201e7044c..1fa79bba492e 100644 --- a/drivers/firmware/arm_scmi/powercap.c +++ b/drivers/firmware/arm_scmi/powercap.c @@ -736,7 +736,7 @@ static void scmi_powercap_domain_init_fc(const struct scmi_protocol_handle *ph, ph->hops->fastchannel_init(ph, POWERCAP_DESCRIBE_FASTCHANNEL, POWERCAP_PAI_GET, 4, domain, &fc[POWERCAP_FC_PAI].get_addr, NULL, - &fc[POWERCAP_PAI_GET].rate_limit); + &fc[POWERCAP_FC_PAI].rate_limit); *p_fc = fc; } -- cgit v1.2.3 From b70c7996d4ffb2e02895132e8a79a37cee66504f Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Fri, 15 Mar 2024 14:03:24 +0000 Subject: firmware: arm_scmi: Make raw debugfs entries non-seekable SCMI raw debugfs entries are used to inject and snoop messages out of the SCMI core and, as such, the underlying virtual files have no reason to support seeking. Modify the related file_operations descriptors to be non-seekable. Fixes: 3c3d818a9317 ("firmware: arm_scmi: Add core raw transmission support") Signed-off-by: Cristian Marussi Link: https://lore.kernel.org/r/20240315140324.231830-1-cristian.marussi@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/raw_mode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/raw_mode.c b/drivers/firmware/arm_scmi/raw_mode.c index 350573518503..130d13e9cd6b 100644 --- a/drivers/firmware/arm_scmi/raw_mode.c +++ b/drivers/firmware/arm_scmi/raw_mode.c @@ -921,7 +921,7 @@ static int scmi_dbg_raw_mode_open(struct inode *inode, struct file *filp) rd->raw = raw; filp->private_data = rd; - return 0; + return nonseekable_open(inode, filp); } static int scmi_dbg_raw_mode_release(struct inode *inode, struct file *filp) @@ -950,6 +950,7 @@ static const struct file_operations scmi_dbg_raw_mode_reset_fops = { .open = scmi_dbg_raw_mode_open, .release = scmi_dbg_raw_mode_release, .write = scmi_dbg_raw_mode_reset_write, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -959,6 +960,7 @@ static const struct file_operations scmi_dbg_raw_mode_message_fops = { .read = scmi_dbg_raw_mode_message_read, .write = scmi_dbg_raw_mode_message_write, .poll = scmi_dbg_raw_mode_message_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -975,6 +977,7 @@ static const struct file_operations scmi_dbg_raw_mode_message_async_fops = { .read = scmi_dbg_raw_mode_message_read, .write = scmi_dbg_raw_mode_message_async_write, .poll = scmi_dbg_raw_mode_message_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -998,6 +1001,7 @@ static const struct file_operations scmi_dbg_raw_mode_notification_fops = { .release = scmi_dbg_raw_mode_release, .read = scmi_test_dbg_raw_mode_notif_read, .poll = scmi_test_dbg_raw_mode_notif_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; @@ -1021,6 +1025,7 @@ static const struct file_operations scmi_dbg_raw_mode_errors_fops = { .release = scmi_dbg_raw_mode_release, .read = scmi_test_dbg_raw_mode_errors_read, .poll = scmi_test_dbg_raw_mode_errors_poll, + .llseek = no_llseek, .owner = THIS_MODULE, }; -- cgit v1.2.3 From 8aebf68dfd4b482f7fb1f82864fdda08020f62b4 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 30 Jan 2024 15:59:17 -0600 Subject: MAINTAINERS: Add TPM DT bindings to TPM maintainers Bindings for a given device class generally go to the respective subsystem maintainers. Add the TPM bindings to the TPM maintainers entry. Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20240130215917.2473250-1-robh@kernel.org Signed-off-by: Rob Herring --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index aa3b947fb080..08ea59b78df0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22384,6 +22384,7 @@ S: Maintained W: https://kernsec.org/wiki/index.php/Linux_Kernel_Integrity Q: https://patchwork.kernel.org/project/linux-integrity/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git +F: Documentation/devicetree/bindings/tpm/ F: drivers/char/tpm/ TPS546D24 DRIVER -- cgit v1.2.3 From 3d9b8e6db9bda4463bbf2a97411f0716215254da Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 25 Mar 2024 10:11:39 +0100 Subject: docs: dt-bindings: add missing address/size-cells to example Complete the example of recommended order of properties by adding missing address/size-cells. They are not necessary to illustrate the style, but lack of them us bit really correct DTS code which might confuse readers. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Tudor Ambarus Link: https://lore.kernel.org/r/20240325091139.18602-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/dts-coding-style.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/dts-coding-style.rst b/Documentation/devicetree/bindings/dts-coding-style.rst index a9bdd2b59dca..8a68331075a0 100644 --- a/Documentation/devicetree/bindings/dts-coding-style.rst +++ b/Documentation/devicetree/bindings/dts-coding-style.rst @@ -144,6 +144,8 @@ Example:: #dma-cells = <1>; clocks = <&clock_controller 0>, <&clock_controller 1>; clock-names = "bus", "host"; + #address-cells = <1>; + #size-cells = <1>; vendor,custom-property = <2>; status = "disabled"; -- cgit v1.2.3 From fb9f8125ed9d9b8e11f309a7dbfbe7b40de48fba Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:07:58 +0200 Subject: ASoC: SOF: Add dsp_max_burst_size_in_ms member to snd_sof_pcm_stream The dsp_max_burst_size_in_ms can be used to save the length of the maximum burst size in ms the host DMA will use. Platform code can place constraint using this to avoid user space requesting too small ALSA buffer which will result xruns. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/sof-audio.h | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h index 9ea2ac5adac7..04e5cb2c70a7 100644 --- a/sound/soc/sof/sof-audio.h +++ b/sound/soc/sof/sof-audio.h @@ -322,6 +322,7 @@ struct snd_sof_pcm_stream { struct work_struct period_elapsed_work; struct snd_soc_dapm_widget_list *list; /* list of connected DAPM widgets */ bool d0i3_compatible; /* DSP can be in D0I3 when this pcm is opened */ + unsigned int dsp_max_burst_size_in_ms; /* The maximum size of the host DMA burst in ms */ /* * flag to indicate that the DSP pipelines should be kept * active or not while suspending the stream -- cgit v1.2.3 From 842bb8b62cc6f3546d61eb63115b32ebc6dd4a87 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:07:59 +0200 Subject: ASoC: SOF: ipc4-topology: Save the DMA maximum burst size for PCMs When setting up the pcm widget, save the DSP buffer size (in ms) for platform code to place a constraint on playback. On playback the DMA will fill the buffer on start and if the period size is smaller it will immediately overrun. On capture the DMA will move data in 1ms bursts. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-topology.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index da4a83afb87a..bb4cf6dd1e18 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -412,8 +412,9 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget) struct sof_ipc4_available_audio_format *available_fmt; struct snd_soc_component *scomp = swidget->scomp; struct sof_ipc4_copier *ipc4_copier; + struct snd_sof_pcm *spcm; int node_type = 0; - int ret; + int ret, dir; ipc4_copier = kzalloc(sizeof(*ipc4_copier), GFP_KERNEL); if (!ipc4_copier) @@ -447,6 +448,25 @@ static int sof_ipc4_widget_setup_pcm(struct snd_sof_widget *swidget) } dev_dbg(scomp->dev, "host copier '%s' node_type %u\n", swidget->widget->name, node_type); + spcm = snd_sof_find_spcm_comp(scomp, swidget->comp_id, &dir); + if (!spcm) + goto skip_gtw_cfg; + + if (dir == SNDRV_PCM_STREAM_PLAYBACK) { + struct snd_sof_pcm_stream *sps = &spcm->stream[dir]; + + sof_update_ipc_object(scomp, &sps->dsp_max_burst_size_in_ms, + SOF_COPIER_DEEP_BUFFER_TOKENS, + swidget->tuples, + swidget->num_tuples, sizeof(u32), 1); + /* Set default DMA buffer size if it is not specified in topology */ + if (!sps->dsp_max_burst_size_in_ms) + sps->dsp_max_burst_size_in_ms = SOF_IPC4_MIN_DMA_BUFFER_SIZE; + } else { + /* Capture data is copied from DSP to host in 1ms bursts */ + spcm->stream[dir].dsp_max_burst_size_in_ms = 1; + } + skip_gtw_cfg: ipc4_copier->gtw_attr = kzalloc(sizeof(*ipc4_copier->gtw_attr), GFP_KERNEL); if (!ipc4_copier->gtw_attr) { -- cgit v1.2.3 From fe76d2e75a6da97edd2b4ec5cfb9efd541be087a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:00 +0200 Subject: ASoC: SOF: Intel: hda-pcm: Use dsp_max_burst_size_in_ms to place constraint If the PCM have the dsp_max_burst_size_in_ms set then place a constraint to limit the minimum buffer time to avoid xruns caused by DMA bursts spinning on the ALSA buffer. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-4-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-pcm.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sound/soc/sof/intel/hda-pcm.c b/sound/soc/sof/intel/hda-pcm.c index 18f07364d219..69fefcd1abc5 100644 --- a/sound/soc/sof/intel/hda-pcm.c +++ b/sound/soc/sof/intel/hda-pcm.c @@ -259,6 +259,27 @@ int hda_dsp_pcm_open(struct snd_sof_dev *sdev, snd_pcm_hw_constraint_mask64(substream->runtime, SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_FMTBIT_S16 | SNDRV_PCM_FMTBIT_S32); + /* + * The dsp_max_burst_size_in_ms is the length of the maximum burst size + * of the host DMA in the ALSA buffer. + * + * On playback start the DMA will transfer dsp_max_burst_size_in_ms + * amount of data in one initial burst to fill up the host DMA buffer. + * Consequent DMA burst sizes are shorter and their length can vary. + * To make sure that userspace allocate large enough ALSA buffer we need + * to place a constraint on the buffer time. + * + * On capture the DMA will transfer 1ms chunks. + * + * Exact dsp_max_burst_size_in_ms constraint is racy, so set the + * constraint to a minimum of 2x dsp_max_burst_size_in_ms. + */ + if (spcm->stream[direction].dsp_max_burst_size_in_ms) + snd_pcm_hw_constraint_minmax(substream->runtime, + SNDRV_PCM_HW_PARAM_BUFFER_TIME, + spcm->stream[direction].dsp_max_burst_size_in_ms * USEC_PER_MSEC * 2, + UINT_MAX); + /* binding pcm substream to hda stream */ substream->runtime->private_data = &dsp_stream->hstream; return 0; -- cgit v1.2.3 From 67b182bea08a8d1092b91b57aefdfe420fce1634 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:01 +0200 Subject: ASoC: SOF: Intel: hda: Implement get_stream_position (Linear Link Position) When the Linear Link Position is not available in firmware SRAM window we use the host accessible position registers to read it. The address of the PPLCLLPL/U registers depend on the number of streams (playback+capture). At probe time the pplc_addr is calculated for each stream and we can use it to read the LLP without the need of address re-calculation. Set the get_stream_position callback in sof_hda_common_ops for all platforms: The callback is used for IPC4 delay calculations only but the register is a generic HDA register, not tied to any specific IPC version. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Rander Wang Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-5-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-common-ops.c | 2 ++ sound/soc/sof/intel/hda-stream.c | 32 ++++++++++++++++++++++++++++++++ sound/soc/sof/intel/hda.h | 3 +++ 3 files changed, 37 insertions(+) diff --git a/sound/soc/sof/intel/hda-common-ops.c b/sound/soc/sof/intel/hda-common-ops.c index 2b385cddc385..80a69599a8c3 100644 --- a/sound/soc/sof/intel/hda-common-ops.c +++ b/sound/soc/sof/intel/hda-common-ops.c @@ -57,6 +57,8 @@ struct snd_sof_dsp_ops sof_hda_common_ops = { .pcm_pointer = hda_dsp_pcm_pointer, .pcm_ack = hda_dsp_pcm_ack, + .get_stream_position = hda_dsp_get_stream_llp, + /* firmware loading */ .load_firmware = snd_sof_load_firmware_raw, diff --git a/sound/soc/sof/intel/hda-stream.c b/sound/soc/sof/intel/hda-stream.c index b387b1a69d7e..48ea187f7230 100644 --- a/sound/soc/sof/intel/hda-stream.c +++ b/sound/soc/sof/intel/hda-stream.c @@ -1063,3 +1063,35 @@ snd_pcm_uframes_t hda_dsp_stream_get_position(struct hdac_stream *hstream, return pos; } + +/** + * hda_dsp_get_stream_llp - Retrieve the LLP (Linear Link Position) of the stream + * @sdev: SOF device + * @component: ASoC component + * @substream: PCM substream + * + * Returns the raw Linear Link Position value + */ +u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + struct hdac_stream *hstream = substream->runtime->private_data; + struct hdac_ext_stream *hext_stream = stream_to_hdac_ext_stream(hstream); + u32 llp_l, llp_u; + + /* + * The pplc_addr have been calculated during probe in + * hda_dsp_stream_init(): + * pplc_addr = sdev->bar[HDA_DSP_PP_BAR] + + * SOF_HDA_PPLC_BASE + + * SOF_HDA_PPLC_MULTI * total_stream + + * SOF_HDA_PPLC_INTERVAL * stream_index + * + * Use this pre-calculated address to avoid repeated re-calculation. + */ + llp_l = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPL); + llp_u = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPU); + + return ((u64)llp_u << 32) | llp_l; +} diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h index b36eb7c78913..9d26cad785fe 100644 --- a/sound/soc/sof/intel/hda.h +++ b/sound/soc/sof/intel/hda.h @@ -662,6 +662,9 @@ bool hda_dsp_check_stream_irq(struct snd_sof_dev *sdev); snd_pcm_uframes_t hda_dsp_stream_get_position(struct hdac_stream *hstream, int direction, bool can_sleep); +u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream); struct hdac_ext_stream * hda_dsp_stream_get(struct snd_sof_dev *sdev, int direction, u32 flags); -- cgit v1.2.3 From 4374f698d7d9f849b66f3fa8f7a64f0bc1a53d7f Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:02 +0200 Subject: ASoC: SOF: Intel: mtl/lnl: Use the generic get_stream_position callback Drop the MTL mtl_dsp_get_stream_hda_link_position() function and related defines since it can only work on platforms which have 19 streams because of the use of 0x948 as base offset for the LLP registers. The generic hda_dsp_get_stream_hda_link_position() takes the number of streams into consideration when reading the LLP registers for the stream and can handle different HDA configurations. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Rander Wang Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-6-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/lnl.c | 2 -- sound/soc/sof/intel/mtl.c | 14 -------------- sound/soc/sof/intel/mtl.h | 10 ---------- 3 files changed, 26 deletions(-) diff --git a/sound/soc/sof/intel/lnl.c b/sound/soc/sof/intel/lnl.c index 7ae017a00184..d1c73d407e68 100644 --- a/sound/soc/sof/intel/lnl.c +++ b/sound/soc/sof/intel/lnl.c @@ -134,8 +134,6 @@ int sof_lnl_ops_init(struct snd_sof_dev *sdev) sof_lnl_ops.runtime_resume = lnl_hda_dsp_runtime_resume; } - sof_lnl_ops.get_stream_position = mtl_dsp_get_stream_hda_link_position; - /* dsp core get/put */ sof_lnl_ops.core_get = mtl_dsp_core_get; sof_lnl_ops.core_put = mtl_dsp_core_put; diff --git a/sound/soc/sof/intel/mtl.c b/sound/soc/sof/intel/mtl.c index df05dc77b8d5..060c34988e90 100644 --- a/sound/soc/sof/intel/mtl.c +++ b/sound/soc/sof/intel/mtl.c @@ -626,18 +626,6 @@ static int mtl_dsp_disable_interrupts(struct snd_sof_dev *sdev) return mtl_enable_interrupts(sdev, false); } -u64 mtl_dsp_get_stream_hda_link_position(struct snd_sof_dev *sdev, - struct snd_soc_component *component, - struct snd_pcm_substream *substream) -{ - struct hdac_stream *hstream = substream->runtime->private_data; - u32 llp_l, llp_u; - - llp_l = snd_sof_dsp_read(sdev, HDA_DSP_HDA_BAR, MTL_PPLCLLPL(hstream->index)); - llp_u = snd_sof_dsp_read(sdev, HDA_DSP_HDA_BAR, MTL_PPLCLLPU(hstream->index)); - return ((u64)llp_u << 32) | llp_l; -} - int mtl_dsp_core_get(struct snd_sof_dev *sdev, int core) { const struct sof_ipc_pm_ops *pm_ops = sdev->ipc->ops->pm; @@ -707,8 +695,6 @@ int sof_mtl_ops_init(struct snd_sof_dev *sdev) sof_mtl_ops.core_get = mtl_dsp_core_get; sof_mtl_ops.core_put = mtl_dsp_core_put; - sof_mtl_ops.get_stream_position = mtl_dsp_get_stream_hda_link_position; - sdev->private = kzalloc(sizeof(struct sof_ipc4_fw_data), GFP_KERNEL); if (!sdev->private) return -ENOMEM; diff --git a/sound/soc/sof/intel/mtl.h b/sound/soc/sof/intel/mtl.h index cc5a1f46fd09..ea8c1b83f712 100644 --- a/sound/soc/sof/intel/mtl.h +++ b/sound/soc/sof/intel/mtl.h @@ -6,12 +6,6 @@ * Copyright(c) 2020-2022 Intel Corporation. All rights reserved. */ -/* HDA Registers */ -#define MTL_PPLCLLPL_BASE 0x948 -#define MTL_PPLCLLPU_STRIDE 0x10 -#define MTL_PPLCLLPL(x) (MTL_PPLCLLPL_BASE + (x) * MTL_PPLCLLPU_STRIDE) -#define MTL_PPLCLLPU(x) (MTL_PPLCLLPL_BASE + 0x4 + (x) * MTL_PPLCLLPU_STRIDE) - /* DSP Registers */ #define MTL_HFDSSCS 0x1000 #define MTL_HFDSSCS_SPA_MASK BIT(16) @@ -103,9 +97,5 @@ int mtl_dsp_ipc_get_window_offset(struct snd_sof_dev *sdev, u32 id); void mtl_ipc_dump(struct snd_sof_dev *sdev); -u64 mtl_dsp_get_stream_hda_link_position(struct snd_sof_dev *sdev, - struct snd_soc_component *component, - struct snd_pcm_substream *substream); - int mtl_dsp_core_get(struct snd_sof_dev *sdev, int core); int mtl_dsp_core_put(struct snd_sof_dev *sdev, int core); -- cgit v1.2.3 From ce2faa9a180c1984225689b6b1cb26045f8b7470 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:03 +0200 Subject: ASoC: SOF: Introduce a new callback pair to be used for PCM delay reporting For delay calculation we need two information: Number of bytes transferred between the DSP and host memory (ALSA buffer) Number of frames transferred between the DSP and external device (link/codec/DMIC/etc). The reason for the different units (bytes vs frames) on host and dai side is that the format on the dai side is decided by the firmware and might not be the same as on the host side, thus the expectation is that the counter reflects the number of frames. The kernel know the host side format and in there we have access to the DMA position which is in bytes. In a simplified way, the DSP caused delay is the difference between the two counters. The existing get_stream_position callback is defined to retrieve the frame counter on the DAI side but it's name is too generic to be intuitive and makes it hard to define a callback for the host side. This patch introduces a new set of callbacks to replace the get_stream_position and define the host side equivalent: get_dai_frame_counter get_host_byte_counter Subsequent patches will remove the old callback. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-7-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ops.h | 24 ++++++++++++++++++++++++ sound/soc/sof/sof-priv.h | 21 +++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h index 6cf21e829e07..d83cd771015c 100644 --- a/sound/soc/sof/ops.h +++ b/sound/soc/sof/ops.h @@ -533,6 +533,30 @@ static inline u64 snd_sof_pcm_get_stream_position(struct snd_sof_dev *sdev, return 0; } +static inline u64 +snd_sof_pcm_get_dai_frame_counter(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + if (sof_ops(sdev) && sof_ops(sdev)->get_dai_frame_counter) + return sof_ops(sdev)->get_dai_frame_counter(sdev, component, + substream); + + return 0; +} + +static inline u64 +snd_sof_pcm_get_host_byte_counter(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + if (sof_ops(sdev) && sof_ops(sdev)->get_host_byte_counter) + return sof_ops(sdev)->get_host_byte_counter(sdev, component, + substream); + + return 0; +} + /* machine driver */ static inline int snd_sof_machine_register(struct snd_sof_dev *sdev, void *pdata) diff --git a/sound/soc/sof/sof-priv.h b/sound/soc/sof/sof-priv.h index d453a4ce3b21..91043f177dfa 100644 --- a/sound/soc/sof/sof-priv.h +++ b/sound/soc/sof/sof-priv.h @@ -270,6 +270,27 @@ struct snd_sof_dsp_ops { struct snd_soc_component *component, struct snd_pcm_substream *substream); /* optional */ + /* + * optional callback to retrieve the number of frames left/arrived from/to + * the DSP on the DAI side (link/codec/DMIC/etc). + * + * The callback is used when the firmware does not provide this information + * via the shared SRAM window and it can be retrieved by host. + */ + u64 (*get_dai_frame_counter)(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream); /* optional */ + + /* + * Optional callback to retrieve the number of bytes left/arrived from/to + * the DSP on the host side (bytes between host ALSA buffer and DSP). + * + * The callback is needed for ALSA delay reporting. + */ + u64 (*get_host_byte_counter)(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream); /* optional */ + /* host read DSP stream data */ int (*ipc_msg_data)(struct snd_sof_dev *sdev, struct snd_sof_pcm_stream *sps, -- cgit v1.2.3 From fd6f6a0632bc891673490bf4a92304172251825c Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:04 +0200 Subject: ASoC: SOF: Intel: Set the dai/host get frame/byte counter callbacks Add implementation for reading the LDP (Linear DMA Position) to be used as get_host_byte_counter(). The LDP is counting the number of bytes moved between the DSP and host memory. Set the get_dai_frame_counter to hda_dsp_get_stream_llp, which is counting the frames on the link side of the DSP. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-8-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-common-ops.c | 2 ++ sound/soc/sof/intel/hda-stream.c | 31 +++++++++++++++++++++++++++++++ sound/soc/sof/intel/hda.h | 3 +++ 3 files changed, 36 insertions(+) diff --git a/sound/soc/sof/intel/hda-common-ops.c b/sound/soc/sof/intel/hda-common-ops.c index 80a69599a8c3..4d7ea18604ee 100644 --- a/sound/soc/sof/intel/hda-common-ops.c +++ b/sound/soc/sof/intel/hda-common-ops.c @@ -58,6 +58,8 @@ struct snd_sof_dsp_ops sof_hda_common_ops = { .pcm_ack = hda_dsp_pcm_ack, .get_stream_position = hda_dsp_get_stream_llp, + .get_dai_frame_counter = hda_dsp_get_stream_llp, + .get_host_byte_counter = hda_dsp_get_stream_ldp, /* firmware loading */ .load_firmware = snd_sof_load_firmware_raw, diff --git a/sound/soc/sof/intel/hda-stream.c b/sound/soc/sof/intel/hda-stream.c index 48ea187f7230..8504a4f27b60 100644 --- a/sound/soc/sof/intel/hda-stream.c +++ b/sound/soc/sof/intel/hda-stream.c @@ -1095,3 +1095,34 @@ u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, return ((u64)llp_u << 32) | llp_l; } + +/** + * hda_dsp_get_stream_ldp - Retrieve the LDP (Linear DMA Position) of the stream + * @sdev: SOF device + * @component: ASoC component + * @substream: PCM substream + * + * Returns the raw Linear Link Position value + */ +u64 hda_dsp_get_stream_ldp(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + struct hdac_stream *hstream = substream->runtime->private_data; + struct hdac_ext_stream *hext_stream = stream_to_hdac_ext_stream(hstream); + u32 ldp_l, ldp_u; + + /* + * The pphc_addr have been calculated during probe in + * hda_dsp_stream_init(): + * pphc_addr = sdev->bar[HDA_DSP_PP_BAR] + + * SOF_HDA_PPHC_BASE + + * SOF_HDA_PPHC_INTERVAL * stream_index + * + * Use this pre-calculated address to avoid repeated re-calculation. + */ + ldp_l = readl(hext_stream->pphc_addr + AZX_REG_PPHCLDPL); + ldp_u = readl(hext_stream->pphc_addr + AZX_REG_PPHCLDPU); + + return ((u64)ldp_u << 32) | ldp_l; +} diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h index 9d26cad785fe..81a1d4606d3c 100644 --- a/sound/soc/sof/intel/hda.h +++ b/sound/soc/sof/intel/hda.h @@ -665,6 +665,9 @@ snd_pcm_uframes_t hda_dsp_stream_get_position(struct hdac_stream *hstream, u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, struct snd_soc_component *component, struct snd_pcm_substream *substream); +u64 hda_dsp_get_stream_ldp(struct snd_sof_dev *sdev, + struct snd_soc_component *component, + struct snd_pcm_substream *substream); struct hdac_ext_stream * hda_dsp_stream_get(struct snd_sof_dev *sdev, int direction, u32 flags); -- cgit v1.2.3 From 37679a1bd372c8308a3faccf3438c9df642565b3 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:05 +0200 Subject: ASoC: SOF: ipc4-pcm: Use the snd_sof_pcm_get_dai_frame_counter() for pcm_delay Switch to the new callback to retrieve the DAI (link) frame counter. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-9-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 0f332c8cdbe6..d0795f77cc15 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -897,11 +897,12 @@ static snd_pcm_sframes_t sof_ipc4_pcm_delay(struct snd_soc_component *component, } /* - * HDaudio links don't support the LLP counter reported by firmware - * the link position is read directly from hardware registers. + * If the LLP counter is not reported by firmware in the SRAM window + * then read the dai (link) position via host accessible means if + * available. */ if (!time_info->llp_offset) { - tmp_ptr = snd_sof_pcm_get_stream_position(sdev, component, substream); + tmp_ptr = snd_sof_pcm_get_dai_frame_counter(sdev, component, substream); if (!tmp_ptr) return 0; } else { -- cgit v1.2.3 From 4ab6c38c664442c1fc9911eb3c5c6953d3dbcca5 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:06 +0200 Subject: ASoC: SOF: Intel: hda-common-ops: Do not set the get_stream_position callback The get_stream_position has been replaced by get_dai_frame_counter, it should not be set to allow it to be dropped from core code. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-10-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-common-ops.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/sof/intel/hda-common-ops.c b/sound/soc/sof/intel/hda-common-ops.c index 4d7ea18604ee..d71bb66b9991 100644 --- a/sound/soc/sof/intel/hda-common-ops.c +++ b/sound/soc/sof/intel/hda-common-ops.c @@ -57,7 +57,6 @@ struct snd_sof_dsp_ops sof_hda_common_ops = { .pcm_pointer = hda_dsp_pcm_pointer, .pcm_ack = hda_dsp_pcm_ack, - .get_stream_position = hda_dsp_get_stream_llp, .get_dai_frame_counter = hda_dsp_get_stream_llp, .get_host_byte_counter = hda_dsp_get_stream_ldp, -- cgit v1.2.3 From 07007b8ac42cffc23043d00e56b0f67a75dc4b22 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:07 +0200 Subject: ASoC: SOF: Remove the get_stream_position callback The get_stream_position has been replaced by get_dai_frame_counter and all related code can be dropped form the core. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-11-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ops.h | 10 ---------- sound/soc/sof/sof-priv.h | 9 --------- 2 files changed, 19 deletions(-) diff --git a/sound/soc/sof/ops.h b/sound/soc/sof/ops.h index d83cd771015c..3cd748e13460 100644 --- a/sound/soc/sof/ops.h +++ b/sound/soc/sof/ops.h @@ -523,16 +523,6 @@ static inline int snd_sof_pcm_platform_ack(struct snd_sof_dev *sdev, return 0; } -static inline u64 snd_sof_pcm_get_stream_position(struct snd_sof_dev *sdev, - struct snd_soc_component *component, - struct snd_pcm_substream *substream) -{ - if (sof_ops(sdev) && sof_ops(sdev)->get_stream_position) - return sof_ops(sdev)->get_stream_position(sdev, component, substream); - - return 0; -} - static inline u64 snd_sof_pcm_get_dai_frame_counter(struct snd_sof_dev *sdev, struct snd_soc_component *component, diff --git a/sound/soc/sof/sof-priv.h b/sound/soc/sof/sof-priv.h index 91043f177dfa..d3c436f82604 100644 --- a/sound/soc/sof/sof-priv.h +++ b/sound/soc/sof/sof-priv.h @@ -261,15 +261,6 @@ struct snd_sof_dsp_ops { /* pcm ack */ int (*pcm_ack)(struct snd_sof_dev *sdev, struct snd_pcm_substream *substream); /* optional */ - /* - * optional callback to retrieve the link DMA position for the substream - * when the position is not reported in the shared SRAM windows but - * instead from a host-accessible hardware counter. - */ - u64 (*get_stream_position)(struct snd_sof_dev *sdev, - struct snd_soc_component *component, - struct snd_pcm_substream *substream); /* optional */ - /* * optional callback to retrieve the number of frames left/arrived from/to * the DSP on the DAI side (link/codec/DMIC/etc). -- cgit v1.2.3 From 31d2874d083ba6cc2a4f4b26dab73c3be1c92658 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:08 +0200 Subject: ASoC: SOF: ipc4-pcm: Move struct sof_ipc4_timestamp_info definition locally The sof_ipc4_timestamp_info is only used by ipc4-pcm.c internally, it should not be in a generic header implying that it might be used elsewhere. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-12-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 14 ++++++++++++++ sound/soc/sof/ipc4-priv.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index d0795f77cc15..2d7295221884 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -15,6 +15,20 @@ #include "ipc4-topology.h" #include "ipc4-fw-reg.h" +/** + * struct sof_ipc4_timestamp_info - IPC4 timestamp info + * @host_copier: the host copier of the pcm stream + * @dai_copier: the dai copier of the pcm stream + * @stream_start_offset: reported by fw in memory window + * @llp_offset: llp offset in memory window + */ +struct sof_ipc4_timestamp_info { + struct sof_ipc4_copier *host_copier; + struct sof_ipc4_copier *dai_copier; + u64 stream_start_offset; + u32 llp_offset; +}; + static int sof_ipc4_set_multi_pipeline_state(struct snd_sof_dev *sdev, u32 state, struct ipc4_pipeline_set_state_data *trigger_list) { diff --git a/sound/soc/sof/ipc4-priv.h b/sound/soc/sof/ipc4-priv.h index f3b908b093f9..afed618a15f0 100644 --- a/sound/soc/sof/ipc4-priv.h +++ b/sound/soc/sof/ipc4-priv.h @@ -92,20 +92,6 @@ struct sof_ipc4_fw_data { struct mutex pipeline_state_mutex; /* protect pipeline triggers, ref counts and states */ }; -/** - * struct sof_ipc4_timestamp_info - IPC4 timestamp info - * @host_copier: the host copier of the pcm stream - * @dai_copier: the dai copier of the pcm stream - * @stream_start_offset: reported by fw in memory window - * @llp_offset: llp offset in memory window - */ -struct sof_ipc4_timestamp_info { - struct sof_ipc4_copier *host_copier; - struct sof_ipc4_copier *dai_copier; - u64 stream_start_offset; - u32 llp_offset; -}; - extern const struct sof_ipc_fw_loader_ops ipc4_loader_ops; extern const struct sof_ipc_tplg_ops ipc4_tplg_ops; extern const struct sof_ipc_tplg_control_ops tplg_ipc4_control_ops; -- cgit v1.2.3 From 55ca6ca227bfc5a8d0a0c2c5d6e239777226a604 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:09 +0200 Subject: ASoC: SOF: ipc4-pcm: Combine the SOF_IPC4_PIPE_PAUSED cases in pcm_trigger The SNDRV_PCM_TRIGGER_PAUSE_PUSH does not need to be a separate case, it can be handled along with STOP and SUSPEND Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-13-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 2d7295221884..4e41b16d3205 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -478,14 +478,12 @@ static int sof_ipc4_pcm_trigger(struct snd_soc_component *component, /* determine the pipeline state */ switch (cmd) { - case SNDRV_PCM_TRIGGER_PAUSE_PUSH: - state = SOF_IPC4_PIPE_PAUSED; - break; case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: case SNDRV_PCM_TRIGGER_RESUME: case SNDRV_PCM_TRIGGER_START: state = SOF_IPC4_PIPE_RUNNING; break; + case SNDRV_PCM_TRIGGER_PAUSE_PUSH: case SNDRV_PCM_TRIGGER_SUSPEND: case SNDRV_PCM_TRIGGER_STOP: state = SOF_IPC4_PIPE_PAUSED; -- cgit v1.2.3 From 3ce3bc36d91510389955b47e36ea4c4e94fcbdd3 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:10 +0200 Subject: ASoC: SOF: ipc4-pcm: Invalidate the stream_start_offset in PAUSED state When the final state is SOF_IPC4_PIPE_PAUSED, it is possible that the stream will be restarted (resume or start) in which case we need to update the offset from the firmware. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-14-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 4e41b16d3205..905dbc4852b1 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -437,8 +437,19 @@ static int sof_ipc4_trigger_pipelines(struct snd_soc_component *component, } /* return if this is the final state */ - if (state == SOF_IPC4_PIPE_PAUSED) + if (state == SOF_IPC4_PIPE_PAUSED) { + struct sof_ipc4_timestamp_info *time_info; + + /* + * Invalidate the stream_start_offset to make sure that it is + * going to be updated if the stream resumes + */ + time_info = spcm->stream[substream->stream].private; + if (time_info) + time_info->stream_start_offset = SOF_IPC4_INVALID_STREAM_POSITION; + goto free; + } skip_pause_transition: /* else set the RUNNING/RESET state in the DSP */ ret = sof_ipc4_set_multi_pipeline_state(sdev, state, trigger_list); -- cgit v1.2.3 From 77165bd955d55114028b06787a530b8f9220e4b0 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:11 +0200 Subject: ASoC: SOF: sof-pcm: Add pointer callback to sof_ipc_pcm_ops The IPC specific pointer callback can be used when additional or custom handling is needed during the pointer calculation, like executing a delay calculation at the same time to minimize drift between the reported pointer and the calculated delay. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-15-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/pcm.c | 8 ++++++++ sound/soc/sof/sof-audio.h | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/pcm.c b/sound/soc/sof/pcm.c index 33d576b17647..f03cee94bce6 100644 --- a/sound/soc/sof/pcm.c +++ b/sound/soc/sof/pcm.c @@ -388,13 +388,21 @@ static snd_pcm_uframes_t sof_pcm_pointer(struct snd_soc_component *component, { struct snd_soc_pcm_runtime *rtd = snd_soc_substream_to_rtd(substream); struct snd_sof_dev *sdev = snd_soc_component_get_drvdata(component); + const struct sof_ipc_pcm_ops *pcm_ops = sof_ipc_get_ops(sdev, pcm); struct snd_sof_pcm *spcm; snd_pcm_uframes_t host, dai; + int ret = -EOPNOTSUPP; /* nothing to do for BE */ if (rtd->dai_link->no_pcm) return 0; + if (pcm_ops && pcm_ops->pointer) + ret = pcm_ops->pointer(component, substream, &host); + + if (ret != -EOPNOTSUPP) + return ret ? ret : host; + /* use dsp ops pointer callback directly if set */ if (sof_ops(sdev)->pcm_pointer) return sof_ops(sdev)->pcm_pointer(sdev, substream); diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h index 04e5cb2c70a7..86bbb531e142 100644 --- a/sound/soc/sof/sof-audio.h +++ b/sound/soc/sof/sof-audio.h @@ -103,7 +103,10 @@ struct snd_sof_dai_config_data { * additional memory in the SOF PCM stream structure * @pcm_free: Function pointer for PCM free that can be used for freeing any * additional memory in the SOF PCM stream structure - * @delay: Function pointer for pcm delay calculation + * @pointer: Function pointer for pcm pointer + * Note: the @pointer callback may return -EOPNOTSUPP which should be + * handled in a same way as if the callback is not provided + * @delay: Function pointer for pcm delay reporting * @reset_hw_params_during_stop: Flag indicating whether the hw_params should be reset during the * STOP pcm trigger * @ipc_first_on_start: Send IPC before invoking platform trigger during @@ -124,6 +127,9 @@ struct sof_ipc_pcm_ops { int (*dai_link_fixup)(struct snd_soc_pcm_runtime *rtd, struct snd_pcm_hw_params *params); int (*pcm_setup)(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm); void (*pcm_free)(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm); + int (*pointer)(struct snd_soc_component *component, + struct snd_pcm_substream *substream, + snd_pcm_uframes_t *pointer); snd_pcm_sframes_t (*delay)(struct snd_soc_component *component, struct snd_pcm_substream *substream); bool reset_hw_params_during_stop; -- cgit v1.2.3 From 0ea06680dfcb4464ac6c05968433d060efb44345 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:12 +0200 Subject: ASoC: SOF: ipc4-pcm: Correct the delay calculation This patch improves the delay calculation by relying on the LLP (Linear Link Position) on the DAI side and the LDP (Linear Data Pointer) on the host side. The LDP provides the same DMA position as LPIB, but with a linear count instead of a position in the ALSA ring buffer. The LDP values are provided in bytes and must be converted to frames. The difference in units means that the host counter will wrap earlier than the LLP. We need to wrap the LLP at the same boundary as the host counter. The ASoC framework relies on separate pointer and delay callback. Measurement errors can be reduced by processing all the counter values in the pointer callback. The delay value is stored, and will be reported to higher levels in the delay callback. For playback, the firmware provides a stream_start offset to handle mixing/pause usages, where the DAI might have started earlier than the PCM device. The delay calculation must be special-cased when the link counter has not reached the start offset value, i.e. no valid audio has left the DSP. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-16-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 159 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 127 insertions(+), 32 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 905dbc4852b1..e915f9f87a6c 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -19,14 +19,22 @@ * struct sof_ipc4_timestamp_info - IPC4 timestamp info * @host_copier: the host copier of the pcm stream * @dai_copier: the dai copier of the pcm stream - * @stream_start_offset: reported by fw in memory window + * @stream_start_offset: reported by fw in memory window (converted to frames) + * @stream_end_offset: reported by fw in memory window (converted to frames) * @llp_offset: llp offset in memory window + * @boundary: wrap boundary should be used for the LLP frame counter + * @delay: Calculated and stored in pointer callback. The stored value is + * returned in the delay callback. */ struct sof_ipc4_timestamp_info { struct sof_ipc4_copier *host_copier; struct sof_ipc4_copier *dai_copier; u64 stream_start_offset; + u64 stream_end_offset; u32 llp_offset; + + u64 boundary; + snd_pcm_sframes_t delay; }; static int sof_ipc4_set_multi_pipeline_state(struct snd_sof_dev *sdev, u32 state, @@ -726,6 +734,10 @@ static int sof_ipc4_pcm_setup(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm if (abi_version < SOF_IPC4_FW_REGS_ABI_VER) support_info = false; + /* For delay reporting the get_host_byte_counter callback is needed */ + if (!sof_ops(sdev) || !sof_ops(sdev)->get_host_byte_counter) + support_info = false; + for_each_pcm_streams(stream) { pipeline_list = &spcm->stream[stream].pipeline_list; @@ -858,7 +870,6 @@ static int sof_ipc4_get_stream_start_offset(struct snd_sof_dev *sdev, struct sof_ipc4_copier *host_copier = time_info->host_copier; struct sof_ipc4_copier *dai_copier = time_info->dai_copier; struct sof_ipc4_pipeline_registers ppl_reg; - u64 stream_start_position; u32 dai_sample_size; u32 ch, node_index; u32 offset; @@ -875,38 +886,51 @@ static int sof_ipc4_get_stream_start_offset(struct snd_sof_dev *sdev, if (ppl_reg.stream_start_offset == SOF_IPC4_INVALID_STREAM_POSITION) return -EINVAL; - stream_start_position = ppl_reg.stream_start_offset; ch = dai_copier->data.out_format.fmt_cfg; ch = SOF_IPC4_AUDIO_FORMAT_CFG_CHANNELS_COUNT(ch); dai_sample_size = (dai_copier->data.out_format.bit_depth >> 3) * ch; - /* convert offset to sample count */ - do_div(stream_start_position, dai_sample_size); - time_info->stream_start_offset = stream_start_position; + + /* convert offsets to frame count */ + time_info->stream_start_offset = ppl_reg.stream_start_offset; + do_div(time_info->stream_start_offset, dai_sample_size); + time_info->stream_end_offset = ppl_reg.stream_end_offset; + do_div(time_info->stream_end_offset, dai_sample_size); + + /* + * Calculate the wrap boundary need to be used for delay calculation + * The host counter is in bytes, it will wrap earlier than the frames + * based link counter. + */ + time_info->boundary = div64_u64(~((u64)0), + frames_to_bytes(substream->runtime, 1)); + /* Initialize the delay value to 0 (no delay) */ + time_info->delay = 0; return 0; } -static snd_pcm_sframes_t sof_ipc4_pcm_delay(struct snd_soc_component *component, - struct snd_pcm_substream *substream) +static int sof_ipc4_pcm_pointer(struct snd_soc_component *component, + struct snd_pcm_substream *substream, + snd_pcm_uframes_t *pointer) { struct snd_sof_dev *sdev = snd_soc_component_get_drvdata(component); struct snd_soc_pcm_runtime *rtd = snd_soc_substream_to_rtd(substream); struct sof_ipc4_timestamp_info *time_info; struct sof_ipc4_llp_reading_slot llp; - snd_pcm_uframes_t head_ptr, tail_ptr; + snd_pcm_uframes_t head_cnt, tail_cnt; struct snd_sof_pcm_stream *stream; + u64 dai_cnt, host_cnt, host_ptr; struct snd_sof_pcm *spcm; - u64 tmp_ptr; int ret; spcm = snd_sof_find_spcm_dai(component, rtd); if (!spcm) - return 0; + return -EOPNOTSUPP; stream = &spcm->stream[substream->stream]; time_info = stream->private; if (!time_info) - return 0; + return -EOPNOTSUPP; /* * stream_start_offset is updated to memory window by FW based on @@ -916,46 +940,116 @@ static snd_pcm_sframes_t sof_ipc4_pcm_delay(struct snd_soc_component *component, if (time_info->stream_start_offset == SOF_IPC4_INVALID_STREAM_POSITION) { ret = sof_ipc4_get_stream_start_offset(sdev, substream, stream, time_info); if (ret < 0) - return 0; + return -EOPNOTSUPP; } + /* For delay calculation we need the host counter */ + host_cnt = snd_sof_pcm_get_host_byte_counter(sdev, component, substream); + host_ptr = host_cnt; + + /* convert the host_cnt to frames */ + host_cnt = div64_u64(host_cnt, frames_to_bytes(substream->runtime, 1)); + /* * If the LLP counter is not reported by firmware in the SRAM window - * then read the dai (link) position via host accessible means if + * then read the dai (link) counter via host accessible means if * available. */ if (!time_info->llp_offset) { - tmp_ptr = snd_sof_pcm_get_dai_frame_counter(sdev, component, substream); - if (!tmp_ptr) - return 0; + dai_cnt = snd_sof_pcm_get_dai_frame_counter(sdev, component, substream); + if (!dai_cnt) + return -EOPNOTSUPP; } else { sof_mailbox_read(sdev, time_info->llp_offset, &llp, sizeof(llp)); - tmp_ptr = ((u64)llp.reading.llp_u << 32) | llp.reading.llp_l; + dai_cnt = ((u64)llp.reading.llp_u << 32) | llp.reading.llp_l; } + dai_cnt += time_info->stream_end_offset; - /* In two cases dai dma position is not accurate + /* In two cases dai dma counter is not accurate * (1) dai pipeline is started before host pipeline - * (2) multiple streams mixed into one. Each stream has the same dai dma position + * (2) multiple streams mixed into one. Each stream has the same dai dma + * counter + * + * Firmware calculates correct stream_start_offset for all cases + * including above two. + * Driver subtracts stream_start_offset from dai dma counter to get + * accurate one + */ + + /* + * On stream start the dai counter might not yet have reached the + * stream_start_offset value which means that no frames have left the + * DSP yet from the audio stream (on playback, capture streams have + * offset of 0 as we start capturing right away). + * In this case we need to adjust the distance between the counters by + * increasing the host counter by (offset - dai_counter). + * Otherwise the dai_counter needs to be adjusted to reflect the number + * of valid frames passed on the DAI side. * - * Firmware calculates correct stream_start_offset for all cases including above two. - * Driver subtracts stream_start_offset from dai dma position to get accurate one + * The delay is the difference between the counters on the two + * sides of the DSP. */ - tmp_ptr -= time_info->stream_start_offset; + if (dai_cnt < time_info->stream_start_offset) { + host_cnt += time_info->stream_start_offset - dai_cnt; + dai_cnt = 0; + } else { + dai_cnt -= time_info->stream_start_offset; + } + + /* Wrap the dai counter at the boundary where the host counter wraps */ + div64_u64_rem(dai_cnt, time_info->boundary, &dai_cnt); - /* Calculate the delay taking into account that both pointer can wrap */ - div64_u64_rem(tmp_ptr, substream->runtime->boundary, &tmp_ptr); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { - head_ptr = substream->runtime->status->hw_ptr; - tail_ptr = tmp_ptr; + head_cnt = host_cnt; + tail_cnt = dai_cnt; } else { - head_ptr = tmp_ptr; - tail_ptr = substream->runtime->status->hw_ptr; + head_cnt = dai_cnt; + tail_cnt = host_cnt; + } + + if (head_cnt < tail_cnt) { + time_info->delay = time_info->boundary - tail_cnt + head_cnt; + goto out; } - if (head_ptr < tail_ptr) - return substream->runtime->boundary - tail_ptr + head_ptr; + time_info->delay = head_cnt - tail_cnt; + +out: + /* + * Convert the host byte counter to PCM pointer which wraps in buffer + * and it is in frames + */ + div64_u64_rem(host_ptr, snd_pcm_lib_buffer_bytes(substream), &host_ptr); + *pointer = bytes_to_frames(substream->runtime, host_ptr); + + return 0; +} + +static snd_pcm_sframes_t sof_ipc4_pcm_delay(struct snd_soc_component *component, + struct snd_pcm_substream *substream) +{ + struct snd_soc_pcm_runtime *rtd = snd_soc_substream_to_rtd(substream); + struct sof_ipc4_timestamp_info *time_info; + struct snd_sof_pcm_stream *stream; + struct snd_sof_pcm *spcm; + + spcm = snd_sof_find_spcm_dai(component, rtd); + if (!spcm) + return 0; + + stream = &spcm->stream[substream->stream]; + time_info = stream->private; + /* + * Report the stored delay value calculated in the pointer callback. + * In the unlikely event that the calculation was skipped/aborted, the + * default 0 delay returned. + */ + if (time_info) + return time_info->delay; + + /* No delay information available, report 0 as delay */ + return 0; - return head_ptr - tail_ptr; } const struct sof_ipc_pcm_ops ipc4_pcm_ops = { @@ -965,6 +1059,7 @@ const struct sof_ipc_pcm_ops ipc4_pcm_ops = { .dai_link_fixup = sof_ipc4_pcm_dai_link_fixup, .pcm_setup = sof_ipc4_pcm_setup, .pcm_free = sof_ipc4_pcm_free, + .pointer = sof_ipc4_pcm_pointer, .delay = sof_ipc4_pcm_delay, .ipc_first_on_start = true, .platform_stop_during_hw_free = true, -- cgit v1.2.3 From f9eeb6bb13fb5d7af1ea5b74a10b1f8ead962540 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:13 +0200 Subject: ALSA: hda: Add pplcllpl/u members to hdac_ext_stream The pplcllpl/u can be used to save the Link Connection Linear Link Position register value to be used for compensation of the LLP register value in case the counter is not reset (after pause/resume or stop/start without closing the stream). The LLP can be used along with PPHCLDP to calculate delay caused by the DSP processing for HDA links. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-17-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- include/sound/hdaudio_ext.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/sound/hdaudio_ext.h b/include/sound/hdaudio_ext.h index a8bebac1e4b2..957295364a5e 100644 --- a/include/sound/hdaudio_ext.h +++ b/include/sound/hdaudio_ext.h @@ -56,6 +56,9 @@ struct hdac_ext_stream { u32 pphcldpl; u32 pphcldpu; + u32 pplcllpl; + u32 pplcllpu; + bool decoupled:1; bool link_locked:1; bool link_prepared; -- cgit v1.2.3 From 1abc2642588e06f6180b3fbb21968cf5d0ba9e5f Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:14 +0200 Subject: ASoC: SOF: Intel: hda: Compensate LLP in case it is not reset During pause/reset or stop/start the LLP counter is not reset, which will result broken delay reporting. Read the LLP value on STOP/PAUSE trigger and use it in LLP reading to normalize the LLP from the register. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-18-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai-ops.c | 11 +++++++++++ sound/soc/sof/intel/hda-pcm.c | 8 ++++++++ sound/soc/sof/intel/hda-stream.c | 9 ++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/intel/hda-dai-ops.c b/sound/soc/sof/intel/hda-dai-ops.c index c50ca9e72d37..b073720b4cf4 100644 --- a/sound/soc/sof/intel/hda-dai-ops.c +++ b/sound/soc/sof/intel/hda-dai-ops.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -362,6 +363,16 @@ static int hda_trigger(struct snd_sof_dev *sdev, struct snd_soc_dai *cpu_dai, case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_PAUSE_PUSH: snd_hdac_ext_stream_clear(hext_stream); + + /* + * Save the LLP registers in case the stream is + * restarting due PAUSE_RELEASE, or START without a pcm + * close/open since in this case the LLP register is not reset + * to 0 and the delay calculation will return with invalid + * results. + */ + hext_stream->pplcllpl = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPL); + hext_stream->pplcllpu = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPU); break; default: dev_err(sdev->dev, "unknown trigger command %d\n", cmd); diff --git a/sound/soc/sof/intel/hda-pcm.c b/sound/soc/sof/intel/hda-pcm.c index 69fefcd1abc5..d7b446f3f973 100644 --- a/sound/soc/sof/intel/hda-pcm.c +++ b/sound/soc/sof/intel/hda-pcm.c @@ -282,6 +282,14 @@ int hda_dsp_pcm_open(struct snd_sof_dev *sdev, /* binding pcm substream to hda stream */ substream->runtime->private_data = &dsp_stream->hstream; + + /* + * Reset the llp cache values (they are used for LLP compensation in + * case the counter is not reset) + */ + dsp_stream->pplcllpl = 0; + dsp_stream->pplcllpu = 0; + return 0; } diff --git a/sound/soc/sof/intel/hda-stream.c b/sound/soc/sof/intel/hda-stream.c index 8504a4f27b60..0c189d3b19c1 100644 --- a/sound/soc/sof/intel/hda-stream.c +++ b/sound/soc/sof/intel/hda-stream.c @@ -1064,6 +1064,8 @@ snd_pcm_uframes_t hda_dsp_stream_get_position(struct hdac_stream *hstream, return pos; } +#define merge_u64(u32_u, u32_l) (((u64)(u32_u) << 32) | (u32_l)) + /** * hda_dsp_get_stream_llp - Retrieve the LLP (Linear Link Position) of the stream * @sdev: SOF device @@ -1093,7 +1095,12 @@ u64 hda_dsp_get_stream_llp(struct snd_sof_dev *sdev, llp_l = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPL); llp_u = readl(hext_stream->pplc_addr + AZX_REG_PPLCLLPU); - return ((u64)llp_u << 32) | llp_l; + /* Compensate the LLP counter with the saved offset */ + if (hext_stream->pplcllpl || hext_stream->pplcllpu) + return merge_u64(llp_u, llp_l) - + merge_u64(hext_stream->pplcllpu, hext_stream->pplcllpl); + + return merge_u64(llp_u, llp_l); } /** -- cgit v1.2.3 From c61115b37ff964d63191dbf4a058f481daabdf57 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 22 Mar 2024 13:25:04 +0200 Subject: ASoC: SOF: Intel: hda-dsp: Skip IMR boot on ACE platforms in case of S3 suspend SoCs with ACE architecture are tailored to use s2idle instead deep (S3) suspend state and the IMR content is lost when the system is forced to enter even to S3. When waking up from S3 state the IMR boot will fail as the content is lost. Set the skip_imr_boot flag to make sure that we don't try IMR in this case. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Link: https://msgid.link/r/20240322112504.4192-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dsp.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sound/soc/sof/intel/hda-dsp.c b/sound/soc/sof/intel/hda-dsp.c index 31ffa1a8f2ac..ef5c915db8ff 100644 --- a/sound/soc/sof/intel/hda-dsp.c +++ b/sound/soc/sof/intel/hda-dsp.c @@ -681,17 +681,27 @@ static int hda_suspend(struct snd_sof_dev *sdev, bool runtime_suspend) struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata; const struct sof_intel_dsp_desc *chip = hda->desc; struct hdac_bus *bus = sof_to_bus(sdev); + bool imr_lost = false; int ret, j; /* - * The memory used for IMR boot loses its content in deeper than S3 state - * We must not try IMR boot on next power up (as it will fail). - * + * The memory used for IMR boot loses its content in deeper than S3 + * state on CAVS platforms. + * On ACE platforms due to the system architecture the IMR content is + * lost at S3 state already, they are tailored for s2idle use. + * We must not try IMR boot on next power up in these cases as it will + * fail. + */ + if (sdev->system_suspend_target > SOF_SUSPEND_S3 || + (chip->hw_ip_version >= SOF_INTEL_ACE_1_0 && + sdev->system_suspend_target == SOF_SUSPEND_S3)) + imr_lost = true; + + /* * In case of firmware crash or boot failure set the skip_imr_boot to true * as well in order to try to re-load the firmware to do a 'cold' boot. */ - if (sdev->system_suspend_target > SOF_SUSPEND_S3 || - sdev->fw_state == SOF_FW_CRASHED || + if (imr_lost || sdev->fw_state == SOF_FW_CRASHED || sdev->fw_state == SOF_FW_BOOT_FAILED) hda->skip_imr_boot = true; -- cgit v1.2.3 From e2d7ad717a6b0880843dbc60855a5b97ad0395f8 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 25 Mar 2024 14:44:50 +0000 Subject: ASoC: cs-amp-lib: Check for no firmware controls when writing calibration When a wmfw file has not been loaded the firmware control descriptions necessary to write a stored calibration are not present. In this case print a more descriptive error message. The message is logged at info level because it is not fatal, and does not necessarily imply that anything is broken. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Link: https://msgid.link/r/20240325144450.293630-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs-amp-lib.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/soc/codecs/cs-amp-lib.c b/sound/soc/codecs/cs-amp-lib.c index 01ef4db5407d..287ac01a3873 100644 --- a/sound/soc/codecs/cs-amp-lib.c +++ b/sound/soc/codecs/cs-amp-lib.c @@ -56,6 +56,11 @@ static int _cs_amp_write_cal_coeffs(struct cs_dsp *dsp, dev_dbg(dsp->dev, "Calibration: Ambient=%#x, Status=%#x, CalR=%d\n", data->calAmbient, data->calStatus, data->calR); + if (list_empty(&dsp->ctl_list)) { + dev_info(dsp->dev, "Calibration disabled due to missing firmware controls\n"); + return -ENOENT; + } + ret = cs_amp_write_cal_coeff(dsp, controls, controls->ambient, data->calAmbient); if (ret) return ret; -- cgit v1.2.3 From 708181c50b7763c689ecaba5db8075c2d03719c4 Mon Sep 17 00:00:00 2001 From: Rander Wang Date: Fri, 22 Mar 2024 13:27:03 +0200 Subject: ASoC: SOF: mtrace: rework mtrace timestamp setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original timestamp is built base on windows epoch time which is not fit for Linux system and difficult to be used for kernel debugging. This patch adopts syslog timestamp so that we can simply use dmesg to check the timestamp between fw and kernel. Signed-off-by: Rander Wang Reviewed-by: Péter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Kai Vehmanen Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240322112703.4549-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-mtrace.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sound/soc/sof/ipc4-mtrace.c b/sound/soc/sof/ipc4-mtrace.c index 9f1e33ee8826..0e04bea9432d 100644 --- a/sound/soc/sof/ipc4-mtrace.c +++ b/sound/soc/sof/ipc4-mtrace.c @@ -4,6 +4,7 @@ #include #include +#include #include #include "sof-priv.h" #include "ipc4-priv.h" @@ -412,7 +413,6 @@ static int ipc4_mtrace_enable(struct snd_sof_dev *sdev) const struct sof_ipc_ops *iops = sdev->ipc->ops; struct sof_ipc4_msg msg; u64 system_time; - ktime_t kt; int ret; if (priv->mtrace_state != SOF_MTRACE_DISABLED) @@ -424,9 +424,12 @@ static int ipc4_mtrace_enable(struct snd_sof_dev *sdev) msg.primary |= SOF_IPC4_MOD_INSTANCE(SOF_IPC4_MOD_INIT_BASEFW_INSTANCE_ID); msg.extension = SOF_IPC4_MOD_EXT_MSG_PARAM_ID(SOF_IPC4_FW_PARAM_SYSTEM_TIME); - /* The system time is in usec, UTC, epoch is 1601-01-01 00:00:00 */ - kt = ktime_add_us(ktime_get_real(), FW_EPOCH_DELTA * USEC_PER_SEC); - system_time = ktime_to_us(kt); + /* + * local_clock() is used to align with dmesg, so both kernel and firmware logs have + * the same base and a minor delta due to the IPC. system time is in us format but + * local_clock() returns the time in ns, so convert to ns. + */ + system_time = div64_u64(local_clock(), NSEC_PER_USEC); msg.data_size = sizeof(system_time); msg.data_ptr = &system_time; ret = iops->set_get_data(sdev, &msg, msg.data_size, true); -- cgit v1.2.3 From a469158eaf8f4b10263b417856d923dfa38ae96d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Apitzsch?= Date: Mon, 25 Mar 2024 19:05:09 +0100 Subject: regulator: tps65132: Add of_match table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add of_match table for "ti,tps65132" compatible string. This fixes automatic driver loading when using device-tree, and if built as a module like major linux distributions do. Signed-off-by: André Apitzsch Link: https://msgid.link/r/20240325-of_tps65132-v1-1-86a5f7ef4ede@apitzsch.eu Signed-off-by: Mark Brown --- drivers/regulator/tps65132-regulator.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/regulator/tps65132-regulator.c b/drivers/regulator/tps65132-regulator.c index a06f5f2d7932..9c2f0dd42613 100644 --- a/drivers/regulator/tps65132-regulator.c +++ b/drivers/regulator/tps65132-regulator.c @@ -267,10 +267,17 @@ static const struct i2c_device_id tps65132_id[] = { }; MODULE_DEVICE_TABLE(i2c, tps65132_id); +static const struct of_device_id __maybe_unused tps65132_of_match[] = { + { .compatible = "ti,tps65132" }, + {}, +}; +MODULE_DEVICE_TABLE(of, tps65132_of_match); + static struct i2c_driver tps65132_i2c_driver = { .driver = { .name = "tps65132", .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .of_match_table = of_match_ptr(tps65132_of_match), }, .probe = tps65132_probe, .id_table = tps65132_id, -- cgit v1.2.3 From 8e936e98718f005c986be0bfa1ee6b355acf96be Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 21 Mar 2024 14:20:41 +0530 Subject: RISC-V: KVM: Fix APLIC in_clrip[x] read emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reads to APLIC in_clrip[x] registers returns rectified input values of the interrupt sources. A rectified input value of an interrupt source is defined by the section "4.5.2 Source configurations (sourcecfg[1]–sourcecfg[1023])" of the RISC-V AIA specification as: rectified input value = (incoming wire value) XOR (source is inverted) Update the riscv_aplic_input() implementation to match the above. Cc: stable@vger.kernel.org Fixes: 74967aa208e2 ("RISC-V: KVM: Add in-kernel emulation of AIA APLIC") Signed-off-by: Anup Patel Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20240321085041.1955293-3-apatel@ventanamicro.com --- arch/riscv/kvm/aia_aplic.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c index 5e842b92dc46..b467ba5ed910 100644 --- a/arch/riscv/kvm/aia_aplic.c +++ b/arch/riscv/kvm/aia_aplic.c @@ -197,16 +197,31 @@ static void aplic_write_enabled(struct aplic *aplic, u32 irq, bool enabled) static bool aplic_read_input(struct aplic *aplic, u32 irq) { - bool ret; - unsigned long flags; + u32 sourcecfg, sm, raw_input, irq_inverted; struct aplic_irq *irqd; + unsigned long flags; + bool ret = false; if (!irq || aplic->nr_irqs <= irq) return false; irqd = &aplic->irqs[irq]; raw_spin_lock_irqsave(&irqd->lock, flags); - ret = (irqd->state & APLIC_IRQ_STATE_INPUT) ? true : false; + + sourcecfg = irqd->sourcecfg; + if (sourcecfg & APLIC_SOURCECFG_D) + goto skip; + + sm = sourcecfg & APLIC_SOURCECFG_SM_MASK; + if (sm == APLIC_SOURCECFG_SM_INACTIVE) + goto skip; + + raw_input = (irqd->state & APLIC_IRQ_STATE_INPUT) ? 1 : 0; + irq_inverted = (sm == APLIC_SOURCECFG_SM_LEVEL_LOW || + sm == APLIC_SOURCECFG_SM_EDGE_FALL) ? 1 : 0; + ret = !!(raw_input ^ irq_inverted); + +skip: raw_spin_unlock_irqrestore(&irqd->lock, flags); return ret; -- cgit v1.2.3 From e89c928bedd77d181edc2df01cb6672184775140 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 5 Mar 2024 18:48:39 +0000 Subject: KVM: arm64: Fix host-programmed guest events in nVHE Programming PMU events in the host that count during guest execution is a feature supported by perf, e.g. perf stat -e cpu_cycles:G ./lkvm run While this works for VHE, the guest/host event bitmaps are not carried through to the hypervisor in the nVHE configuration. Make kvm_pmu_update_vcpu_events() conditional on whether or not _hardware_ supports PMUv3 rather than if the vCPU as vPMU enabled. Cc: stable@vger.kernel.org Fixes: 84d751a019a9 ("KVM: arm64: Pass pmu events to hyp via vcpu") Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240305184840.636212-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- include/kvm/arm_pmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index eb4c369a79eb..35d4ca4f6122 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -86,7 +86,7 @@ void kvm_vcpu_pmu_resync_el0(void); */ #define kvm_pmu_update_vcpu_events(vcpu) \ do { \ - if (!has_vhe() && kvm_vcpu_has_pmu(vcpu)) \ + if (!has_vhe() && kvm_arm_support_pmu_v3()) \ vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) -- cgit v1.2.3 From f5fe0adeed6019df495497a64cb57d563ead2296 Mon Sep 17 00:00:00 2001 From: Wujie Duan Date: Mon, 18 Mar 2024 17:47:35 +0800 Subject: KVM: arm64: Fix out-of-IPA space translation fault handling Commit 11e5ea5242e3 ("KVM: arm64: Use helpers to classify exception types reported via ESR") tried to abstract the translation fault check when handling an out-of IPA space condition, but incorrectly replaced it with a permission fault check. Restore the previous translation fault check. Fixes: 11e5ea5242e3 ("KVM: arm64: Use helpers to classify exception types reported via ESR") Acked-by: Ard Biesheuvel Reviewed-by: Marc Zyngier Signed-off-by: Wujie Duan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/kvmarm/864jd3269g.wl-maz@kernel.org/ Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 18680771cdb0..dc04bc767865 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1637,7 +1637,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (esr_fsc_is_permission_fault(esr)) { + if (esr_fsc_is_translation_fault(esr)) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); -- cgit v1.2.3 From 29b0075ed61cda250449f556fbe007a5c469440c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 14 Mar 2024 10:51:15 -0700 Subject: KVM: selftests: Fix __GUEST_ASSERT() format warnings in ARM's arch timer test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use %x instead of %lx when printing uint32_t variables to fix format warnings in ARM's arch timer test. aarch64/arch_timer.c: In function ‘guest_run_stage’: aarch64/arch_timer.c:138:33: warning: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 6 has type ‘uint32_t’ {aka ‘unsigned int’} [-Wformat=] 138 | "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n" | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ...... 141 | config_iter + 1, irq_iter); | ~~~~~~~~~~~~~~~ | | | uint32_t {aka unsigned int} Fixes: d1dafd065a23 ("KVM: arm64: selftests: Enable tuning of error margin in arch_timer test") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20240314175116.2366301-1-seanjc@google.com Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index ddba2c2fb5de..93100b3f1312 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -135,7 +135,7 @@ static void guest_run_stage(struct test_vcpu_shared_data *shared_data, irq_iter = READ_ONCE(shared_data->nr_iter); __GUEST_ASSERT(config_iter + 1 == irq_iter, - "config_iter + 1 = 0x%lx, irq_iter = 0x%lx.\n" + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" " Guest timer interrupt was not trigged within the specified\n" " interval, try to increase the error margin by [-e] option.\n", config_iter + 1, irq_iter); -- cgit v1.2.3 From 502892bbd2021fbe20f0d702b6b0bae281a742fa Mon Sep 17 00:00:00 2001 From: Irui Wang Date: Wed, 6 Mar 2024 10:12:23 +0800 Subject: media: mediatek: vcodec: Handle VP9 superframe bitstream with 8 sub-frames The VP9 bitstream uses superframes, which each contain 8 sub-frames, enable accessing the last superframe by increasing the range of the index vaidation as the maximum number of superframes is 8 and not 7, so that the last sub-frame can be decoded normally with the stateful VP9 decoder. Signed-off-by: Irui Wang Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- .../media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c index 55355fa70090..039082f600c8 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_if.c @@ -16,6 +16,7 @@ #include "../vdec_drv_base.h" #include "../vdec_vpu_if.h" +#define VP9_MAX_SUPER_FRAMES_NUM 8 #define VP9_SUPER_FRAME_BS_SZ 64 #define MAX_VP9_DPB_SIZE 9 @@ -133,11 +134,11 @@ struct vp9_sf_ref_fb { */ struct vdec_vp9_vsi { unsigned char sf_bs_buf[VP9_SUPER_FRAME_BS_SZ]; - struct vp9_sf_ref_fb sf_ref_fb[VP9_MAX_FRM_BUF_NUM-1]; + struct vp9_sf_ref_fb sf_ref_fb[VP9_MAX_SUPER_FRAMES_NUM]; int sf_next_ref_fb_idx; unsigned int sf_frm_cnt; - unsigned int sf_frm_offset[VP9_MAX_FRM_BUF_NUM-1]; - unsigned int sf_frm_sz[VP9_MAX_FRM_BUF_NUM-1]; + unsigned int sf_frm_offset[VP9_MAX_SUPER_FRAMES_NUM]; + unsigned int sf_frm_sz[VP9_MAX_SUPER_FRAMES_NUM]; unsigned int sf_frm_idx; unsigned int sf_init; struct vdec_fb fb; @@ -526,7 +527,7 @@ static void vp9_swap_frm_bufs(struct vdec_vp9_inst *inst) /* if this super frame and it is not last sub-frame, get next fb for * sub-frame decode */ - if (vsi->sf_frm_cnt > 0 && vsi->sf_frm_idx != vsi->sf_frm_cnt - 1) + if (vsi->sf_frm_cnt > 0 && vsi->sf_frm_idx != vsi->sf_frm_cnt) vsi->sf_next_ref_fb_idx = vp9_get_sf_ref_fb(inst); } @@ -735,7 +736,7 @@ static void get_free_fb(struct vdec_vp9_inst *inst, struct vdec_fb **out_fb) static int validate_vsi_array_indexes(struct vdec_vp9_inst *inst, struct vdec_vp9_vsi *vsi) { - if (vsi->sf_frm_idx >= VP9_MAX_FRM_BUF_NUM - 1) { + if (vsi->sf_frm_idx > VP9_MAX_SUPER_FRAMES_NUM) { mtk_vdec_err(inst->ctx, "Invalid vsi->sf_frm_idx=%u.", vsi->sf_frm_idx); return -EIO; } -- cgit v1.2.3 From 97c75ee5de060d271d80109b0c47cb6008439e5b Mon Sep 17 00:00:00 2001 From: Nicolas Dufresne Date: Mon, 26 Feb 2024 16:19:52 -0500 Subject: media: mediatek: vcodec: Fix oops when HEVC init fails The stateless HEVC decoder saves the instance pointer in the context regardless if the initialization worked or not. This caused a use after free, when the pointer is freed in case of a failure in the deinit function. Only store the instance pointer when the initialization was successful, to solve this issue. Hardware name: Acer Tomato (rev3 - 4) board (DT) pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : vcodec_vpu_send_msg+0x4c/0x190 [mtk_vcodec_dec] lr : vcodec_send_ap_ipi+0x78/0x170 [mtk_vcodec_dec] sp : ffff80008750bc20 x29: ffff80008750bc20 x28: ffff1299f6d70000 x27: 0000000000000000 x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000 x23: ffff80008750bc98 x22: 000000000000a003 x21: ffffd45c4cfae000 x20: 0000000000000010 x19: ffff1299fd668310 x18: 000000000000001a x17: 000000040044ffff x16: ffffd45cb15dc648 x15: 0000000000000000 x14: ffff1299c08da1c0 x13: ffffd45cb1f87a10 x12: ffffd45cb2f5fe80 x11: 0000000000000001 x10: 0000000000001b30 x9 : ffffd45c4d12b488 x8 : 1fffe25339380d81 x7 : 0000000000000001 x6 : ffff1299c9c06c00 x5 : 0000000000000132 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000010 x1 : ffff80008750bc98 x0 : 0000000000000000 Call trace: vcodec_vpu_send_msg+0x4c/0x190 [mtk_vcodec_dec] vcodec_send_ap_ipi+0x78/0x170 [mtk_vcodec_dec] vpu_dec_deinit+0x1c/0x30 [mtk_vcodec_dec] vdec_hevc_slice_deinit+0x30/0x98 [mtk_vcodec_dec] vdec_if_deinit+0x38/0x68 [mtk_vcodec_dec] mtk_vcodec_dec_release+0x20/0x40 [mtk_vcodec_dec] fops_vcodec_release+0x64/0x118 [mtk_vcodec_dec] v4l2_release+0x7c/0x100 __fput+0x80/0x2d8 __fput_sync+0x58/0x70 __arm64_sys_close+0x40/0x90 invoke_syscall+0x50/0x128 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x38/0xd8 el0t_64_sync_handler+0xc0/0xc8 el0t_64_sync+0x1a8/0x1b0 Code: d503201f f9401660 b900127f b900227f (f9400400) Signed-off-by: Nicolas Dufresne Fixes: 2674486aac7d ("media: mediatek: vcodec: support stateless hevc decoder") Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- .../platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c index 06ed47df693b..21836dd6ef85 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_hevc_req_multi_if.c @@ -869,7 +869,6 @@ static int vdec_hevc_slice_init(struct mtk_vcodec_dec_ctx *ctx) inst->vpu.codec_type = ctx->current_codec; inst->vpu.capture_type = ctx->capture_fourcc; - ctx->drv_handle = inst; err = vpu_dec_init(&inst->vpu); if (err) { mtk_vdec_err(ctx, "vdec_hevc init err=%d", err); @@ -898,6 +897,7 @@ static int vdec_hevc_slice_init(struct mtk_vcodec_dec_ctx *ctx) mtk_vdec_debug(ctx, "lat hevc instance >> %p, codec_type = 0x%x", inst, inst->vpu.codec_type); + ctx->drv_handle = inst; return 0; error_free_inst: kfree(inst); -- cgit v1.2.3 From 6467cda18c9f9b5f2f9a0aa1e2861c653e41f382 Mon Sep 17 00:00:00 2001 From: Yunfei Dong Date: Thu, 14 Mar 2024 20:30:08 +0800 Subject: media: mediatek: vcodec: adding lock to protect decoder context list Add a lock for the ctx_list, to avoid accessing a NULL pointer within the 'vpu_dec_ipi_handler' function when the ctx_list has been deleted due to an unexpected behavior on the SCP IP block. Hardware name: Google juniper sku16 board (DT) pstate: 20400005 (nzCv daif +PAN -UAO -TCO BTYPE=--) pc : vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec] lr : scp_ipi_handler+0xd0/0x194 [mtk_scp] sp : ffffffc0131dbbd0 x29: ffffffc0131dbbd0 x28: 0000000000000000 x27: ffffff9bb277f348 x26: ffffff9bb242ad00 x25: ffffffd2d440d3b8 x24: ffffffd2a13ff1d4 x23: ffffff9bb7fe85a0 x22: ffffffc0133fbdb0 x21: 0000000000000010 x20: ffffff9b050ea328 x19: ffffffc0131dbc08 x18: 0000000000001000 x17: 0000000000000000 x16: ffffffd2d461c6e0 x15: 0000000000000242 x14: 000000000000018f x13: 000000000000004d x12: 0000000000000000 x11: 0000000000000001 x10: fffffffffffffff0 x9 : ffffff9bb6e793a8 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : fffffffffffffff0 x3 : 0000000000000020 x2 : ffffff9bb6e79080 x1 : 0000000000000010 x0 : ffffffc0131dbc08 Call trace: vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec (HASH:6c3f 2)] scp_ipi_handler+0xd0/0x194 [mtk_scp (HASH:7046 3)] mt8183_scp_irq_handler+0x44/0x88 [mtk_scp (HASH:7046 3)] scp_irq_handler+0x48/0x90 [mtk_scp (HASH:7046 3)] irq_thread_fn+0x38/0x94 irq_thread+0x100/0x1c0 kthread+0x140/0x1fc ret_from_fork+0x10/0x30 Code: 54000088 f94ca50a eb14015f 54000060 (f9400108) ---[ end trace ace43ce36cbd5c93 ]--- Kernel panic - not syncing: Oops: Fatal exception SMP: stopping secondary CPUs Kernel Offset: 0x12c4000000 from 0xffffffc010000000 PHYS_OFFSET: 0xffffffe580000000 CPU features: 0x08240002,2188200c Memory Limit: none Fixes: 655b86e52eac ("media: mediatek: vcodec: Fix possible invalid memory access for decoder") Signed-off-by: Yunfei Dong Reviewed-by: Nicolas Dufresne Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c | 4 ++-- drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c | 5 +++++ drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h | 2 ++ drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c index 4c34344dc7dc..62cafe25fed9 100644 --- a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c +++ b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c @@ -50,12 +50,12 @@ static void mtk_vcodec_vpu_reset_dec_handler(void *priv) dev_err(&dev->plat_dev->dev, "Watchdog timeout!!"); - mutex_lock(&dev->dev_mutex); + mutex_lock(&dev->dev_ctx_lock); list_for_each_entry(ctx, &dev->ctx_list, list) { ctx->state = MTK_STATE_ABORT; mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id); } - mutex_unlock(&dev->dev_mutex); + mutex_unlock(&dev->dev_ctx_lock); } static void mtk_vcodec_vpu_reset_enc_handler(void *priv) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c index f47c98faf068..2073781ccadb 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c @@ -268,7 +268,9 @@ static int fops_vcodec_open(struct file *file) ctx->dev->vdec_pdata->init_vdec_params(ctx); + mutex_lock(&dev->dev_ctx_lock); list_add(&ctx->list, &dev->ctx_list); + mutex_unlock(&dev->dev_ctx_lock); mtk_vcodec_dbgfs_create(ctx); mutex_unlock(&dev->dev_mutex); @@ -311,7 +313,9 @@ static int fops_vcodec_release(struct file *file) v4l2_ctrl_handler_free(&ctx->ctrl_hdl); mtk_vcodec_dbgfs_remove(dev, ctx->id); + mutex_lock(&dev->dev_ctx_lock); list_del_init(&ctx->list); + mutex_unlock(&dev->dev_ctx_lock); kfree(ctx); mutex_unlock(&dev->dev_mutex); return 0; @@ -404,6 +408,7 @@ static int mtk_vcodec_probe(struct platform_device *pdev) for (i = 0; i < MTK_VDEC_HW_MAX; i++) mutex_init(&dev->dec_mutex[i]); mutex_init(&dev->dev_mutex); + mutex_init(&dev->dev_ctx_lock); spin_lock_init(&dev->irqlock); snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s", diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h index 849b89dd205c..85b2c0d3d8bc 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h @@ -241,6 +241,7 @@ struct mtk_vcodec_dec_ctx { * * @dec_mutex: decoder hardware lock * @dev_mutex: video_device lock + * @dev_ctx_lock: the lock of context list * @decode_workqueue: decode work queue * * @irqlock: protect data access by irq handler and work thread @@ -282,6 +283,7 @@ struct mtk_vcodec_dec_dev { /* decoder hardware mutex lock */ struct mutex dec_mutex[MTK_VDEC_HW_MAX]; struct mutex dev_mutex; + struct mutex dev_ctx_lock; struct workqueue_struct *decode_workqueue; spinlock_t irqlock; diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c index 82e57ae983d5..da6be556727b 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c @@ -77,12 +77,14 @@ static bool vpu_dec_check_ap_inst(struct mtk_vcodec_dec_dev *dec_dev, struct vde struct mtk_vcodec_dec_ctx *ctx; int ret = false; + mutex_lock(&dec_dev->dev_ctx_lock); list_for_each_entry(ctx, &dec_dev->ctx_list, list) { if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) { ret = true; break; } } + mutex_unlock(&dec_dev->dev_ctx_lock); return ret; } -- cgit v1.2.3 From afaaf3a0f647a24a7bf6a2145d8ade37baaf75ad Mon Sep 17 00:00:00 2001 From: Yunfei Dong Date: Thu, 14 Mar 2024 20:30:09 +0800 Subject: media: mediatek: vcodec: adding lock to protect encoder context list Add a lock for the ctx_list, to avoid accessing a NULL pointer within the 'vpu_enc_ipi_handler' function when the ctx_list has been deleted due to an unexpected behavior on the SCP IP block. Fixes: 1972e32431ed ("media: mediatek: vcodec: Fix possible invalid memory access for encoder") Signed-off-by: Yunfei Dong Reviewed-by: Nicolas Dufresne Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c | 4 ++-- drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c | 5 +++++ drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h | 2 ++ drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c index 62cafe25fed9..d7027d600208 100644 --- a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c +++ b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c @@ -65,12 +65,12 @@ static void mtk_vcodec_vpu_reset_enc_handler(void *priv) dev_err(&dev->plat_dev->dev, "Watchdog timeout!!"); - mutex_lock(&dev->dev_mutex); + mutex_lock(&dev->dev_ctx_lock); list_for_each_entry(ctx, &dev->ctx_list, list) { ctx->state = MTK_STATE_ABORT; mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id); } - mutex_unlock(&dev->dev_mutex); + mutex_unlock(&dev->dev_ctx_lock); } static const struct mtk_vcodec_fw_ops mtk_vcodec_vpu_msg = { diff --git a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c index 6319f24bc714..3cb8a1622222 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c +++ b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.c @@ -177,7 +177,9 @@ static int fops_vcodec_open(struct file *file) mtk_v4l2_venc_dbg(2, ctx, "Create instance [%d]@%p m2m_ctx=%p ", ctx->id, ctx, ctx->m2m_ctx); + mutex_lock(&dev->dev_ctx_lock); list_add(&ctx->list, &dev->ctx_list); + mutex_unlock(&dev->dev_ctx_lock); mutex_unlock(&dev->dev_mutex); mtk_v4l2_venc_dbg(0, ctx, "%s encoder [%d]", dev_name(&dev->plat_dev->dev), @@ -212,7 +214,9 @@ static int fops_vcodec_release(struct file *file) v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->ctrl_hdl); + mutex_lock(&dev->dev_ctx_lock); list_del_init(&ctx->list); + mutex_unlock(&dev->dev_ctx_lock); kfree(ctx); mutex_unlock(&dev->dev_mutex); return 0; @@ -294,6 +298,7 @@ static int mtk_vcodec_probe(struct platform_device *pdev) mutex_init(&dev->enc_mutex); mutex_init(&dev->dev_mutex); + mutex_init(&dev->dev_ctx_lock); spin_lock_init(&dev->irqlock); snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s", diff --git a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h index a042f607ed8d..0bd85d0fb379 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h +++ b/drivers/media/platform/mediatek/vcodec/encoder/mtk_vcodec_enc_drv.h @@ -178,6 +178,7 @@ struct mtk_vcodec_enc_ctx { * * @enc_mutex: encoder hardware lock. * @dev_mutex: video_device lock + * @dev_ctx_lock: the lock of context list * @encode_workqueue: encode work queue * * @enc_irq: h264 encoder irq resource @@ -205,6 +206,7 @@ struct mtk_vcodec_enc_dev { /* encoder hardware mutex lock */ struct mutex enc_mutex; struct mutex dev_mutex; + struct mutex dev_ctx_lock; struct workqueue_struct *encode_workqueue; int enc_irq; diff --git a/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c b/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c index 84ad1cc6ad17..51bb7ee141b9 100644 --- a/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c +++ b/drivers/media/platform/mediatek/vcodec/encoder/venc_vpu_if.c @@ -47,12 +47,14 @@ static bool vpu_enc_check_ap_inst(struct mtk_vcodec_enc_dev *enc_dev, struct ven struct mtk_vcodec_enc_ctx *ctx; int ret = false; + mutex_lock(&enc_dev->dev_ctx_lock); list_for_each_entry(ctx, &enc_dev->ctx_list, list) { if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) { ret = true; break; } } + mutex_unlock(&enc_dev->dev_ctx_lock); return ret; } -- cgit v1.2.3 From d353c3c34af08cfd4eaafc8c55f664eacec274ee Mon Sep 17 00:00:00 2001 From: Yunfei Dong Date: Wed, 6 Mar 2024 20:19:02 +0800 Subject: media: mediatek: vcodec: support 36 bits physical address The physical address on the MT8188 platform is larger than 32 bits, change the type from unsigned int to dma_addr_t to be able to access the high bits of the address. Signed-off-by: Yunfei Dong Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Hans Verkuil --- drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c | 2 +- .../media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c index 19407f9bc773..987b3d71b662 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp8_if.c @@ -449,7 +449,7 @@ static int vdec_vp8_decode(void *h_vdec, struct mtk_vcodec_mem *bs, inst->frm_cnt, y_fb_dma, c_fb_dma, fb); inst->cur_fb = fb; - dec->bs_dma = (unsigned long)bs->dma_addr; + dec->bs_dma = (uint64_t)bs->dma_addr; dec->bs_sz = bs->size; dec->cur_y_fb_dma = y_fb_dma; dec->cur_c_fb_dma = c_fb_dma; diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c index cf48d09b78d7..eea709d93820 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c @@ -1074,7 +1074,7 @@ static int vdec_vp9_slice_setup_tile_buffer(struct vdec_vp9_slice_instance *inst unsigned int mi_row; unsigned int mi_col; unsigned int offset; - unsigned int pa; + dma_addr_t pa; unsigned int size; struct vdec_vp9_slice_tiles *tiles; unsigned char *pos; @@ -1109,7 +1109,7 @@ static int vdec_vp9_slice_setup_tile_buffer(struct vdec_vp9_slice_instance *inst pos = va + offset; end = va + bs->size; /* truncated */ - pa = (unsigned int)bs->dma_addr + offset; + pa = bs->dma_addr + offset; tb = instance->tile.va; for (i = 0; i < rows; i++) { for (j = 0; j < cols; j++) { -- cgit v1.2.3 From 56ebbd19c2989f7450341f581e2724a149d0f08e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 26 Mar 2024 10:54:34 +0000 Subject: ASoC: cs42l43: Correct extraction of data pointer in suspend/resume The current code is pulling the wrong pointer causing it to disable the wrong IRQ. Correct the code to pull the correct cs42l43 core data pointer. Fixes: 64353af49fec ("ASoC: cs42l43: Add system suspend ops to disable IRQ") Signed-off-by: Charles Keepax Link: https://msgid.link/r/20240326105434.852907-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/cs42l43.c b/sound/soc/codecs/cs42l43.c index 860d5cda67bf..94685449f0f4 100644 --- a/sound/soc/codecs/cs42l43.c +++ b/sound/soc/codecs/cs42l43.c @@ -2364,7 +2364,8 @@ static int cs42l43_codec_runtime_resume(struct device *dev) static int cs42l43_codec_suspend(struct device *dev) { - struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + struct cs42l43_codec *priv = dev_get_drvdata(dev); + struct cs42l43 *cs42l43 = priv->core; disable_irq(cs42l43->irq); @@ -2373,7 +2374,8 @@ static int cs42l43_codec_suspend(struct device *dev) static int cs42l43_codec_suspend_noirq(struct device *dev) { - struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + struct cs42l43_codec *priv = dev_get_drvdata(dev); + struct cs42l43 *cs42l43 = priv->core; enable_irq(cs42l43->irq); @@ -2382,7 +2384,8 @@ static int cs42l43_codec_suspend_noirq(struct device *dev) static int cs42l43_codec_resume(struct device *dev) { - struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + struct cs42l43_codec *priv = dev_get_drvdata(dev); + struct cs42l43 *cs42l43 = priv->core; enable_irq(cs42l43->irq); @@ -2391,7 +2394,8 @@ static int cs42l43_codec_resume(struct device *dev) static int cs42l43_codec_resume_noirq(struct device *dev) { - struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + struct cs42l43_codec *priv = dev_get_drvdata(dev); + struct cs42l43 *cs42l43 = priv->core; disable_irq(cs42l43->irq); -- cgit v1.2.3 From 674bc0168e6b68070c75df22e97ab63b6eb60d89 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Fri, 1 Mar 2024 12:18:32 -0800 Subject: riscv: mm: Fix prototype to avoid discarding const __flush_tlb_range() does not modify the provided cpumask, so its cmask parameter can be pointer-to-const. This avoids the unsafe cast of cpu_online_mask. Fixes: 54d7431af73e ("riscv: Add support for BATCHED_UNMAP_TLB_FLUSH") Signed-off-by: Samuel Holland Reviewed-by: Andrew Jones Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240301201837.2826172-1-samuel.holland@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/tlbflush.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 893566e004b7..07d743f87b3f 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -99,7 +99,7 @@ static void __ipi_flush_tlb_range_asid(void *info) local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid); } -static void __flush_tlb_range(struct cpumask *cmask, unsigned long asid, +static void __flush_tlb_range(const struct cpumask *cmask, unsigned long asid, unsigned long start, unsigned long size, unsigned long stride) { @@ -200,7 +200,7 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - __flush_tlb_range((struct cpumask *)cpu_online_mask, FLUSH_TLB_NO_ASID, + __flush_tlb_range(cpu_online_mask, FLUSH_TLB_NO_ASID, start, end - start, PAGE_SIZE); } -- cgit v1.2.3 From ea558de7238bb12c3435c47f0631e9d17bf4a09f Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 16 Mar 2024 12:38:29 +0100 Subject: i40e: Enforce software interrupt during busy-poll exit As for ice bug fixed by commit b7306b42beaf ("ice: manage interrupts during poll exit") followed by commit 23be7075b318 ("ice: fix software generating extra interrupts") I'm seeing the similar issue also with i40e driver. In certain situation when busy-loop is enabled together with adaptive coalescing, the driver occasionally misses that there are outstanding descriptors to clean when exiting busy poll. Try to catch the remaining work by triggering a software interrupt when exiting busy poll. No extra interrupts will be generated when busy polling is not used. The issue was found when running sockperf ping-pong tcp test with adaptive coalescing and busy poll enabled (50 as value busy_pool and busy_read sysctl knobs) and results in huge latency spikes with more than 100000us. The fix is inspired from the ice driver and do the following: 1) During napi poll exit in case of busy-poll (napo_complete_done() returns false) this is recorded to q_vector that we were in busy loop. 2) Extends i40e_buildreg_itr() to be able to add an enforced software interrupt into built value 2) In i40e_update_enable_itr() enforces a software interrupt trigger if we are exiting busy poll to catch any pending clean-ups 3) Reuses unused 3rd ITR (interrupt throttle) index and set it to 20K interrupts per second to limit the number of these sw interrupts. Test results ============ Prior: [root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 sockperf: == version #3.10-no.git == sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) [ 0] IP = 10.9.9.1 PORT = 11111 # TCP sockperf: Warmup stage (sending a few dummy messages)... sockperf: Starting test... sockperf: Test end (interrupted by timer) sockperf: Test ended sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2438563; ReceivedMessages=2438562 sockperf: ========= Printing statistics for Server No: 0 sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2429473; ReceivedMessages=2429473 sockperf: ====> avg-latency=24.571 (std-dev=93.297, mean-ad=4.904, median-ad=1.510, siqr=1.063, cv=3.797, std-error=0.060, 99.0% ci=[24.417, 24.725]) sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 sockperf: Summary: Latency is 24.571 usec sockperf: Total 2429473 observations; each percentile contains 24294.73 observations sockperf: ---> observation = 103294.331 sockperf: ---> percentile 99.999 = 45.633 sockperf: ---> percentile 99.990 = 37.013 sockperf: ---> percentile 99.900 = 35.910 sockperf: ---> percentile 99.000 = 33.390 sockperf: ---> percentile 90.000 = 28.626 sockperf: ---> percentile 75.000 = 27.741 sockperf: ---> percentile 50.000 = 26.743 sockperf: ---> percentile 25.000 = 25.614 sockperf: ---> observation = 12.220 After: [root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 sockperf: == version #3.10-no.git == sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) [ 0] IP = 10.9.9.1 PORT = 11111 # TCP sockperf: Warmup stage (sending a few dummy messages)... sockperf: Starting test... sockperf: Test end (interrupted by timer) sockperf: Test ended sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2400055; ReceivedMessages=2400054 sockperf: ========= Printing statistics for Server No: 0 sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2391186; ReceivedMessages=2391186 sockperf: ====> avg-latency=24.965 (std-dev=5.934, mean-ad=4.642, median-ad=1.485, siqr=1.067, cv=0.238, std-error=0.004, 99.0% ci=[24.955, 24.975]) sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 sockperf: Summary: Latency is 24.965 usec sockperf: Total 2391186 observations; each percentile contains 23911.86 observations sockperf: ---> observation = 195.841 sockperf: ---> percentile 99.999 = 45.026 sockperf: ---> percentile 99.990 = 39.009 sockperf: ---> percentile 99.900 = 35.922 sockperf: ---> percentile 99.000 = 33.482 sockperf: ---> percentile 90.000 = 28.902 sockperf: ---> percentile 75.000 = 27.821 sockperf: ---> percentile 50.000 = 26.860 sockperf: ---> percentile 25.000 = 25.685 sockperf: ---> observation = 12.277 Fixes: 0bcd952feec7 ("ethernet/intel: consolidate NAPI and NAPI exit") Reported-by: Hugo Ferreira Reviewed-by: Michal Schmidt Signed-off-by: Ivan Vecera Reviewed-by: Jesse Brandeburg Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++ drivers/net/ethernet/intel/i40e/i40e_register.h | 3 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 82 ++++++++++++++++++------- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 + 5 files changed, 72 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index ba24f3fa92c3..2fbabcdb5bb5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -955,6 +955,7 @@ struct i40e_q_vector { struct rcu_head rcu; /* to avoid race with update stats on free */ char name[I40E_INT_NAME_STR_LEN]; bool arm_wb_state; + bool in_busy_poll; int irq_num; /* IRQ assigned to this q_vector */ } ____cacheline_internodealigned_in_smp; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f86578857e8a..6576a0081093 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3911,6 +3911,12 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi) q_vector->tx.target_itr >> 1); q_vector->tx.current_itr = q_vector->tx.target_itr; + /* Set ITR for software interrupts triggered after exiting + * busy-loop polling. + */ + wr32(hw, I40E_PFINT_ITRN(I40E_SW_ITR, vector - 1), + I40E_ITR_20K); + wr32(hw, I40E_PFINT_RATEN(vector - 1), i40e_intrl_usec_to_reg(vsi->int_rate_limit)); diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h index 14ab642cafdb..432afbb64201 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_register.h +++ b/drivers/net/ethernet/intel/i40e/i40e_register.h @@ -333,8 +333,11 @@ #define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT 3 #define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) #define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT 5 +#define I40E_PFINT_DYN_CTLN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT) #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24 #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT) +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25 +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT) #define I40E_PFINT_ICR0 0x00038780 /* Reset: CORER */ #define I40E_PFINT_ICR0_INTEVENT_SHIFT 0 #define I40E_PFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 0d7177083708..1a12b732818e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2630,7 +2630,22 @@ process_next: return failure ? budget : (int)total_rx_packets; } -static inline u32 i40e_buildreg_itr(const int type, u16 itr) +/** + * i40e_buildreg_itr - build a value for writing to I40E_PFINT_DYN_CTLN register + * @itr_idx: interrupt throttling index + * @interval: interrupt throttling interval value in usecs + * @force_swint: force software interrupt + * + * The function builds a value for I40E_PFINT_DYN_CTLN register that + * is used to update interrupt throttling interval for specified ITR index + * and optionally enforces a software interrupt. If the @itr_idx is equal + * to I40E_ITR_NONE then no interval change is applied and only @force_swint + * parameter is taken into account. If the interval change and enforced + * software interrupt are not requested then the built value just enables + * appropriate vector interrupt. + **/ +static u32 i40e_buildreg_itr(enum i40e_dyn_idx itr_idx, u16 interval, + bool force_swint) { u32 val; @@ -2644,23 +2659,33 @@ static inline u32 i40e_buildreg_itr(const int type, u16 itr) * an event in the PBA anyway so we need to rely on the automask * to hold pending events for us until the interrupt is re-enabled * - * The itr value is reported in microseconds, and the register - * value is recorded in 2 microsecond units. For this reason we - * only need to shift by the interval shift - 1 instead of the - * full value. + * We have to shift the given value as it is reported in microseconds + * and the register value is recorded in 2 microsecond units. */ - itr &= I40E_ITR_MASK; + interval >>= 1; + /* 1. Enable vector interrupt + * 2. Update the interval for the specified ITR index + * (I40E_ITR_NONE in the register is used to indicate that + * no interval update is requested) + */ val = I40E_PFINT_DYN_CTLN_INTENA_MASK | - (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) | - (itr << (I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT - 1)); + FIELD_PREP(I40E_PFINT_DYN_CTLN_ITR_INDX_MASK, itr_idx) | + FIELD_PREP(I40E_PFINT_DYN_CTLN_INTERVAL_MASK, interval); + + /* 3. Enforce software interrupt trigger if requested + * (These software interrupts rate is limited by ITR2 that is + * set to 20K interrupts per second) + */ + if (force_swint) + val |= I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK | + I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK | + FIELD_PREP(I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK, + I40E_SW_ITR); return val; } -/* a small macro to shorten up some long lines */ -#define INTREG I40E_PFINT_DYN_CTLN - /* The act of updating the ITR will cause it to immediately trigger. In order * to prevent this from throwing off adaptive update statistics we defer the * update so that it can only happen so often. So after either Tx or Rx are @@ -2679,8 +2704,10 @@ static inline u32 i40e_buildreg_itr(const int type, u16 itr) static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector) { + enum i40e_dyn_idx itr_idx = I40E_ITR_NONE; struct i40e_hw *hw = &vsi->back->hw; - u32 intval; + u16 interval = 0; + u32 itr_val; /* If we don't have MSIX, then we only need to re-enable icr0 */ if (!test_bit(I40E_FLAG_MSIX_ENA, vsi->back->flags)) { @@ -2702,8 +2729,8 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, */ if (q_vector->rx.target_itr < q_vector->rx.current_itr) { /* Rx ITR needs to be reduced, this is highest priority */ - intval = i40e_buildreg_itr(I40E_RX_ITR, - q_vector->rx.target_itr); + itr_idx = I40E_RX_ITR; + interval = q_vector->rx.target_itr; q_vector->rx.current_itr = q_vector->rx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || @@ -2712,25 +2739,36 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, /* Tx ITR needs to be reduced, this is second priority * Tx ITR needs to be increased more than Rx, fourth priority */ - intval = i40e_buildreg_itr(I40E_TX_ITR, - q_vector->tx.target_itr); + itr_idx = I40E_TX_ITR; + interval = q_vector->tx.target_itr; q_vector->tx.current_itr = q_vector->tx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { /* Rx ITR needs to be increased, third priority */ - intval = i40e_buildreg_itr(I40E_RX_ITR, - q_vector->rx.target_itr); + itr_idx = I40E_RX_ITR; + interval = q_vector->rx.target_itr; q_vector->rx.current_itr = q_vector->rx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else { /* No ITR update, lowest priority */ - intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); if (q_vector->itr_countdown) q_vector->itr_countdown--; } - if (!test_bit(__I40E_VSI_DOWN, vsi->state)) - wr32(hw, INTREG(q_vector->reg_idx), intval); + /* Do not update interrupt control register if VSI is down */ + if (test_bit(__I40E_VSI_DOWN, vsi->state)) + return; + + /* Update ITR interval if necessary and enforce software interrupt + * if we are exiting busy poll. + */ + if (q_vector->in_busy_poll) { + itr_val = i40e_buildreg_itr(itr_idx, interval, true); + q_vector->in_busy_poll = false; + } else { + itr_val = i40e_buildreg_itr(itr_idx, interval, false); + } + wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), itr_val); } /** @@ -2845,6 +2883,8 @@ tx_only: */ if (likely(napi_complete_done(napi, work_done))) i40e_update_enable_itr(vsi, q_vector); + else + q_vector->in_busy_poll = true; return min(work_done, budget - 1); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index abf15067eb5d..2cdc7de6301c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -68,6 +68,7 @@ enum i40e_dyn_idx { /* these are indexes into ITRN registers */ #define I40E_RX_ITR I40E_IDX_ITR0 #define I40E_TX_ITR I40E_IDX_ITR1 +#define I40E_SW_ITR I40E_IDX_ITR2 /* Supported RSS offloads */ #define I40E_DEFAULT_RSS_HENA ( \ -- cgit v1.2.3 From d080a08b06b6266cc3e0e86c5acfd80db937cb6b Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Mon, 11 Mar 2024 19:19:13 -0700 Subject: riscv: Fix spurious errors from __get/put_kernel_nofault These macros did not initialize __kr_err, so they could fail even if the access did not fault. Cc: stable@vger.kernel.org Fixes: d464118cdc41 ("riscv: implement __get_kernel_nofault and __put_user_nofault") Signed-off-by: Samuel Holland Reviewed-by: Alexandre Ghiti Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240312022030.320789-1-samuel.holland@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index ec0cab9fbddd..72ec1d9bd3f3 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -319,7 +319,7 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ - long __kr_err; \ + long __kr_err = 0; \ \ __get_user_nocheck(*((type *)(dst)), (type *)(src), __kr_err); \ if (unlikely(__kr_err)) \ @@ -328,7 +328,7 @@ do { \ #define __put_kernel_nofault(dst, src, type, err_label) \ do { \ - long __kr_err; \ + long __kr_err = 0; \ \ __put_user_nocheck(*((type *)(src)), (type *)(dst), __kr_err); \ if (unlikely(__kr_err)) \ -- cgit v1.2.3 From eb58c598ce45b7e787568fe27016260417c3d807 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Wed, 13 Mar 2024 10:44:00 +0100 Subject: i40e: fix i40e_count_filters() to count only active/new filters The bug usually affects untrusted VFs, because they are limited to 18 MACs, it affects them badly, not letting to create MAC all filters. Not stable to reproduce, it happens when VF user creates MAC filters when other MACVLAN operations are happened in parallel. But consequence is that VF can't receive desired traffic. Fix counter to be bumped only for new or active filters. Fixes: 621650cabee5 ("i40e: Refactoring VF MAC filters counting to make more reliable") Signed-off-by: Aleksandr Loktionov Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Paul Menzel Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6576a0081093..48b9ddb2b1b3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1253,8 +1253,11 @@ int i40e_count_filters(struct i40e_vsi *vsi) int bkt; int cnt = 0; - hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) - ++cnt; + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { + if (f->state == I40E_FILTER_NEW || + f->state == I40E_FILTER_ACTIVE) + ++cnt; + } return cnt; } -- cgit v1.2.3 From f37c4eac99c258111d414d31b740437e1925b8e8 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Wed, 13 Mar 2024 10:56:39 +0100 Subject: i40e: fix vf may be used uninitialized in this function warning To fix the regression introduced by commit 52424f974bc5, which causes servers hang in very hard to reproduce conditions with resets races. Using two sources for the information is the root cause. In this function before the fix bumping v didn't mean bumping vf pointer. But the code used this variables interchangeably, so stale vf could point to different/not intended vf. Remove redundant "v" variable and iterate via single VF pointer across whole function instead to guarantee VF pointer validity. Fixes: 52424f974bc5 ("i40e: Fix VF hang when reset is triggered on another VF") Signed-off-by: Aleksandr Loktionov Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Przemek Kitszel Reviewed-by: Paul Menzel Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 34 ++++++++++------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 83a34e98bdc7..4c13f78af675 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1624,8 +1624,8 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) { struct i40e_hw *hw = &pf->hw; struct i40e_vf *vf; - int i, v; u32 reg; + int i; /* If we don't have any VFs, then there is nothing to reset */ if (!pf->num_alloc_vfs) @@ -1636,11 +1636,10 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) return false; /* Begin reset on all VFs at once */ - for (v = 0; v < pf->num_alloc_vfs; v++) { - vf = &pf->vf[v]; + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* If VF is being reset no need to trigger reset again */ if (!test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) - i40e_trigger_vf_reset(&pf->vf[v], flr); + i40e_trigger_vf_reset(vf, flr); } /* HW requires some time to make sure it can flush the FIFO for a VF @@ -1649,14 +1648,13 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) * the VFs using a simple iterator that increments once that VF has * finished resetting. */ - for (i = 0, v = 0; i < 10 && v < pf->num_alloc_vfs; i++) { + for (i = 0, vf = &pf->vf[0]; i < 10 && vf < &pf->vf[pf->num_alloc_vfs]; ++i) { usleep_range(10000, 20000); /* Check each VF in sequence, beginning with the VF to fail * the previous check. */ - while (v < pf->num_alloc_vfs) { - vf = &pf->vf[v]; + while (vf < &pf->vf[pf->num_alloc_vfs]) { if (!test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) { reg = rd32(hw, I40E_VPGEN_VFRSTAT(vf->vf_id)); if (!(reg & I40E_VPGEN_VFRSTAT_VFRD_MASK)) @@ -1666,7 +1664,7 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) /* If the current VF has finished resetting, move on * to the next VF in sequence. */ - v++; + ++vf; } } @@ -1676,39 +1674,39 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) /* Display a warning if at least one VF didn't manage to reset in * time, but continue on with the operation. */ - if (v < pf->num_alloc_vfs) + if (vf < &pf->vf[pf->num_alloc_vfs]) dev_err(&pf->pdev->dev, "VF reset check timeout on VF %d\n", - pf->vf[v].vf_id); + vf->vf_id); usleep_range(10000, 20000); /* Begin disabling all the rings associated with VFs, but do not wait * between each VF. */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* On initial reset, we don't have any queues to disable */ - if (pf->vf[v].lan_vsi_idx == 0) + if (vf->lan_vsi_idx == 0) continue; /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_vsi_stop_rings_no_wait(pf->vsi[pf->vf[v].lan_vsi_idx]); + i40e_vsi_stop_rings_no_wait(pf->vsi[vf->lan_vsi_idx]); } /* Now that we've notified HW to disable all of the VF rings, wait * until they finish. */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* On initial reset, we don't have any queues to disable */ - if (pf->vf[v].lan_vsi_idx == 0) + if (vf->lan_vsi_idx == 0) continue; /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_vsi_wait_queues_disabled(pf->vsi[pf->vf[v].lan_vsi_idx]); + i40e_vsi_wait_queues_disabled(pf->vsi[vf->lan_vsi_idx]); } /* Hw may need up to 50ms to finish disabling the RX queues. We @@ -1717,12 +1715,12 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) mdelay(50); /* Finish the reset on each VF */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_cleanup_reset_vf(&pf->vf[v]); + i40e_cleanup_reset_vf(vf); } i40e_flush(hw); -- cgit v1.2.3 From b7c59b038c656214f56432867056997c2e0fc268 Mon Sep 17 00:00:00 2001 From: Yuquan Wang Date: Mon, 18 Mar 2024 10:29:28 +0800 Subject: cxl/mem: Fix for the index of Clear Event Record Handle The dev_dbg info for Clear Event Records mailbox command would report the handle of the next record to clear not the current one. This was because the index 'i' had incremented before printing the current handle value. Fixes: 6ebe28f9ec72 ("cxl/mem: Read, trace, and clear events on driver load") Signed-off-by: Yuquan Wang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Reviewed-by: Fan Ni Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 9adda4795eb7..50146161887d 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -915,7 +915,7 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, payload->handles[i++] = gen->hdr.handle; dev_dbg(mds->cxlds.dev, "Event log '%d': Clearing %u\n", log, - le16_to_cpu(payload->handles[i])); + le16_to_cpu(payload->handles[i - 1])); if (i == max_handles) { payload->nr_recs = i; -- cgit v1.2.3 From 5c88a9ccd4c431d58b532e4158b6999a8350062c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 19 Mar 2024 11:15:08 -0700 Subject: cxl/core/regs: Fix usage of map->reg_type in cxl_decode_regblock() before assigned In the error path, map->reg_type is being used for kernel warning before its value is setup. Found by code inspection. Exposure to user is wrong reg_type being emitted via kernel log. Use a local var for reg_type and retrieve value for usage. Fixes: 6c7f4f1e51c2 ("cxl/core/regs: Make cxl_map_{component, device}_regs() device generic") Reviewed-by: Dan Williams Reviewed-by: Davidlohr Bueso Signed-off-by: Dave Jiang --- drivers/cxl/core/regs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 372786f80955..3c42f984eeaf 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -271,6 +271,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, CXL); static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, struct cxl_register_map *map) { + u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); u64 offset = ((u64)reg_hi << 32) | (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); @@ -278,11 +279,11 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, if (offset > pci_resource_len(pdev, bar)) { dev_warn(&pdev->dev, "BAR%d: %pr: too small (offset: %pa, type: %d)\n", bar, - &pdev->resource[bar], &offset, map->reg_type); + &pdev->resource[bar], &offset, reg_type); return false; } - map->reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); + map->reg_type = reg_type; map->resource = pci_resource_start(pdev, bar) + offset; map->max_size = pci_resource_len(pdev, bar) - offset; return true; -- cgit v1.2.3 From 0462c56c290a99a7f03e817ae5b843116dfb575c Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 25 Mar 2024 16:21:25 +0100 Subject: driver core: Introduce device_link_wait_removal() The commit 80dd33cf72d1 ("drivers: base: Fix device link removal") introduces a workqueue to release the consumer and supplier devices used in the devlink. In the job queued, devices are release and in turn, when all the references to these devices are dropped, the release function of the device itself is called. Nothing is present to provide some synchronisation with this workqueue in order to ensure that all ongoing releasing operations are done and so, some other operations can be started safely. For instance, in the following sequence: 1) of_platform_depopulate() 2) of_overlay_remove() During the step 1, devices are released and related devlinks are removed (jobs pushed in the workqueue). During the step 2, OF nodes are destroyed but, without any synchronisation with devlink removal jobs, of_overlay_remove() can raise warnings related to missing of_node_put(): ERROR: memory leak, expected refcount 1 instead of 2 Indeed, the missing of_node_put() call is going to be done, too late, from the workqueue job execution. Introduce device_link_wait_removal() to offer a way to synchronize operations waiting for the end of devlink removals (i.e. end of workqueue jobs). Also, as a flushing operation is done on the workqueue, the workqueue used is moved from a system-wide workqueue to a local one. Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Tested-by: Luca Ceresoli Reviewed-by: Nuno Sa Reviewed-by: Saravana Kannan Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240325152140.198219-2-herve.codina@bootlin.com Signed-off-by: Rob Herring --- drivers/base/core.c | 26 +++++++++++++++++++++++--- include/linux/device.h | 1 + 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index b93f3c5716ae..5f4e03336e68 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -44,6 +44,7 @@ static bool fw_devlink_is_permissive(void); static void __fw_devlink_link_to_consumers(struct device *dev); static bool fw_devlink_drv_reg_done; static bool fw_devlink_best_effort; +static struct workqueue_struct *device_link_wq; /** * __fwnode_link_add - Create a link between two fwnode_handles. @@ -533,12 +534,26 @@ static void devlink_dev_release(struct device *dev) /* * It may take a while to complete this work because of the SRCU * synchronization in device_link_release_fn() and if the consumer or - * supplier devices get deleted when it runs, so put it into the "long" - * workqueue. + * supplier devices get deleted when it runs, so put it into the + * dedicated workqueue. */ - queue_work(system_long_wq, &link->rm_work); + queue_work(device_link_wq, &link->rm_work); } +/** + * device_link_wait_removal - Wait for ongoing devlink removal jobs to terminate + */ +void device_link_wait_removal(void) +{ + /* + * devlink removal jobs are queued in the dedicated work queue. + * To be sure that all removal jobs are terminated, ensure that any + * scheduled work has run to completion. + */ + flush_workqueue(device_link_wq); +} +EXPORT_SYMBOL_GPL(device_link_wait_removal); + static struct class devlink_class = { .name = "devlink", .dev_groups = devlink_groups, @@ -4164,9 +4179,14 @@ int __init devices_init(void) sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); if (!sysfs_dev_char_kobj) goto char_kobj_err; + device_link_wq = alloc_workqueue("device_link_wq", 0, 0); + if (!device_link_wq) + goto wq_err; return 0; + wq_err: + kobject_put(sysfs_dev_char_kobj); char_kobj_err: kobject_put(sysfs_dev_block_kobj); block_kobj_err: diff --git a/include/linux/device.h b/include/linux/device.h index 97c4b046c09d..b9f5464f44ed 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1247,6 +1247,7 @@ void device_link_del(struct device_link *link); void device_link_remove(void *consumer, struct device *supplier); void device_links_supplier_sync_state_pause(void); void device_links_supplier_sync_state_resume(void); +void device_link_wait_removal(void); /* Create alias, so I can be autoloaded. */ #define MODULE_ALIAS_CHARDEV(major,minor) \ -- cgit v1.2.3 From 8917e7385346bd6584890ed362985c219fe6ae84 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 25 Mar 2024 16:21:26 +0100 Subject: of: dynamic: Synchronize of_changeset_destroy() with the devlink removals In the following sequence: 1) of_platform_depopulate() 2) of_overlay_remove() During the step 1, devices are destroyed and devlinks are removed. During the step 2, OF nodes are destroyed but __of_changeset_entry_destroy() can raise warnings related to missing of_node_put(): ERROR: memory leak, expected refcount 1 instead of 2 ... Indeed, during the devlink removals performed at step 1, the removal itself releasing the device (and the attached of_node) is done by a job queued in a workqueue and so, it is done asynchronously with respect to function calls. When the warning is present, of_node_put() will be called but wrongly too late from the workqueue job. In order to be sure that any ongoing devlink removals are done before the of_node destruction, synchronize the of_changeset_destroy() with the devlink removals. Fixes: 80dd33cf72d1 ("drivers: base: Fix device link removal") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Reviewed-by: Saravana Kannan Tested-by: Luca Ceresoli Reviewed-by: Nuno Sa Link: https://lore.kernel.org/r/20240325152140.198219-3-herve.codina@bootlin.com Signed-off-by: Rob Herring --- drivers/of/dynamic.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c index 3bf27052832f..4d57a4e34105 100644 --- a/drivers/of/dynamic.c +++ b/drivers/of/dynamic.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) "OF: " fmt +#include #include #include #include @@ -667,6 +668,17 @@ void of_changeset_destroy(struct of_changeset *ocs) { struct of_changeset_entry *ce, *cen; + /* + * When a device is deleted, the device links to/from it are also queued + * for deletion. Until these device links are freed, the devices + * themselves aren't freed. If the device being deleted is due to an + * overlay change, this device might be holding a reference to a device + * node that will be freed. So, wait until all already pending device + * links are deleted before freeing a device node. This ensures we don't + * free any device node that has a non-zero reference count. + */ + device_link_wait_removal(); + list_for_each_entry_safe_reverse(ce, cen, &ocs->entries, node) __of_changeset_entry_destroy(ce); } -- cgit v1.2.3 From ad14f7ca9f0d9fdf73d1fd61aaf8248d46ffc849 Mon Sep 17 00:00:00 2001 From: Vladimir Isaev Date: Wed, 13 Mar 2024 10:35:46 +0300 Subject: riscv: hwprobe: do not produce frtace relocation Such relocation causes crash of android linker similar to one described in commit e05d57dcb8c7 ("riscv: Fixup __vdso_gettimeofday broke dynamic ftrace"). Looks like this relocation is added by CONFIG_DYNAMIC_FTRACE which is disabled in the default android kernel. Before: readelf -rW arch/riscv/kernel/vdso/vdso.so: Relocation section '.rela.dyn' at offset 0xd00 contains 1 entry: Offset Info Type 0000000000000d20 0000000000000003 R_RISCV_RELATIVE objdump: 0000000000000c86 <__vdso_riscv_hwprobe@@LINUX_4.15>: c86: 0001 nop c88: 0001 nop c8a: 0001 nop c8c: 0001 nop c8e: e211 bnez a2,c92 <__vdso_riscv_hwprobe... After: readelf -rW arch/riscv/kernel/vdso/vdso.so: There are no relocations in this file. objdump: 0000000000000c86 <__vdso_riscv_hwprobe@@LINUX_4.15>: c86: e211 bnez a2,c8a <__vdso_riscv_hwprobe... c88: c6b9 beqz a3,cd6 <__vdso_riscv_hwprobe... c8a: e739 bnez a4,cd8 <__vdso_riscv_hwprobe... c8c: ffffd797 auipc a5,0xffffd Also disable SCS since it also should not be available in vdso. Fixes: aa5af0aa90ba ("RISC-V: Add hwprobe vDSO function and data") Signed-off-by: Roman Artemev Signed-off-by: Vladimir Isaev Reviewed-by: Alexandre Ghiti Reviewed-by: Guo Ren Link: https://lore.kernel.org/r/20240313085843.17661-1-vladimir.isaev@syntacore.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 9b517fe1b8a8..272c431ac5b9 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -37,6 +37,7 @@ endif # Disable -pg to prevent insert call site CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) +CFLAGS_REMOVE_hwprobe.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) # Disable profiling and instrumentation for VDSO code GCOV_PROFILE := n -- cgit v1.2.3 From 4b0bf9a0127029054c2fa18ba5b3f3ddc45f54ed Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2023 21:58:07 +0900 Subject: riscv: compat_vdso: install compat_vdso.so.dbg to /lib/modules/*/vdso/ 'make vdso_install' installs debug vdso files to /lib/modules/*/vdso/. Only for the compat vdso on riscv, the installation destination differs; compat_vdso.so.dbg is installed to /lib/module/*/compat_vdso/. To follow the standard install destination and simplify the vdso_install logic, change the install destination to standard /lib/modules/*/vdso/. Signed-off-by: Masahiro Yamada Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231117125807.1058477-1-masahiroy@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 252d63942f34..5b3115a19852 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -151,7 +151,7 @@ endif endif vdso-install-y += arch/riscv/kernel/vdso/vdso.so.dbg -vdso-install-$(CONFIG_COMPAT) += arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg:../compat_vdso/compat_vdso.so +vdso-install-$(CONFIG_COMPAT) += arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg ifneq ($(CONFIG_XIP_KERNEL),y) ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_ARCH_CANAAN),yy) -- cgit v1.2.3 From ea6873118493019474abbf57d5a800da365734df Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 12 Mar 2024 01:20:53 +0000 Subject: drivers/perf: riscv: Disable PERF_SAMPLE_BRANCH_* while not supported RISC-V perf driver does not yet support branch sampling. Although the specification is in the works [0], it is best to disable such events until support is available, otherwise we will get unexpected results. Due to this reason, two riscv bpf testcases get_branch_snapshot and perf_branches/perf_branches_hw fail. Link: https://github.com/riscv/riscv-control-transfer-records [0] Fixes: f5bfa23f576f ("RISC-V: Add a perf core library for pmu drivers") Signed-off-by: Pu Lehui Reviewed-by: Atish Patra Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240312012053.1178140-1-pulehui@huaweicloud.com Signed-off-by: Palmer Dabbelt --- drivers/perf/riscv_pmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c index c78a6fd6c57f..b4efdddb2ad9 100644 --- a/drivers/perf/riscv_pmu.c +++ b/drivers/perf/riscv_pmu.c @@ -313,6 +313,10 @@ static int riscv_pmu_event_init(struct perf_event *event) u64 event_config = 0; uint64_t cmask; + /* driver does not support branch stack sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + hwc->flags = 0; mapped_event = rvpmu->event_map(event, &event_config); if (mapped_event < 0) { -- cgit v1.2.3 From 653650c468be211752aa56eae79af1ae67c5e70c Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Tue, 26 Mar 2024 15:37:13 +0000 Subject: riscv: Mark __se_sys_* functions __used Clang doesn't think ___se_sys_* functions used even though they are aliased to __se_sys_*, resulting in -Wunused-function warnings when building rv32. For example: mm/oom_kill.c:1195:1: warning: unused function '___se_sys_process_mrelease' [-Wunused-function] 1195 | SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/syscalls.h:221:36: note: expanded from macro 'SYSCALL_DEFINE2' 221 | #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/syscalls.h:231:2: note: expanded from macro 'SYSCALL_DEFINEx' 231 | __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/riscv/include/asm/syscall_wrapper.h:81:2: note: expanded from macro '__SYSCALL_DEFINEx' 81 | __SYSCALL_SE_DEFINEx(x, sys, name, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/riscv/include/asm/syscall_wrapper.h:40:14: note: expanded from macro '__SYSCALL_SE_DEFINEx' 40 | static long ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)) | ^~~~~~~~~~~~~~~~~~~~ :30:1: note: expanded from here 30 | ___se_sys_process_mrelease | ^~~~~~~~~~~~~~~~~~~~~~~~~~ 1 warning generated. Mark the functions __used explicitly to fix the Clang warnings. Fixes: a9ad73295cc1 ("riscv: Fix syscall wrapper for >word-size arguments") Reported-by: Linux Kernel Functional Testing Tested-by: Linux Kernel Functional Testing Signed-off-by: Sami Tolvanen Reviewed-by: Alexandre Ghiti Tested-by: Conor Dooley Link: https://lore.kernel.org/r/20240326153712.1839482-2-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/syscall_wrapper.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/syscall_wrapper.h b/arch/riscv/include/asm/syscall_wrapper.h index 980094c2e976..ac80216549ff 100644 --- a/arch/riscv/include/asm/syscall_wrapper.h +++ b/arch/riscv/include/asm/syscall_wrapper.h @@ -36,7 +36,8 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); ulong) \ __attribute__((alias(__stringify(___se_##prefix##name)))); \ __diag_pop(); \ - static long noinline ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static long noinline ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + __used; \ static long ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)) #define SC_RISCV_REGS_TO_ARGS(x, ...) \ -- cgit v1.2.3 From ddd65e19c60140673ea9f7249af0a672f1820623 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 23 Mar 2024 17:11:19 +0100 Subject: block: handle BLK_OPEN_RESTRICT_WRITES correctly Last kernel release we introduce CONFIG_BLK_DEV_WRITE_MOUNTED. By default this option is set. When it is set the long-standing behavior of being able to write to mounted block devices is enabled. But in order to guard against unintended corruption by writing to the block device buffer cache CONFIG_BLK_DEV_WRITE_MOUNTED can be turned off. In that case it isn't possible to write to mounted block devices anymore. A filesystem may open its block devices with BLK_OPEN_RESTRICT_WRITES which disallows concurrent BLK_OPEN_WRITE access. When we still had the bdev handle around we could recognize BLK_OPEN_RESTRICT_WRITES because the mode was passed around. Since we managed to get rid of the bdev handle we changed that logic to recognize BLK_OPEN_RESTRICT_WRITES based on whether the file was opened writable and writes to that block device are blocked. That logic doesn't work because we do allow BLK_OPEN_RESTRICT_WRITES to be specified without BLK_OPEN_WRITE. Fix the detection logic and use an FMODE_* bit. We could've also abused O_EXCL as an indicator that BLK_OPEN_RESTRICT_WRITES has been requested. For userspace open paths O_EXCL will never be retained but for internal opens where we open files that are never installed into a file descriptor table this is fine. But it would be a gamble that this doesn't cause bugs. Note that BLK_OPEN_RESTRICT_WRITES is an internal only flag that cannot directly be raised by userspace. It is implicitly raised during mounting. Passes xftests and blktests with CONFIG_BLK_DEV_WRITE_MOUNTED set and unset. Link: https://lore.kernel.org/r/ZfyyEwu9Uq5Pgb94@casper.infradead.org Link: https://lore.kernel.org/r/20240323-zielbereich-mittragen-6fdf14876c3e@brauner Fixes: 321de651fa56 ("block: don't rely on BLK_OPEN_RESTRICT_WRITES when yielding write access") Reviewed-by: Yu Kuai Reviewed-by: Jan Kara Reported-by: Matthew Wilcox Signed-off-by: Christian Brauner --- block/bdev.c | 14 +++++++------- include/linux/fs.h | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 7a5f611c3d2e..ab9cae0e05f1 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -821,13 +821,11 @@ static void bdev_yield_write_access(struct file *bdev_file) return; bdev = file_bdev(bdev_file); - /* Yield exclusive or shared write access. */ - if (bdev_file->f_mode & FMODE_WRITE) { - if (bdev_writes_blocked(bdev)) - bdev_unblock_writes(bdev); - else - bdev->bd_writers--; - } + + if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) + bdev_unblock_writes(bdev); + else if (bdev_file->f_mode & FMODE_WRITE) + bdev->bd_writers--; } /** @@ -907,6 +905,8 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; if (bdev_nowait(bdev)) bdev_file->f_mode |= FMODE_NOWAIT; + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; bdev_file->f_mapping = bdev->bd_inode->i_mapping; bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); bdev_file->private_data = holder; diff --git a/include/linux/fs.h b/include/linux/fs.h index 00fc429b0af0..8dfd53b52744 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -121,6 +121,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_PWRITE ((__force fmode_t)0x10) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)0x20) +/* File writes are restricted (block device specific) */ +#define FMODE_WRITE_RESTRICTED ((__force fmode_t)0x40) /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ -- cgit v1.2.3 From 3ff56e285de5a375fbfab3c3f1af81bbd23db36d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 23 Mar 2024 17:11:20 +0100 Subject: block: count BLK_OPEN_RESTRICT_WRITES openers The original changes in v6.8 do allow for a block device to be reopened with BLK_OPEN_RESTRICT_WRITES provided the same holder is used as per bdev_may_open(). I think this has a bug. The first opener @f1 of that block device will set bdev->bd_writers to -1. The second opener @f2 using the same holder will pass the check in bdev_may_open() that bdev->bd_writers must not be greater than zero. The first opener @f1 now closes the block device and in bdev_release() will end up calling bdev_yield_write_access() which calls bdev_writes_blocked() and sets bdev->bd_writers to 0 again. Now @f2 holds a file to that block device which was opened with exclusive write access but bdev->bd_writers has been reset to 0. So now @f3 comes along and succeeds in opening the block device with BLK_OPEN_WRITE betraying @f2's request to have exclusive write access. This isn't a practical issue yet because afaict there's no codepath inside the kernel that reopenes the same block device with BLK_OPEN_RESTRICT_WRITES but it will be if there is. Fix this by counting the number of BLK_OPEN_RESTRICT_WRITES openers. So we only allow writes again once all BLK_OPEN_RESTRICT_WRITES openers are done. Link: https://lore.kernel.org/r/20240323-abtauchen-klauen-c2953810082d@brauner Fixes: ed5cc702d311 ("block: Add config option to not allow writing to mounted devices") Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/bdev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index ab9cae0e05f1..a1946a902df3 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -776,17 +776,17 @@ void blkdev_put_no_open(struct block_device *bdev) static bool bdev_writes_blocked(struct block_device *bdev) { - return bdev->bd_writers == -1; + return bdev->bd_writers < 0; } static void bdev_block_writes(struct block_device *bdev) { - bdev->bd_writers = -1; + bdev->bd_writers--; } static void bdev_unblock_writes(struct block_device *bdev) { - bdev->bd_writers = 0; + bdev->bd_writers++; } static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) -- cgit v1.2.3 From 22650a99821dda3d05f1c334ea90330b4982de56 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 26 Mar 2024 13:47:22 +0100 Subject: fs,block: yield devices early Currently a device is only really released once the umount returns to userspace due to how file closing works. That ultimately could cause an old umount assumption to be violated that concurrent umount and mount don't fail. So an exclusively held device with a temporary holder should be yielded before the filesystem is gone. Add a helper that allows callers to do that. This also allows us to remove the two holder ops that Linus wasn't excited about. Link: https://lore.kernel.org/r/20240326-vfs-bdev-end_holder-v1-1-20af85202918@kernel.org Fixes: f3a608827d1f ("bdev: open block device as files") # mainline only Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- block/bdev.c | 64 +++++++++++++++++++++++++++++++++++------ drivers/mtd/devices/block2mtd.c | 2 +- fs/bcachefs/super-io.c | 2 +- fs/cramfs/inode.c | 2 +- fs/ext4/super.c | 8 +++--- fs/f2fs/super.c | 2 +- fs/jfs/jfs_logmgr.c | 4 +-- fs/reiserfs/journal.c | 2 +- fs/romfs/super.c | 2 +- fs/super.c | 24 ++-------------- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_super.c | 6 ++-- include/linux/blkdev.h | 11 +------ 13 files changed, 76 insertions(+), 55 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index a1946a902df3..b8e32d933a63 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -583,9 +583,6 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder, mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); mutex_unlock(&bdev_lock); - - if (hops && hops->get_holder) - hops->get_holder(holder); } /** @@ -608,7 +605,6 @@ EXPORT_SYMBOL(bd_abort_claiming); static void bd_end_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); - const struct blk_holder_ops *hops = bdev->bd_holder_ops; bool unblock = false; /* @@ -631,9 +627,6 @@ static void bd_end_claim(struct block_device *bdev, void *holder) whole->bd_holder = NULL; mutex_unlock(&bdev_lock); - if (hops && hops->put_holder) - hops->put_holder(holder); - /* * If this was the last claim, remove holder link and unblock evpoll if * it was a write holder. @@ -813,6 +806,11 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) bdev->bd_writers++; } +static inline bool bdev_unclaimed(const struct file *bdev_file) +{ + return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); +} + static void bdev_yield_write_access(struct file *bdev_file) { struct block_device *bdev; @@ -820,6 +818,9 @@ static void bdev_yield_write_access(struct file *bdev_file) if (bdev_allow_write_mounted) return; + if (bdev_unclaimed(bdev_file)) + return; + bdev = file_bdev(bdev_file); if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) @@ -1012,6 +1013,20 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, } EXPORT_SYMBOL(bdev_file_open_by_path); +static inline void bd_yield_claim(struct file *bdev_file) +{ + struct block_device *bdev = file_bdev(bdev_file); + void *holder = bdev_file->private_data; + + lockdep_assert_held(&bdev->bd_disk->open_mutex); + + if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) + return; + + if (!bdev_unclaimed(bdev_file)) + bd_end_claim(bdev, holder); +} + void bdev_release(struct file *bdev_file) { struct block_device *bdev = file_bdev(bdev_file); @@ -1036,7 +1051,7 @@ void bdev_release(struct file *bdev_file) bdev_yield_write_access(bdev_file); if (holder) - bd_end_claim(bdev, holder); + bd_yield_claim(bdev_file); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -1056,6 +1071,39 @@ put_no_open: blkdev_put_no_open(bdev); } +/** + * bdev_fput - yield claim to the block device and put the file + * @bdev_file: open block device + * + * Yield claim on the block device and put the file. Ensure that the + * block device can be reclaimed before the file is closed which is a + * deferred operation. + */ +void bdev_fput(struct file *bdev_file) +{ + if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) + return; + + if (bdev_file->private_data) { + struct block_device *bdev = file_bdev(bdev_file); + struct gendisk *disk = bdev->bd_disk; + + mutex_lock(&disk->open_mutex); + bdev_yield_write_access(bdev_file); + bd_yield_claim(bdev_file); + /* + * Tell release we already gave up our hold on the + * device and if write restrictions are available that + * we already gave up write access to the device. + */ + bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); + mutex_unlock(&disk->open_mutex); + } + + fput(bdev_file); +} +EXPORT_SYMBOL(bdev_fput); + /** * lookup_bdev() - Look up a struct block_device by name. * @pathname: Name of the block device in the filesystem. diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 97a00ec9a4d4..caacdc0a3819 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -209,7 +209,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) if (dev->bdev_file) { invalidate_mapping_pages(dev->bdev_file->f_mapping, 0, -1); - fput(dev->bdev_file); + bdev_fput(dev->bdev_file); } kfree(dev); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ad28e370b640..cb7b4de11a49 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -143,7 +143,7 @@ void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); if (!IS_ERR_OR_NULL(sb->s_bdev_file)) - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); kfree(sb->holder); kfree(sb->sb_name); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 39e75131fd5a..9901057a15ba 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb) sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } kfree(sbi); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cfb8449c731f..044135796f2b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5668,7 +5668,7 @@ failed_mount: brelse(sbi->s_sbh); if (sbi->s_journal_bdev_file) { invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); - fput(sbi->s_journal_bdev_file); + bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5913,7 +5913,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, out_bh: brelse(bh); out_bdev: - fput(bdev_file); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -5952,7 +5952,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, out_journal: jbd2_journal_destroy(journal); out_bdev: - fput(bdev_file); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -7327,7 +7327,7 @@ static void ext4_kill_sb(struct super_block *sb) kill_block_super(sb); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a6867f26f141..a4bc26dfdb1a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1558,7 +1558,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) for (i = 0; i < sbi->s_ndevs; i++) { if (i > 0) - fput(FDEV(i).bdev_file); + bdev_fput(FDEV(i).bdev_file); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 73389c68e251..9609349e92e5 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1141,7 +1141,7 @@ journal_found: lbmLogShutdown(log); close: /* close external log device */ - fput(bdev_file); + bdev_fput(bdev_file); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1485,7 +1485,7 @@ int lmLogClose(struct super_block *sb) bdev_file = log->bdev_file; rc = lmLogShutdown(log); - fput(bdev_file); + bdev_fput(bdev_file); kfree(log); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 6474529c4253..e539ccd39e1e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2589,7 +2589,7 @@ static void journal_list_init(struct super_block *sb) static void release_journal_dev(struct reiserfs_journal *journal) { if (journal->j_bdev_file) { - fput(journal->j_bdev_file); + bdev_fput(journal->j_bdev_file); journal->j_bdev_file = NULL; } } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 2be227532f39..2cbb92462074 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb) #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } #endif } diff --git a/fs/super.c b/fs/super.c index 71d9779c42b1..69ce6c600968 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1515,29 +1515,11 @@ static int fs_bdev_thaw(struct block_device *bdev) return error; } -static void fs_bdev_super_get(void *data) -{ - struct super_block *sb = data; - - spin_lock(&sb_lock); - sb->s_count++; - spin_unlock(&sb_lock); -} - -static void fs_bdev_super_put(void *data) -{ - struct super_block *sb = data; - - put_super(sb); -} - const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, - .get_holder = fs_bdev_super_get, - .put_holder = fs_bdev_super_put, }; EXPORT_SYMBOL_GPL(fs_holder_ops); @@ -1562,7 +1544,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - fput(bdev_file); + bdev_fput(bdev_file); return -EACCES; } @@ -1573,7 +1555,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - fput(bdev_file); + bdev_fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); @@ -1693,7 +1675,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1a18c381127e..f0fa02264eda 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2030,7 +2030,7 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - fput(btp->bt_bdev_file); + bdev_fput(btp->bt_bdev_file); kfree(btp); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c21f10ab0f5d..bce020374c5e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -485,7 +485,7 @@ xfs_open_devices( mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ if (logdev_file) - fput(logdev_file); + bdev_fput(logdev_file); } return 0; @@ -497,10 +497,10 @@ xfs_open_devices( xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: if (rtdev_file) - fput(rtdev_file); + bdev_fput(rtdev_file); out_close_logdev: if (logdev_file) - fput(logdev_file); + bdev_fput(logdev_file); return error; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..172c91879999 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1505,16 +1505,6 @@ struct blk_holder_ops { * Thaw the file system mounted on the block device. */ int (*thaw)(struct block_device *bdev); - - /* - * If needed, get a reference to the holder. - */ - void (*get_holder)(void *holder); - - /* - * Release the holder. - */ - void (*put_holder)(void *holder); }; /* @@ -1585,6 +1575,7 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) int bdev_freeze(struct block_device *bdev); int bdev_thaw(struct block_device *bdev); +void bdev_fput(struct file *bdev_file); struct io_comp_batch { struct request *req_list; -- cgit v1.2.3 From 13dddf9319808badd2c1f5d7007b4e82838a648e Mon Sep 17 00:00:00 2001 From: Victor Isaev Date: Fri, 15 Dec 2023 23:27:20 -0500 Subject: RISC-V: Update AT_VECTOR_SIZE_ARCH for new AT_MINSIGSTKSZ "riscv: signal: Report signal frame size to userspace via auxv" (e92f469) has added new constant AT_MINSIGSTKSZ but failed to increment the size of auxv, keeping AT_VECTOR_SIZE_ARCH at 9. This fix correctly increments AT_VECTOR_SIZE_ARCH to 10, following the approach in the commit 94b07c1 ("arm64: signal: Report signal frame size to userspace via auxv"). Link: https://lore.kernel.org/r/73883406.20231215232720@torrio.net Link: https://lore.kernel.org/all/20240102133617.3649-1-victor@torrio.net/ Reported-by: Ivan Komarov Closes: https://lore.kernel.org/linux-riscv/CY3Z02NYV1C4.11BLB9PLVW9G1@fedora/ Fixes: e92f469b0771 ("riscv: signal: Report signal frame size to userspace via auxv") Signed-off-by: Victor Isaev Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/auxvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/uapi/asm/auxvec.h b/arch/riscv/include/uapi/asm/auxvec.h index 10aaa83db89e..95050ebe9ad0 100644 --- a/arch/riscv/include/uapi/asm/auxvec.h +++ b/arch/riscv/include/uapi/asm/auxvec.h @@ -34,7 +34,7 @@ #define AT_L3_CACHEGEOMETRY 47 /* entries in ARCH_DLINFO */ -#define AT_VECTOR_SIZE_ARCH 9 +#define AT_VECTOR_SIZE_ARCH 10 #define AT_MINSIGSTKSZ 51 #endif /* _UAPI_ASM_RISCV_AUXVEC_H */ -- cgit v1.2.3 From 7115ff4a8bfed3b9294bad2e111744e6abeadf1a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2023 21:58:43 +0900 Subject: riscv: compat_vdso: align VDSOAS build log Add one more space after "VDSOAS" for better alignment in the build log. [Before] LDS arch/riscv/kernel/compat_vdso/compat_vdso.lds VDSOAS arch/riscv/kernel/compat_vdso/rt_sigreturn.o VDSOAS arch/riscv/kernel/compat_vdso/getcpu.o VDSOAS arch/riscv/kernel/compat_vdso/flush_icache.o VDSOAS arch/riscv/kernel/compat_vdso/note.o VDSOLD arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg VDSOSYM include/generated/compat_vdso-offsets.h [After] LDS arch/riscv/kernel/compat_vdso/compat_vdso.lds VDSOAS arch/riscv/kernel/compat_vdso/rt_sigreturn.o VDSOAS arch/riscv/kernel/compat_vdso/getcpu.o VDSOAS arch/riscv/kernel/compat_vdso/flush_icache.o VDSOAS arch/riscv/kernel/compat_vdso/note.o VDSOLD arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg VDSOSYM include/generated/compat_vdso-offsets.h Signed-off-by: Masahiro Yamada Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231117125843.1058553-1-masahiroy@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/compat_vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/compat_vdso/Makefile b/arch/riscv/kernel/compat_vdso/Makefile index 62fa393b2eb2..3df4cb788c1f 100644 --- a/arch/riscv/kernel/compat_vdso/Makefile +++ b/arch/riscv/kernel/compat_vdso/Makefile @@ -74,5 +74,5 @@ quiet_cmd_compat_vdsold = VDSOLD $@ rm $@.tmp # actual build commands -quiet_cmd_compat_vdsoas = VDSOAS $@ +quiet_cmd_compat_vdsoas = VDSOAS $@ cmd_compat_vdsoas = $(COMPAT_CC) $(a_flags) $(COMPAT_CC_FLAGS) -c -o $@ $< -- cgit v1.2.3 From 0ffe1ae7026dd129d86318388ed62ba61f085730 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 22 Nov 2023 00:06:37 +0800 Subject: riscv: mm: implement pgprot_nx commit cca98e9f8b5e ("mm: enforce that vmap can't map pages executable") enforces the W^X protection by not allowing remapping existing pages as executable. Add riscv bits so that riscv can benefit the same protection. Signed-off-by: Jisheng Zhang Reviewed-by: Samuel Holland Tested-by: Samuel Holland Reviewed-by: Christoph Hellwig Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231121160637.3856-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 97fcde30e247..9f8ea0e33eb1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -593,6 +593,12 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, return ptep_test_and_clear_young(vma, address, ptep); } +#define pgprot_nx pgprot_nx +static inline pgprot_t pgprot_nx(pgprot_t _prot) +{ + return __pgprot(pgprot_val(_prot) & ~_PAGE_EXEC); +} + #define pgprot_noncached pgprot_noncached static inline pgprot_t pgprot_noncached(pgprot_t _prot) { -- cgit v1.2.3 From ad91c1d77fd0a489706b7b784a70e464a4a03490 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 26 Mar 2024 18:46:30 +0100 Subject: dt-bindings: ufs: qcom: document SC8180X UFS Document already upstreamed and used Qualcomm SC8180x UFS host controller to fix dtbs_check warnings like: sc8180x-primus.dtb: ufshc@1d84000: compatible:0: 'qcom,sc8180x-ufshc' is not one of ['qcom,msm8994-ufshc', ... ] sc8180x-primus.dtb: ufshc@1d84000: Unevaluated properties are not allowed ('compatible' was unexpected) Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240326174632.209745-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/ufs/qcom,ufs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml index 10c146424baa..1ab3d16917ac 100644 --- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml +++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml @@ -28,6 +28,7 @@ properties: - qcom,msm8998-ufshc - qcom,sa8775p-ufshc - qcom,sc7280-ufshc + - qcom,sc8180x-ufshc - qcom,sc8280xp-ufshc - qcom,sdm845-ufshc - qcom,sm6115-ufshc @@ -120,6 +121,7 @@ allOf: - qcom,msm8998-ufshc - qcom,sa8775p-ufshc - qcom,sc7280-ufshc + - qcom,sc8180x-ufshc - qcom,sc8280xp-ufshc - qcom,sm8250-ufshc - qcom,sm8350-ufshc -- cgit v1.2.3 From 7fb5aafc0a702c4c0bb22410d1e67a732e320511 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 26 Mar 2024 18:46:31 +0100 Subject: dt-bindings: ufs: qcom: document SC7180 UFS Document already upstreamed and used Qualcomm SC7180 UFS host controller to fix dtbs_check warnings like: sc7180-idp.dtb: ufshc@1d84000: compatible:0: 'qcom,sc7180-ufshc' is not one of ... sc7180-idp.dtb: ufshc@1d84000: clocks: [[39, 99], [39, 7], [39, 98], [39, 107], [36, 0], [39, 106], [39, 105]] is too short sc7180-idp.dtb: ufshc@1d84000: clock-names: ['core_clk', 'bus_aggr_clk', 'iface_clk', 'core_clk_unipro', ...] is too short Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240326174632.209745-2-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- .../devicetree/bindings/ufs/qcom,ufs.yaml | 34 +++++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml index 1ab3d16917ac..7e6d442545ad 100644 --- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml +++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml @@ -27,6 +27,7 @@ properties: - qcom,msm8996-ufshc - qcom,msm8998-ufshc - qcom,sa8775p-ufshc + - qcom,sc7180-ufshc - qcom,sc7280-ufshc - qcom,sc8180x-ufshc - qcom,sc8280xp-ufshc @@ -43,11 +44,11 @@ properties: - const: jedec,ufs-2.0 clocks: - minItems: 8 + minItems: 7 maxItems: 11 clock-names: - minItems: 8 + minItems: 7 maxItems: 11 dma-coherent: true @@ -113,6 +114,31 @@ required: allOf: - $ref: ufs-common.yaml + - if: + properties: + compatible: + contains: + enum: + - qcom,sc7180-ufshc + then: + properties: + clocks: + minItems: 7 + maxItems: 7 + clock-names: + items: + - const: core_clk + - const: bus_aggr_clk + - const: iface_clk + - const: core_clk_unipro + - const: ref_clk + - const: tx_lane0_sync_clk + - const: rx_lane0_sync_clk + reg: + maxItems: 1 + reg-names: + maxItems: 1 + - if: properties: compatible: @@ -250,7 +276,7 @@ allOf: reg: maxItems: 1 clocks: - minItems: 8 + minItems: 7 maxItems: 8 else: properties: @@ -258,7 +284,7 @@ allOf: minItems: 1 maxItems: 2 clocks: - minItems: 8 + minItems: 7 maxItems: 11 unevaluatedProperties: false -- cgit v1.2.3 From b5237d0bdb3cb164b7792cc4f1ff2ecafbfac661 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 26 Mar 2024 18:46:32 +0100 Subject: dt-bindings: ufs: qcom: document SM6125 UFS Document already upstreamed and used Qualcomm SM6125 UFS host controller to fix dtbs_check warnings like: sm6125-xiaomi-laurel-sprout.dtb: ufs@4804000: compatible:0: 'qcom,sm6125-ufshc' is not one of ['qcom,msm8994-ufshc', ... sm6125-xiaomi-laurel-sprout.dtb: ufs@4804000: Unevaluated properties are not allowed ('compatible' was unexpected) Signed-off-by: Krzysztof Kozlowski Acked-by: Krzysztof Kozlowski Reviewed-by: Krzysztof Kozlowski Reviewed-by: Martin Botka Link: https://lore.kernel.org/r/20240326174632.209745-3-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/ufs/qcom,ufs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml index 7e6d442545ad..cd3680dc002f 100644 --- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml +++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml @@ -33,6 +33,7 @@ properties: - qcom,sc8280xp-ufshc - qcom,sdm845-ufshc - qcom,sm6115-ufshc + - qcom,sm6125-ufshc - qcom,sm6350-ufshc - qcom,sm8150-ufshc - qcom,sm8250-ufshc @@ -243,6 +244,7 @@ allOf: contains: enum: - qcom,sm6115-ufshc + - qcom,sm6125-ufshc then: properties: clocks: -- cgit v1.2.3 From 4af565de9f8c74b9f6035924ce0d40adec211246 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Wed, 27 Mar 2024 16:16:53 +0530 Subject: ASoC: amd: acp: fix for acp pdm configuration check ACP PDM configuration has to be verified for all combinations. Remove FLAG_AMD_LEGACY_ONLY_DMIC check. Fixes: 3a94c8ad0aae ("ASoC: amd: acp: add code for scanning acp pdm controller") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240327104657.3537664-2-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-pci.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sound/soc/amd/acp/acp-pci.c b/sound/soc/amd/acp/acp-pci.c index 8c8b1dcac628..440b91d4f261 100644 --- a/sound/soc/amd/acp/acp-pci.c +++ b/sound/soc/amd/acp/acp-pci.c @@ -133,11 +133,9 @@ static int acp_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id } } - if (flag == FLAG_AMD_LEGACY_ONLY_DMIC) { - ret = check_acp_pdm(pci, chip); - if (ret < 0) - goto skip_pdev_creation; - } + ret = check_acp_pdm(pci, chip); + if (ret < 0) + goto skip_pdev_creation; chip->flag = flag; memset(&pdevinfo, 0, sizeof(pdevinfo)); -- cgit v1.2.3 From 00bb549d7d63a21532e76e4a334d7807a54d9f31 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 27 Mar 2024 11:44:06 +0000 Subject: regmap: maple: Fix cache corruption in regcache_maple_drop() When keeping the upper end of a cache block entry, the entry[] array must be indexed by the offset from the base register of the block, i.e. max - mas.index. The code was indexing entry[] by only the register address, leading to an out-of-bounds access that copied some part of the kernel memory over the cache contents. This bug was not detected by the regmap KUnit test because it only tests with a block of registers starting at 0, so mas.index == 0. Signed-off-by: Richard Fitzgerald Fixes: f033c26de5a5 ("regmap: Add maple tree based register cache") Link: https://msgid.link/r/20240327114406.976986-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/base/regmap/regcache-maple.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regcache-maple.c b/drivers/base/regmap/regcache-maple.c index 41edd6a430eb..c1776127a572 100644 --- a/drivers/base/regmap/regcache-maple.c +++ b/drivers/base/regmap/regcache-maple.c @@ -145,7 +145,7 @@ static int regcache_maple_drop(struct regmap *map, unsigned int min, upper_index = max + 1; upper_last = mas.last; - upper = kmemdup(&entry[max + 1], + upper = kmemdup(&entry[max - mas.index + 1], ((mas.last - max) * sizeof(unsigned long)), map->alloc_flags); -- cgit v1.2.3 From 3cf5abf2860bc538620fc3dfca06f403964b787b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 26 Mar 2024 14:21:30 +0530 Subject: MAINTAINERS: Drop Gustavo Pimentel as PCI DWC Maintainer Gustavo Pimentel seems to have left Synopsys, so his email is bouncing. And there is no indication from him expressing willingless to continue contributing to the driver. Drop him from the MAINTAINERS entry and add a CREDITS entry. Link: https://lore.kernel.org/r/20240326085130.12487-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam [bhelgaas: add CREDITS entry] Signed-off-by: Bjorn Helgaas --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index c55c5a0ee4ff..0107047f807b 100644 --- a/CREDITS +++ b/CREDITS @@ -3146,6 +3146,10 @@ S: Triftstra=DFe 55 S: 13353 Berlin S: Germany +N: Gustavo Pimental +E: gustavo.pimentel@synopsys.com +D: PCI driver for Synopsys DesignWare + N: Emanuel Pirker E: epirker@edu.uni-klu.ac.at D: AIC5800 IEEE 1394, RAW I/O on 1394 diff --git a/MAINTAINERS b/MAINTAINERS index aa3b947fb080..f61540722269 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16966,7 +16966,6 @@ F: drivers/pci/controller/dwc/pci-exynos.c PCI DRIVER FOR SYNOPSYS DESIGNWARE M: Jingoo Han -M: Gustavo Pimentel M: Manivannan Sadhasivam L: linux-pci@vger.kernel.org S: Maintained -- cgit v1.2.3 From 6dbdd4de0362c37e54e8b049781402e5a409e7d0 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Thu, 4 Jan 2024 16:16:52 +0200 Subject: e1000e: Workaround for sporadic MDI error on Meteor Lake systems On some Meteor Lake systems accessing the PHY via the MDIO interface may result in an MDI error. This issue happens sporadically and in most cases a second access to the PHY via the MDIO interface results in success. As a workaround, introduce a retry counter which is set to 3 on Meteor Lake systems. The driver will only return an error if 3 consecutive PHY access attempts fail. The retry mechanism is disabled in specific flows, where MDI errors are expected. Fixes: cc23f4f0b6b9 ("e1000e: Add support for Meteor Lake") Suggested-by: Nikolay Mushayev Co-developed-by: Nir Efrati Signed-off-by: Nir Efrati Signed-off-by: Vitaly Lifshits Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/hw.h | 2 + drivers/net/ethernet/intel/e1000e/ich8lan.c | 33 +++++ drivers/net/ethernet/intel/e1000e/phy.c | 182 +++++++++++++++++----------- drivers/net/ethernet/intel/e1000e/phy.h | 2 + 4 files changed, 150 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index 1fef6bb5a5fb..4b6e7536170a 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -628,6 +628,7 @@ struct e1000_phy_info { u32 id; u32 reset_delay_us; /* in usec */ u32 revision; + u32 retry_count; enum e1000_media_type media_type; @@ -644,6 +645,7 @@ struct e1000_phy_info { bool polarity_correction; bool speed_downgraded; bool autoneg_wait_to_complete; + bool retry_enabled; }; struct e1000_nvm_info { diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 19e450a5bd31..d8e97669f31b 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -222,11 +222,18 @@ out: if (hw->mac.type >= e1000_pch_lpt) { /* Only unforce SMBus if ME is not active */ if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) { + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Unforce SMBus mode in PHY */ e1e_rphy_locked(hw, CV_SMB_CTRL, &phy_reg); phy_reg &= ~CV_SMB_CTRL_FORCE_SMBUS; e1e_wphy_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Unforce SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg &= ~E1000_CTRL_EXT_FORCE_SMBUS; @@ -310,6 +317,11 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) goto out; } + /* There is no guarantee that the PHY is accessible at this time + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* The MAC-PHY interconnect may be in SMBus mode. If the PHY is * inaccessible and resetting the PHY is not blocked, toggle the * LANPHYPC Value bit to force the interconnect to PCIe mode. @@ -380,6 +392,8 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) break; } + e1000e_enable_phy_retry(hw); + hw->phy.ops.release(hw); if (!ret_val) { @@ -449,6 +463,11 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw) phy->id = e1000_phy_unknown; + if (hw->mac.type == e1000_pch_mtp) { + phy->retry_count = 2; + e1000e_enable_phy_retry(hw); + } + ret_val = e1000_init_phy_workarounds_pchlan(hw); if (ret_val) return ret_val; @@ -1146,6 +1165,11 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Force SMBus mode in PHY */ ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); if (ret_val) @@ -1153,6 +1177,8 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Force SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; @@ -1313,6 +1339,11 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) /* Toggle LANPHYPC Value bit */ e1000_toggle_lanphypc_pch_lpt(hw); + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Unforce SMBus mode in PHY */ ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); if (ret_val) { @@ -1333,6 +1364,8 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) phy_reg &= ~CV_SMB_CTRL_FORCE_SMBUS; e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Unforce SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg &= ~E1000_CTRL_EXT_FORCE_SMBUS; diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c index 5e329156d1ba..93544f1cc2a5 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.c +++ b/drivers/net/ethernet/intel/e1000e/phy.c @@ -107,6 +107,16 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw *hw) return e1e_wphy(hw, M88E1000_PHY_GEN_CONTROL, 0); } +void e1000e_disable_phy_retry(struct e1000_hw *hw) +{ + hw->phy.retry_enabled = false; +} + +void e1000e_enable_phy_retry(struct e1000_hw *hw) +{ + hw->phy.retry_enabled = true; +} + /** * e1000e_read_phy_reg_mdic - Read MDI control register * @hw: pointer to the HW structure @@ -118,55 +128,73 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw *hw) **/ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) { + u32 i, mdic = 0, retry_counter, retry_max; struct e1000_phy_info *phy = &hw->phy; - u32 i, mdic = 0; + bool success; if (offset > MAX_PHY_REG_ADDRESS) { e_dbg("PHY Address %d is out of range\n", offset); return -E1000_ERR_PARAM; } + retry_max = phy->retry_enabled ? phy->retry_count : 0; + /* Set up Op-code, Phy Address, and register offset in the MDI * Control register. The MAC will take care of interfacing with the * PHY to retrieve the desired data. */ - mdic = ((offset << E1000_MDIC_REG_SHIFT) | - (phy->addr << E1000_MDIC_PHY_SHIFT) | - (E1000_MDIC_OP_READ)); + for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { + success = true; - ew32(MDIC, mdic); + mdic = ((offset << E1000_MDIC_REG_SHIFT) | + (phy->addr << E1000_MDIC_PHY_SHIFT) | + (E1000_MDIC_OP_READ)); - /* Poll the ready bit to see if the MDI read completed - * Increasing the time out as testing showed failures with - * the lower time out - */ - for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - udelay(50); - mdic = er32(MDIC); - if (mdic & E1000_MDIC_READY) - break; - } - if (!(mdic & E1000_MDIC_READY)) { - e_dbg("MDI Read PHY Reg Address %d did not complete\n", offset); - return -E1000_ERR_PHY; - } - if (mdic & E1000_MDIC_ERROR) { - e_dbg("MDI Read PHY Reg Address %d Error\n", offset); - return -E1000_ERR_PHY; - } - if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { - e_dbg("MDI Read offset error - requested %d, returned %d\n", - offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); - return -E1000_ERR_PHY; + ew32(MDIC, mdic); + + /* Poll the ready bit to see if the MDI read completed + * Increasing the time out as testing showed failures with + * the lower time out + */ + for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { + usleep_range(50, 60); + mdic = er32(MDIC); + if (mdic & E1000_MDIC_READY) + break; + } + if (!(mdic & E1000_MDIC_READY)) { + e_dbg("MDI Read PHY Reg Address %d did not complete\n", + offset); + success = false; + } + if (mdic & E1000_MDIC_ERROR) { + e_dbg("MDI Read PHY Reg Address %d Error\n", offset); + success = false; + } + if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { + e_dbg("MDI Read offset error - requested %d, returned %d\n", + offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); + success = false; + } + + /* Allow some time after each MDIC transaction to avoid + * reading duplicate data in the next MDIC transaction. + */ + if (hw->mac.type == e1000_pch2lan) + usleep_range(100, 150); + + if (success) { + *data = (u16)mdic; + return 0; + } + + if (retry_counter != retry_max) { + e_dbg("Perform retry on PHY transaction...\n"); + mdelay(10); + } } - *data = (u16)mdic; - /* Allow some time after each MDIC transaction to avoid - * reading duplicate data in the next MDIC transaction. - */ - if (hw->mac.type == e1000_pch2lan) - udelay(100); - return 0; + return -E1000_ERR_PHY; } /** @@ -179,56 +207,72 @@ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) **/ s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data) { + u32 i, mdic = 0, retry_counter, retry_max; struct e1000_phy_info *phy = &hw->phy; - u32 i, mdic = 0; + bool success; if (offset > MAX_PHY_REG_ADDRESS) { e_dbg("PHY Address %d is out of range\n", offset); return -E1000_ERR_PARAM; } + retry_max = phy->retry_enabled ? phy->retry_count : 0; + /* Set up Op-code, Phy Address, and register offset in the MDI * Control register. The MAC will take care of interfacing with the * PHY to retrieve the desired data. */ - mdic = (((u32)data) | - (offset << E1000_MDIC_REG_SHIFT) | - (phy->addr << E1000_MDIC_PHY_SHIFT) | - (E1000_MDIC_OP_WRITE)); + for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { + success = true; - ew32(MDIC, mdic); + mdic = (((u32)data) | + (offset << E1000_MDIC_REG_SHIFT) | + (phy->addr << E1000_MDIC_PHY_SHIFT) | + (E1000_MDIC_OP_WRITE)); - /* Poll the ready bit to see if the MDI read completed - * Increasing the time out as testing showed failures with - * the lower time out - */ - for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - udelay(50); - mdic = er32(MDIC); - if (mdic & E1000_MDIC_READY) - break; - } - if (!(mdic & E1000_MDIC_READY)) { - e_dbg("MDI Write PHY Reg Address %d did not complete\n", offset); - return -E1000_ERR_PHY; - } - if (mdic & E1000_MDIC_ERROR) { - e_dbg("MDI Write PHY Red Address %d Error\n", offset); - return -E1000_ERR_PHY; - } - if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { - e_dbg("MDI Write offset error - requested %d, returned %d\n", - offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); - return -E1000_ERR_PHY; - } + ew32(MDIC, mdic); - /* Allow some time after each MDIC transaction to avoid - * reading duplicate data in the next MDIC transaction. - */ - if (hw->mac.type == e1000_pch2lan) - udelay(100); + /* Poll the ready bit to see if the MDI read completed + * Increasing the time out as testing showed failures with + * the lower time out + */ + for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { + usleep_range(50, 60); + mdic = er32(MDIC); + if (mdic & E1000_MDIC_READY) + break; + } + if (!(mdic & E1000_MDIC_READY)) { + e_dbg("MDI Write PHY Reg Address %d did not complete\n", + offset); + success = false; + } + if (mdic & E1000_MDIC_ERROR) { + e_dbg("MDI Write PHY Reg Address %d Error\n", offset); + success = false; + } + if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { + e_dbg("MDI Write offset error - requested %d, returned %d\n", + offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); + success = false; + } - return 0; + /* Allow some time after each MDIC transaction to avoid + * reading duplicate data in the next MDIC transaction. + */ + if (hw->mac.type == e1000_pch2lan) + usleep_range(100, 150); + + if (success) + return 0; + + if (retry_counter != retry_max) { + e_dbg("Perform retry on PHY transaction...\n"); + mdelay(10); + } + } + + return -E1000_ERR_PHY; } /** diff --git a/drivers/net/ethernet/intel/e1000e/phy.h b/drivers/net/ethernet/intel/e1000e/phy.h index c48777d09523..049bb325b4b1 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.h +++ b/drivers/net/ethernet/intel/e1000e/phy.h @@ -51,6 +51,8 @@ s32 e1000e_read_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 *data); s32 e1000e_write_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 data); void e1000_power_up_phy_copper(struct e1000_hw *hw); void e1000_power_down_phy_copper(struct e1000_hw *hw); +void e1000e_disable_phy_retry(struct e1000_hw *hw); +void e1000e_enable_phy_retry(struct e1000_hw *hw); s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data); s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data); s32 e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data); -- cgit v1.2.3 From 861e8086029e003305750b4126ecd6617465f5c7 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Sun, 3 Mar 2024 12:51:32 +0200 Subject: e1000e: move force SMBUS from enable ulp function to avoid PHY loss issue Forcing SMBUS inside the ULP enabling flow leads to sporadic PHY loss on some systems. It is suspected to be caused by initiating PHY transactions before the interface settles. Separating this configuration from the ULP enabling flow and moving it to the shutdown function allows enough time for the interface to settle and avoids adding a delay. Fixes: 6607c99e7034 ("e1000e: i219 - fix to enable both ULP and EEE in Sx state") Co-developed-by: Dima Ruinskiy Signed-off-by: Dima Ruinskiy Signed-off-by: Vitaly Lifshits Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 19 ------------------- drivers/net/ethernet/intel/e1000e/netdev.c | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index d8e97669f31b..f9e94be36e97 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1165,25 +1165,6 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; - /* Switching PHY interface always returns MDI error - * so disable retry mechanism to avoid wasting time - */ - e1000e_disable_phy_retry(hw); - - /* Force SMBus mode in PHY */ - ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); - if (ret_val) - goto release; - phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; - e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); - - e1000e_enable_phy_retry(hw); - - /* Force SMBus mode in MAC */ - mac_reg = er32(CTRL_EXT); - mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; - ew32(CTRL_EXT, mac_reg); - /* Si workaround for ULP entry flow on i127/rev6 h/w. Enable * LPLU and disable Gig speed when entering ULP */ diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index cc8c531ec3df..3692fce20195 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6623,6 +6623,7 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) struct e1000_hw *hw = &adapter->hw; u32 ctrl, ctrl_ext, rctl, status, wufc; int retval = 0; + u16 smb_ctrl; /* Runtime suspend should only enable wakeup for link changes */ if (runtime) @@ -6696,6 +6697,23 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) if (retval) return retval; } + + /* Force SMBUS to allow WOL */ + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + + e1e_rphy(hw, CV_SMB_CTRL, &smb_ctrl); + smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS; + e1e_wphy(hw, CV_SMB_CTRL, smb_ctrl); + + e1000e_enable_phy_retry(hw); + + /* Force SMBus mode in MAC */ + ctrl_ext = er32(CTRL_EXT); + ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS; + ew32(CTRL_EXT, ctrl_ext); } /* Ensure that the appropriate bits are set in LPI_CTRL -- cgit v1.2.3 From a1aa5390cc912934fee76ce80af5f940452fa987 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Wed, 27 Mar 2024 19:52:49 +0300 Subject: of: module: prevent NULL pointer dereference in vsnprintf() In of_modalias(), we can get passed the str and len parameters which would cause a kernel oops in vsnprintf() since it only allows passing a NULL ptr when the length is also 0. Also, we need to filter out the negative values of the len parameter as these will result in a really huge buffer since snprintf() takes size_t parameter while ours is ssize_t... Found by Linux Verification Center (linuxtesting.org) with the Svace static analysis tool. Signed-off-by: Sergey Shtylyov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1d211023-3923-685b-20f0-f3f90ea56e1f@omp.ru Signed-off-by: Rob Herring --- drivers/of/module.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/of/module.c b/drivers/of/module.c index 0e8aa974f0f2..f58e624953a2 100644 --- a/drivers/of/module.c +++ b/drivers/of/module.c @@ -16,6 +16,14 @@ ssize_t of_modalias(const struct device_node *np, char *str, ssize_t len) ssize_t csize; ssize_t tsize; + /* + * Prevent a kernel oops in vsnprintf() -- it only allows passing a + * NULL ptr when the length is also 0. Also filter out the negative + * lengths... + */ + if ((len > 0 && !str) || len < 0) + return -EINVAL; + /* Name & Type */ /* %p eats all alphanum characters, so %c must be used here */ csize = snprintf(str, len, "of:N%pOFn%c%s", np, 'T', -- cgit v1.2.3 From daf6c4681a74034d5723e2fb761e0d7f3a1ca18f Mon Sep 17 00:00:00 2001 From: Christoffer Sandberg Date: Thu, 28 Mar 2024 11:27:57 +0100 Subject: ALSA: hda/realtek - Fix inactive headset mic jack This patch adds the existing fixup to certain TF platforms implementing the ALC274 codec with a headset jack. It fixes/activates the inactive microphone of the headset. Signed-off-by: Christoffer Sandberg Signed-off-by: Werner Sembach Cc: Message-ID: <20240328102757.50310-1-wse@tuxedocomputers.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a17c36a36aa5..c31e9be257a9 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10403,6 +10403,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d05, 0x1147, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x115c, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x121b, "TongFang GMxAGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x1387, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), -- cgit v1.2.3 From 2d0401ee38d43ab0e4cdd02dfc9d402befb2b5c8 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 28 Mar 2024 12:13:55 +0000 Subject: ALSA: hda: cs35l56: Add ACPI device match tables Adding the ACPI HIDs to the match table triggers the cs35l56-hda modules to be loaded on boot so that Serial Multi Instantiate can add the devices to the bus and begin the driver init sequence. Signed-off-by: Simon Trimmer Fixes: 73cfbfa9caea ("ALSA: hda/cs35l56: Add driver for Cirrus Logic CS35L56 amplifier") Message-ID: <20240328121355.18972-1-simont@opensource.cirrus.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda_i2c.c | 13 +++++++++++-- sound/pci/hda/cs35l56_hda_spi.c | 13 +++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/sound/pci/hda/cs35l56_hda_i2c.c b/sound/pci/hda/cs35l56_hda_i2c.c index 13beee807308..40f2f97944d5 100644 --- a/sound/pci/hda/cs35l56_hda_i2c.c +++ b/sound/pci/hda/cs35l56_hda_i2c.c @@ -56,10 +56,19 @@ static const struct i2c_device_id cs35l56_hda_i2c_id[] = { {} }; +static const struct acpi_device_id cs35l56_acpi_hda_match[] = { + { "CSC3554", 0 }, + { "CSC3556", 0 }, + { "CSC3557", 0 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, cs35l56_acpi_hda_match); + static struct i2c_driver cs35l56_hda_i2c_driver = { .driver = { - .name = "cs35l56-hda", - .pm = &cs35l56_hda_pm_ops, + .name = "cs35l56-hda", + .acpi_match_table = cs35l56_acpi_hda_match, + .pm = &cs35l56_hda_pm_ops, }, .id_table = cs35l56_hda_i2c_id, .probe = cs35l56_hda_i2c_probe, diff --git a/sound/pci/hda/cs35l56_hda_spi.c b/sound/pci/hda/cs35l56_hda_spi.c index a3b2fa76663d..7f02155fe61e 100644 --- a/sound/pci/hda/cs35l56_hda_spi.c +++ b/sound/pci/hda/cs35l56_hda_spi.c @@ -56,10 +56,19 @@ static const struct spi_device_id cs35l56_hda_spi_id[] = { {} }; +static const struct acpi_device_id cs35l56_acpi_hda_match[] = { + { "CSC3554", 0 }, + { "CSC3556", 0 }, + { "CSC3557", 0 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, cs35l56_acpi_hda_match); + static struct spi_driver cs35l56_hda_spi_driver = { .driver = { - .name = "cs35l56-hda", - .pm = &cs35l56_hda_pm_ops, + .name = "cs35l56-hda", + .acpi_match_table = cs35l56_acpi_hda_match, + .pm = &cs35l56_hda_pm_ops, }, .id_table = cs35l56_hda_spi_id, .probe = cs35l56_hda_spi_probe, -- cgit v1.2.3 From cd25e15e57e68a6b18dc9323047fe9c68b99290b Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Mon, 18 Mar 2024 12:22:31 +0100 Subject: fs/9p: only translate RWX permissions for plain 9P2000 Garbage in plain 9P2000's perm bits is allowed through, which causes it to be able to set (among others) the suid bit. This was presumably not the intent since the unix extended bits are handled explicitly and conditionally on .u. Signed-off-by: Joakim Sindholt Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b01b1bbf2493..9612fdb563a3 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -83,7 +83,7 @@ static int p9mode2perm(struct v9fs_session_info *v9ses, int res; int mode = stat->mode; - res = mode & S_IALLUGO; + res = mode & 0777; /* S_IRWXUGO */ if (v9fs_proto_dotu(v9ses)) { if ((mode & P9_DMSETUID) == P9_DMSETUID) res |= S_ISUID; -- cgit v1.2.3 From 87de39e70503e04ddb58965520b15eb9efa7eef3 Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Mon, 18 Mar 2024 12:22:33 +0100 Subject: fs/9p: translate O_TRUNC into OTRUNC This one hits both 9P2000 and .u as it appears v9fs has never translated the O_TRUNC flag. Signed-off-by: Joakim Sindholt Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 9612fdb563a3..c5b4d3631c47 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -178,6 +178,9 @@ int v9fs_uflags2omode(int uflags, int extended) break; } + if (uflags & O_TRUNC) + ret |= P9_OTRUNC; + if (extended) { if (uflags & O_EXCL) ret |= P9_OEXCL; -- cgit v1.2.3 From 4e5d208cc9bd5fbc95d536fa223b4b14c37b8ca8 Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Mon, 18 Mar 2024 12:22:34 +0100 Subject: fs/9p: fix the cache always being enabled on files with qid flags I'm not sure why this check was ever here. After updating to 6.6 I suddenly found caching had been turned on by default and neither cache=none nor the new directio would turn it off. After walking through the new code very manually I realized that it's because the caching has to be, in effect, turned off explicitly by setting P9L_DIRECT and whenever a file has a flag, in my case QTAPPEND, it doesn't get set. Setting aside QTDIR which seems to ignore the new fid->mode entirely, the rest of these either should be subject to the same cache rules as every other QTFILE or perhaps very explicitly not cached in the case of QTAUTH. Signed-off-by: Joakim Sindholt Signed-off-by: Eric Van Hensbergen --- fs/9p/fid.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 29281b7c3887..0d6138bee2a3 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -49,9 +49,6 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) static inline void v9fs_fid_add_modes(struct p9_fid *fid, unsigned int s_flags, unsigned int s_cache, unsigned int f_flags) { - if (fid->qid.type != P9_QTFILE) - return; - if ((!s_cache) || ((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) || (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) { -- cgit v1.2.3 From 2bd02f5a0bac4bb13e0da18652dc75ba0e4958ec Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Fri, 22 Mar 2024 16:45:25 +0000 Subject: drm/panfrost: fix power transition timeout warnings Increase the timeout value to prevent system logs on Amlogic boards flooding with power transition warnings: [ 13.047638] panfrost ffe40000.gpu: shader power transition timeout [ 13.048674] panfrost ffe40000.gpu: l2 power transition timeout [ 13.937324] panfrost ffe40000.gpu: shader power transition timeout [ 13.938351] panfrost ffe40000.gpu: l2 power transition timeout ... [39829.506904] panfrost ffe40000.gpu: shader power transition timeout [39829.507938] panfrost ffe40000.gpu: l2 power transition timeout [39949.508369] panfrost ffe40000.gpu: shader power transition timeout [39949.509405] panfrost ffe40000.gpu: l2 power transition timeout The 2000 value has been found through trial and error testing with devices using G52 and G31 GPUs. Fixes: 22aa1a209018 ("drm/panfrost: Really power off GPU cores in panfrost_gpu_power_off()") Signed-off-by: Christian Hewitt Reviewed-by: Steven Price Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Steven Price Link: https://patchwork.freedesktop.org/patch/msgid/20240322164525.2617508-1-christianshewitt@gmail.com --- drivers/gpu/drm/panfrost/panfrost_gpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c index 9063ce254642..fd8e44992184 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gpu.c +++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c @@ -441,19 +441,19 @@ void panfrost_gpu_power_off(struct panfrost_device *pfdev) gpu_write(pfdev, SHADER_PWROFF_LO, pfdev->features.shader_present); ret = readl_relaxed_poll_timeout(pfdev->iomem + SHADER_PWRTRANS_LO, - val, !val, 1, 1000); + val, !val, 1, 2000); if (ret) dev_err(pfdev->dev, "shader power transition timeout"); gpu_write(pfdev, TILER_PWROFF_LO, pfdev->features.tiler_present); ret = readl_relaxed_poll_timeout(pfdev->iomem + TILER_PWRTRANS_LO, - val, !val, 1, 1000); + val, !val, 1, 2000); if (ret) dev_err(pfdev->dev, "tiler power transition timeout"); gpu_write(pfdev, L2_PWROFF_LO, pfdev->features.l2_present); ret = readl_poll_timeout(pfdev->iomem + L2_PWRTRANS_LO, - val, !val, 0, 1000); + val, !val, 0, 2000); if (ret) dev_err(pfdev->dev, "l2 power transition timeout"); } -- cgit v1.2.3 From c60ebc58f2a82d27006cfc30af406bfd2ec204cc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Mar 2024 09:09:30 +0000 Subject: drm/nouveau/gr/gf100: Remove second semicolon There is a statement with two semicolons. Remove the second one, it is redundant. Signed-off-by: Colin Ian King Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240315090930.2429958-1-colin.i.king@gmail.com --- drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c index 986e8d547c94..060c74a80eb1 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c @@ -420,7 +420,7 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_chan *fifoch, return ret; } else { ret = nvkm_memory_map(gr->attrib_cb, 0, chan->vmm, chan->attrib_cb, - &args, sizeof(args));; + &args, sizeof(args)); if (ret) return ret; } -- cgit v1.2.3 From be141849ec00ef39935bf169c0f194ac70bf85ce Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 28 Mar 2024 12:43:16 +1000 Subject: nouveau/uvmm: fix addr/range calcs for remap operations dEQP-VK.sparse_resources.image_rebind.2d_array.r64i.128_128_8 was causing a remap operation like the below. op_remap: prev: 0000003fffed0000 00000000000f0000 00000000a5abd18a 0000000000000000 op_remap: next: op_remap: unmap: 0000003fffed0000 0000000000100000 0 op_map: map: 0000003ffffc0000 0000000000010000 000000005b1ba33c 00000000000e0000 This was resulting in an unmap operation from 0x3fffed0000+0xf0000, 0x100000 which was corrupting the pagetables and oopsing the kernel. Fixes the prev + unmap range calcs to use start/end and map back to addr/range. Signed-off-by: Dave Airlie Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI") Cc: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240328024317.2041851-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nouveau_uvmm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c index 0a0a11dc9ec0..ee02cd833c5e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c @@ -812,15 +812,15 @@ op_remap(struct drm_gpuva_op_remap *r, struct drm_gpuva_op_unmap *u = r->unmap; struct nouveau_uvma *uvma = uvma_from_va(u->va); u64 addr = uvma->va.va.addr; - u64 range = uvma->va.va.range; + u64 end = uvma->va.va.addr + uvma->va.va.range; if (r->prev) addr = r->prev->va.addr + r->prev->va.range; if (r->next) - range = r->next->va.addr - addr; + end = r->next->va.addr; - op_unmap_range(u, addr, range); + op_unmap_range(u, addr, end - addr); } static int -- cgit v1.2.3 From a4ec240f6b7c21cf846d10017c3ce423a0eae92c Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 22 Mar 2024 14:48:01 -0700 Subject: drm/prime: Unbreak virtgpu dma-buf export virtgpu "vram" GEM objects do not implement obj->get_sg_table(). But they also don't use drm_gem_map_dma_buf(). In fact they may not even have guest visible pages. But it is perfectly fine to export and share with other virtual devices. Reported-by: Dominik Behr Fixes: 207395da5a97 ("drm/prime: reject DMA-BUF attach when get_sg_table is missing") Signed-off-by: Rob Clark Reviewed-by: Simon Ser Signed-off-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20240322214801.319975-1-robdclark@gmail.com --- drivers/gpu/drm/drm_prime.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c index 7352bde299d5..03bd3c7bd0dc 100644 --- a/drivers/gpu/drm/drm_prime.c +++ b/drivers/gpu/drm/drm_prime.c @@ -582,7 +582,12 @@ int drm_gem_map_attach(struct dma_buf *dma_buf, { struct drm_gem_object *obj = dma_buf->priv; - if (!obj->funcs->get_sg_table) + /* + * drm_gem_map_dma_buf() requires obj->get_sg_table(), but drivers + * that implement their own ->map_dma_buf() do not. + */ + if (dma_buf->ops->map_dma_buf == drm_gem_map_dma_buf && + !obj->funcs->get_sg_table) return -ENOSYS; return drm_gem_pin(obj); -- cgit v1.2.3 From 310a5caa4e861616a27a83c3e8bda17d65026fa8 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:12 -0500 Subject: ASoC: rt5682-sdw: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: 02fb23d72720 ("ASoC: rt5682-sdw: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt5682-sdw.c b/sound/soc/codecs/rt5682-sdw.c index e67c2e19cb1a..1fdbef5fd6cb 100644 --- a/sound/soc/codecs/rt5682-sdw.c +++ b/sound/soc/codecs/rt5682-sdw.c @@ -763,12 +763,12 @@ static int __maybe_unused rt5682_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt5682->disable_irq_lock); if (rt5682->disable_irq == true) { - mutex_lock(&rt5682->disable_irq_lock); sdw_write_no_pm(slave, SDW_SCP_INTMASK1, SDW_SCP_INT1_IMPL_DEF); rt5682->disable_irq = false; - mutex_unlock(&rt5682->disable_irq_lock); } + mutex_unlock(&rt5682->disable_irq_lock); goto regmap_sync; } -- cgit v1.2.3 From ee287771644394d071e6a331951ee8079b64f9a7 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:13 -0500 Subject: ASoC: rt711-sdca: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: 23adeb7056ac ("ASoC: rt711-sdca: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-3-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt711-sdca-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt711-sdca-sdw.c b/sound/soc/codecs/rt711-sdca-sdw.c index 935e597022d3..b8471b2d8f4f 100644 --- a/sound/soc/codecs/rt711-sdca-sdw.c +++ b/sound/soc/codecs/rt711-sdca-sdw.c @@ -438,13 +438,13 @@ static int __maybe_unused rt711_sdca_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt711->disable_irq_lock); if (rt711->disable_irq == true) { - mutex_lock(&rt711->disable_irq_lock); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK1, SDW_SCP_SDCA_INTMASK_SDCA_0); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK2, SDW_SCP_SDCA_INTMASK_SDCA_8); rt711->disable_irq = false; - mutex_unlock(&rt711->disable_irq_lock); } + mutex_unlock(&rt711->disable_irq_lock); goto regmap_sync; } -- cgit v1.2.3 From aae86cfd8790bcc7693a5a0894df58de5cb5128c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:14 -0500 Subject: ASoC: rt711-sdw: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: b69de265bd0e ("ASoC: rt711: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-4-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt711-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt711-sdw.c b/sound/soc/codecs/rt711-sdw.c index 3f5773310ae8..988451f24a75 100644 --- a/sound/soc/codecs/rt711-sdw.c +++ b/sound/soc/codecs/rt711-sdw.c @@ -536,12 +536,12 @@ static int __maybe_unused rt711_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt711->disable_irq_lock); if (rt711->disable_irq == true) { - mutex_lock(&rt711->disable_irq_lock); sdw_write_no_pm(slave, SDW_SCP_INTMASK1, SDW_SCP_INT1_IMPL_DEF); rt711->disable_irq = false; - mutex_unlock(&rt711->disable_irq_lock); } + mutex_unlock(&rt711->disable_irq_lock); goto regmap_sync; } -- cgit v1.2.3 From c8b2e5c1b959d100990e4f0cbad38e7d047bb97c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:15 -0500 Subject: ASoC: rt712-sdca-sdw: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: 7a8735c1551e ("ASoC: rt712-sdca: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-5-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt712-sdca-sdw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt712-sdca-sdw.c b/sound/soc/codecs/rt712-sdca-sdw.c index 01ac555cd79b..36d0dd532b8d 100644 --- a/sound/soc/codecs/rt712-sdca-sdw.c +++ b/sound/soc/codecs/rt712-sdca-sdw.c @@ -438,13 +438,14 @@ static int __maybe_unused rt712_sdca_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt712->disable_irq_lock); if (rt712->disable_irq == true) { - mutex_lock(&rt712->disable_irq_lock); + sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK1, SDW_SCP_SDCA_INTMASK_SDCA_0); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK2, SDW_SCP_SDCA_INTMASK_SDCA_8); rt712->disable_irq = false; - mutex_unlock(&rt712->disable_irq_lock); } + mutex_unlock(&rt712->disable_irq_lock); goto regmap_sync; } -- cgit v1.2.3 From adb354bbc231b23d3a05163ce35c1d598512ff64 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:16 -0500 Subject: ASoC: rt722-sdca-sdw: fix locking sequence The disable_irq_lock protects the 'disable_irq' value, we need to lock before testing it. Fixes: a0b7c59ac1a9 ("ASoC: rt722-sdca: fix for JD event handling in ClockStop Mode0") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Chao Song Link: https://msgid.link/r/20240325221817.206465-6-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca-sdw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index eb76f4c675b6..65d584c1886e 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -467,13 +467,13 @@ static int __maybe_unused rt722_sdca_dev_resume(struct device *dev) return 0; if (!slave->unattach_request) { + mutex_lock(&rt722->disable_irq_lock); if (rt722->disable_irq == true) { - mutex_lock(&rt722->disable_irq_lock); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK1, SDW_SCP_SDCA_INTMASK_SDCA_6); sdw_write_no_pm(slave, SDW_SCP_SDCA_INTMASK2, SDW_SCP_SDCA_INTMASK_SDCA_8); rt722->disable_irq = false; - mutex_unlock(&rt722->disable_irq_lock); } + mutex_unlock(&rt722->disable_irq_lock); goto regmap_sync; } -- cgit v1.2.3 From f892e66fcabc6161cd38c0fc86e769208174b840 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 25 Mar 2024 17:18:17 -0500 Subject: ASoC: rt-sdw*: add __func__ to all error logs The drivers for Realtek SoundWire codecs use similar logs, which is problematic to analyze problems reported by CI tools, e.g. "Failed to get private value: 752001 => 0000 ret=-5". It's not uncommon to have several Realtek devices on the same platform, having the same log thrown makes support difficult. This patch adds __func__ to all error logs which didn't already include it. No functionality change, only error logs are modified. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Link: https://msgid.link/r/20240325221817.206465-7-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1316-sdw.c | 8 +++---- sound/soc/codecs/rt1318-sdw.c | 8 +++---- sound/soc/codecs/rt5682-sdw.c | 12 +++++----- sound/soc/codecs/rt700.c | 16 ++++++------- sound/soc/codecs/rt711-sdca-sdw.c | 2 +- sound/soc/codecs/rt711-sdca.c | 18 +++++++-------- sound/soc/codecs/rt711-sdw.c | 4 ++-- sound/soc/codecs/rt711.c | 16 ++++++------- sound/soc/codecs/rt712-sdca-dmic.c | 24 +++++++++++--------- sound/soc/codecs/rt712-sdca-sdw.c | 2 +- sound/soc/codecs/rt712-sdca.c | 20 ++++++++--------- sound/soc/codecs/rt715-sdca-sdw.c | 2 +- sound/soc/codecs/rt715-sdca.c | 46 +++++++++++++++++++------------------- sound/soc/codecs/rt715-sdw.c | 4 ++-- sound/soc/codecs/rt715.c | 24 ++++++++++---------- sound/soc/codecs/rt722-sdca.c | 21 ++++++++--------- 16 files changed, 115 insertions(+), 112 deletions(-) diff --git a/sound/soc/codecs/rt1316-sdw.c b/sound/soc/codecs/rt1316-sdw.c index 47511f70119a..0b3bf920bcab 100644 --- a/sound/soc/codecs/rt1316-sdw.c +++ b/sound/soc/codecs/rt1316-sdw.c @@ -537,7 +537,7 @@ static int rt1316_sdw_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt1316->sdw_slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -577,12 +577,12 @@ static int rt1316_sdw_parse_dt(struct rt1316_sdw_priv *rt1316, struct device *de if (rt1316->bq_params_cnt) { rt1316->bq_params = devm_kzalloc(dev, rt1316->bq_params_cnt, GFP_KERNEL); if (!rt1316->bq_params) { - dev_err(dev, "Could not allocate bq_params memory\n"); + dev_err(dev, "%s: Could not allocate bq_params memory\n", __func__); ret = -ENOMEM; } else { ret = device_property_read_u8_array(dev, "realtek,bq-params", rt1316->bq_params, rt1316->bq_params_cnt); if (ret < 0) - dev_err(dev, "Could not read list of realtek,bq-params\n"); + dev_err(dev, "%s: Could not read list of realtek,bq-params\n", __func__); } } @@ -759,7 +759,7 @@ static int __maybe_unused rt1316_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT1316_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt1318-sdw.c b/sound/soc/codecs/rt1318-sdw.c index ff364bde4a08..462c9a4b1be5 100644 --- a/sound/soc/codecs/rt1318-sdw.c +++ b/sound/soc/codecs/rt1318-sdw.c @@ -606,7 +606,7 @@ static int rt1318_sdw_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt1318->sdw_slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -631,8 +631,8 @@ static int rt1318_sdw_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT1318_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } @@ -835,7 +835,7 @@ static int __maybe_unused rt1318_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT1318_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); return -ETIMEDOUT; } diff --git a/sound/soc/codecs/rt5682-sdw.c b/sound/soc/codecs/rt5682-sdw.c index 1fdbef5fd6cb..f9ee42c13dba 100644 --- a/sound/soc/codecs/rt5682-sdw.c +++ b/sound/soc/codecs/rt5682-sdw.c @@ -132,7 +132,7 @@ static int rt5682_sdw_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt5682->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -315,8 +315,8 @@ static int rt5682_sdw_init(struct device *dev, struct regmap *regmap, &rt5682_sdw_indirect_regmap); if (IS_ERR(rt5682->regmap)) { ret = PTR_ERR(rt5682->regmap); - dev_err(dev, "Failed to allocate register map: %d\n", - ret); + dev_err(dev, "%s: Failed to allocate register map: %d\n", + __func__, ret); return ret; } @@ -400,7 +400,7 @@ static int rt5682_io_init(struct device *dev, struct sdw_slave *slave) } if (val != DEVICE_ID) { - dev_err(dev, "Device with ID register %x is not rt5682\n", val); + dev_err(dev, "%s: Device with ID register %x is not rt5682\n", __func__, val); ret = -ENODEV; goto err_nodev; } @@ -648,7 +648,7 @@ static int rt5682_bus_config(struct sdw_slave *slave, ret = rt5682_clock_config(&slave->dev); if (ret < 0) - dev_err(&slave->dev, "Invalid clk config"); + dev_err(&slave->dev, "%s: Invalid clk config", __func__); return ret; } @@ -775,7 +775,7 @@ static int __maybe_unused rt5682_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT5682_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt700.c b/sound/soc/codecs/rt700.c index 0ebf344a1b60..434b926f96c8 100644 --- a/sound/soc/codecs/rt700.c +++ b/sound/soc/codecs/rt700.c @@ -37,8 +37,8 @@ static int rt700_index_write(struct regmap *regmap, ret = regmap_write(regmap, addr, value); if (ret < 0) - pr_err("Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + pr_err("%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -52,8 +52,8 @@ static int rt700_index_read(struct regmap *regmap, *value = 0; ret = regmap_read(regmap, addr, value); if (ret < 0) - pr_err("Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + pr_err("%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -930,14 +930,14 @@ static int rt700_pcm_hw_params(struct snd_pcm_substream *substream, port_config.num += 2; break; default: - dev_err(component->dev, "Invalid DAI id %d\n", dai->id); + dev_err(component->dev, "%s: Invalid DAI id %d\n", __func__, dai->id); return -EINVAL; } retval = sdw_stream_add_slave(rt700->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -945,8 +945,8 @@ static int rt700_pcm_hw_params(struct snd_pcm_substream *substream, /* bit 3:0 Number of Channel */ val |= (params_channels(params) - 1); } else { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt711-sdca-sdw.c b/sound/soc/codecs/rt711-sdca-sdw.c index b8471b2d8f4f..2636c2eea4bc 100644 --- a/sound/soc/codecs/rt711-sdca-sdw.c +++ b/sound/soc/codecs/rt711-sdca-sdw.c @@ -451,7 +451,7 @@ static int __maybe_unused rt711_sdca_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT711_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt711-sdca.c b/sound/soc/codecs/rt711-sdca.c index 447154cb6010..1e8dbfc3ecd9 100644 --- a/sound/soc/codecs/rt711-sdca.c +++ b/sound/soc/codecs/rt711-sdca.c @@ -36,8 +36,8 @@ static int rt711_sdca_index_write(struct rt711_sdca_priv *rt711, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt711->slave->dev, - "Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + "%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -52,8 +52,8 @@ static int rt711_sdca_index_read(struct rt711_sdca_priv *rt711, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt711->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -1293,13 +1293,13 @@ static int rt711_sdca_pcm_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt711->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } if (params_channels(params) > 16) { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } @@ -1318,8 +1318,8 @@ static int rt711_sdca_pcm_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT711_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt711-sdw.c b/sound/soc/codecs/rt711-sdw.c index 988451f24a75..0d3b43dd22e6 100644 --- a/sound/soc/codecs/rt711-sdw.c +++ b/sound/soc/codecs/rt711-sdw.c @@ -408,7 +408,7 @@ static int rt711_bus_config(struct sdw_slave *slave, ret = rt711_clock_config(&slave->dev); if (ret < 0) - dev_err(&slave->dev, "Invalid clk config"); + dev_err(&slave->dev, "%s: Invalid clk config", __func__); return ret; } @@ -548,7 +548,7 @@ static int __maybe_unused rt711_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT711_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); return -ETIMEDOUT; } diff --git a/sound/soc/codecs/rt711.c b/sound/soc/codecs/rt711.c index 66eaed13b0d6..5446f9506a16 100644 --- a/sound/soc/codecs/rt711.c +++ b/sound/soc/codecs/rt711.c @@ -37,8 +37,8 @@ static int rt711_index_write(struct regmap *regmap, ret = regmap_write(regmap, addr, value); if (ret < 0) - pr_err("Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + pr_err("%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -52,8 +52,8 @@ static int rt711_index_read(struct regmap *regmap, *value = 0; ret = regmap_read(regmap, addr, value); if (ret < 0) - pr_err("Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + pr_err("%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -428,7 +428,7 @@ static void rt711_jack_init(struct rt711_priv *rt711) RT711_HP_JD_FINAL_RESULT_CTL_JD12); break; default: - dev_warn(rt711->component->dev, "Wrong JD source\n"); + dev_warn(rt711->component->dev, "%s: Wrong JD source\n", __func__); break; } @@ -1020,7 +1020,7 @@ static int rt711_pcm_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt711->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -1028,8 +1028,8 @@ static int rt711_pcm_hw_params(struct snd_pcm_substream *substream, /* bit 3:0 Number of Channel */ val |= (params_channels(params) - 1); } else { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt712-sdca-dmic.c b/sound/soc/codecs/rt712-sdca-dmic.c index 0926b26619bd..012b79e72cf6 100644 --- a/sound/soc/codecs/rt712-sdca-dmic.c +++ b/sound/soc/codecs/rt712-sdca-dmic.c @@ -139,8 +139,8 @@ static int rt712_sdca_dmic_index_write(struct rt712_sdca_dmic_priv *rt712, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt712->slave->dev, - "Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + "%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -155,8 +155,8 @@ static int rt712_sdca_dmic_index_read(struct rt712_sdca_dmic_priv *rt712, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt712->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -317,7 +317,8 @@ static int rt712_sdca_dmic_set_gain_put(struct snd_kcontrol *kcontrol, for (i = 0; i < p->count; i++) { err = regmap_write(rt712->mbq_regmap, p->reg_base + i, gain_val[i]); if (err < 0) - dev_err(&rt712->slave->dev, "0x%08x can't be set\n", p->reg_base + i); + dev_err(&rt712->slave->dev, "%s: 0x%08x can't be set\n", + __func__, p->reg_base + i); } return changed; @@ -667,13 +668,13 @@ static int rt712_sdca_dmic_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt712->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } if (params_channels(params) > 4) { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } @@ -698,8 +699,8 @@ static int rt712_sdca_dmic_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT712_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } @@ -923,7 +924,8 @@ static int __maybe_unused rt712_sdca_dmic_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT712_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", + __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt712-sdca-sdw.c b/sound/soc/codecs/rt712-sdca-sdw.c index 36d0dd532b8d..4e9ab3ef135b 100644 --- a/sound/soc/codecs/rt712-sdca-sdw.c +++ b/sound/soc/codecs/rt712-sdca-sdw.c @@ -452,7 +452,7 @@ static int __maybe_unused rt712_sdca_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT712_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt712-sdca.c b/sound/soc/codecs/rt712-sdca.c index 6954fbe7ec5f..b503de9fda80 100644 --- a/sound/soc/codecs/rt712-sdca.c +++ b/sound/soc/codecs/rt712-sdca.c @@ -34,8 +34,8 @@ static int rt712_sdca_index_write(struct rt712_sdca_priv *rt712, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt712->slave->dev, - "Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + "%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -50,8 +50,8 @@ static int rt712_sdca_index_read(struct rt712_sdca_priv *rt712, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt712->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -1060,13 +1060,13 @@ static int rt712_sdca_pcm_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt712->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } if (params_channels(params) > 16) { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } @@ -1085,8 +1085,8 @@ static int rt712_sdca_pcm_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT712_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } @@ -1106,7 +1106,7 @@ static int rt712_sdca_pcm_hw_params(struct snd_pcm_substream *substream, sampling_rate); break; default: - dev_err(component->dev, "Wrong DAI id\n"); + dev_err(component->dev, "%s: Wrong DAI id\n", __func__); return -EINVAL; } diff --git a/sound/soc/codecs/rt715-sdca-sdw.c b/sound/soc/codecs/rt715-sdca-sdw.c index ab54a67a27eb..ee450126106f 100644 --- a/sound/soc/codecs/rt715-sdca-sdw.c +++ b/sound/soc/codecs/rt715-sdca-sdw.c @@ -237,7 +237,7 @@ static int __maybe_unused rt715_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->enumeration_complete, msecs_to_jiffies(RT715_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Enumeration not complete, timed out\n"); + dev_err(&slave->dev, "%s: Enumeration not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt715-sdca.c b/sound/soc/codecs/rt715-sdca.c index 4533eedd7e18..3fb7b9adb61d 100644 --- a/sound/soc/codecs/rt715-sdca.c +++ b/sound/soc/codecs/rt715-sdca.c @@ -41,8 +41,8 @@ static int rt715_sdca_index_write(struct rt715_sdca_priv *rt715, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt715->slave->dev, - "Failed to set private value: %08x <= %04x %d\n", - addr, value, ret); + "%s: Failed to set private value: %08x <= %04x %d\n", + __func__, addr, value, ret); return ret; } @@ -59,8 +59,8 @@ static int rt715_sdca_index_read(struct rt715_sdca_priv *rt715, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt715->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -152,8 +152,8 @@ static int rt715_sdca_set_amp_gain_put(struct snd_kcontrol *kcontrol, mc->shift); ret = regmap_write(rt715->mbq_regmap, mc->reg + i, gain_val); if (ret != 0) { - dev_err(component->dev, "Failed to write 0x%x=0x%x\n", - mc->reg + i, gain_val); + dev_err(component->dev, "%s: Failed to write 0x%x=0x%x\n", + __func__, mc->reg + i, gain_val); return ret; } } @@ -188,8 +188,8 @@ static int rt715_sdca_set_amp_gain_4ch_put(struct snd_kcontrol *kcontrol, ret = regmap_write(rt715->mbq_regmap, reg_base + i, gain_val); if (ret != 0) { - dev_err(component->dev, "Failed to write 0x%x=0x%x\n", - reg_base + i, gain_val); + dev_err(component->dev, "%s: Failed to write 0x%x=0x%x\n", + __func__, reg_base + i, gain_val); return ret; } } @@ -224,8 +224,8 @@ static int rt715_sdca_set_amp_gain_8ch_put(struct snd_kcontrol *kcontrol, reg = i < 7 ? reg_base + i : (reg_base - 1) | BIT(15); ret = regmap_write(rt715->mbq_regmap, reg, gain_val); if (ret != 0) { - dev_err(component->dev, "Failed to write 0x%x=0x%x\n", - reg, gain_val); + dev_err(component->dev, "%s: Failed to write 0x%x=0x%x\n", + __func__, reg, gain_val); return ret; } } @@ -246,8 +246,8 @@ static int rt715_sdca_set_amp_gain_get(struct snd_kcontrol *kcontrol, for (i = 0; i < 2; i++) { ret = regmap_read(rt715->mbq_regmap, mc->reg + i, &val); if (ret < 0) { - dev_err(component->dev, "Failed to read 0x%x, ret=%d\n", - mc->reg + i, ret); + dev_err(component->dev, "%s: Failed to read 0x%x, ret=%d\n", + __func__, mc->reg + i, ret); return ret; } ucontrol->value.integer.value[i] = rt715_sdca_get_gain(val, mc->shift); @@ -271,8 +271,8 @@ static int rt715_sdca_set_amp_gain_4ch_get(struct snd_kcontrol *kcontrol, for (i = 0; i < 4; i++) { ret = regmap_read(rt715->mbq_regmap, reg_base + i, &val); if (ret < 0) { - dev_err(component->dev, "Failed to read 0x%x, ret=%d\n", - reg_base + i, ret); + dev_err(component->dev, "%s: Failed to read 0x%x, ret=%d\n", + __func__, reg_base + i, ret); return ret; } ucontrol->value.integer.value[i] = rt715_sdca_get_gain(val, gain_sft); @@ -297,8 +297,8 @@ static int rt715_sdca_set_amp_gain_8ch_get(struct snd_kcontrol *kcontrol, for (i = 0; i < 8; i += 2) { ret = regmap_read(rt715->mbq_regmap, reg_base + i, &val_l); if (ret < 0) { - dev_err(component->dev, "Failed to read 0x%x, ret=%d\n", - reg_base + i, ret); + dev_err(component->dev, "%s: Failed to read 0x%x, ret=%d\n", + __func__, reg_base + i, ret); return ret; } ucontrol->value.integer.value[i] = (val_l >> gain_sft) / 10; @@ -306,8 +306,8 @@ static int rt715_sdca_set_amp_gain_8ch_get(struct snd_kcontrol *kcontrol, reg = (i == 6) ? (reg_base - 1) | BIT(15) : reg_base + 1 + i; ret = regmap_read(rt715->mbq_regmap, reg, &val_r); if (ret < 0) { - dev_err(component->dev, "Failed to read 0x%x, ret=%d\n", - reg, ret); + dev_err(component->dev, "%s: Failed to read 0x%x, ret=%d\n", + __func__, reg, ret); return ret; } ucontrol->value.integer.value[i + 1] = (val_r >> gain_sft) / 10; @@ -834,15 +834,15 @@ static int rt715_sdca_pcm_hw_params(struct snd_pcm_substream *substream, 0xaf00); break; default: - dev_err(component->dev, "Invalid DAI id %d\n", dai->id); + dev_err(component->dev, "%s: Invalid DAI id %d\n", __func__, dai->id); return -EINVAL; } retval = sdw_stream_add_slave(rt715->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(component->dev, "Unable to configure port, retval:%d\n", - retval); + dev_err(component->dev, "%s: Unable to configure port, retval:%d\n", + __func__, retval); return retval; } @@ -893,8 +893,8 @@ static int rt715_sdca_pcm_hw_params(struct snd_pcm_substream *substream, val = 0xf; break; default: - dev_err(component->dev, "Unsupported sample rate %d\n", - params_rate(params)); + dev_err(component->dev, "%s: Unsupported sample rate %d\n", + __func__, params_rate(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt715-sdw.c b/sound/soc/codecs/rt715-sdw.c index 21f37babd148..7e13868ff99f 100644 --- a/sound/soc/codecs/rt715-sdw.c +++ b/sound/soc/codecs/rt715-sdw.c @@ -482,7 +482,7 @@ static int rt715_bus_config(struct sdw_slave *slave, ret = rt715_clock_config(&slave->dev); if (ret < 0) - dev_err(&slave->dev, "Invalid clk config"); + dev_err(&slave->dev, "%s: Invalid clk config", __func__); return 0; } @@ -554,7 +554,7 @@ static int __maybe_unused rt715_dev_resume(struct device *dev) time = wait_for_completion_timeout(&slave->initialization_complete, msecs_to_jiffies(RT715_PROBE_TIMEOUT)); if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); + dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); sdw_show_ping_status(slave->bus, true); return -ETIMEDOUT; diff --git a/sound/soc/codecs/rt715.c b/sound/soc/codecs/rt715.c index 9f732a5abd53..299c9b12377c 100644 --- a/sound/soc/codecs/rt715.c +++ b/sound/soc/codecs/rt715.c @@ -40,8 +40,8 @@ static int rt715_index_write(struct regmap *regmap, unsigned int reg, ret = regmap_write(regmap, addr, value); if (ret < 0) { - pr_err("Failed to set private value: %08x <= %04x %d\n", - addr, value, ret); + pr_err("%s: Failed to set private value: %08x <= %04x %d\n", + __func__, addr, value, ret); } return ret; @@ -55,8 +55,8 @@ static int rt715_index_write_nid(struct regmap *regmap, ret = regmap_write(regmap, addr, value); if (ret < 0) - pr_err("Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + pr_err("%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -70,8 +70,8 @@ static int rt715_index_read_nid(struct regmap *regmap, *value = 0; ret = regmap_read(regmap, addr, value); if (ret < 0) - pr_err("Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + pr_err("%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -862,14 +862,14 @@ static int rt715_pcm_hw_params(struct snd_pcm_substream *substream, rt715_index_write(rt715->regmap, RT715_SDW_INPUT_SEL, 0xa000); break; default: - dev_err(component->dev, "Invalid DAI id %d\n", dai->id); + dev_err(component->dev, "%s: Invalid DAI id %d\n", __func__, dai->id); return -EINVAL; } retval = sdw_stream_add_slave(rt715->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } @@ -883,8 +883,8 @@ static int rt715_pcm_hw_params(struct snd_pcm_substream *substream, val |= 0x0 << 8; break; default: - dev_err(component->dev, "Unsupported sample rate %d\n", - params_rate(params)); + dev_err(component->dev, "%s: Unsupported sample rate %d\n", + __func__, params_rate(params)); return -EINVAL; } @@ -892,8 +892,8 @@ static int rt715_pcm_hw_params(struct snd_pcm_substream *substream, /* bit 3:0 Number of Channel */ val |= (params_channels(params) - 1); } else { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } diff --git a/sound/soc/codecs/rt722-sdca.c b/sound/soc/codecs/rt722-sdca.c index 0e1c65a20392..e0ea3a23f7cc 100644 --- a/sound/soc/codecs/rt722-sdca.c +++ b/sound/soc/codecs/rt722-sdca.c @@ -35,8 +35,8 @@ int rt722_sdca_index_write(struct rt722_sdca_priv *rt722, ret = regmap_write(regmap, addr, value); if (ret < 0) dev_err(&rt722->slave->dev, - "Failed to set private value: %06x <= %04x ret=%d\n", - addr, value, ret); + "%s: Failed to set private value: %06x <= %04x ret=%d\n", + __func__, addr, value, ret); return ret; } @@ -51,8 +51,8 @@ int rt722_sdca_index_read(struct rt722_sdca_priv *rt722, ret = regmap_read(regmap, addr, value); if (ret < 0) dev_err(&rt722->slave->dev, - "Failed to get private value: %06x => %04x ret=%d\n", - addr, *value, ret); + "%s: Failed to get private value: %06x => %04x ret=%d\n", + __func__, addr, *value, ret); return ret; } @@ -663,7 +663,8 @@ static int rt722_sdca_dmic_set_gain_put(struct snd_kcontrol *kcontrol, for (i = 0; i < p->count; i++) { err = regmap_write(rt722->mbq_regmap, p->reg_base + i, gain_val[i]); if (err < 0) - dev_err(&rt722->slave->dev, "%#08x can't be set\n", p->reg_base + i); + dev_err(&rt722->slave->dev, "%s: %#08x can't be set\n", + __func__, p->reg_base + i); } return changed; @@ -1211,13 +1212,13 @@ static int rt722_sdca_pcm_hw_params(struct snd_pcm_substream *substream, retval = sdw_stream_add_slave(rt722->slave, &stream_config, &port_config, 1, sdw_stream); if (retval) { - dev_err(dai->dev, "Unable to configure port\n"); + dev_err(dai->dev, "%s: Unable to configure port\n", __func__); return retval; } if (params_channels(params) > 16) { - dev_err(component->dev, "Unsupported channels %d\n", - params_channels(params)); + dev_err(component->dev, "%s: Unsupported channels %d\n", + __func__, params_channels(params)); return -EINVAL; } @@ -1236,8 +1237,8 @@ static int rt722_sdca_pcm_hw_params(struct snd_pcm_substream *substream, sampling_rate = RT722_SDCA_RATE_192000HZ; break; default: - dev_err(component->dev, "Rate %d is not supported\n", - params_rate(params)); + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); return -EINVAL; } -- cgit v1.2.3 From 7a84602297d36617dbdadeba55a2567031e5165b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 19 Mar 2024 12:34:45 -0400 Subject: 9p: explicitly deny setlease attempts 9p is a remote network protocol, and it doesn't support asynchronous notifications from the server. Ensure that we don't hand out any leases since we can't guarantee they'll be broken when a file's contents change. Signed-off-by: Jeff Layton Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index abdbbaee5184..348cc90bf9c5 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -520,6 +520,7 @@ const struct file_operations v9fs_file_operations = { .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, + .setlease = simple_nosetlease, }; const struct file_operations v9fs_file_operations_dotl = { @@ -534,4 +535,5 @@ const struct file_operations v9fs_file_operations_dotl = { .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, + .setlease = simple_nosetlease, }; -- cgit v1.2.3 From fc563aa900659a850e2ada4af26b9d7a3de6c591 Mon Sep 17 00:00:00 2001 From: Stephen Lee Date: Mon, 25 Mar 2024 18:01:31 -0700 Subject: ASoC: ops: Fix wraparound for mask in snd_soc_get_volsw In snd_soc_info_volsw(), mask is generated by figuring out the index of the most significant bit set in max and converting the index to a bitmask through bit shift 1. Unintended wraparound occurs when max is an integer value with msb bit set. Since the bit shift value 1 is treated as an integer type, the left shift operation will wraparound and set mask to 0 instead of all 1's. In order to fix this, we type cast 1 as `1ULL` to prevent the wraparound. Fixes: 7077148fb50a ("ASoC: core: Split ops out of soc-core.c") Signed-off-by: Stephen Lee Link: https://msgid.link/r/20240326010131.6211-1-slee08177@gmail.com Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 2d25748ca706..b27e89ff6a16 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -263,7 +263,7 @@ int snd_soc_get_volsw(struct snd_kcontrol *kcontrol, int max = mc->max; int min = mc->min; int sign_bit = mc->sign_bit; - unsigned int mask = (1 << fls(max)) - 1; + unsigned int mask = (1ULL << fls(max)) - 1; unsigned int invert = mc->invert; int val; int ret; -- cgit v1.2.3 From 7f1dd39aedfccf60772328c5b88d56dbd39954c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 27 Mar 2024 08:33:10 +0100 Subject: clk: Provide !COMMON_CLK dummy for devm_clk_rate_exclusive_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to compile drivers using devm_clk_rate_exclusive_get() also on platforms without the common clk framework, add a dummy implementation that does the same as clk_rate_exclusive_get() in that case (i.e. nothing). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403270305.ydvX9xq1-lkp@intel.com/ Fixes: b0cde62e4c54 ("clk: Add a devm variant of clk_rate_exclusive_get()") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240327073310.520950-2-u.kleine-koenig@pengutronix.de Signed-off-by: Stephen Boyd --- include/linux/clk.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/clk.h b/include/linux/clk.h index 00623f4de5e1..0fa56d672532 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -286,6 +286,11 @@ static inline int clk_rate_exclusive_get(struct clk *clk) return 0; } +static inline int devm_clk_rate_exclusive_get(struct device *dev, struct clk *clk) +{ + return 0; +} + static inline void clk_rate_exclusive_put(struct clk *clk) {} #endif -- cgit v1.2.3 From c90847bcbfb65d0f1c48fcc73a2b3a2d4ceac6a1 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 26 Mar 2024 22:45:24 -0700 Subject: cache: sifive_ccache: Partially convert to a platform driver Commit 8ec99b033147 ("irqchip/sifive-plic: Convert PLIC driver into a platform driver") broke ccache initialization because the PLIC IRQ domain is no longer available during an arch_initcall: [ 0.087229] irq: no irq domain found for interrupt-controller@c000000 ! [ 0.087255] CCACHE: Could not request IRQ 0 Fix this by moving the IRQ handling code to a platform driver. Fixes: 8ec99b033147 ("irqchip/sifive-plic: Convert PLIC driver into a platform driver") Signed-off-by: Samuel Holland Tested-by: Geert Uytterhoeven Signed-off-by: Conor Dooley --- drivers/cache/sifive_ccache.c | 72 +++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/cache/sifive_ccache.c b/drivers/cache/sifive_ccache.c index 89ed6cd6b059..e9cc8b4786fb 100644 --- a/drivers/cache/sifive_ccache.c +++ b/drivers/cache/sifive_ccache.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -247,13 +249,49 @@ static irqreturn_t ccache_int_handler(int irq, void *device) return IRQ_HANDLED; } +static int sifive_ccache_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + unsigned long quirks; + int intr_num, rc; + + quirks = (unsigned long)device_get_match_data(dev); + + intr_num = platform_irq_count(pdev); + if (!intr_num) + return dev_err_probe(dev, -ENODEV, "No interrupts property\n"); + + for (int i = 0; i < intr_num; i++) { + if (i == DATA_UNCORR && (quirks & QUIRK_BROKEN_DATA_UNCORR)) + continue; + + g_irq[i] = platform_get_irq(pdev, i); + if (g_irq[i] < 0) + return g_irq[i]; + + rc = devm_request_irq(dev, g_irq[i], ccache_int_handler, 0, "ccache_ecc", NULL); + if (rc) + return dev_err_probe(dev, rc, "Could not request IRQ %d\n", g_irq[i]); + } + + return 0; +} + +static struct platform_driver sifive_ccache_driver = { + .probe = sifive_ccache_probe, + .driver = { + .name = "sifive_ccache", + .of_match_table = sifive_ccache_ids, + }, +}; + static int __init sifive_ccache_init(void) { struct device_node *np; struct resource res; - int i, rc, intr_num; const struct of_device_id *match; unsigned long quirks; + int rc; np = of_find_matching_node_and_match(NULL, sifive_ccache_ids, &match); if (!np) @@ -277,28 +315,6 @@ static int __init sifive_ccache_init(void) goto err_unmap; } - intr_num = of_property_count_u32_elems(np, "interrupts"); - if (!intr_num) { - pr_err("No interrupts property\n"); - rc = -ENODEV; - goto err_unmap; - } - - for (i = 0; i < intr_num; i++) { - g_irq[i] = irq_of_parse_and_map(np, i); - - if (i == DATA_UNCORR && (quirks & QUIRK_BROKEN_DATA_UNCORR)) - continue; - - rc = request_irq(g_irq[i], ccache_int_handler, 0, "ccache_ecc", - NULL); - if (rc) { - pr_err("Could not request IRQ %d\n", g_irq[i]); - goto err_free_irq; - } - } - of_node_put(np); - #ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS if (quirks & QUIRK_NONSTANDARD_CACHE_OPS) { riscv_cbom_block_size = SIFIVE_CCACHE_LINE_SIZE; @@ -315,11 +331,15 @@ static int __init sifive_ccache_init(void) #ifdef CONFIG_DEBUG_FS setup_sifive_debug(); #endif + + rc = platform_driver_register(&sifive_ccache_driver); + if (rc) + goto err_unmap; + + of_node_put(np); + return 0; -err_free_irq: - while (--i >= 0) - free_irq(g_irq[i], NULL); err_unmap: iounmap(ccache_base); err_node_put: -- cgit v1.2.3 From 931ec1e4cb7fd81fe01e85419238a9cfb9d930c9 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 25 Mar 2024 11:12:28 -0700 Subject: Documentation: Add documentation for eswitch attribute Provide devlink documentation for three eswitch attributes: mode, inline-mode, and encap-mode. Signed-off-by: William Tu Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240325181228.6244-1-witu@nvidia.com Signed-off-by: Jakub Kicinski --- .../networking/devlink/devlink-eswitch-attr.rst | 76 ++++++++++++++++++++++ Documentation/networking/devlink/index.rst | 1 + Documentation/networking/representors.rst | 1 + 3 files changed, 78 insertions(+) create mode 100644 Documentation/networking/devlink/devlink-eswitch-attr.rst diff --git a/Documentation/networking/devlink/devlink-eswitch-attr.rst b/Documentation/networking/devlink/devlink-eswitch-attr.rst new file mode 100644 index 000000000000..08bb39ab1528 --- /dev/null +++ b/Documentation/networking/devlink/devlink-eswitch-attr.rst @@ -0,0 +1,76 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Devlink E-Switch Attribute +========================== + +Devlink E-Switch supports two modes of operation: legacy and switchdev. +Legacy mode operates based on traditional MAC/VLAN steering rules. Switching +decisions are made based on MAC addresses, VLANs, etc. There is limited ability +to offload switching rules to hardware. + +On the other hand, switchdev mode allows for more advanced offloading +capabilities of the E-Switch to hardware. In switchdev mode, more switching +rules and logic can be offloaded to the hardware switch ASIC. It enables +representor netdevices that represent the slow path of virtual functions (VFs) +or scalable-functions (SFs) of the device. See more information about +:ref:`Documentation/networking/switchdev.rst ` and +:ref:`Documentation/networking/representors.rst `. + +In addition, the devlink E-Switch also comes with other attributes listed +in the following section. + +Attributes Description +====================== + +The following is a list of E-Switch attributes. + +.. list-table:: E-Switch attributes + :widths: 8 5 45 + + * - Name + - Type + - Description + * - ``mode`` + - enum + - The mode of the device. The mode can be one of the following: + + * ``legacy`` operates based on traditional MAC/VLAN steering + rules. + * ``switchdev`` allows for more advanced offloading capabilities of + the E-Switch to hardware. + * - ``inline-mode`` + - enum + - Some HWs need the VF driver to put part of the packet + headers on the TX descriptor so the e-switch can do proper + matching and steering. Support for both switchdev mode and legacy mode. + + * ``none`` none. + * ``link`` L2 mode. + * ``network`` L3 mode. + * ``transport`` L4 mode. + * - ``encap-mode`` + - enum + - The encapsulation mode of the device. Support for both switchdev mode + and legacy mode. The mode can be one of the following: + + * ``none`` Disable encapsulation support. + * ``basic`` Enable encapsulation support. + +Example Usage +============= + +.. code:: shell + + # enable switchdev mode + $ devlink dev eswitch set pci/0000:08:00.0 mode switchdev + + # set inline-mode and encap-mode + $ devlink dev eswitch set pci/0000:08:00.0 inline-mode none encap-mode basic + + # display devlink device eswitch attributes + $ devlink dev eswitch show pci/0000:08:00.0 + pci/0000:08:00.0: mode switchdev inline-mode none encap-mode basic + + # enable encap-mode with legacy mode + $ devlink dev eswitch set pci/0000:08:00.0 mode legacy inline-mode none encap-mode basic diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst index e14d7a701b72..948c8c44e233 100644 --- a/Documentation/networking/devlink/index.rst +++ b/Documentation/networking/devlink/index.rst @@ -67,6 +67,7 @@ general. devlink-selftests devlink-trap devlink-linecard + devlink-eswitch-attr Driver-specific documentation ----------------------------- diff --git a/Documentation/networking/representors.rst b/Documentation/networking/representors.rst index decb39c19b9e..5e23386f6968 100644 --- a/Documentation/networking/representors.rst +++ b/Documentation/networking/representors.rst @@ -1,4 +1,5 @@ .. SPDX-License-Identifier: GPL-2.0 +.. _representors: ============================= Network Function Representors -- cgit v1.2.3 From fa84513997e9703fbac94b73bbe50aafdb29040e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 09:14:13 +0100 Subject: ptp: MAINTAINERS: drop Jeff Sipek Emails to Jeff Sipek bounce: Your message to jsipek@vmware.com couldn't be delivered. Recipient is not authorized to accept external mail Status code: 550 5.7.1_ETR Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240327081413.306054-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6a233e1a3cf2..7b8f97bf7975 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23661,7 +23661,6 @@ F: drivers/scsi/vmw_pvscsi.c F: drivers/scsi/vmw_pvscsi.h VMWARE VIRTUAL PTP CLOCK DRIVER -M: Jeff Sipek R: Ajay Kaher R: Alexey Makhalov R: VMware PV-Drivers Reviewers -- cgit v1.2.3 From 037965402a010898d34f4e35327d22c0a95cd51f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Mar 2024 13:14:56 +0100 Subject: xen-netfront: Add missing skb_mark_for_recycle Notice that skb_mark_for_recycle() is introduced later than fixes tag in commit 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling"). It is believed that fixes tag were missing a call to page_pool_release_page() between v5.9 to v5.14, after which is should have used skb_mark_for_recycle(). Since v6.6 the call page_pool_release_page() were removed (in commit 535b9c61bdef ("net: page_pool: hide page_pool_release_page()") and remaining callers converted (in commit 6bfef2ec0172 ("Merge branch 'net-page_pool-remove-page_pool_release_page'")). This leak became visible in v6.8 via commit dba1b8a7ab68 ("mm/page_pool: catch page_pool memory leaks"). Cc: stable@vger.kernel.org Fixes: 6c5aa6fc4def ("xen networking: add basic XDP support for xen-netfront") Reported-by: Leonidas Spyropoulos Link: https://bugzilla.kernel.org/show_bug.cgi?id=218654 Reported-by: Arthur Borsboom Signed-off-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/171154167446.2671062.9127105384591237363.stgit@firesoul Signed-off-by: Jakub Kicinski --- drivers/net/xen-netfront.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index ad29f370034e..8d2aee88526c 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -285,6 +285,7 @@ static struct sk_buff *xennet_alloc_one_rx_buffer(struct netfront_queue *queue) return NULL; } skb_add_rx_frag(skb, 0, page, 0, 0, PAGE_SIZE); + skb_mark_for_recycle(skb); /* Align ip header to a 16 bytes boundary */ skb_reserve(skb, NET_IP_ALIGN); -- cgit v1.2.3 From e9c856cabefb71d47b2eeb197f72c9c88e9b45b0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 27 Mar 2024 22:24:25 -0700 Subject: bpf: put uprobe link's path and task in release callback There is no need to delay putting either path or task to deallocation step. It can be done right after bpf_uprobe_unregister. Between release and dealloc, there could be still some running BPF programs, but they don't access either task or path, only data in link->uprobes, so it is safe to do. On the other hand, doing path_put() in dealloc callback makes this dealloc sleepable because path_put() itself might sleep. Which is problematic due to the need to call uprobe's dealloc through call_rcu(), which is what is done in the next bug fix patch. So solve the problem by releasing these resources early. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240328052426.3042617-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0a5c4efc73c3..0b73fe5f7206 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3157,6 +3157,9 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + if (umulti_link->task) + put_task_struct(umulti_link->task); + path_put(&umulti_link->path); } static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) @@ -3164,9 +3167,6 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - if (umulti_link->task) - put_task_struct(umulti_link->task); - path_put(&umulti_link->path); kvfree(umulti_link->uprobes); kfree(umulti_link); } -- cgit v1.2.3 From 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 27 Mar 2024 22:24:26 -0700 Subject: bpf: support deferring bpf_link dealloc to after RCU grace period BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values. Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data. This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP. BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable. We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++++++++- kernel/bpf/syscall.c | 35 ++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f20f62f9d63..890e152d553e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1574,12 +1574,26 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; - struct work_struct work; + /* rcu is used before freeing, work can be used to schedule that + * RCU-based freeing before that, so they never overlap + */ + union { + struct rcu_head rcu; + struct work_struct work; + }; }; struct bpf_link_ops { void (*release)(struct bpf_link *link); + /* deallocate link resources callback, called without RCU grace period + * waiting + */ void (*dealloc)(struct bpf_link *link); + /* deallocate link resources callback, called after RCU grace period; + * if underlying BPF program is sleepable we go through tasks trace + * RCU GP and then "classic" RCU GP + */ + void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..c287925471f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{ + struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); + + /* free bpf_link and its containing memory */ + link->ops->dealloc_deferred(link); +} + +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_link_defer_dealloc_rcu_gp(rcu); + else + call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); +} + /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bool sleepable = false; + bpf_link_free_id(link->id); if (link->prog) { + sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); } - /* free bpf_link and its containing memory */ - link->ops->dealloc(link); + if (link->ops->dealloc_deferred) { + /* schedule BPF link deallocation; if underlying BPF program + * is sleepable, we need to first wait for RCU tasks trace + * sync, then go through "classic" RCU grace period + */ + if (sleepable) + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + else + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } + if (link->ops->dealloc) + link->ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) @@ -3544,7 +3573,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, - .dealloc = bpf_raw_tp_link_dealloc, + .dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b73fe5f7206..9dc605f08a23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2728,7 +2728,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, }; @@ -3242,7 +3242,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; -- cgit v1.2.3 From 62248b22d01e96a4d669cde0d7005bd51ebf9e76 Mon Sep 17 00:00:00 2001 From: Natanael Copa Date: Thu, 28 Mar 2024 11:59:13 +0100 Subject: tools/resolve_btfids: fix build with musl libc Include the header that defines u32. This fixes build of 6.6.23 and 6.1.83 kernels for Alpine Linux, which uses musl libc. I assume that GNU libc indirecly pulls in linux/types.h. Fixes: 9707ac4fe2f5 ("tools/resolve_btfids: Refactor set sorting with types from btf_ids.h") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218647 Cc: stable@vger.kernel.org Signed-off-by: Natanael Copa Tested-by: Greg Thelen Link: https://lore.kernel.org/r/20240328110103.28734-1-ncopa@alpinelinux.org Signed-off-by: Alexei Starovoitov --- tools/include/linux/btf_ids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 72535f00572f..72ea363d434d 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -3,6 +3,8 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +#include /* for u32 */ + struct btf_id_set { u32 cnt; u32 ids[]; -- cgit v1.2.3 From 8cb10cba124c4798b6cb333245ecdc8dde78aeae Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 28 Feb 2024 12:02:15 -0800 Subject: arm64: dts: freescale: imx8mp-venice-gw72xx-2x: fix USB vbus regulator When using usb-conn-gpio to control USB role and VBUS, the vbus-supply property must be present in the usb-conn-gpio node. Additionally it should not be present in the phy node as that isn't what controls vbus and will upset the use count. This resolves an issue where VBUS is enabled with OTG in peripheral mode. Fixes: ad9a12f7a522 ("arm64: dts: imx8mp-venice: Fix USB connector description") Signed-off-by: Tim Harvey Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi index 41c79d2ebdd6..f24b14744799 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi @@ -14,6 +14,7 @@ pinctrl-0 = <&pinctrl_usbcon1>; type = "micro"; label = "otg"; + vbus-supply = <®_usb1_vbus>; id-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; port { @@ -183,7 +184,6 @@ }; &usb3_phy0 { - vbus-supply = <®_usb1_vbus>; status = "okay"; }; -- cgit v1.2.3 From 6f8e0aca838e163e81fde176e945161d50679339 Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 28 Feb 2024 12:02:16 -0800 Subject: arm64: dts: freescale: imx8mp-venice-gw73xx-2x: fix USB vbus regulator When using usb-conn-gpio to control USB role and VBUS, the vbus-supply property must be present in the usb-conn-gpio node. Additionally it should not be present in the phy node as that isn't what controls vbus and will upset the use count. This resolves an issue where VBUS is enabled with OTG in peripheral mode. Fixes: ad9a12f7a522 ("arm64: dts: imx8mp-venice: Fix USB connector description") Signed-off-by: Tim Harvey Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi index d5c400b355af..f5491a608b2f 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi @@ -14,6 +14,7 @@ pinctrl-0 = <&pinctrl_usbcon1>; type = "micro"; label = "otg"; + vbus-supply = <®_usb1_vbus>; id-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; port { @@ -202,7 +203,6 @@ }; &usb3_phy0 { - vbus-supply = <®_usb1_vbus>; status = "okay"; }; -- cgit v1.2.3 From 831ec5e3538e989c7995137b5c5c661991a09504 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 28 Mar 2024 23:47:37 +0100 Subject: ASoC: tas2781: mark dvc_tlv with __maybe_unused Since we put dvc_tlv static variable to a header file it's copied to each module that includes the header. But not all of them are actually used it. Fix this W=1 build warning: include/sound/tas2781-tlv.h:18:35: warning: 'dvc_tlv' defined but not used [-Wunused-const-variable=] Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403290354.v0StnRpc-lkp@intel.com/ Fixes: ae065d0ce9e3 ("ALSA: hda/tas2781: remove digital gain kcontrol") Signed-off-by: Gergo Koteles Message-ID: <0e461545a2a6e9b6152985143e50526322e5f76b.1711665731.git.soyer@irl.hu> Signed-off-by: Takashi Iwai --- include/sound/tas2781-tlv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sound/tas2781-tlv.h b/include/sound/tas2781-tlv.h index 4038dd421150..1dc59005d241 100644 --- a/include/sound/tas2781-tlv.h +++ b/include/sound/tas2781-tlv.h @@ -15,7 +15,7 @@ #ifndef __TAS2781_TLV_H__ #define __TAS2781_TLV_H__ -static const DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0); +static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0); static const DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0); #endif -- cgit v1.2.3 From 10e52ad5ced2a7dcdb3fb18c9cef111d5f30471d Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 26 Mar 2024 09:56:49 +0100 Subject: net: hsr: Use full string description when opening HSR network device Up till now only single character ('A' or 'B') was used to provide information of HSR slave network device status. As it is also possible and valid, that Interlink network device may be supported as well, the description must be more verbose. As a result the full string description is now used. Signed-off-by: Lukasz Majewski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index c98b5b71ad7c..e9d45133d641 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -132,30 +132,29 @@ static int hsr_dev_open(struct net_device *dev) { struct hsr_priv *hsr; struct hsr_port *port; - char designation; + const char *designation = NULL; hsr = netdev_priv(dev); - designation = '\0'; hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: - designation = 'A'; + designation = "Slave A"; break; case HSR_PT_SLAVE_B: - designation = 'B'; + designation = "Slave B"; break; default: - designation = '?'; + designation = "Unknown"; } if (!is_slave_up(port->dev)) - netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n", + netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n", designation, port->dev->name); } - if (designation == '\0') + if (!designation) netdev_warn(dev, "No slave devices configured\n"); return 0; -- cgit v1.2.3 From 3d010c8031e39f5fa1e8b13ada77e0321091011f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:33:58 +0100 Subject: udp: do not accept non-tunnel GSO skbs landing in a tunnel When rx-udp-gro-forwarding is enabled UDP packets might be GROed when being forwarded. If such packets might land in a tunnel this can cause various issues and udp_gro_receive makes sure this isn't the case by looking for a matching socket. This is performed in udp4/6_gro_lookup_skb but only in the current netns. This is an issue with tunneled packets when the endpoint is in another netns. In such cases the packets will be GROed at the UDP level, which leads to various issues later on. The same thing can happen with rx-gro-list. We saw this with geneve packets being GROed at the UDP level. In such case gso_size is set; later the packet goes through the geneve rx path, the geneve header is pulled, the offset are adjusted and frag_list skbs are not adjusted with regard to geneve. When those skbs hit skb_fragment, it will misbehave. Different outcomes are possible depending on what the GROed skbs look like; from corrupted packets to kernel crashes. One example is a BUG_ON[1] triggered in skb_segment while processing the frag_list. Because gso_size is wrong (geneve header was pulled) skb_segment thinks there is "geneve header size" of data in frag_list, although it's in fact the next packet. The BUG_ON itself has nothing to do with the issue. This is only one of the potential issues. Looking up for a matching socket in udp_gro_receive is fragile: the lookup could be extended to all netns (not speaking about performances) but nothing prevents those packets from being modified in between and we could still not find a matching socket. It's OK to keep the current logic there as it should cover most cases but we also need to make sure we handle tunnel packets being GROed too early. This is done by extending the checks in udp_unexpected_gso: GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits and landing in a tunnel must be segmented. [1] kernel BUG at net/core/skbuff.c:4408! RIP: 0010:skb_segment+0xd2a/0xf70 __udp_gso_segment+0xaa/0x560 Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 28 ++++++++++++++++++++++++++++ net/ipv4/udp.c | 7 +++++++ net/ipv4/udp_offload.c | 6 ++++-- net/ipv6/udp.c | 2 +- 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 3748e82b627b..17539d089666 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -150,6 +150,24 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, } } +DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); +#if IS_ENABLED(CONFIG_IPV6) +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +#endif + +static inline bool udp_encap_needed(void) +{ + if (static_branch_unlikely(&udp_encap_needed_key)) + return true; + +#if IS_ENABLED(CONFIG_IPV6) + if (static_branch_unlikely(&udpv6_encap_needed_key)) + return true; +#endif + + return false; +} + static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) { if (!skb_is_gso(skb)) @@ -163,6 +181,16 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) !udp_test_bit(ACCEPT_FRAGLIST, sk)) return true; + /* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still + * land in a tunnel as the socket check in udp_gro_receive cannot be + * foolproof. + */ + if (udp_encap_needed() && + READ_ONCE(udp_sk(sk)->encap_rcv) && + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM))) + return true; + return false; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 661d0e0d273f..c02bf011d4a6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -582,6 +582,13 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +EXPORT_SYMBOL(udp_encap_needed_key); + +#if IS_ENABLED(CONFIG_IPV6) +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +EXPORT_SYMBOL(udpv6_encap_needed_key); +#endif + void udp_encap_enable(void) { static_branch_inc(&udp_encap_needed_key); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b9880743765c..e9719afe91cf 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -551,8 +551,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; - /* we can do L4 aggregation only if the packet can't land in a tunnel - * otherwise we could corrupt the inner stream + /* We can do L4 aggregation only if the packet can't land in a tunnel + * otherwise we could corrupt the inner stream. Detecting such packets + * cannot be foolproof and the aggregation might still happen in some + * cases. Such packets should be caught in udp_unexpected_gso later. */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7c1e6469d091..8b1dd7f51249 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -447,7 +447,7 @@ csum_copy_err: goto try_again; } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_inc(&udpv6_encap_needed_key); -- cgit v1.2.3 From ed4cccef64c1d0d5b91e69f7a8a6697c3a865486 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:33:59 +0100 Subject: gro: fix ownership transfer If packets are GROed with fraglist they might be segmented later on and continue their journey in the stack. In skb_segment_list those skbs can be reused as-is. This is an issue as their destructor was removed in skb_gro_receive_list but not the reference to their socket, and then they can't be orphaned. Fix this by also removing the reference to the socket. For example this could be observed, kernel BUG at include/linux/skbuff.h:3131! (skb_orphan) RIP: 0010:ip6_rcv_core+0x11bc/0x19a0 Call Trace: ipv6_list_rcv+0x250/0x3f0 __netif_receive_skb_list_core+0x49d/0x8f0 netif_receive_skb_list_internal+0x634/0xd40 napi_complete_done+0x1d2/0x7d0 gro_cell_poll+0x118/0x1f0 A similar construction is found in skb_gro_receive, apply the same change there. Fixes: 5e10da5385d2 ("skbuff: allow 'slow_gro' for skb carring sock reference") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/gro.c | 3 ++- net/ipv4/udp_offload.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/gro.c b/net/core/gro.c index ee30d4f0c038..83f35d99a682 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -192,8 +192,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) } merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e9719afe91cf..3bb69464930b 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -449,8 +449,9 @@ static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) NAPI_GRO_CB(p)->count++; p->data_len += skb->len; - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; p->truesize += skb->truesize; p->len += skb->len; -- cgit v1.2.3 From f0b8c30345565344df2e33a8417a27503589247d Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:00 +0100 Subject: udp: do not transition UDP GRO fraglist partial checksums to unnecessary UDP GRO validates checksums and in udp4/6_gro_complete fraglist packets are converted to CHECKSUM_UNNECESSARY to avoid later checks. However this is an issue for CHECKSUM_PARTIAL packets as they can be looped in an egress path and then their partial checksums are not fixed. Different issues can be observed, from invalid checksum on packets to traces like: gen01: hw csum failure skb len=3008 headroom=160 headlen=1376 tailroom=0 mac=(106,14) net=(120,40) trans=160 shinfo(txflags=0 nr_frags=0 gso(size=0 type=0 segs=0)) csum(0xffff232e ip_summed=2 complete_sw=0 valid=0 level=0) hash(0x77e3d716 sw=1 l4=1) proto=0x86dd pkttype=0 iif=12 ... Fix this by only converting CHECKSUM_NONE packets to CHECKSUM_UNNECESSARY by reusing __skb_incr_checksum_unnecessary. All other checksum types are kept as-is, including CHECKSUM_COMPLETE as fraglist packets being segmented back would have their skb->csum valid. Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 8 +------- net/ipv6/udp_offload.c | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 3bb69464930b..548476d78237 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -722,13 +722,7 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 312bcaeea96f..bbd347de00b4 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -174,13 +174,7 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } -- cgit v1.2.3 From 64235eabc4b5b18c507c08a1f16cdac6c5661220 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:01 +0100 Subject: udp: prevent local UDP tunnel packets from being GROed GRO has a fundamental issue with UDP tunnel packets as it can't detect those in a foolproof way and GRO could happen before they reach the tunnel endpoint. Previous commits have fixed issues when UDP tunnel packets come from a remote host, but if those packets are issued locally they could run into checksum issues. If the inner packet has a partial checksum the information will be lost in the GRO logic, either in udp4/6_gro_complete or in udp_gro_complete_segment and packets will have an invalid checksum when leaving the host. Prevent local UDP tunnel packets from ever being GROed at the outer UDP level. Due to skb->encapsulation being wrongly used in some drivers this is actually only preventing UDP tunnel packets with a partial checksum to be GROed (see iptunnel_handle_offloads) but those were also the packets triggering issues so in practice this should be sufficient. Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") Suggested-by: Paolo Abeni Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 548476d78237..3498dd1d0694 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -559,6 +559,12 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { + /* If the packet was locally encapsulated in a UDP tunnel that + * wasn't detected above, do not GRO. + */ + if (skb->encapsulation) + goto out; + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; -- cgit v1.2.3 From 0fb101be97ca27850c5ecdbd1269423ce4d1f607 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:02 +0100 Subject: selftests: net: gro fwd: update vxlan GRO test expectations UDP tunnel packets can't be GRO in-between their endpoints as this causes different issues. The UDP GRO fwd vxlan tests were relying on this and their expectations have to be fixed. We keep both vxlan tests and expected no GRO from happening. The vxlan UDP GRO bench test was removed as it's not providing any valuable information now. Fixes: a062260a9d5f ("selftests: net: add UDP GRO forwarding self-tests") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro_fwd.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 380cb15e942e..83ed987cff34 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -244,7 +244,7 @@ for family in 4 6; do create_vxlan_pair ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on - run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1 + run_test "GRO frag list over UDP tunnel" $OL_NET$DST 10 10 cleanup # use NAT to circumvent GRO FWD check @@ -258,13 +258,7 @@ for family in 4 6; do # load arp cache before running the test to reduce the amount of # stray traffic on top of the UDP tunnel ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null - run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST - cleanup - - create_vxlan_pair - run_bench "UDP tunnel fwd perf" $OL_NET$DST - ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on - run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST + run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 10 10 $OL_NET$DST cleanup done -- cgit v1.2.3 From 0ba80d96585662299d4ea4624043759ce9015421 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Tue, 26 Mar 2024 17:51:49 +0530 Subject: octeontx2-af: Fix issue with loading coalesced KPU profiles The current implementation for loading coalesced KPU profiles has a limitation. The "offset" field, which is used to locate profiles within the profile is restricted to a u16. This restricts the number of profiles that can be loaded. This patch addresses this limitation by increasing the size of the "offset" field. Fixes: 11c730bfbf5b ("octeontx2-af: support for coalescing KPU profiles") Signed-off-by: Hariprasad Kelam Reviewed-by: Kalesh AP Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index e350242bbafb..be709f83f331 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -1657,7 +1657,7 @@ static int npc_fwdb_detect_load_prfl_img(struct rvu *rvu, uint64_t prfl_sz, struct npc_coalesced_kpu_prfl *img_data = NULL; int i = 0, rc = -EINVAL; void __iomem *kpu_prfl_addr; - u16 offset; + u32 offset; img_data = (struct npc_coalesced_kpu_prfl __force *)rvu->kpu_prfl_addr; if (le64_to_cpu(img_data->signature) == KPU_SIGN && -- cgit v1.2.3 From 73dfe970c038d0548beccc5bfb2707e1d543b01f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 29 Mar 2024 11:35:40 +0100 Subject: pwm: Fix setting period with #pwm-cells = <1> and of_pwm_single_xlate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For drivers making use of of_pwm_single_xlate() (i.e. those that don't pass a hwpwm index) and also don't pass flags, setting period was wrongly skipped. This affects the pwm-pxa and ti-sn65dsi86 drivers. Reported-by: Karel Balej Link: https://lore.kernel.org/r/D05IVTPYH35N.2CLDG6LSILRSN@matfyz.cz Fixes: 40ade0c2e794 ("pwm: Let the of_xlate callbacks accept references without period") Tested-by: Karel Balej Link: https://lore.kernel.org/r/20240329103544.545290-2-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index d70f793ce4b3..403525cc1783 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -443,7 +443,7 @@ of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args) if (IS_ERR(pwm)) return pwm; - if (args->args_count > 1) + if (args->args_count > 0) pwm->args.period = args->args[0]; pwm->args.polarity = PWM_POLARITY_NORMAL; -- cgit v1.2.3 From a3d3eab627bbbb0cb175910cf8d0f7022628a642 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Fri, 29 Mar 2024 17:58:40 +0900 Subject: spi: s3c64xx: Use DMA mode from fifo size If the SPI data size is smaller than FIFO, it operates in PIO mode, and if it is larger than FIFO size, it oerates in DMA mode. If the SPI data size is equal to fifo, it operates in PIO mode and it is separated to 2 transfers. To prevent it, it must operate in DMA mode from the case where the data size and the fifo size are the same. Fixes: 1ee806718d5e ("spi: s3c64xx: support interrupt based pio mode") Signed-off-by: Jaewon Kim Reviewed-by: Sam Protsenko Link: https://lore.kernel.org/r/20240329085840.65856-1-jaewon02.kim@samsung.com Signed-off-by: Mark Brown --- drivers/spi/spi-s3c64xx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c index 9fcbe040cb2f..f726d8670428 100644 --- a/drivers/spi/spi-s3c64xx.c +++ b/drivers/spi/spi-s3c64xx.c @@ -430,7 +430,7 @@ static bool s3c64xx_spi_can_dma(struct spi_controller *host, struct s3c64xx_spi_driver_data *sdd = spi_controller_get_devdata(host); if (sdd->rx_dma.ch && sdd->tx_dma.ch) - return xfer->len > sdd->fifo_depth; + return xfer->len >= sdd->fifo_depth; return false; } @@ -826,10 +826,9 @@ static int s3c64xx_spi_transfer_one(struct spi_controller *host, return status; } - if (!is_polling(sdd) && (xfer->len > fifo_len) && + if (!is_polling(sdd) && xfer->len >= fifo_len && sdd->rx_dma.ch && sdd->tx_dma.ch) { use_dma = 1; - } else if (xfer->len >= fifo_len) { tx_buf = xfer->tx_buf; rx_buf = xfer->rx_buf; -- cgit v1.2.3 From 4790a73ace86f3d165bbedba898e0758e6e1b82d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Mar 2024 09:44:12 +0100 Subject: Revert "Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT" This reverts commit 7dcd3e014aa7faeeaf4047190b22d8a19a0db696. Qualcomm Bluetooth controllers like WCN6855 do not have persistent storage for the Bluetooth address and must therefore start as unconfigured to allow the user to set a valid address unless one has been provided by the boot firmware in the devicetree. A recent change snuck into v6.8-rc7 and incorrectly started marking the default (non-unique) address as valid. This specifically also breaks the Bluetooth setup for some user of the Lenovo ThinkPad X13s. Note that this is the second time Qualcomm breaks the driver this way and that this was fixed last year by commit 6945795bc81a ("Bluetooth: fix use-bdaddr-property quirk"), which also has some further details. Fixes: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Cc: stable@vger.kernel.org # 6.8 Cc: Janaki Ramaiah Thota Signed-off-by: Johan Hovold Reported-by: Clayton Craft Tested-by: Clayton Craft Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 8a60ad7acd70..4ecbcb1644cc 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -7,7 +7,6 @@ * * Copyright (C) 2007 Texas Instruments, Inc. * Copyright (c) 2010, 2012, 2018 The Linux Foundation. All rights reserved. - * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved. * * Acknowledgements: * This file is based on hci_ll.c, which was... @@ -1904,17 +1903,7 @@ retry: case QCA_WCN6750: case QCA_WCN6855: case QCA_WCN7850: - - /* Set BDA quirk bit for reading BDA value from fwnode property - * only if that property exist in DT. - */ - if (fwnode_property_present(dev_fwnode(hdev->dev.parent), "local-bd-address")) { - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); - bt_dev_info(hdev, "setting quirk bit to read BDA from fwnode later"); - } else { - bt_dev_dbg(hdev, "local-bd-address` is not present in the devicetree so not setting quirk bit for BDA"); - } - + set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); hci_set_aosp_capable(hdev); ret = qca_read_soc_version(hdev, &ver, soc_type); -- cgit v1.2.3 From 7003de8a226ea07d36e9461a30633af26dc79248 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:51 +0100 Subject: dt-bindings: bluetooth: add 'qcom,local-bd-address-broken' Several Qualcomm Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. The only device out there that should be affected by this is the WCN3991 used in some Chromebooks. Add a 'qcom,local-bd-address-broken' property which can be set on these platforms to indicate that the boot firmware is using the wrong byte order. Note that ChromeOS always updates the kernel and devicetree in lockstep so that there is no need to handle backwards compatibility with older devicetrees. Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Reviewed-by: Rob Herring Signed-off-by: Luiz Augusto von Dentz --- .../devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml b/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml index 528ef3572b62..055a3351880b 100644 --- a/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml +++ b/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml @@ -94,6 +94,10 @@ properties: local-bd-address: true + qcom,local-bd-address-broken: + type: boolean + description: + boot firmware is incorrectly passing the address in big-endian order required: - compatible -- cgit v1.2.3 From e12e28009e584c8f8363439f6a928ec86278a106 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:52 +0100 Subject: arm64: dts: qcom: sc7180-trogdor: mark bluetooth address as broken Several Qualcomm Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. The boot firmware in SC7180 Trogdor Chromebooks is known to be affected so mark the 'local-bd-address' property as broken to maintain backwards compatibility with older firmware when fixing the underlying driver bug. Note that ChromeOS always updates the kernel and devicetree in lockstep so that there is no need to handle backwards compatibility with older devicetrees. Fixes: 7ec3e67307f8 ("arm64: dts: qcom: sc7180-trogdor: add initial trogdor and lazor dt") Cc: stable@vger.kernel.org # 5.10 Cc: Rob Clark Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Acked-by: Bjorn Andersson Reviewed-by: Bjorn Andersson Signed-off-by: Luiz Augusto von Dentz --- arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi index f3a6da8b2890..5260c63db007 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi @@ -944,6 +944,8 @@ ap_spi_fp: &spi10 { vddrf-supply = <&pp1300_l2c>; vddch0-supply = <&pp3300_l10c>; max-speed = <3200000>; + + qcom,local-bd-address-broken; }; }; -- cgit v1.2.3 From 39646f29b100566451d37abc4cc8cdd583756dfe Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:53 +0100 Subject: Bluetooth: add quirk for broken address properties Some Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth devicetree bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. Add a new quirk that can be set on platforms with broken firmware and use it to reverse the address when parsing the property so that the underlying driver bug can be fixed. Fixes: 5c0a1001c8be ("Bluetooth: hci_qca: Add helper to set device address") Cc: stable@vger.kernel.org # 5.1 Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 9 +++++++++ net/bluetooth/hci_sync.c | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8701ca5f31ee..5c12761cbc0e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -176,6 +176,15 @@ enum { */ HCI_QUIRK_USE_BDADDR_PROPERTY, + /* When this quirk is set, the Bluetooth Device Address provided by + * the 'local-bd-address' fwnode property is incorrectly specified in + * big-endian order. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BDADDR_PROPERTY_BROKEN, + /* When this quirk is set, the duplicate filtering during * scanning is based on Bluetooth devices addresses. To allow * RSSI based updates, restart scanning if needed. diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index f6b662369322..639090b9f4b8 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3416,7 +3416,10 @@ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; - bacpy(&hdev->public_addr, &ba); + if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) + baswap(&hdev->public_addr, &ba); + else + bacpy(&hdev->public_addr, &ba); } struct hci_init_stage { -- cgit v1.2.3 From 77f45cca8bc55d00520a192f5a7715133591c83e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:54 +0100 Subject: Bluetooth: qca: fix device-address endianness The WCN6855 firmware on the Lenovo ThinkPad X13s expects the Bluetooth device address in big-endian order when setting it using the EDL_WRITE_BD_ADDR_OPCODE command. Presumably, this is the case for all non-ROME devices which all use the EDL_WRITE_BD_ADDR_OPCODE command for this (unlike the ROME devices which use a different command and expect the address in little-endian order). Reverse the little-endian address before setting it to make sure that the address can be configured using tools like btmgmt or using the 'local-bd-address' devicetree property. Note that this can potentially break systems with boot firmware which has started relying on the broken behaviour and is incorrectly passing the address via devicetree in big-endian order. The only device affected by this should be the WCN3991 used in some Chromebooks. As ChromeOS updates the kernel and devicetree in lockstep, the new 'qcom,local-bd-address-broken' property can be used to determine if the firmware is buggy so that the underlying driver bug can be fixed without breaking backwards compatibility. Set the HCI_QUIRK_BDADDR_PROPERTY_BROKEN quirk for such platforms so that the address is reversed when parsing the address property. Fixes: 5c0a1001c8be ("Bluetooth: hci_qca: Add helper to set device address") Cc: stable@vger.kernel.org # 5.1 Cc: Balakrishna Godavarthi Cc: Matthias Kaehlcke Tested-by: Nikita Travkin # sc7180 Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btqca.c | 8 ++++++-- drivers/bluetooth/hci_qca.c | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index b40b32fa7f1c..19cfc342fc7b 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -826,11 +826,15 @@ EXPORT_SYMBOL_GPL(qca_uart_setup); int qca_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr) { + bdaddr_t bdaddr_swapped; struct sk_buff *skb; int err; - skb = __hci_cmd_sync_ev(hdev, EDL_WRITE_BD_ADDR_OPCODE, 6, bdaddr, - HCI_EV_VENDOR, HCI_INIT_TIMEOUT); + baswap(&bdaddr_swapped, bdaddr); + + skb = __hci_cmd_sync_ev(hdev, EDL_WRITE_BD_ADDR_OPCODE, 6, + &bdaddr_swapped, HCI_EV_VENDOR, + HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); bt_dev_err(hdev, "QCA Change address cmd failed (%d)", err); diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 4ecbcb1644cc..ecbc52eaf101 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -225,6 +225,7 @@ struct qca_serdev { struct qca_power *bt_power; u32 init_speed; u32 oper_speed; + bool bdaddr_property_broken; const char *firmware_name; }; @@ -1842,6 +1843,7 @@ static int qca_setup(struct hci_uart *hu) const char *firmware_name = qca_get_firmware_name(hu); int ret; struct qca_btsoc_version ver; + struct qca_serdev *qcadev; const char *soc_name; ret = qca_check_speeds(hu); @@ -1904,6 +1906,11 @@ retry: case QCA_WCN6855: case QCA_WCN7850: set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + + qcadev = serdev_device_get_drvdata(hu->serdev); + if (qcadev->bdaddr_property_broken) + set_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks); + hci_set_aosp_capable(hdev); ret = qca_read_soc_version(hdev, &ver, soc_type); @@ -2284,6 +2291,9 @@ static int qca_serdev_probe(struct serdev_device *serdev) if (!qcadev->oper_speed) BT_DBG("UART will pick default operating speed"); + qcadev->bdaddr_property_broken = device_property_read_bool(&serdev->dev, + "qcom,local-bd-address-broken"); + if (data) qcadev->btsoc_type = data->soc_type; else -- cgit v1.2.3 From 6946b9c99bde45f3ba74e00a7af9a3458cc24bea Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 26 Mar 2024 12:43:17 -0400 Subject: Bluetooth: hci_sync: Fix not checking error on hci_cmd_sync_cancel_sync hci_cmd_sync_cancel_sync shall check the error passed to it since it will be propagated using req_result which is __u32 it needs to be properly set to a positive value if it was passed as negative othertise IS_ERR will not trigger as -(errno) would be converted to a positive value. Fixes: 63298d6e752f ("Bluetooth: hci_core: Cancel request on command timeout") Signed-off-by: Luiz Augusto von Dentz Reported-and-tested-by: Thorsten Leemhuis Closes: https://lore.kernel.org/all/08275279-7462-4f4a-a0ee-8aa015f829bc@leemhuis.info/ --- net/bluetooth/hci_core.c | 6 +++--- net/bluetooth/hci_sync.c | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1690ae57a09d..a7028d38c1f5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2874,7 +2874,7 @@ static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) cancel_delayed_work_sync(&hdev->ncmd_timer); atomic_set(&hdev->cmd_cnt, 1); - hci_cmd_sync_cancel_sync(hdev, -err); + hci_cmd_sync_cancel_sync(hdev, err); } /* Suspend HCI device */ @@ -2894,7 +2894,7 @@ int hci_suspend_dev(struct hci_dev *hdev) return 0; /* Cancel potentially blocking sync operation before suspend */ - hci_cancel_cmd_sync(hdev, -EHOSTDOWN); + hci_cancel_cmd_sync(hdev, EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); @@ -4210,7 +4210,7 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) err = hci_send_frame(hdev, skb); if (err < 0) { - hci_cmd_sync_cancel_sync(hdev, err); + hci_cmd_sync_cancel_sync(hdev, -err); return; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 639090b9f4b8..8fe02921adf1 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -617,7 +617,10 @@ void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err) bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { - hdev->req_result = err; + /* req_result is __u32 so error must be positive to be properly + * propagated. + */ + hdev->req_result = err < 0 ? -err : err; hdev->req_status = HCI_REQ_CANCELED; wake_up_interruptible(&hdev->req_wait_q); -- cgit v1.2.3 From c569242cd49287d53b73a94233db40097d838535 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 27 Mar 2024 12:30:30 +0800 Subject: Bluetooth: hci_event: set the conn encrypted before conn establishes We have a BT headset (Lenovo Thinkplus XT99), the pairing and connecting has no problem, once this headset is paired, bluez will remember this device and will auto re-connect it whenever the device is powered on. The auto re-connecting works well with Windows and Android, but with Linux, it always fails. Through debugging, we found at the rfcomm connection stage, the bluetooth stack reports "Connection refused - security block (0x0003)". For this device, the re-connecting negotiation process is different from other BT headsets, it sends the Link_KEY_REQUEST command before the CONNECT_REQUEST completes, and it doesn't send ENCRYPT_CHANGE command during the negotiation. When the device sends the "connect complete" to hci, the ev->encr_mode is 1. So here in the conn_complete_evt(), if ev->encr_mode is 1, link type is ACL and HCI_CONN_ENCRYPT is not set, we set HCI_CONN_ENCRYPT to this conn, and update conn->enc_key_size accordingly. After this change, this BT headset could re-connect with Linux successfully. This is the btmon log after applying the patch, after receiving the "Connect Complete" with "Encryption: Enabled", will send the command to read encryption key size: > HCI Event: Connect Request (0x04) plen 10 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Class: 0x240404 Major class: Audio/Video (headset, speaker, stereo, video, vcr) Minor class: Wearable Headset Device Rendering (Printing, Speaker) Audio (Speaker, Microphone, Headset) Link type: ACL (0x01) ... > HCI Event: Link Key Request (0x17) plen 6 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) < HCI Command: Link Key Request Reply (0x01|0x000b) plen 22 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Link key: ${32-hex-digits-key} ... > HCI Event: Connect Complete (0x03) plen 11 Status: Success (0x00) Handle: 256 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Link type: ACL (0x01) Encryption: Enabled (0x01) < HCI Command: Read Encryption Key... (0x05|0x0008) plen 2 Handle: 256 < ACL Data TX: Handle 256 flags 0x00 dlen 10 L2CAP: Information Request (0x0a) ident 1 len 2 Type: Extended features supported (0x0002) > HCI Event: Command Complete (0x0e) plen 7 Read Encryption Key Size (0x05|0x0008) ncmd 1 Status: Success (0x00) Handle: 256 Key size: 16 Cc: stable@vger.kernel.org Link: https://github.com/bluez/bluez/issues/704 Reviewed-by: Paul Menzel Reviewed-by: Luiz Augusto von Dentz Signed-off-by: Hui Wang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 4ae224824012..a8b8cfebe018 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3208,6 +3208,31 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, if (test_bit(HCI_ENCRYPT, &hdev->flags)) set_bit(HCI_CONN_ENCRYPT, &conn->flags); + /* "Link key request" completed ahead of "connect request" completes */ + if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) && + ev->link_type == ACL_LINK) { + struct link_key *key; + struct hci_cp_read_enc_key_size cp; + + key = hci_find_link_key(hdev, &ev->bdaddr); + if (key) { + set_bit(HCI_CONN_ENCRYPT, &conn->flags); + + if (!(hdev->commands[20] & 0x10)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } else { + cp.handle = cpu_to_le16(conn->handle); + if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, + sizeof(cp), &cp)) { + bt_dev_err(hdev, "sending read key size failed"); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } + } + + hci_encrypt_cfm(conn, ev->status); + } + } + /* Get remote features */ if (conn->type == ACL_LINK) { struct hci_cp_read_remote_features cp; -- cgit v1.2.3 From 7835fcfd132eb88b87e8eb901f88436f63ab60f7 Mon Sep 17 00:00:00 2001 From: Bastien Nocera Date: Wed, 27 Mar 2024 15:24:56 +0100 Subject: Bluetooth: Fix TOCTOU in HCI debugfs implementation struct hci_dev members conn_info_max_age, conn_info_min_age, le_conn_max_interval, le_conn_min_interval, le_adv_max_interval, and le_adv_min_interval can be modified from the HCI core code, as well through debugfs. The debugfs implementation, that's only available to privileged users, will check for boundaries, making sure that the minimum value being set is strictly above the maximum value that already exists, and vice-versa. However, as both minimum and maximum values can be changed concurrently to us modifying them, we need to make sure that the value we check is the value we end up using. For example, with ->conn_info_max_age set to 10, conn_info_min_age_set() gets called from vfs handlers to set conn_info_min_age to 8. In conn_info_min_age_set(), this goes through: if (val == 0 || val > hdev->conn_info_max_age) return -EINVAL; Concurrently, conn_info_max_age_set() gets called to set to set the conn_info_max_age to 7: if (val == 0 || val > hdev->conn_info_max_age) return -EINVAL; That check will also pass because we used the old value (10) for conn_info_max_age. After those checks that both passed, the struct hci_dev access is mutex-locked, disabling concurrent access, but that does not matter because the invalid value checks both passed, and we'll end up with conn_info_min_age = 8 and conn_info_max_age = 7 To fix this problem, we need to lock the structure access before so the check and assignment are not interrupted. This fix was originally devised by the BassCheck[1] team, and considered the problem to be an atomicity one. This isn't the case as there aren't any concerns about the variable changing while we check it, but rather after we check it parallel to another change. This patch fixes CVE-2024-24858 and CVE-2024-24857. [1] https://sites.google.com/view/basscheck/ Co-developed-by: Gui-Dong Han <2045gemini@gmail.com> Signed-off-by: Gui-Dong Han <2045gemini@gmail.com> Link: https://lore.kernel.org/linux-bluetooth/20231222161317.6255-1-2045gemini@gmail.com/ Link: https://nvd.nist.gov/vuln/detail/CVE-2024-24858 Link: https://lore.kernel.org/linux-bluetooth/20231222162931.6553-1-2045gemini@gmail.com/ Link: https://lore.kernel.org/linux-bluetooth/20231222162310.6461-1-2045gemini@gmail.com/ Link: https://nvd.nist.gov/vuln/detail/CVE-2024-24857 Fixes: 31ad169148df ("Bluetooth: Add conn info lifetime parameters to debugfs") Fixes: 729a1051da6f ("Bluetooth: Expose default LE advertising interval via debugfs") Fixes: 71c3b60ec6d2 ("Bluetooth: Move BR/EDR debugfs file creation into hci_debugfs.c") Signed-off-by: Bastien Nocera Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_debugfs.c | 48 ++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 233453807b50..ce3ff2fa72e5 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -218,10 +218,12 @@ static int conn_info_min_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val > hdev->conn_info_max_age) + hci_dev_lock(hdev); + if (val == 0 || val > hdev->conn_info_max_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_min_age = val; hci_dev_unlock(hdev); @@ -246,10 +248,12 @@ static int conn_info_max_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val < hdev->conn_info_min_age) + hci_dev_lock(hdev); + if (val == 0 || val < hdev->conn_info_min_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_max_age = val; hci_dev_unlock(hdev); @@ -567,10 +571,12 @@ static int sniff_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val > hdev->sniff_max_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val > hdev->sniff_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_min_interval = val; hci_dev_unlock(hdev); @@ -595,10 +601,12 @@ static int sniff_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val < hdev->sniff_min_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val < hdev->sniff_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_max_interval = val; hci_dev_unlock(hdev); @@ -850,10 +858,12 @@ static int conn_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_min_interval = val; hci_dev_unlock(hdev); @@ -878,10 +888,12 @@ static int conn_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_max_interval = val; hci_dev_unlock(hdev); @@ -990,10 +1002,12 @@ static int adv_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_min_interval = val; hci_dev_unlock(hdev); @@ -1018,10 +1032,12 @@ static int adv_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_max_interval = val; hci_dev_unlock(hdev); -- cgit v1.2.3 From 2c603a4947a1247102ccb008d5eb6f37a4043c98 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Fri, 29 Mar 2024 11:08:12 +0530 Subject: ASoC: amd: acp: fix for acp_init function error handling If acp_init() fails, acp pci driver probe should return error. Add acp_init() function return value check logic. Fixes: e61b415515d3 ("ASoC: amd: acp: refactor the acp init and de-init sequence") Signed-off-by: Vijendar Mukunda Link: https://lore.kernel.org/r/20240329053815.2373979-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-pci.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/soc/amd/acp/acp-pci.c b/sound/soc/amd/acp/acp-pci.c index 440b91d4f261..5f35b90eab8d 100644 --- a/sound/soc/amd/acp/acp-pci.c +++ b/sound/soc/amd/acp/acp-pci.c @@ -115,7 +115,10 @@ static int acp_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id goto unregister_dmic_dev; } - acp_init(chip); + ret = acp_init(chip); + if (ret) + goto unregister_dmic_dev; + res = devm_kcalloc(&pci->dev, num_res, sizeof(struct resource), GFP_KERNEL); if (!res) { ret = -ENOMEM; -- cgit v1.2.3 From 42fb9cfd5b186fe2e615564f0a1bdd424aa1b151 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 28 Mar 2024 12:49:47 +0000 Subject: Documentation: dev-tools: Add link to RV docs I could not remember the name of this system and it's pretty hard to find without the right keywords. I had to ask an LLM! Drop a breadcrumb to help people find it in the future. Signed-off-by: Brendan Jackman Acked-by: Daniel Bristot de Oliveira Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240328124947.2107524-1-jackmanb@google.com --- Documentation/dev-tools/testing-overview.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/dev-tools/testing-overview.rst b/Documentation/dev-tools/testing-overview.rst index 0aaf6ea53608..1619e5e5cc9c 100644 --- a/Documentation/dev-tools/testing-overview.rst +++ b/Documentation/dev-tools/testing-overview.rst @@ -104,6 +104,8 @@ Some of these tools are listed below: KASAN and can be used in production. See Documentation/dev-tools/kfence.rst * lockdep is a locking correctness validator. See Documentation/locking/lockdep-design.rst +* Runtime Verification (RV) supports checking specific behaviours for a given + subsystem. See Documentation/trace/rv/runtime-verification.rst * There are several other pieces of debug instrumentation in the kernel, many of which can be found in lib/Kconfig.debug -- cgit v1.2.3 From 0ec69b3bed23a4a5a88b4261afeee44ade709ed3 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Tue, 26 Mar 2024 17:38:25 +0000 Subject: docs: Fix bitfield handling in kernel-doc kernel-doc doesn't handle bitfields that are specified with symbolic name, e.g. u32 cs_index_mask : SPI_CS_CNT_MAX This results in the following warnings when running `make htmldocs`: include/linux/spi/spi.h:246: warning: Function parameter or struct member 'cs_index_mask:SPI_CS_CNT_MAX' not described in 'spi_device' include/linux/spi/spi.h:246: warning: Excess struct member 'cs_index_mask' description in 'spi_device' Update the regexp for bitfields to accept all word chars, not just digits. Signed-off-by: Donald Hunter Acked-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240326173825.99190-1-donald.hunter@gmail.com --- scripts/kernel-doc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 967f1abb0edb..cb1be22afc65 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1541,7 +1541,7 @@ sub create_parameterlist($$$$) { save_struct_actual($2); push_parameter($2, "$type $1", $arg, $file, $declaration_name); - } elsif ($param =~ m/(.*?):(\d+)/) { + } elsif ($param =~ m/(.*?):(\w+)/) { if ($type ne "") { # skip unnamed bit-fields save_struct_actual($1); push_parameter($1, "$type:$2", $arg, $file, $declaration_name) -- cgit v1.2.3 From b75d85218fdfd8774f2f8397d1f6092ed06bd311 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Sun, 24 Mar 2024 02:17:04 +0300 Subject: tracing: Fix documentation on tp_printk cmdline option kernel-parameters.txt incorrectly states that workings of kernel.tracepoint_printk sysctl depends on "tracepoint_printk kernel cmdline option", this is a bit misleading for new users since the actual cmdline option name is tp_printk. Fixes: 0daa2302968c ("tracing: Add tp_printk cmdline to have tracepoints go to printk()") Signed-off-by: Vitaly Chikunov Reviewed-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240323231704.1217926-1-vt@altlinux.org --- Documentation/admin-guide/kernel-parameters.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f6..623fce7d5fcd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6599,7 +6599,7 @@ To turn off having tracepoints sent to printk, echo 0 > /proc/sys/kernel/tracepoint_printk Note, echoing 1 into this file without the - tracepoint_printk kernel cmdline option has no effect. + tp_printk kernel cmdline option has no effect. The tp_printk_stop_on_boot (see below) can also be used to stop the printing of events to console at -- cgit v1.2.3 From e9c44c1beaba623b12201d2028bc20f535464d9b Mon Sep 17 00:00:00 2001 From: Weiji Wang Date: Tue, 19 Mar 2024 19:42:15 +0800 Subject: docs: zswap: fix shell command format Format the shell commands as code block to keep the documentation in the same style Signed-off-by: Weiji Wang Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240319114253.2647-1-nebclllo0444@gmail.com --- Documentation/admin-guide/mm/zswap.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index b42132969e31..13632671adae 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -155,7 +155,7 @@ Setting this parameter to 100 will disable the hysteresis. Some users cannot tolerate the swapping that comes with zswap store failures and zswap writebacks. Swapping can be disabled entirely (without disabling -zswap itself) on a cgroup-basis as follows: +zswap itself) on a cgroup-basis as follows:: echo 0 > /sys/fs/cgroup//memory.zswap.writeback @@ -166,7 +166,7 @@ writeback (because the same pages might be rejected again and again). When there is a sizable amount of cold memory residing in the zswap pool, it can be advantageous to proactively write these cold pages to swap and reclaim the memory for other use cases. By default, the zswap shrinker is disabled. -User can enable it as follows: +User can enable it as follows:: echo Y > /sys/module/zswap/parameters/shrinker_enabled -- cgit v1.2.3 From 09ba28e1cd3cf715daab1fca6e1623e22fd754a6 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Mon, 25 Mar 2024 17:09:29 -0400 Subject: mlxbf_gige: stop interface during shutdown The mlxbf_gige driver intermittantly encounters a NULL pointer exception while the system is shutting down via "reboot" command. The mlxbf_driver will experience an exception right after executing its shutdown() method. One example of this exception is: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070 Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=000000011d373000 [0000000000000070] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 96000004 [#1] SMP CPU: 0 PID: 13 Comm: ksoftirqd/0 Tainted: G S OE 5.15.0-bf.6.gef6992a #1 Hardware name: https://www.mellanox.com BlueField SoC/BlueField SoC, BIOS 4.0.2.12669 Apr 21 2023 pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] lr : mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] sp : ffff8000080d3c10 x29: ffff8000080d3c10 x28: ffffcce72cbb7000 x27: ffff8000080d3d58 x26: ffff0000814e7340 x25: ffff331cd1a05000 x24: ffffcce72c4ea008 x23: ffff0000814e4b40 x22: ffff0000814e4d10 x21: ffff0000814e4128 x20: 0000000000000000 x19: ffff0000814e4a80 x18: ffffffffffffffff x17: 000000000000001c x16: ffffcce72b4553f4 x15: ffff80008805b8a7 x14: 0000000000000000 x13: 0000000000000030 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: c2ac898b17576267 x9 : ffffcce720fa5404 x8 : ffff000080812138 x7 : 0000000000002e9a x6 : 0000000000000080 x5 : ffff00008de3b000 x4 : 0000000000000000 x3 : 0000000000000001 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 Call trace: mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] __napi_poll+0x40/0x1c8 net_rx_action+0x314/0x3a0 __do_softirq+0x128/0x334 run_ksoftirqd+0x54/0x6c smpboot_thread_fn+0x14c/0x190 kthread+0x10c/0x110 ret_from_fork+0x10/0x20 Code: 8b070000 f9000ea0 f95056c0 f86178a1 (b9407002) ---[ end trace 7cc3941aa0d8e6a4 ]--- Kernel panic - not syncing: Oops: Fatal exception in interrupt Kernel Offset: 0x4ce722520000 from 0xffff800008000000 PHYS_OFFSET: 0x80000000 CPU features: 0x000005c1,a3330e5a Memory Limit: none ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]--- During system shutdown, the mlxbf_gige driver's shutdown() is always executed. However, the driver's stop() method will only execute if networking interface configuration logic within the Linux distribution has been setup to do so. If shutdown() executes but stop() does not execute, NAPI remains enabled and this can lead to an exception if NAPI is scheduled while the hardware interface has only been partially deinitialized. The networking interface managed by the mlxbf_gige driver must be properly stopped during system shutdown so that IFF_UP is cleared, the hardware interface is put into a clean state, and NAPI is fully deinitialized. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Signed-off-by: David Thompson Link: https://lore.kernel.org/r/20240325210929.25362-1-davthompson@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index 77134ca92938..ba303868686a 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "mlxbf_gige.h" @@ -492,8 +493,13 @@ static void mlxbf_gige_shutdown(struct platform_device *pdev) { struct mlxbf_gige *priv = platform_get_drvdata(pdev); - writeq(0, priv->base + MLXBF_GIGE_INT_EN); - mlxbf_gige_clean_port(priv); + rtnl_lock(); + netif_device_detach(priv->netdev); + + if (netif_running(priv->netdev)) + dev_close(priv->netdev); + + rtnl_unlock(); } static const struct acpi_device_id __maybe_unused mlxbf_gige_acpi_match[] = { -- cgit v1.2.3 From 6dae957c8eef6eae5b386462767de97303235d5c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 29 Mar 2024 07:11:06 +0000 Subject: bpf: fix possible file descriptor leaks in verifier The resolve_pseudo_ldimm64() function might have leaked file descriptors when BPF_MAP_TYPE_ARENA was used in a program (some error paths missed a corresponding fdput). Add missing fdputs. v2: remove unrelated changes from the fix Fixes: 6082b6c328b5 ("bpf: Recognize addr_space_cast instruction in the verifier.") Signed-off-by: Anton Protopopov Acked-by: Yonghong Song Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20240329071106.67968-1-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 353985b2b6a2..98188379d5c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18379,15 +18379,18 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } if (!env->prog->jit_requested) { verbose(env, "JIT is required to use arena\n"); + fdput(f); return -EOPNOTSUPP; } if (!bpf_jit_supports_arena()) { verbose(env, "JIT doesn't support arena\n"); + fdput(f); return -EOPNOTSUPP; } env->prog->aux->arena = (void *)map; if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { verbose(env, "arena's user address must be set via map_extra or mmap()\n"); + fdput(f); return -EINVAL; } } -- cgit v1.2.3 From eaa03486d932572dfd1c5f64f9dfebe572ad88c0 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 29 Mar 2024 14:46:30 +0000 Subject: regmap: maple: Fix uninitialized symbol 'ret' warnings Fix warnings reported by smatch by initializing local 'ret' variable to 0. drivers/base/regmap/regcache-maple.c:186 regcache_maple_drop() error: uninitialized symbol 'ret'. drivers/base/regmap/regcache-maple.c:290 regcache_maple_sync() error: uninitialized symbol 'ret'. Signed-off-by: Richard Fitzgerald Fixes: f033c26de5a5 ("regmap: Add maple tree based register cache") Link: https://lore.kernel.org/r/20240329144630.1965159-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/base/regmap/regcache-maple.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/regmap/regcache-maple.c b/drivers/base/regmap/regcache-maple.c index c1776127a572..55999a50ccc0 100644 --- a/drivers/base/regmap/regcache-maple.c +++ b/drivers/base/regmap/regcache-maple.c @@ -112,7 +112,7 @@ static int regcache_maple_drop(struct regmap *map, unsigned int min, unsigned long *entry, *lower, *upper; unsigned long lower_index, lower_last; unsigned long upper_index, upper_last; - int ret; + int ret = 0; lower = NULL; upper = NULL; @@ -244,7 +244,7 @@ static int regcache_maple_sync(struct regmap *map, unsigned int min, unsigned long lmin = min; unsigned long lmax = max; unsigned int r, v, sync_start; - int ret; + int ret = 0; bool sync_needed = false; map->cache_bypass = true; -- cgit v1.2.3 From 302b84e84d108b878efc56ebfea09474159be56b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 28 Mar 2024 16:07:23 -0500 Subject: Revert "PCI: Mark LSI FW643 to avoid bus reset" This reverts commit 29a43dc130ce65d365a8ea9e1cc4bc51005a353e. 29a43dc130ce ("PCI: Mark LSI FW643 to avoid bus reset") by Edmund was based on the assumption that the LSI / Agere FW643 has a defect such that it can't recover after a Secondary Bus Reset (SBR). But Takashi Sakamoto reported that SBR works fine on this same FW643 device in an AMD Ryzen 5 2400G system, so apparently there is some other aspect of Edmund's system that accounts for the issue. The down side of 29a43dc130ce is that when the FW643 is assigned to a VM, avoiding the SBR means we leak data out of the VM. Revert 29a43dc130ce until we figure out a better solution. In the meantime, we can use the sysfs "reset_method" interface to restrict the available reset methods. Link: https://lore.kernel.org/r/20240328212302.1582483-1-helgaas@kernel.org Fixes: 29a43dc130ce ("PCI: Mark LSI FW643 to avoid bus reset") Reported-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20240325012135.36861-1-o-takashi@sakamocchi.jp Signed-off-by: Bjorn Helgaas Reviewed-by: Takashi Sakamoto --- drivers/pci/quirks.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index bf4833221816..eff7f5df08e2 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3765,14 +3765,6 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003e, quirk_no_bus_reset); */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_CAVIUM, 0xa100, quirk_no_bus_reset); -/* - * Apparently the LSI / Agere FW643 can't recover after a Secondary Bus - * Reset and requires a power-off or suspend/resume and rescan. Prevent - * use of that reset. - */ -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATT, 0x5900, quirk_no_bus_reset); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATT, 0x5901, quirk_no_bus_reset); - /* * Some TI KeyStone C667X devices do not support bus/hot reset. The PCIESS * automatically disables LTSSM when Secondary Bus Reset is received and -- cgit v1.2.3 From 625aefac340f45a4fc60908da763f437599a0d6f Mon Sep 17 00:00:00 2001 From: Michael Krummsdorf Date: Tue, 26 Mar 2024 13:36:54 +0100 Subject: net: dsa: mv88e6xxx: fix usable ports on 88e6020 The switch has 4 ports with 2 internal PHYs, but ports are numbered up to 6, with ports 0, 1, 5 and 6 being usable. Fixes: 71d94a432a15 ("net: dsa: mv88e6xxx: add support for MV88E6020 switch") Signed-off-by: Michael Krummsdorf Signed-off-by: Matthias Schiffer Reviewed-by: Andrew Lunn Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326123655.40666-1-matthias.schiffer@ew.tq-group.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 9ed1821184ec..c95787cb9086 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5503,8 +5503,12 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .family = MV88E6XXX_FAMILY_6250, .name = "Marvell 88E6020", .num_databases = 64, - .num_ports = 4, + /* Ports 2-4 are not routed to pins + * => usable ports 0, 1, 5, 6 + */ + .num_ports = 7, .num_internal_phys = 2, + .invalid_port_mask = BIT(2) | BIT(3) | BIT(4), .max_vid = 4095, .port_base_addr = 0x8, .phy_base_addr = 0x0, -- cgit v1.2.3 From 62fc3357e079a07a22465b9b6ef71bb6ea75ee4b Mon Sep 17 00:00:00 2001 From: Mahmoud Adam Date: Tue, 26 Mar 2024 16:31:33 +0100 Subject: net/rds: fix possible cp null dereference cp might be null, calling cp->cp_conn would produce null dereference [Simon Horman adds:] Analysis: * cp is a parameter of __rds_rdma_map and is not reassigned. * The following call-sites pass a NULL cp argument to __rds_rdma_map() - rds_get_mr() - rds_get_mr_for_dest * Prior to the code above, the following assumes that cp may be NULL (which is indicative, but could itself be unnecessary) trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); * The code modified by this patch is guarded by IS_ERR(trans_private), where trans_private is assigned as per the previous point in this analysis. The only implementation of get_mr that I could locate is rds_ib_get_mr() which can return an ERR_PTR if the conn (4th) argument is NULL. * ret is set to PTR_ERR(trans_private). rds_ib_get_mr can return ERR_PTR(-ENODEV) if the conn (4th) argument is NULL. Thus ret may be -ENODEV in which case the code in question will execute. Conclusion: * cp may be NULL at the point where this patch adds a check; this patch does seem to address a possible bug Fixes: c055fc00c07b ("net/rds: fix WARNING in rds_conn_connect_if_down") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Mahmoud Adam Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326153132.55580-1-mngyadam@amazon.com Signed-off-by: Jakub Kicinski --- net/rds/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a4e3c5de998b..00dbcd4d28e6 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -302,7 +302,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ - if (ret == -ENODEV) + if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } -- cgit v1.2.3 From b1f532a3b1e6d2e5559c7ace49322922637a28aa Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 12 Feb 2024 13:58:33 +0100 Subject: batman-adv: Avoid infinite loop trying to resize local TT If the MTU of one of an attached interface becomes too small to transmit the local translation table then it must be resized to fit inside all fragments (when enabled) or a single packet. But if the MTU becomes too low to transmit even the header + the VLAN specific part then the resizing of the local TT will never succeed. This can for example happen when the usable space is 110 bytes and 11 VLANs are on top of batman-adv. In this case, at least 116 byte would be needed. There will just be an endless spam of batman_adv: batadv0: Forced to purge local tt entries to fit new maximum fragment MTU (110) in the log but the function will never finish. Problem here is that the timeout will be halved all the time and will then stagnate at 0 and therefore never be able to reduce the table even more. There are other scenarios possible with a similar result. The number of BATADV_TT_CLIENT_NOPURGE entries in the local TT can for example be too high to fit inside a packet. Such a scenario can therefore happen also with only a single VLAN + 7 non-purgable addresses - requiring at least 120 bytes. While this should be handled proactively when: * interface with too low MTU is added * VLAN is added * non-purgeable local mac is added * MTU of an attached interface is reduced * fragmentation setting gets disabled (which most likely requires dropping attached interfaces) not all of these scenarios can be prevented because batman-adv is only consuming events without the the possibility to prevent these actions (non-purgable MAC address added, MTU of an attached interface is reduced). It is therefore necessary to also make sure that the code is able to handle also the situations when there were already incompatible system configuration are present. Cc: stable@vger.kernel.org Fixes: a19d3d85e1b8 ("batman-adv: limit local translation table max size") Reported-by: syzbot+a6a4b5bb3da165594cff@syzkaller.appspotmail.com Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b95c36765d04..2243cec18ecc 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3948,7 +3948,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) spin_lock_bh(&bat_priv->tt.commit_lock); - while (true) { + while (timeout) { table_size = batadv_tt_local_table_transmit_size(bat_priv); if (packet_size_max >= table_size) break; -- cgit v1.2.3 From 5086f0fe46dcf8687c8c2e41e1f07826affebbba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 17:34:48 +0000 Subject: net: do not consume a cacheline for system_page_pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no reason to consume a full cacheline to store system_page_pool. We can eventually move it to softnet_data later for full locality control. Fixes: 2b0cfa6e4956 ("net: add generic percpu page_pool allocator") Signed-off-by: Eric Dumazet Cc: Lorenzo Bianconi Cc: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20240328173448.2262593-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9a67003e49db..984ff8b9d0e1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -429,7 +429,7 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool); +static DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* -- cgit v1.2.3 From e709acbd84fb6ef32736331b0147f027a3ef4c20 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 28 Mar 2024 10:06:21 +0800 Subject: octeontx2-pf: check negative error code in otx2_open() otx2_rxtx_enable() return negative error code such as -EIO, check -EIO rather than EIO to fix this problem. Fixes: c926252205c4 ("octeontx2-pf: Disable packet I/O for graceful exit") Signed-off-by: Su Hui Reviewed-by: Subbaraya Sundeep Reviewed-by: Simon Horman Reviewed-by: Kalesh AP Link: https://lore.kernel.org/r/20240328020620.4054692-1-suhui@nfschina.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index b40bd0e46751..3f46d5e0fb2e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1933,7 +1933,7 @@ int otx2_open(struct net_device *netdev) * mcam entries are enabled to receive the packets. Hence disable the * packet I/O. */ - if (err == EIO) + if (err == -EIO) goto err_disable_rxtx; else if (err) goto err_tx_stop_queues; -- cgit v1.2.3 From 5e864d90b20803edf6bd44a99fb9afa7171785f2 Mon Sep 17 00:00:00 2001 From: Atlas Yu Date: Thu, 28 Mar 2024 13:51:52 +0800 Subject: r8169: skip DASH fw status checks when DASH is disabled On devices that support DASH, the current code in the "rtl_loop_wait" function raises false alarms when DASH is disabled. This occurs because the function attempts to wait for the DASH firmware to be ready, even though it's not relevant in this case. r8169 0000:0c:00.0 eth0: RTL8168ep/8111ep, 38:7c:76:49:08:d9, XID 502, IRQ 86 r8169 0000:0c:00.0 eth0: jumbo features [frames: 9194 bytes, tx checksumming: ko] r8169 0000:0c:00.0 eth0: DASH disabled ... r8169 0000:0c:00.0 eth0: rtl_ep_ocp_read_cond == 0 (loop: 30, delay: 10000). This patch modifies the driver start/stop functions to skip checking the DASH firmware status when DASH is explicitly disabled. This prevents unnecessary delays and false alarms. The patch has been tested on several ThinkStation P8/PX workstations. Fixes: 0ab0c45d8aae ("r8169: add handling DASH when DASH is disabled") Signed-off-by: Atlas Yu Reviewed-by: Heiner Kallweit Link: https://lore.kernel.org/r/20240328055152.18443-1-atlas.yu@canonical.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 5c879a5c86d7..4ac444eb269f 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1314,17 +1314,40 @@ static void rtl8168ep_stop_cmac(struct rtl8169_private *tp) RTL_W8(tp, IBCR0, RTL_R8(tp, IBCR0) & ~0x01); } +static void rtl_dash_loop_wait(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long usecs, int n, bool high) +{ + if (!tp->dash_enabled) + return; + rtl_loop_wait(tp, c, usecs, n, high); +} + +static void rtl_dash_loop_wait_high(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long d, int n) +{ + rtl_dash_loop_wait(tp, c, d, n, true); +} + +static void rtl_dash_loop_wait_low(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long d, int n) +{ + rtl_dash_loop_wait(tp, c, d, n, false); +} + static void rtl8168dp_driver_start(struct rtl8169_private *tp) { r8168dp_oob_notify(tp, OOB_CMD_DRIVER_START); - rtl_loop_wait_high(tp, &rtl_dp_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_high(tp, &rtl_dp_ocp_read_cond, 10000, 10); } static void rtl8168ep_driver_start(struct rtl8169_private *tp) { r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_START); r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); - rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); + rtl_dash_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); } static void rtl8168_driver_start(struct rtl8169_private *tp) @@ -1338,7 +1361,7 @@ static void rtl8168_driver_start(struct rtl8169_private *tp) static void rtl8168dp_driver_stop(struct rtl8169_private *tp) { r8168dp_oob_notify(tp, OOB_CMD_DRIVER_STOP); - rtl_loop_wait_low(tp, &rtl_dp_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_low(tp, &rtl_dp_ocp_read_cond, 10000, 10); } static void rtl8168ep_driver_stop(struct rtl8169_private *tp) @@ -1346,7 +1369,7 @@ static void rtl8168ep_driver_stop(struct rtl8169_private *tp) rtl8168ep_stop_cmac(tp); r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_STOP); r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); - rtl_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); } static void rtl8168_driver_stop(struct rtl8169_private *tp) -- cgit v1.2.3 From 17af420545a750f763025149fa7b833a4fc8b8f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 11:22:48 +0000 Subject: erspan: make sure erspan_base_hdr is present in skb->head syzbot reported a problem in ip6erspan_rcv() [1] Issue is that ip6erspan_rcv() (and erspan_rcv()) no longer make sure erspan_base_hdr is present in skb linear part (skb->head) before getting @ver field from it. Add the missing pskb_may_pull() calls. v2: Reload iph pointer in erspan_rcv() after pskb_may_pull() because skb->head might have changed. [1] BUG: KMSAN: uninit-value in pskb_may_pull_reason include/linux/skbuff.h:2742 [inline] BUG: KMSAN: uninit-value in pskb_may_pull include/linux/skbuff.h:2756 [inline] BUG: KMSAN: uninit-value in ip6erspan_rcv net/ipv6/ip6_gre.c:541 [inline] BUG: KMSAN: uninit-value in gre_rcv+0x11f8/0x1930 net/ipv6/ip6_gre.c:610 pskb_may_pull_reason include/linux/skbuff.h:2742 [inline] pskb_may_pull include/linux/skbuff.h:2756 [inline] ip6erspan_rcv net/ipv6/ip6_gre.c:541 [inline] gre_rcv+0x11f8/0x1930 net/ipv6/ip6_gre.c:610 ip6_protocol_deliver_rcu+0x1d4c/0x2ca0 net/ipv6/ip6_input.c:438 ip6_input_finish net/ipv6/ip6_input.c:483 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] ip6_input+0x15d/0x430 net/ipv6/ip6_input.c:492 ip6_mc_input+0xa7e/0xc80 net/ipv6/ip6_input.c:586 dst_input include/net/dst.h:460 [inline] ip6_rcv_finish+0x955/0x970 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:314 [inline] ipv6_rcv+0xde/0x390 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core net/core/dev.c:5538 [inline] __netif_receive_skb+0x1da/0xa00 net/core/dev.c:5652 netif_receive_skb_internal net/core/dev.c:5738 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5798 tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1549 tun_get_user+0x5566/0x69e0 drivers/net/tun.c:2002 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2108 [inline] new_sync_write fs/read_write.c:497 [inline] vfs_write+0xb63/0x1520 fs/read_write.c:590 ksys_write+0x20f/0x4c0 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0x93/0xe0 fs/read_write.c:652 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 tun_alloc_skb drivers/net/tun.c:1525 [inline] tun_get_user+0x209a/0x69e0 drivers/net/tun.c:1846 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2108 [inline] new_sync_write fs/read_write.c:497 [inline] vfs_write+0xb63/0x1520 fs/read_write.c:590 ksys_write+0x20f/0x4c0 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0x93/0xe0 fs/read_write.c:652 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 1 PID: 5045 Comm: syz-executor114 Not tainted 6.9.0-rc1-syzkaller-00021-g962490525cff #0 Fixes: cb73ee40b1b3 ("net: ip_gre: use erspan key field for tunnel lookup") Reported-by: syzbot+1c1cf138518bf0c53d68@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/000000000000772f2c0614b66ef7@google.com/ Signed-off-by: Eric Dumazet Cc: Lorenzo Bianconi Link: https://lore.kernel.org/r/20240328112248.1101491-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_gre.c | 5 +++++ net/ipv6/ip6_gre.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7b16c211b904..57ddcd8c62f6 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -280,8 +280,13 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, tpi->flags | TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); } else { + if (unlikely(!pskb_may_pull(skb, + gre_hdr_len + sizeof(*ershdr)))) + return PACKET_REJECT; + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; + iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ca7e77e84283..c89aef524df9 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -528,6 +528,9 @@ static int ip6erspan_rcv(struct sk_buff *skb, struct ip6_tnl *tunnel; u8 ver; + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + ipv6h = ipv6_hdr(skb); ershdr = (struct erspan_base_hdr *)skb->data; ver = ershdr->ver; -- cgit v1.2.3 From 0640f47b742667fca6aac174f7cd62b6c2c7532c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 Mar 2024 17:43:05 +0100 Subject: drm/msm/dp: fix runtime PM leak on disconnect Make sure to put the runtime PM usage count (and suspend) also when receiving a disconnect event while in the ST_MAINLINK_READY state. This specifically avoids leaking a runtime PM usage count on every disconnect with display servers that do not automatically enable external displays when receiving a hotplug notification. Fixes: 5814b8bf086a ("drm/msm/dp: incorporate pm_runtime framework into DP driver") Cc: stable@vger.kernel.org # 6.8 Cc: Kuogee Hsieh Signed-off-by: Johan Hovold Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582744/ Link: https://lore.kernel.org/r/20240313164306.23133-2-johan+linaro@kernel.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index d80f89581760..ffa785ec641a 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -629,6 +629,7 @@ static int dp_hpd_unplug_handle(struct dp_display_private *dp, u32 data) dp_display_host_phy_exit(dp); dp->hpd_state = ST_DISCONNECTED; dp_display_notify_disconnect(&dp->dp_display.pdev->dev); + pm_runtime_put_sync(&pdev->dev); mutex_unlock(&dp->event_mutex); return 0; } -- cgit v1.2.3 From e86750b01a1560f198e4b3e21bb3f78bfd5bb2c3 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 Mar 2024 17:43:06 +0100 Subject: drm/msm/dp: fix runtime PM leak on connect failure Make sure to balance the runtime PM usage counter (and suspend) before returning on connect failures (e.g. DPCD read failures after a spurious connect event or if link training fails). Fixes: 5814b8bf086a ("drm/msm/dp: incorporate pm_runtime framework into DP driver") Cc: stable@vger.kernel.org # 6.8 Cc: Kuogee Hsieh Signed-off-by: Johan Hovold Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582746/ Link: https://lore.kernel.org/r/20240313164306.23133-3-johan+linaro@kernel.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index ffa785ec641a..2ed40bd0acb6 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -572,6 +572,7 @@ static int dp_hpd_plug_handle(struct dp_display_private *dp, u32 data) ret = dp_display_usbpd_configure_cb(&pdev->dev); if (ret) { /* link train failed */ dp->hpd_state = ST_DISCONNECTED; + pm_runtime_put_sync(&pdev->dev); } else { dp->hpd_state = ST_MAINLINK_READY; } -- cgit v1.2.3 From c588f7d67044d6d59ef92d75a970b64929984d89 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 14:08:09 -0700 Subject: drm/msm: Add newlines to some debug prints These debug prints are missing newlines, leading to multiple messages being printed on one line and hard to read logs. Add newlines to have the debug prints on separate lines. The DBG macro used to add a newline, but I missed that while migrating to drm_dbg wrappers. Fixes: 7cb017db1896 ("drm/msm: Move FB debug prints to drm_dbg_state()") Fixes: 721c6e0c6aed ("drm/msm: Move vblank debug prints to drm_dbg_vbl()") Signed-off-by: Stephen Boyd Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/584769/ Link: https://lore.kernel.org/r/20240325210810.1340820-1-swboyd@chromium.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/msm_fb.c | 6 +++--- drivers/gpu/drm/msm/msm_kms.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_fb.c b/drivers/gpu/drm/msm/msm_fb.c index e3f61c39df69..80166f702a0d 100644 --- a/drivers/gpu/drm/msm/msm_fb.c +++ b/drivers/gpu/drm/msm/msm_fb.c @@ -89,7 +89,7 @@ int msm_framebuffer_prepare(struct drm_framebuffer *fb, for (i = 0; i < n; i++) { ret = msm_gem_get_and_pin_iova(fb->obj[i], aspace, &msm_fb->iova[i]); - drm_dbg_state(fb->dev, "FB[%u]: iova[%d]: %08llx (%d)", + drm_dbg_state(fb->dev, "FB[%u]: iova[%d]: %08llx (%d)\n", fb->base.id, i, msm_fb->iova[i], ret); if (ret) return ret; @@ -176,7 +176,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev, const struct msm_format *format; int ret, i, n; - drm_dbg_state(dev, "create framebuffer: mode_cmd=%p (%dx%d@%4.4s)", + drm_dbg_state(dev, "create framebuffer: mode_cmd=%p (%dx%d@%4.4s)\n", mode_cmd, mode_cmd->width, mode_cmd->height, (char *)&mode_cmd->pixel_format); @@ -232,7 +232,7 @@ static struct drm_framebuffer *msm_framebuffer_init(struct drm_device *dev, refcount_set(&msm_fb->dirtyfb, 1); - drm_dbg_state(dev, "create: FB ID: %d (%p)", fb->base.id, fb); + drm_dbg_state(dev, "create: FB ID: %d (%p)\n", fb->base.id, fb); return fb; diff --git a/drivers/gpu/drm/msm/msm_kms.c b/drivers/gpu/drm/msm/msm_kms.c index 84c21ec2ceea..af6a6fcb1173 100644 --- a/drivers/gpu/drm/msm/msm_kms.c +++ b/drivers/gpu/drm/msm/msm_kms.c @@ -149,7 +149,7 @@ int msm_crtc_enable_vblank(struct drm_crtc *crtc) struct msm_kms *kms = priv->kms; if (!kms) return -ENXIO; - drm_dbg_vbl(dev, "crtc=%u", crtc->base.id); + drm_dbg_vbl(dev, "crtc=%u\n", crtc->base.id); return vblank_ctrl_queue_work(priv, crtc, true); } @@ -160,7 +160,7 @@ void msm_crtc_disable_vblank(struct drm_crtc *crtc) struct msm_kms *kms = priv->kms; if (!kms) return; - drm_dbg_vbl(dev, "crtc=%u", crtc->base.id); + drm_dbg_vbl(dev, "crtc=%u\n", crtc->base.id); vblank_ctrl_queue_work(priv, crtc, false); } -- cgit v1.2.3 From 4f3b77ae5ff5b5ba9d99c5d5450db388dbee5107 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 14 Mar 2024 03:10:41 +0200 Subject: drm/msm/dpu: don't allow overriding data from catalog The data from catalog is marked as const, so it is a part of the RO segment. Allowing userspace to write to it through debugfs can cause protection faults. Set debugfs file mode to read-only for debug entries corresponding to perf_cfg coming from catalog. Fixes: abda0d925f9c ("drm/msm/dpu: Mark various data tables as const") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/582844/ Link: https://lore.kernel.org/r/20240314-dpu-perf-rework-v3-1-79fa4e065574@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c index ef871239adb2..68fae048a9a8 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c @@ -459,15 +459,15 @@ int dpu_core_perf_debugfs_init(struct dpu_kms *dpu_kms, struct dentry *parent) &perf->core_clk_rate); debugfs_create_u32("enable_bw_release", 0600, entry, (u32 *)&perf->enable_bw_release); - debugfs_create_u32("threshold_low", 0600, entry, + debugfs_create_u32("threshold_low", 0400, entry, (u32 *)&perf->perf_cfg->max_bw_low); - debugfs_create_u32("threshold_high", 0600, entry, + debugfs_create_u32("threshold_high", 0400, entry, (u32 *)&perf->perf_cfg->max_bw_high); - debugfs_create_u32("min_core_ib", 0600, entry, + debugfs_create_u32("min_core_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_core_ib); - debugfs_create_u32("min_llcc_ib", 0600, entry, + debugfs_create_u32("min_llcc_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_llcc_ib); - debugfs_create_u32("min_dram_ib", 0600, entry, + debugfs_create_u32("min_dram_ib", 0400, entry, (u32 *)&perf->perf_cfg->min_dram_ib); debugfs_create_file("perf_mode", 0600, entry, (u32 *)perf, &dpu_core_perf_mode_fops); -- cgit v1.2.3 From ee15c8bf5d77a306614bdefe33828310662dee05 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Fri, 29 Mar 2024 12:46:26 -0700 Subject: drm/msm/dp: assign correct DP controller ID to x1e80100 interface table At current x1e80100 interface table, interface #3 is wrongly connected to DP controller #0 and interface #4 wrongly connected to DP controller #2. Fix this problem by connect Interface #3 to DP controller #0 and interface #4 connect to DP controller #1. Also add interface #6, #7 and #8 connections to DP controller to complete x1e80100 interface table. Changs in V3: -- add v2 changes log Changs in V2: -- add x1e80100 to subject -- add Fixes Fixes: e3b1f369db5a ("drm/msm/dpu: Add X1E80100 support") Signed-off-by: Kuogee Hsieh Reviewed-by: Abhinav Kumar Reviewed-by: Abel Vesa Patchwork: https://patchwork.freedesktop.org/patch/585549/ Link: https://lore.kernel.org/r/1711741586-9037-1-git-send-email-quic_khsieh@quicinc.com Signed-off-by: Abhinav Kumar --- .../drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h | 34 ++++++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h index 9a9f7092c526..a3e60ac70689 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_2_x1e80100.h @@ -324,6 +324,7 @@ static const struct dpu_wb_cfg x1e80100_wb[] = { }, }; +/* TODO: INTF 3, 8 and 7 are used for MST, marked as INTF_NONE for now */ static const struct dpu_intf_cfg x1e80100_intf[] = { { .name = "intf_0", .id = INTF_0, @@ -358,8 +359,8 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .name = "intf_3", .id = INTF_3, .base = 0x37000, .len = 0x280, .features = INTF_SC7280_MASK, - .type = INTF_DP, - .controller_id = MSM_DP_CONTROLLER_1, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_0, /* pair with intf_0 for DP MST */ .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 30), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 31), @@ -368,7 +369,7 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .base = 0x38000, .len = 0x280, .features = INTF_SC7280_MASK, .type = INTF_DP, - .controller_id = MSM_DP_CONTROLLER_2, + .controller_id = MSM_DP_CONTROLLER_1, .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 20), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 21), @@ -381,6 +382,33 @@ static const struct dpu_intf_cfg x1e80100_intf[] = { .prog_fetch_lines_worst_case = 24, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 22), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 23), + }, { + .name = "intf_6", .id = INTF_6, + .base = 0x3A000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_DP, + .controller_id = MSM_DP_CONTROLLER_2, + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 17), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 16), + }, { + .name = "intf_7", .id = INTF_7, + .base = 0x3b000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_2, /* pair with intf_6 for DP MST */ + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 18), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 19), + }, { + .name = "intf_8", .id = INTF_8, + .base = 0x3c000, .len = 0x280, + .features = INTF_SC7280_MASK, + .type = INTF_NONE, + .controller_id = MSM_DP_CONTROLLER_1, /* pair with intf_4 for DP MST */ + .prog_fetch_lines_worst_case = 24, + .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12), + .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13), }, }; -- cgit v1.2.3 From ea111449501ea32bf6da82750de860243691efc7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:44 -0700 Subject: tcp: Fix bind() regression for v6-only wildcard and v4-mapped-v6 non-wildcard addresses. Commit 5e07e672412b ("tcp: Use bhash2 for v4-mapped-v6 non-wildcard address.") introduced bind() regression for v4-mapped-v6 address. When we bind() the following two addresses on the same port, the 2nd bind() should succeed but fails now. 1. [::] w/ IPV6_ONLY 2. ::ffff:127.0.0.1 After the chagne, v4-mapped-v6 uses bhash2 instead of bhash to detect conflict faster, but I forgot to add a necessary change. During the 2nd bind(), inet_bind2_bucket_match_addr_any() returns the tb2 bucket of [::], and inet_bhash2_conflict() finally calls inet_bind_conflict(), which returns true, meaning conflict. inet_bhash2_addr_any_conflict |- inet_bind2_bucket_match_addr_any <-- return [::] bucket `- inet_bhash2_conflict `- __inet_bhash2_conflict <-- checks IPV6_ONLY for AF_INET | but not for v4-mapped-v6 address `- inet_bind_conflict <-- does not check address inet_bind_conflict() does not check socket addresses because __inet_bhash2_conflict() is expected to do so. However, it checks IPV6_V6ONLY attribute only against AF_INET socket, and not for v4-mapped-v6 address. As a result, v4-mapped-v6 address conflicts with v6-only wildcard address. To avoid that, let's add the missing test to use bhash2 for v4-mapped-v6 address. Fixes: 5e07e672412b ("tcp: Use bhash2 for v4-mapped-v6 non-wildcard address.") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c038e28e2f1e..4184d45f890c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -203,8 +203,15 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - return false; + if (ipv6_only_sock(sk2)) { + if (sk->sk_family == AF_INET) + return false; + +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return false; +#endif + } return inet_bind_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok); -- cgit v1.2.3 From d91ef1e1b55f730bee8ce286b02b7bdccbc42973 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:45 -0700 Subject: tcp: Fix bind() regression for v6-only wildcard and v4(-mapped-v6) non-wildcard addresses. Jianguo Wu reported another bind() regression introduced by bhash2. Calling bind() for the following 3 addresses on the same port, the 3rd one should fail but now succeeds. 1. 0.0.0.0 or ::ffff:0.0.0.0 2. [::] w/ IPV6_V6ONLY 3. IPv4 non-wildcard address or v4-mapped-v6 non-wildcard address The first two bind() create tb2 like this: bhash2 -> tb2(:: w/ IPV6_V6ONLY) -> tb2(0.0.0.0) The 3rd bind() will match with the IPv6 only wildcard address bucket in inet_bind2_bucket_match_addr_any(), however, no conflicting socket exists in the bucket. So, inet_bhash2_conflict() will returns false, and thus, inet_bhash2_addr_any_conflict() returns false consequently. As a result, the 3rd bind() bypasses conflict check, which should be done against the IPv4 wildcard address bucket. So, in inet_bhash2_addr_any_conflict(), we must iterate over all buckets. Note that we cannot add ipv6_only flag for inet_bind2_bucket as it would confuse the following patetrn. 1. [::] w/ SO_REUSE{ADDR,PORT} and IPV6_V6ONLY 2. [::] w/ SO_REUSE{ADDR,PORT} 3. IPv4 non-wildcard address or v4-mapped-v6 non-wildcard address The first bind() would create a bucket with ipv6_only flag true, the second bind() would add the [::] socket into the same bucket, and the third bind() could succeed based on the wrong assumption that ipv6_only bucket would not conflict with v4(-mapped-v6) address. Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Diagnosed-by: Jianguo Wu Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4184d45f890c..3b38610958ee 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -294,6 +294,7 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; + bool conflict = false; bool reuseport_cb_ok; rcu_read_lock(); @@ -306,18 +307,20 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l spin_lock(&head2->lock); - inet_bind_bucket_for_each(tb2, &head2->chain) - if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) - break; + inet_bind_bucket_for_each(tb2, &head2->chain) { + if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) + continue; - if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok)) { - spin_unlock(&head2->lock); - return true; + if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) + continue; + + conflict = true; + break; } spin_unlock(&head2->lock); - return false; + + return conflict; } /* -- cgit v1.2.3 From c48baf567dedbba731d66f5a2cd46f1b6def50aa Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:46 -0700 Subject: selftest: tcp: Make bind() selftest flexible. Currently, bind_wildcard.c tests only (IPv4, IPv6) pairs, but we will add more tests for the same protocol pairs. This patch makes it possible by changing the address pointer to void. No functional changes are intended. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 92 ++++++++++++++++++----------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index a2662348cdb1..d65c3bb6ba13 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -6,7 +6,9 @@ #include "../kselftest_harness.h" -struct in6_addr in6addr_v4mapped_any = { +static const __u32 in4addr_any = INADDR_ANY; +static const __u32 in4addr_loopback = INADDR_LOOPBACK; +static const struct in6_addr in6addr_v4mapped_any = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -14,8 +16,7 @@ struct in6_addr in6addr_v4mapped_any = { 0, 0, 0, 0 } }; - -struct in6_addr in6addr_v4mapped_loopback = { +static const struct in6_addr in6addr_v4mapped_loopback = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -26,82 +27,105 @@ struct in6_addr in6addr_v4mapped_loopback = { FIXTURE(bind_wildcard) { - struct sockaddr_in addr4; - struct sockaddr_in6 addr6; + socklen_t addrlen[2]; + union { + struct sockaddr addr; + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + } addr[2]; }; FIXTURE_VARIANT(bind_wildcard) { - const __u32 addr4_const; - const struct in6_addr *addr6_const; + sa_family_t family[2]; + const void *addr[2]; int expected_errno; }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_loopback}, .expected_errno = 0, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_v4mapped_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_v4mapped_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_v4mapped_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_v4mapped_loopback}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_loopback}, .expected_errno = 0, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_v4mapped_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_v4mapped_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_v4mapped_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_v4mapped_loopback}, .expected_errno = EADDRINUSE, }; -FIXTURE_SETUP(bind_wildcard) +static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, + int family, const void *addr_const) { - self->addr4.sin_family = AF_INET; - self->addr4.sin_port = htons(0); - self->addr4.sin_addr.s_addr = htonl(variant->addr4_const); + if (family == AF_INET) { + struct sockaddr_in *addr4 = &self->addr[i].addr4; + const __u32 *addr4_const = addr_const; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(0); + addr4->sin_addr.s_addr = htonl(*addr4_const); + + self->addrlen[i] = sizeof(struct sockaddr_in); + } else { + struct sockaddr_in6 *addr6 = &self->addr[i].addr6; + const struct in6_addr *addr6_const = addr_const; + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(0); + addr6->sin6_addr = *addr6_const; - self->addr6.sin6_family = AF_INET6; - self->addr6.sin6_port = htons(0); - self->addr6.sin6_addr = *variant->addr6_const; + self->addrlen[i] = sizeof(struct sockaddr_in6); + } +} + +FIXTURE_SETUP(bind_wildcard) +{ + setup_addr(self, 0, variant->family[0], variant->addr[0]); + setup_addr(self, 1, variant->family[1], variant->addr[1]); } FIXTURE_TEARDOWN(bind_wildcard) @@ -146,15 +170,15 @@ void bind_sockets(struct __test_metadata *_metadata, TEST_F(bind_wildcard, v4_v6) { bind_sockets(_metadata, self, variant->expected_errno, - (struct sockaddr *)&self->addr4, sizeof(self->addr4), - (struct sockaddr *)&self->addr6, sizeof(self->addr6)); + &self->addr[0].addr, self->addrlen[0], + &self->addr[1].addr, self->addrlen[1]); } TEST_F(bind_wildcard, v6_v4) { bind_sockets(_metadata, self, variant->expected_errno, - (struct sockaddr *)&self->addr6, sizeof(self->addr6), - (struct sockaddr *)&self->addr4, sizeof(self->addr4)); + &self->addr[1].addr, self->addrlen[1], + &self->addr[0].addr, self->addrlen[0]); } TEST_HARNESS_MAIN -- cgit v1.2.3 From 6f9bc755c0215501c45897aa5c8b8b56fb65724e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:47 -0700 Subject: selftest: tcp: Define the reverse order bind() tests explicitly. Currently, bind_wildcard.c calls bind() twice for two addresses and checks the pre-defined errno against the 2nd call. Also, the two bind() calls are swapped to cover various patterns how bind buckets are created. However, only testing two addresses is insufficient to detect regression. So, we will add more bind() calls, and then, we need to define different errno for each bind() per test case. As a prepartion, let's define the reverse order bind() test cases as fixtures. No functional changes are intended. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 67 +++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index d65c3bb6ba13..143aae383da3 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,7 @@ FIXTURE_VARIANT(bind_wildcard) int expected_errno; }; +/* (IPv4, IPv6) */ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { .family = {AF_INET, AF_INET6}, @@ -98,6 +99,63 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) .expected_errno = EADDRINUSE, }; +/* (IPv6, IPv4) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_loopback, &in4addr_any}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_loopback, &in4addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_any, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_loopback, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_loopback, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, int family, const void *addr_const) { @@ -167,18 +225,11 @@ void bind_sockets(struct __test_metadata *_metadata, close(fd[0]); } -TEST_F(bind_wildcard, v4_v6) +TEST_F(bind_wildcard, plain) { bind_sockets(_metadata, self, variant->expected_errno, &self->addr[0].addr, self->addrlen[0], &self->addr[1].addr, self->addrlen[1]); } -TEST_F(bind_wildcard, v6_v4) -{ - bind_sockets(_metadata, self, variant->expected_errno, - &self->addr[1].addr, self->addrlen[1], - &self->addr[0].addr, self->addrlen[0]); -} - TEST_HARNESS_MAIN -- cgit v1.2.3 From 5e9e9afdb50449f35d3e65dd6b1cdf87e8ce185e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:48 -0700 Subject: selftest: tcp: Add v4-v4 and v6-v6 bind() conflict tests. We don't have bind() conflict tests for the same protocol pairs. Let's add them except for the same address pair, which will be covered by the following patch adding 6 more bind() calls for each test case. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 100 ++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 143aae383da3..5100dd713a49 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,21 @@ FIXTURE_VARIANT(bind_wildcard) int expected_errno; }; +/* (IPv4, IPv4) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) +{ + .family = {AF_INET, AF_INET}, + .addr = {&in4addr_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) +{ + .family = {AF_INET, AF_INET}, + .addr = {&in4addr_loopback, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + /* (IPv4, IPv6) */ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { @@ -156,6 +171,91 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) .expected_errno = EADDRINUSE, }; +/* (IPv6, IPv6) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_v4mapped_any}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_v4mapped_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_v4mapped_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_v4mapped_any}, + .expected_errno = EADDRINUSE, +}; + static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, int family, const void *addr_const) { -- cgit v1.2.3 From f40742c22a6e9ffb53bf02f22ea5eda55fbcfcc5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:49 -0700 Subject: selftest: tcp: Add more bind() calls. In addtition to the two addresses defined in the fixtures, this patch add 6 more bind calls(): * 0.0.0.0 * 127.0.0.1 * :: * ::1 * ::ffff:0.0.0.0 * ::ffff:127.0.0.1 The first two per-fixture bind() calls control how inet_bind2_bucket is created, and the rest 6 bind() calls cover as many conflicting patterns as possible. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 225 ++++++++++++++++++++-------- 1 file changed, 166 insertions(+), 59 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 5100dd713a49..4ecd3835f33c 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -25,21 +25,34 @@ static const struct in6_addr in6addr_v4mapped_loopback = { } }; +#define NR_SOCKETS 8 + FIXTURE(bind_wildcard) { - socklen_t addrlen[2]; + int fd[NR_SOCKETS]; + socklen_t addrlen[NR_SOCKETS]; union { struct sockaddr addr; struct sockaddr_in addr4; struct sockaddr_in6 addr6; - } addr[2]; + } addr[NR_SOCKETS]; }; FIXTURE_VARIANT(bind_wildcard) { sa_family_t family[2]; const void *addr[2]; - int expected_errno; + + /* 6 bind() calls below follow two bind() for the defined 2 addresses: + * + * 0.0.0.0 + * 127.0.0.1 + * :: + * ::1 + * ::ffff:0.0.0.0 + * ::ffff:127.0.0.1 + */ + int expected_errno[NR_SOCKETS]; }; /* (IPv4, IPv4) */ @@ -47,14 +60,20 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) { .family = {AF_INET, AF_INET}, .addr = {&in4addr_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) { .family = {AF_INET, AF_INET}, .addr = {&in4addr_loopback, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv4, IPv6) */ @@ -62,56 +81,80 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv4) */ @@ -119,56 +162,80 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_any, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_loopback, &in4addr_any}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_loopback, &in4addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_any, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_loopback, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_loopback, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv6) */ @@ -176,84 +243,120 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_v4mapped_any}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_v4mapped_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, @@ -284,52 +387,56 @@ FIXTURE_SETUP(bind_wildcard) { setup_addr(self, 0, variant->family[0], variant->addr[0]); setup_addr(self, 1, variant->family[1], variant->addr[1]); + + setup_addr(self, 2, AF_INET, &in4addr_any); + setup_addr(self, 3, AF_INET, &in4addr_loopback); + + setup_addr(self, 4, AF_INET6, &in6addr_any); + setup_addr(self, 5, AF_INET6, &in6addr_loopback); + setup_addr(self, 6, AF_INET6, &in6addr_v4mapped_any); + setup_addr(self, 7, AF_INET6, &in6addr_v4mapped_loopback); } FIXTURE_TEARDOWN(bind_wildcard) { + int i; + + for (i = 0; i < NR_SOCKETS; i++) + close(self->fd[i]); } -void bind_sockets(struct __test_metadata *_metadata, - FIXTURE_DATA(bind_wildcard) *self, - int expected_errno, - struct sockaddr *addr1, socklen_t addrlen1, - struct sockaddr *addr2, socklen_t addrlen2) +void bind_socket(struct __test_metadata *_metadata, + FIXTURE_DATA(bind_wildcard) *self, + const FIXTURE_VARIANT(bind_wildcard) *variant, + int i) { - int fd[2]; int ret; - fd[0] = socket(addr1->sa_family, SOCK_STREAM, 0); - ASSERT_GT(fd[0], 0); - - ret = bind(fd[0], addr1, addrlen1); - ASSERT_EQ(ret, 0); + self->fd[i] = socket(self->addr[i].addr.sa_family, SOCK_STREAM, 0); + ASSERT_GT(self->fd[i], 0); - ret = getsockname(fd[0], addr1, &addrlen1); - ASSERT_EQ(ret, 0); + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; - ((struct sockaddr_in *)addr2)->sin_port = ((struct sockaddr_in *)addr1)->sin_port; - - fd[1] = socket(addr2->sa_family, SOCK_STREAM, 0); - ASSERT_GT(fd[1], 0); - - ret = bind(fd[1], addr2, addrlen2); - if (expected_errno) { + ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); + if (variant->expected_errno[i]) { ASSERT_EQ(ret, -1); - ASSERT_EQ(errno, expected_errno); + ASSERT_EQ(errno, variant->expected_errno[i]); } else { ASSERT_EQ(ret, 0); } - close(fd[1]); - close(fd[0]); + if (i == 0) { + ret = getsockname(self->fd[0], &self->addr[0].addr, &self->addrlen[0]); + ASSERT_EQ(ret, 0); + } } TEST_F(bind_wildcard, plain) { - bind_sockets(_metadata, self, variant->expected_errno, - &self->addr[0].addr, self->addrlen[0], - &self->addr[1].addr, self->addrlen[1]); + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i); } TEST_HARNESS_MAIN -- cgit v1.2.3 From d37f2f72c91f2c5b61db7e6685c8b4bfdff85cb8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:50 -0700 Subject: selftest: tcp: Add bind() tests for IPV6_V6ONLY. bhash2 was not well tested for IPv6-only sockets. This patch adds test cases where we set IPV6_V6ONLY for per-fixture bind() calls if variant->ipv6_only[i] is true. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 116 ++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 4ecd3835f33c..6d7f02441b9d 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,7 @@ FIXTURE_VARIANT(bind_wildcard) { sa_family_t family[2]; const void *addr[2]; + bool ipv6_only[2]; /* 6 bind() calls below follow two bind() for the defined 2 addresses: * @@ -87,6 +88,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) +{ + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { .family = {AF_INET, AF_INET6}, @@ -127,6 +139,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) +{ + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { .family = {AF_INET, AF_INET6}, @@ -168,6 +191,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) { .family = {AF_INET6, AF_INET}, @@ -178,6 +212,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) { .family = {AF_INET6, AF_INET}, @@ -249,6 +294,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, @@ -259,6 +315,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, @@ -269,6 +336,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) { .family = {AF_INET6, AF_INET6}, @@ -279,6 +357,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, @@ -309,6 +398,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -339,6 +439,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -415,6 +526,11 @@ void bind_socket(struct __test_metadata *_metadata, self->fd[i] = socket(self->addr[i].addr.sa_family, SOCK_STREAM, 0); ASSERT_GT(self->fd[i], 0); + if (i < 2 && variant->ipv6_only[i]) { + ret = setsockopt(self->fd[i], SOL_IPV6, IPV6_V6ONLY, &(int){1}, sizeof(int)); + ASSERT_EQ(ret, 0); + } + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); -- cgit v1.2.3 From 7679f0968d01878b8da80c5078eebe23231a19e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:51 -0700 Subject: selftest: tcp: Add bind() tests for SO_REUSEADDR/SO_REUSEPORT. This patch adds two tests using SO_REUSEADDR and SO_REUSEPORT and defines errno for each test case. SO_REUSEADDR/SO_REUSEPORT is set for the per-fixture two bind() calls. The notable pattern is the pair of v6only [::] and plain [::]. The two sockets are put into the same tb2, where per-bucket v6only flag would be useless to detect bind() conflict. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 263 +++++++++++++++++++++++++++- 1 file changed, 257 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 6d7f02441b9d..b7b54d646b93 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -54,6 +54,7 @@ FIXTURE_VARIANT(bind_wildcard) * ::ffff:127.0.0.1 */ int expected_errno[NR_SOCKETS]; + int expected_reuse_errno[NR_SOCKETS]; }; /* (IPv4, IPv4) */ @@ -65,6 +66,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) @@ -75,6 +80,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv4, IPv6) */ @@ -86,6 +95,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) @@ -97,6 +110,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) @@ -107,6 +124,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) @@ -117,6 +138,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) @@ -127,6 +152,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) @@ -137,6 +166,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) @@ -148,6 +181,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) @@ -158,6 +195,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) @@ -168,6 +209,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) @@ -178,6 +223,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv4) */ @@ -189,6 +238,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) @@ -200,6 +253,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) @@ -210,6 +267,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) @@ -221,6 +282,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) @@ -231,6 +296,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) @@ -241,6 +310,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) @@ -251,6 +324,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) @@ -261,6 +338,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) @@ -271,6 +352,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) @@ -281,9 +366,72 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv6) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {true, true}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -292,6 +440,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) @@ -303,6 +455,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) @@ -313,6 +469,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) @@ -324,6 +484,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) @@ -334,6 +498,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) @@ -345,6 +513,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) @@ -355,6 +527,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) @@ -366,6 +542,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) @@ -376,6 +556,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) @@ -386,6 +570,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) @@ -396,6 +584,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) @@ -407,6 +599,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) @@ -417,6 +613,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) @@ -427,6 +627,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) @@ -437,6 +641,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) @@ -448,6 +656,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) @@ -458,6 +670,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) @@ -468,6 +684,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, @@ -519,7 +739,7 @@ FIXTURE_TEARDOWN(bind_wildcard) void bind_socket(struct __test_metadata *_metadata, FIXTURE_DATA(bind_wildcard) *self, const FIXTURE_VARIANT(bind_wildcard) *variant, - int i) + int i, int reuse) { int ret; @@ -531,14 +751,29 @@ void bind_socket(struct __test_metadata *_metadata, ASSERT_EQ(ret, 0); } + if (i < 2 && reuse) { + ret = setsockopt(self->fd[i], SOL_SOCKET, reuse, &(int){1}, sizeof(int)); + ASSERT_EQ(ret, 0); + } + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); - if (variant->expected_errno[i]) { - ASSERT_EQ(ret, -1); - ASSERT_EQ(errno, variant->expected_errno[i]); + + if (reuse) { + if (variant->expected_reuse_errno[i]) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, variant->expected_reuse_errno[i]); + } else { + ASSERT_EQ(ret, 0); + } } else { - ASSERT_EQ(ret, 0); + if (variant->expected_errno[i]) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, variant->expected_errno[i]); + } else { + ASSERT_EQ(ret, 0); + } } if (i == 0) { @@ -552,7 +787,23 @@ TEST_F(bind_wildcard, plain) int i; for (i = 0; i < NR_SOCKETS; i++) - bind_socket(_metadata, self, variant, i); + bind_socket(_metadata, self, variant, i, 0); +} + +TEST_F(bind_wildcard, reuseaddr) +{ + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, SO_REUSEADDR); +} + +TEST_F(bind_wildcard, reuseport) +{ + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, SO_REUSEPORT); } TEST_HARNESS_MAIN -- cgit v1.2.3 From c33f0d4fcfe072adbbb7f3cf93f1b146e181bf3b Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 29 Mar 2024 11:28:03 +0000 Subject: ALSA: hda/realtek: Add quirks for ASUS Laptops using CS35L56 These ASUS laptops use the Realtek HDA codec combined with a number of CS35L56 amplifiers. The SSID of the GA403U matches a previous ASUS laptop - we can tell them apart because they use different codecs. Signed-off-by: Simon Trimmer Message-ID: <20240329112803.23897-1-simont@opensource.cirrus.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 53 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c31e9be257a9..e866b8a75cda 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6875,11 +6875,38 @@ static void alc287_fixup_legion_16ithg6_speakers(struct hda_codec *cdc, const st comp_generic_fixup(cdc, action, "i2c", "CLSA0101", "-%s:00-cs35l41-hda.%d", 2); } +static void cs35l56_fixup_i2c_two(struct hda_codec *cdc, const struct hda_fixup *fix, int action) +{ + comp_generic_fixup(cdc, action, "i2c", "CSC3556", "-%s:00-cs35l56-hda.%d", 2); +} + +static void cs35l56_fixup_i2c_four(struct hda_codec *cdc, const struct hda_fixup *fix, int action) +{ + comp_generic_fixup(cdc, action, "i2c", "CSC3556", "-%s:00-cs35l56-hda.%d", 4); +} + +static void cs35l56_fixup_spi_two(struct hda_codec *cdc, const struct hda_fixup *fix, int action) +{ + comp_generic_fixup(cdc, action, "spi", "CSC3556", "-%s:00-cs35l56-hda.%d", 2); +} + static void cs35l56_fixup_spi_four(struct hda_codec *cdc, const struct hda_fixup *fix, int action) { comp_generic_fixup(cdc, action, "spi", "CSC3556", "-%s:00-cs35l56-hda.%d", 4); } +static void alc285_fixup_asus_ga403u(struct hda_codec *cdc, const struct hda_fixup *fix, int action) +{ + /* + * The same SSID has been re-used in different hardware, they have + * different codecs and the newer GA403U has a ALC285. + */ + if (cdc->core.vendor_id == 0x10ec0285) + cs35l56_fixup_i2c_two(cdc, fix, action); + else + alc_fixup_inv_dmic(cdc, fix, action); +} + static void tas2781_fixup_i2c(struct hda_codec *cdc, const struct hda_fixup *fix, int action) { @@ -7436,6 +7463,10 @@ enum { ALC256_FIXUP_ACER_SFG16_MICMUTE_LED, ALC256_FIXUP_HEADPHONE_AMP_VOL, ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX, + ALC285_FIXUP_CS35L56_SPI_2, + ALC285_FIXUP_CS35L56_I2C_2, + ALC285_FIXUP_CS35L56_I2C_4, + ALC285_FIXUP_ASUS_GA403U, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9643,6 +9674,22 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc245_fixup_hp_spectre_x360_eu0xxx, }, + [ALC285_FIXUP_CS35L56_SPI_2] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l56_fixup_spi_two, + }, + [ALC285_FIXUP_CS35L56_I2C_2] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l56_fixup_i2c_two, + }, + [ALC285_FIXUP_CS35L56_I2C_4] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l56_fixup_i2c_four, + }, + [ALC285_FIXUP_ASUS_GA403U] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_asus_ga403u, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -10096,7 +10143,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1b11, "ASUS UX431DA", ALC294_FIXUP_ASUS_COEF_1B), - SND_PCI_QUIRK(0x1043, 0x1b13, "Asus U41SV", ALC269_FIXUP_INV_DMIC), + SND_PCI_QUIRK(0x1043, 0x1b13, "ASUS U41SV/GA403U", ALC285_FIXUP_ASUS_GA403U), SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2), @@ -10104,6 +10151,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x1c63, "ASUS GU605M", ALC285_FIXUP_CS35L56_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), SND_PCI_QUIRK(0x1043, 0x1c9f, "ASUS G614JU/JV/JI", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1caf, "ASUS G634JY/JZ/JI/JG", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), @@ -10115,11 +10163,14 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1d42, "ASUS Zephyrus G14 2022", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1d4e, "ASUS TM420", ALC256_FIXUP_ASUS_HPE), SND_PCI_QUIRK(0x1043, 0x1da2, "ASUS UP6502ZA/ZD", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x1df3, "ASUS UM5606", ALC285_FIXUP_CS35L56_I2C_4), SND_PCI_QUIRK(0x1043, 0x1e02, "ASUS UX3402ZA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1e11, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA502), SND_PCI_QUIRK(0x1043, 0x1e12, "ASUS UM3402", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS), SND_PCI_QUIRK(0x1043, 0x1e5e, "ASUS ROG Strix G513", ALC294_FIXUP_ASUS_G513_PINS), + SND_PCI_QUIRK(0x1043, 0x1e63, "ASUS H7606W", ALC285_FIXUP_CS35L56_I2C_2), + SND_PCI_QUIRK(0x1043, 0x1e83, "ASUS GA605W", ALC285_FIXUP_CS35L56_I2C_2), SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1ee2, "ASUS UM6702RA/RC", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1c52, "ASUS Zephyrus G15 2022", ALC289_FIXUP_ASUS_GA401), -- cgit v1.2.3 From ebd9779683aaf089ad0173862553cdd3288ad9b4 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Thu, 28 Mar 2024 21:44:48 +0000 Subject: smb: client: replace deprecated strncpy with strscpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. In cifssmb.c: Using strncpy with a length argument equal to strlen(src) is generally dangerous because it can cause string buffers to not be NUL-terminated. In this case, however, there was extra effort made to ensure the buffer was NUL-terminated via a manual NUL-byte assignment. In an effort to rid the kernel of strncpy() use, let's swap over to using strscpy() which guarantees NUL-termination on the destination buffer. To handle the case where ea_name is NULL, let's use the ?: operator to substitute in an empty string, thereby allowing strscpy to still NUL-terminate the destintation string. Interesting note: this flex array buffer may go on to also have some value encoded after the NUL-termination: | if (ea_value_len) | memcpy(parm_data->list.name + name_len + 1, | ea_value, ea_value_len); Now for smb2ops.c and smb2transport.c: Both of these cases are simple, strncpy() is used to copy string literals which have a length less than the destination buffer's size. We can simply swap in the new 2-argument version of strscpy() introduced in Commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2] Link: https://github.com/KSPP/linux/issues/90 Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Reviewed-by: Kees Cook Signed-off-by: Steve French --- fs/smb/client/cifssmb.c | 6 ++---- fs/smb/client/smb2ops.c | 2 +- fs/smb/client/smb2transport.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 5aee55551573..23b5709ddc31 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -5854,10 +5854,8 @@ SetEARetry: parm_data->list.EA_flags = 0; /* we checked above that name len is less than 255 */ parm_data->list.name_len = (__u8)name_len; - /* EA names are always ASCII */ - if (ea_name) - strncpy(parm_data->list.name, ea_name, name_len); - parm_data->list.name[name_len] = '\0'; + /* EA names are always ASCII and NUL-terminated */ + strscpy(parm_data->list.name, ea_name ?: "", name_len + 1); parm_data->list.value_len = cpu_to_le16(ea_value_len); /* caller ensures that ea_value_len is less than 64K but we need to ensure that it fits within the smb */ diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 2ed456948f34..35bf7eb315cd 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3913,7 +3913,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, strcat(message, "W"); } if (!new_oplock) - strncpy(message, "None", sizeof(message)); + strscpy(message, "None"); cinode->oplock = new_oplock; cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 5a3ca62d2f07..1d6e54f7879e 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -659,7 +659,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) } spin_unlock(&server->srv_lock); if (!is_binding && !server->session_estab) { - strncpy(shdr->Signature, "BSRSPYL", 8); + strscpy(shdr->Signature, "BSRSPYL"); return 0; } -- cgit v1.2.3 From 52f80bb181a9a1530ade30bc18991900bbb9697f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Mar 2024 15:53:37 +0100 Subject: ata: sata_sx4: fix pdc20621_get_from_dimm() on 64-bit gcc warns about a memcpy() with overlapping pointers because of an incorrect size calculation: In file included from include/linux/string.h:369, from drivers/ata/sata_sx4.c:66: In function 'memcpy_fromio', inlined from 'pdc20621_get_from_dimm.constprop' at drivers/ata/sata_sx4.c:962:2: include/linux/fortify-string.h:97:33: error: '__builtin_memcpy' accessing 4294934464 bytes at offsets 0 and [16, 16400] overlaps 6442385281 bytes at offset -2147450817 [-Werror=restrict] 97 | #define __underlying_memcpy __builtin_memcpy | ^ include/linux/fortify-string.h:620:9: note: in expansion of macro '__underlying_memcpy' 620 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ include/linux/fortify-string.h:665:26: note: in expansion of macro '__fortify_memcpy_chk' 665 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ include/asm-generic/io.h:1184:9: note: in expansion of macro 'memcpy' 1184 | memcpy(buffer, __io_virt(addr), size); | ^~~~~~ The problem here is the overflow of an unsigned 32-bit number to a negative that gets converted into a signed 'long', keeping a large positive number. Replace the complex calculation with a more readable min() variant that avoids the warning. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Arnd Bergmann Signed-off-by: Damien Le Moal --- drivers/ata/sata_sx4.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c index b51d7a9d0d90..a482741eb181 100644 --- a/drivers/ata/sata_sx4.c +++ b/drivers/ata/sata_sx4.c @@ -957,8 +957,7 @@ static void pdc20621_get_from_dimm(struct ata_host *host, void *psource, offset -= (idx * window_size); idx++; - dist = ((long) (window_size - (offset + size))) >= 0 ? size : - (long) (window_size - offset); + dist = min(size, window_size - offset); memcpy_fromio(psource, dimm_mmio + offset / 4, dist); psource += dist; @@ -1005,8 +1004,7 @@ static void pdc20621_put_to_dimm(struct ata_host *host, void *psource, readl(mmio + PDC_DIMM_WINDOW_CTLR); offset -= (idx * window_size); idx++; - dist = ((long)(s32)(window_size - (offset + size))) >= 0 ? size : - (long) (window_size - offset); + dist = min(size, window_size - offset); memcpy_toio(dimm_mmio + offset / 4, psource, dist); writel(0x01, mmio + PDC_GENERAL_CTLR); readl(mmio + PDC_GENERAL_CTLR); -- cgit v1.2.3 From 7d899947bca5e1dc2447d9cffb2b31c989e0ceb4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 18:49:36 +0100 Subject: ata: pata_macio: drop driver owner assignment PCI core in pci_register_driver() already sets the .owner, so driver does not need to. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Sergey Shtylyov Signed-off-by: Damien Le Moal --- drivers/ata/pata_macio.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index 4ac854f6b057..88b2e9817f49 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -1371,9 +1371,6 @@ static struct pci_driver pata_macio_pci_driver = { .suspend = pata_macio_pci_suspend, .resume = pata_macio_pci_resume, #endif - .driver = { - .owner = THIS_MODULE, - }, }; MODULE_DEVICE_TABLE(pci, pata_macio_pci_match); -- cgit v1.2.3 From a5e3dce493d4b12b74000b6a99b6712afa5d1a4d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2024 00:15:41 -0400 Subject: bcachefs: Fix assert in bch2_backpointer_invalid() Backpointers that point to invalid devices are caught by fsck, not .key_invalid; so .key_invalid needs to check for them instead of hitting asserts. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 8cb35ea572cb..21b3ae7df824 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -44,6 +44,11 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + /* these will be caught by fsck */ + if (!bch2_dev_exists2(c, bp.k->p.inode)) + return 0; + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; -- cgit v1.2.3 From 8aad8e1f659fcea1b24072e816e434e4cd12382d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2024 04:01:27 -0400 Subject: bcachefs: Fix journal pins in btree write buffer btree write buffer flush has two phases - in natural key order, which is more efficient but may fail - then in journal order The journal order flush was assuming that keys were still correctly ordered by journal sequence number - but due to coalescing by the previous phase, we need an additional sort. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 5cbad8445782..baf63e2fddb6 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -11,6 +11,7 @@ #include "journal_reclaim.h" #include +#include static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); @@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke #endif } +static int wb_key_seq_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq); +} + /* Compare excluding idx, the low 24 bits: */ static inline bool wb_key_eq(const void *_l, const void *_r) { @@ -357,6 +366,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); + sort(wb->flushing.keys.data, + wb->flushing.keys.nr, + sizeof(wb->flushing.keys.data[0]), + wb_key_seq_cmp, NULL); + darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue; -- cgit v1.2.3 From 688d750d10aa9c4fb71c5154521c775f94c887e0 Mon Sep 17 00:00:00 2001 From: Thomas Bertschinger Date: Wed, 20 Mar 2024 21:42:42 -0600 Subject: bcachefs: fix misplaced newline in __bch2_inode_unpacked_to_text() before: u64s 18 type inode_v3 0:1879048192:U32_MAX len 0 ver 0: mode=40700 flags= (15300000) journal_seq=4 bi_size=0 bi_sectors=0 bi_version=0bi_atime=227064388944 ... after: u64s 18 type inode_v3 0:1879048192:U32_MAX len 0 ver 0: mode=40700 flags= (15300000) journal_seq=4 bi_size=0 bi_sectors=0 bi_version=0 bi_atime=227064388944 ... Signed-off-by: Thomas Bertschinger Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 2b5e06770ab3..ca4a066e9a54 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -552,8 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); prt_newline(out); - prt_newline(out); prt_printf(out, "bi_version=%llu", inode->bi_version); + prt_newline(out); #define x(_name, _bits) \ prt_printf(out, #_name "=%llu", (u64) inode->_name); \ -- cgit v1.2.3 From 4bd02d3fb33d8a46e73085b8d47d21c0ccb3de9d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Mar 2024 01:20:36 -0400 Subject: bcachefs: fix mount error path Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 0ccee05f6887..b5ea9fa1259d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1997,6 +1997,7 @@ out: return dget(sb->s_root); err_put_super: + __bch2_fs_stop(c); deactivate_locked_super(sb); return ERR_PTR(bch2_err_class(ret)); } -- cgit v1.2.3 From aa6e130e3c2965a5c26a4033ff63b5dc9549bd76 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Mar 2024 19:52:03 -0400 Subject: bcachefs: Add an assertion for trying to evict btree root Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 562561a9a510..8ff21b2e1463 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1134,6 +1134,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) b = btree_cache_find(bc, k); if (!b) return; + + BUG_ON(b == btree_node_root(trans->c, b)); wait_on_io: /* not allowed to wait on io with btree locks held: */ -- cgit v1.2.3 From 63332394c7e1f4f26e8e5b1387212016aaa7eae2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Mar 2024 20:16:23 -0400 Subject: bcachefs: Move snapshot table size to struct snapshot_table We need to add bounds checking for snapshot table accesses - it turns out there are cases where we do need to use the snapshots table before fsck checks have completed (and indeed, fsck may not have been run). Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/snapshot.c | 25 ++++++++++++++----------- fs/bcachefs/snapshot.h | 6 +++++- fs/bcachefs/subvolume_types.h | 2 ++ 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 799aa32b6b4d..a9ade0b1f78c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -810,7 +810,6 @@ struct bch_fs { /* snapshot.c: */ struct snapshot_table __rcu *snapshots; - size_t snapshot_table_size; struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 39debe814bf3..9cd71e613dc9 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -151,36 +151,39 @@ out: static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; - size_t new_size; struct snapshot_table *new, *old; - new_size = max(16UL, roundup_pow_of_two(idx + 1)); + size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); + size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); - new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; + new->nr = new_size; + old = rcu_dereference_protected(c->snapshots, true); if (old) - memcpy(new->s, - rcu_dereference_protected(c->snapshots, true)->s, - sizeof(new->s[0]) * c->snapshot_table_size); + memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); rcu_assign_pointer(c->snapshots, new); - c->snapshot_table_size = new_size; - kvfree_rcu_mightsleep(old); + kvfree_rcu(old, rcu); - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + return &rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock))->s[idx]; } static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; + struct snapshot_table *table = + rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock)); lockdep_assert_held(&c->snapshot_table_lock); - if (likely(idx < c->snapshot_table_size)) - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + if (likely(table && idx < table->nr)) + return &table->s[idx]; return __snapshot_t_mut(c, id); } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 7c66ffc06385..8538b7fddfed 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -33,7 +33,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) { - return &t->s[U32_MAX - id]; + u32 idx = U32_MAX - id; + + return likely(t && idx < t->nr) + ? &t->s[idx] + : NULL; } static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index ae644adfc391..9b10c8947828 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,6 +20,8 @@ struct snapshot_t { }; struct snapshot_table { + struct rcu_head rcu; + size_t nr; #ifndef RUST_BINDGEN DECLARE_FLEX_ARRAY(struct snapshot_t, s); #else -- cgit v1.2.3 From ec9cc18fc2e65b08c588e01f24aaeb71551a7132 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2024 16:29:23 -0400 Subject: bcachefs: Add checks for invalid snapshot IDs Previously, we assumed that keys were consistent with the snapshots btree - but that's not correct as fsck may not have been run or may not be complete. This adds checks and error handling when using the in-memory snapshots table (that mirrors the snapshots btree). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 2 +- fs/bcachefs/data_update.c | 9 +++++++ fs/bcachefs/errcode.h | 3 ++- fs/bcachefs/snapshot.c | 8 ++++-- fs/bcachefs/snapshot.h | 57 ++++++++++++++++------------------------ 5 files changed, 40 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 30d69a6d133e..96669fede7d3 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -318,7 +318,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 4150feca42a2..b564404dad71 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -14,6 +14,7 @@ #include "move.h" #include "nocow_locking.h" #include "rebalance.h" +#include "snapshot.h" #include "subvolume.h" #include "trace.h" @@ -509,6 +510,14 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned ptrs_locked = 0; int ret = 0; + /* + * fs is corrupt we have a key for a snapshot node that doesn't exist, + * and we have to check for this because we go rw before repairing the + * snapshots table - just skip it, we can move it later. + */ + if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) + return -BCH_ERR_data_update_done; + bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index af25d8ec60f2..01a79fa3eacb 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -252,7 +252,8 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) + x(0, need_inode_lock) \ + x(0, invalid_snapshot_node) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 9cd71e613dc9..4e074136c490 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -93,8 +93,10 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans, static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) { - while (id && id < ancestor) - id = __snapshot_t(t, id)->parent; + while (id && id < ancestor) { + const struct snapshot_t *s = __snapshot_t(t, id); + id = s ? s->parent : 0; + } return id == ancestor; } @@ -110,6 +112,8 @@ static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancest static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) { const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return 0; if (s->skip[2] <= ancestor) return s->skip[2]; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 8538b7fddfed..331f20fd8d03 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -48,7 +48,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { rcu_read_lock(); - id = snapshot_t(c, id)->tree; + const struct snapshot_t *s = snapshot_t(c, id); + id = s ? s->tree : 0; rcu_read_unlock(); return id; @@ -56,7 +57,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->parent : 0; } static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -70,19 +72,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) { -#ifdef CONFIG_BCACHEFS_DEBUG - u32 parent = snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + if (!s) + return 0; - if (parent && - snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + u32 parent = s->parent; + if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) && + parent && + s->depth != snapshot_t(c, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", id, snapshot_t(c, id)->depth, parent, snapshot_t(c, parent)->depth); return parent; -#else - return snapshot_t(c, id)->parent; -#endif } static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) @@ -120,7 +122,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->equiv; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->equiv : 0; } static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) @@ -137,38 +140,22 @@ static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) return id == bch2_snapshot_equiv(c, id); } -static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - bool ret; - rcu_read_lock(); - s = snapshot_t(c, id); - ret = s->children[0]; + const struct snapshot_t *s = snapshot_t(c, id); + int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; rcu_read_unlock(); return ret; } -static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) { - return !bch2_snapshot_is_internal_node(c, id); -} - -static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s; - u32 parent = __bch2_snapshot_parent(c, id); - - if (!parent) - return 0; - - s = snapshot_t(c, __bch2_snapshot_parent(c, id)); - if (id == s->children[0]) - return s->children[1]; - if (id == s->children[1]) - return s->children[0]; - return 0; + int ret = bch2_snapshot_is_internal_node(c, id); + if (ret < 0) + return ret; + return !ret; } static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) @@ -253,7 +240,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bpos pos) { if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0) return 0; return __bch2_key_has_snapshot_overwrites(trans, id, pos); -- cgit v1.2.3 From 57339b24a0eda5433751e7e0f4a8ea1e23315f60 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 19:29:55 -0400 Subject: bcachefs: Don't do extent merging before journal replay is finished We don't normally do extent updates this early in recovery, but some of the repair paths have to and when we do, we don't want to do anything that requires the snapshots table. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index a4b40c1656a5..8e47e260eba5 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -38,6 +38,9 @@ static noinline int extent_front_merge(struct btree_trans *trans, struct bkey_i *update; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + update = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(update); if (ret) @@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); if (ret < 0) -- cgit v1.2.3 From 36f9ef109b1c6935928d09a3e73d744291f71545 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Mon, 25 Mar 2024 10:50:48 +0800 Subject: bcachefs: fix trans->mem realloc in __bch2_trans_kmalloc The old code doesn't consider the mem alloced from mempool when call krealloc on trans->mem. Also in bch2_trans_put, using mempool_free to free trans->mem by condition "trans->mem_bytes == BTREE_TRANS_MEM_MAX" is inaccurate when trans->mem was allocated by krealloc function. Instead, we use used_mempool stuff to record the situation, and realloc or free the trans->mem in elegant way. Also, after krealloc failed in __bch2_trans_kmalloc, the old data should be copied to the new buffer when alloc from mempool_alloc. Fixes: 31403dca5bb1 ("bcachefs: optimize __bch2_trans_get(), kill DEBUG_TRANSACTIONS") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 51bcdc6c6d1c..ae4a28a33ad1 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2790,6 +2790,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); + if (trans->used_mempool) { + if (trans->mem_bytes >= new_bytes) + goto out_change_top; + + /* No more space from mempool item, need malloc new one */ + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + + new_mem = kmalloc(new_bytes, GFP_KERNEL); + if (!new_mem) + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_mem); + return ERR_PTR(ret); + } + } + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = false; + mempool_free(trans->mem, &c->btree_trans_mem_pool); + goto out_new_mem; + } + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); @@ -2798,6 +2823,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = true; kfree(trans->mem); } @@ -2811,7 +2838,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (ret) return ERR_PTR(ret); } - +out_new_mem: trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -2819,7 +2846,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } - +out_change_top: p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3093,7 +3120,7 @@ void bch2_trans_put(struct btree_trans *trans) if (paths_allocated != trans->_paths_allocated) kvfree_rcu_mightsleep(paths_allocated); - if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + if (trans->used_mempool) mempool_free(trans->mem, &c->btree_trans_mem_pool); else kfree(trans->mem); -- cgit v1.2.3 From 048f47e83fc315499dc1943176b3ebe1a55574fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Mar 2024 13:51:29 -0400 Subject: bcachefs: btree_and_journal_iter now respects trans->journal_replay_not_finished btree_and_journal_iter is now safe to use at runtime, not just during recovery before journal keys have been freed. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 50e04356d72c..0272ef0d7310 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -363,7 +363,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { - struct bkey_s_c btree_k, journal_k, ret; + struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; if (iter->prefetch && iter->journal.level) btree_and_journal_iter_prefetch(iter); @@ -375,9 +375,10 @@ again: bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); + if (iter->trans->journal_replay_not_finished) + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); ret = journal_k.k && (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) @@ -435,7 +436,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &trans->c->journal_iters); + if (trans->journal_replay_not_finished && + !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ -- cgit v1.2.3 From 40cb26233a060aeb936de7ea1f6ac2659ed9951c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 17:14:43 -0400 Subject: bcachefs: Be careful about btree node splits during journal replay Don't pick a pivot that's going to be deleted. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 16 ++++++++++++++++ fs/bcachefs/btree_journal_iter.h | 4 ++-- fs/bcachefs/btree_update_interior.c | 9 ++++++++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 0272ef0d7310..1f588264575d 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -261,6 +261,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, return bch2_journal_key_insert(c, id, level, &whiteout); } +bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (!trans->journal_replay_not_finished) + return false; + + return (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + bkey_deleted(&keys->data[idx].k->k)); +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index c9d19da3ea04..09cd53091a05 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -40,8 +40,8 @@ int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, - unsigned, struct bpos); +bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b2f5f2e50f7e..b45c7ced15d8 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1380,9 +1380,16 @@ static void __btree_split_node(struct btree_update *as, if (bkey_deleted(k)) continue; + uk = bkey_unpack_key(b, k); + + if (b->c.level && + u64s < n1_u64s && + u64s + k->u64s >= n1_u64s && + bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + n1_u64s += k->u64s; + i = u64s >= n1_u64s; u64s += k->u64s; - uk = bkey_unpack_key(b, k); if (!i) n1_pos = uk.p; bch2_bkey_format_add_key(&format[i], &uk); -- cgit v1.2.3 From 79032b078173f87a13f8618cdab710798be67314 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 19:29:19 -0400 Subject: bcachefs: Improved topology repair checks Consolidate bch2_gc_check_topology() and btree_node_interior_verify(), and replace them with an improved version, bch2_btree_node_check_topology(). This checks that children of an interior node correctly span the full range of the parent node with no overlaps. Also, ensure that topology repairs at runtime are always a fatal error; in particular, this adds a check in btree_iter_down() - if we don't find a key while walking down the btree that's indicative of a topology error and should be flagged as such, not a null ptr deref. Some checks in btree_update_interior.c remaining BUG_ONS(), because we already checked the node for topology errors when starting the update, and the assertions indicate that we _just_ corrupted the btree node - i.e. the problem can't be that existing on disk corruption, they indicate an actual algorithmic bug. In the future, we'll be annotating the fsck errors list with which recovery pass corrects them; the open coded "run explicit recovery pass or fatal error" in bch2_btree_node_check_topology() will in the future be done for every fsck_err() call. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 3 +- fs/bcachefs/btree_gc.c | 133 ++++------------------------------- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/btree_iter.c | 19 ++++- fs/bcachefs/btree_update_interior.c | 134 +++++++++++++++++++++++++----------- fs/bcachefs/btree_update_interior.h | 2 + fs/bcachefs/error.h | 6 ++ fs/bcachefs/sb-errors_types.h | 3 +- 8 files changed, 137 insertions(+), 165 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 8ff21b2e1463..84474324dba9 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -808,7 +808,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) prt_printf(&buf, "\nmax "); bch2_bpos_to_text(&buf, b->data->max_key); - bch2_fs_inconsistent(c, "%s", buf.buf); + bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index bdaed29f084a..6d32ac9be243 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -70,90 +70,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } -/* - * Missing: if an interior btree node is empty, we need to do something - - * perhaps just kill it - */ -static int bch2_gc_check_topology(struct bch_fs *c, - struct btree *b, - struct bkey_buf *prev, - struct bkey_buf cur, - bool is_last) -{ - struct bpos node_start = b->data->min_key; - struct bpos node_end = b->data->max_key; - struct bpos expected_start = bkey_deleted(&prev->k->k) - ? node_start - : bpos_successor(prev->k->k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - int ret = 0; - - if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { - struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - - if (!bpos_eq(expected_start, bp->v.min_key)) { - bch2_topology_error(c); - - if (bkey_deleted(&prev->k->k)) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, node_start); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); - } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - } - - if (is_last && !bpos_eq(cur.k->k.p, node_end)) { - bch2_topology_error(c); - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); - bch2_bpos_to_text(&buf2, node_end); - - if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - - bch2_bkey_buf_copy(prev, c, cur.k); -err: -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) { switch (b->key.k.type) { @@ -445,6 +361,7 @@ again: prev = NULL; if (ret == DROP_PREV_NODE) { + bch_info(c, "dropped prev node"); bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, prev_k.k->k.p); @@ -841,42 +758,30 @@ err: static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) { - struct bch_fs *c = trans->c; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; - struct bkey_buf prev, cur; int ret = 0; + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; bch2_btree_node_iter_init_from_start(&iter, b); - bch2_bkey_buf_init(&prev); - bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, &k, initial); if (ret) - break; + return ret; bch2_btree_node_iter_advance(&iter, b); - - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - - ret = bch2_gc_check_topology(c, b, &prev, cur, - bch2_btree_node_iter_end(&iter)); - if (ret) - break; - } } - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); - return ret; + return 0; } static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, @@ -925,14 +830,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_buf cur, prev; + struct bkey_buf cur; struct printbuf buf = PRINTBUF; int ret = 0; + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - bch2_bkey_buf_init(&prev); bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -943,20 +850,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (ret) goto fsck_err; - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - k = bkey_i_to_s_c(cur.k); - - bch2_btree_and_journal_iter_advance(&iter); - - ret = bch2_gc_check_topology(c, b, - &prev, cur, - !bch2_btree_and_journal_iter_peek(&iter).k); - if (ret) - goto fsck_err; - } else { - bch2_btree_and_journal_iter_advance(&iter); - } + bch2_btree_and_journal_iter_advance(&iter); } if (b->c.level > target_depth) { @@ -1015,7 +909,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b } fsck_err: bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); bch2_btree_and_journal_iter_exit(&iter); printbuf_exit(&buf); return ret; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 34df8ccc5fec..b85e2cd61c38 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1657,7 +1657,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); - bch_err(c, "%s", buf.buf); + bch_err_ratelimited(c, "%s", buf.buf); if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index ae4a28a33ad1..2a211a4bebd1 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (ret) goto err; } else { - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); + struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); + if (!k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_btree_need_topology_repair; + goto err; + } + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); if ((flags & BTREE_ITER_PREFETCH) && c->opts.btree_node_prefetch) { @@ -962,7 +976,6 @@ err: return ret; } - static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b45c7ced15d8..aae7a2687eee 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" @@ -18,6 +19,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -44,56 +46,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, return path_idx; } -/* Debug code: */ - /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos next_node = b->data->min_key; - struct btree_node_iter iter; + struct bch_fs *c = trans->c; + struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : b->data->min_key; + struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_s_c_btree_ptr_v2 bp; - struct bkey unpacked; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; + struct bkey_buf prev; + int ret = 0; - BUG_ON(!b->c.level); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; + if (!b->c.level) + return 0; - bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - while (1) { - k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) - break; - bp = bkey_s_c_to_btree_ptr_v2(k); + goto out; - if (!bpos_eq(next_node, bp.v->min_key)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, next_node); - bch2_bpos_to_text(&buf2, bp.v->min_key); - panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); - } + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_btree_node_iter_advance(&iter, b); + struct bpos expected_min = bkey_deleted(&prev.k->k) + ? node_min + : bpos_successor(prev.k->k.p); - if (bch2_btree_node_iter_end(&iter)) { - if (!bpos_eq(k.k->p, b->key.k.p)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, b->key.k.p); - bch2_bpos_to_text(&buf2, k.k->p); - panic("expected end %s got %s\n", buf1.buf, buf2.buf); - } - break; + if (!bpos_eq(expected_min, bp.v->min_key)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "end of prev node doesn't match start of next node\n"), + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n prev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\n next "); + bch2_bkey_val_to_text(&buf, c, k); + + need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf); + goto topology_repair; } - next_node = bpos_successor(k.k->p); + bch2_bkey_buf_reassemble(&prev, c, k); + bch2_btree_and_journal_iter_advance(&iter); + } + + if (bkey_deleted(&prev.k->k)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "empty interior node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf); + goto topology_repair; + } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "last child node doesn't end at end of parent node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n last key "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + + need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf); + goto topology_repair; + } +out: +fsck_err: + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev, c); + printbuf_exit(&buf); + return ret; +topology_repair: + if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { + bch2_inconsistent_error(c); + ret = -BCH_ERR_btree_need_topology_repair; + } else { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); } -#endif + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -1448,8 +1497,7 @@ static void __btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n[i]); - if (b->c.level) - btree_node_interior_verify(as->c, n[i]); + BUG_ON(bch2_btree_node_check_topology(trans, n[i])); } } @@ -1480,7 +1528,7 @@ static void btree_split_insert_keys(struct btree_update *as, __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - btree_node_interior_verify(as->c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); } } @@ -1498,6 +1546,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { @@ -1717,7 +1769,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t goto split; } - btree_node_interior_verify(c, b); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) { + bch2_btree_node_unlock_write(trans, path, b); + return ret; + } bch2_btree_insert_keys_interior(as, trans, path, b, keys); @@ -1735,7 +1791,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_unlock_write(trans, path, b); - btree_node_interior_verify(c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); return 0; split: /* diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index f651dd48aaa0..7ac3e17b84fd 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -10,6 +10,8 @@ #define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) +int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index ae1d6674c512..36caedf72d89 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -32,6 +32,12 @@ bool bch2_inconsistent_error(struct bch_fs *); int bch2_topology_error(struct bch_fs *); +#define bch2_fs_topology_error(c, ...) \ +({ \ + bch_err(c, "btree topology error: " __VA_ARGS__); \ + bch2_topology_error(c); \ +}) + #define bch2_fs_inconsistent(c, ...) \ ({ \ bch_err(c, __VA_ARGS__); \ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 5178bf579f7c..8edbd9ef994c 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -265,7 +265,8 @@ x(subvol_children_bad, 257) \ x(subvol_loop, 258) \ x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) + x(btree_node_bkey_bad_u64s, 260) \ + x(btree_node_topology_empty_interior_node, 261) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- cgit v1.2.3 From bb66009958b277a9baffaa53d835661852550151 Mon Sep 17 00:00:00 2001 From: zhuxiaohui Date: Tue, 26 Mar 2024 20:03:45 +0800 Subject: bcachefs: add REQ_SYNC and REQ_IDLE in write dio when writing file with direct_IO on bcachefs, then performance is much lower than other fs due to write back throttle in block layer: wbt_wait+1 __rq_qos_throttle+32 blk_mq_submit_bio+394 submit_bio_noacct_nocheck+649 bch2_submit_wbio_replicas+538 __bch2_write+2539 bch2_direct_write+1663 bch2_write_iter+318 aio_write+355 io_submit_one+1224 __x64_sys_io_submit+169 do_syscall_64+134 entry_SYSCALL_64_after_hwframe+110 add set REQ_SYNC and REQ_IDLE in bio->bi_opf as standard dirct-io Signed-off-by: zhuxiaohui Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-direct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 33cb6da3a5ad..f49e6c0f0f68 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -536,7 +536,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) if (likely(!dio->iter.count) || dio->op.error) break; - bio_reset(bio, NULL, REQ_OP_WRITE); + bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); } out: return bch2_dio_write_done(dio); @@ -618,7 +618,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, GFP_KERNEL, &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); -- cgit v1.2.3 From 805b535a8afbcd8073a03eb25aafd82cb816bff6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Mar 2024 21:58:07 -0400 Subject: bcachefs: Check btree ptr min_key in .invalid Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 9 +++++++-- fs/bcachefs/sb-errors_types.h | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 61395b113df9..b2432d88cda6 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -189,13 +189,18 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, - btree_ptr_v2_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, + c, err, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), + c, err, btree_ptr_v2_min_key_bad, + "min_key > key"); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); fsck_err: return ret; diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 8edbd9ef994c..73e9634df8ff 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -266,7 +266,8 @@ x(subvol_loop, 258) \ x(subvol_unreachable, 259) \ x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) + x(btree_node_topology_empty_interior_node, 261) \ + x(btree_ptr_v2_min_key_bad, 262) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- cgit v1.2.3 From 812a9297936a959c98a2e9e44a9a622bbe30b162 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 16:39:54 -0400 Subject: bcachefs: Fix btree node keys accounting in topology repair path When dropping keys now outside a now because we're changing the node min/max, we need to redo the node's accounting as well. Signed-off-by: Kent Overstreet --- fs/bcachefs/bset.c | 14 ++++++++++---- fs/bcachefs/bset.h | 2 ++ fs/bcachefs/btree_io.c | 1 + fs/bcachefs/btree_update_interior.c | 1 + 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 3fd1085b6c61..3bb477840eab 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -134,18 +134,24 @@ void bch2_dump_btree_node_iter(struct btree *b, printbuf_exit(&buf); } -#ifdef CONFIG_BCACHEFS_DEBUG - -void __bch2_verify_btree_nr_keys(struct btree *b) +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) { struct bset_tree *t; struct bkey_packed *k; - struct btree_nr_keys nr = { 0 }; + struct btree_nr_keys nr = {}; for_each_bset(b, t) bset_tree_for_each_key(b, t, k) if (!bkey_deleted(k)) btree_keys_account_key_add(&nr, t - b->set, k); + return nr; +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *b) +{ + struct btree_nr_keys nr = bch2_btree_node_count_keys(b); BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 79c77baaa383..120a79fd456b 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -458,6 +458,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, /* Accounting: */ +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *); + static inline void btree_keys_account_key(struct btree_nr_keys *n, unsigned bset, struct bkey_packed *k, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index b85e2cd61c38..9c71e6fb9c41 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -654,6 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) */ bch2_bset_set_no_aux_tree(b, b->set); bch2_btree_build_aux_trees(b); + b->nr = bch2_btree_node_count_keys(b); struct bkey_s_c k; struct bkey unpacked; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index aae7a2687eee..826cebe4bce0 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1543,6 +1543,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_verify_btree_nr_keys(b); BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); -- cgit v1.2.3 From 6f5869ffd9f111b81b95b73c6e54f07406591911 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 17:38:22 -0400 Subject: bcachefs: Fix use after free in bch2_check_fix_ptrs() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6d32ac9be243..26e51af9acdc 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -691,12 +691,6 @@ found: } } - ret = bch2_journal_key_insert_take(c, btree_id, level, new); - if (ret) { - kfree(new); - goto err; - } - if (level) bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); @@ -710,6 +704,12 @@ found: bch_info(c, "new key %s", buf.buf); } + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { + kfree(new); + goto err; + } + *k = bkey_i_to_s_c(new); } err: -- cgit v1.2.3 From 83bb58539045b15653b61c6e8eb65f3f9c671cdf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 18:46:20 -0400 Subject: bcachefs: Fix repair path for missing indirect extents Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index c47c66c2b394..ff7864731a07 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -185,8 +185,7 @@ not_found: } else { bkey_error_init(update); update->k.p = p.k->p; - update->k.p.offset = next_idx; - update->k.size = next_idx - *idx; + update->k.size = p.k->size; set_bkey_val_u64s(&update->k, 0); } -- cgit v1.2.3 From dcc1c04587aa9bc3515153f4c89cff73f2cb45b2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Mar 2024 18:46:38 -0400 Subject: bcachefs: Fix use after free in check_root_trans() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 47d4eefaba7b..6d8367ab5ddd 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2098,17 +2098,21 @@ static int check_root_trans(struct btree_trans *trans) if (mustfix_fsck_err_on(ret, c, root_subvol_missing, "root subvol missing")) { - struct bkey_i_subvolume root_subvol; + struct bkey_i_subvolume *root_subvol = + bch2_trans_kmalloc(trans, sizeof(*root_subvol)); + ret = PTR_ERR_OR_ZERO(root_subvol); + if (ret) + goto err; snapshot = U32_MAX; inum = BCACHEFS_ROOT_INO; - bkey_subvolume_init(&root_subvol.k_i); - root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_subvol.v.flags = 0; - root_subvol.v.snapshot = cpu_to_le32(snapshot); - root_subvol.v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); + bkey_subvolume_init(&root_subvol->k_i); + root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol->v.flags = 0; + root_subvol->v.snapshot = cpu_to_le32(snapshot); + root_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; -- cgit v1.2.3 From 47d2080e30b0b9fc636eba4e74f9e4bdc01543d7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Mar 2024 19:26:05 -0400 Subject: bcachefs: Kill bch2_bkey_ptr_data_type() Remove some duplication, and inconsistency between check_fix_ptrs and the main ptr marking paths Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 6 ++--- fs/bcachefs/backpointers.h | 32 ++++++++++++++++++++----- fs/bcachefs/btree_gc.c | 59 +++++++++++++++++++++++++--------------------- fs/bcachefs/buckets.c | 12 ++++++---- fs/bcachefs/extents.h | 24 ------------------- 5 files changed, 67 insertions(+), 66 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 21b3ae7df824..7537d78aa9db 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -29,8 +29,7 @@ static bool extent_matches_bp(struct bch_fs *c, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, - &bucket2, &bp2); + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bucket, bucket2) && !memcmp(&bp, &bp2, sizeof(bp))) return true; @@ -507,8 +506,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, - k, p, &bucket_pos, &bp); + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp); ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 327365a9feac..da012ca7daee 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -90,20 +90,40 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } -static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p) +static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry) { - return level ? BCH_DATA_btree : - p.has_ec ? BCH_DATA_stripe : - BCH_DATA_user; + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return BCH_DATA_btree; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + case KEY_TYPE_stripe: { + const struct bch_extent_ptr *ptr = &entry->ptr; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + BUG_ON(ptr < s.v->ptrs || + ptr >= s.v->ptrs + s.v->nr_blocks); + + return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity + : BCH_DATA_user; + } + default: + BUG(); + } } static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, + const union bch_extent_entry *entry, struct bpos *bucket_pos, struct bch_backpointer *bp) { - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); s64 sectors = level ? btree_sectors(c) : k.k->size; u32 bucket_offset; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 26e51af9acdc..0afefccf4e52 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -7,6 +7,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_methods.h" #include "bkey_buf.h" #include "btree_journal_iter.h" @@ -508,7 +509,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c); if (fsck_err_on(!g->gen_valid, c, ptr_to_missing_alloc_key, @@ -574,7 +575,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id continue; if (fsck_err_on(bucket_data_type(g->data_type) && - bucket_data_type(g->data_type) != data_type, c, + bucket_data_type(g->data_type) != + bucket_data_type(data_type), c, ptr_bucket_data_type_mismatch, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", @@ -615,18 +617,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } if (do_update) { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct bkey_i *new; - if (is_root) { bch_err(c, "cannot update btree roots yet"); ret = -EINVAL; goto err; } - new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { ret = -BCH_ERR_ENOMEM_gc_repair_key; bch_err_msg(c, ret, "allocating new key"); @@ -641,7 +638,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id * btree node isn't there anymore, the read path will * sort it out: */ - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_GC_BUCKET(ca, ptr); @@ -649,19 +646,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id ptr->gen = g->gen; } } else { - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); - - (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || - (!ptr->cached && - gen_cmp(ptr->gen, g->gen) < 0) || - gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type); - })); + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); + + if ((p.ptr.cached && + (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || + (!p.ptr.cached && + gen_cmp(p.ptr.gen, g->gen) < 0) || + gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type)) { + bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); + goto restart_drop_ptrs; + } + } again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_extent_entry_for_each(ptrs, entry) { @@ -736,10 +740,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, BUG_ON(bch2_journal_seq_verify && k->k->version.lo > atomic64_read(&c->journal.seq)); - ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); - if (ret) - goto err; - if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, "key version number higher than recorded: %llu > %llu", @@ -748,8 +748,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, atomic64_set(&c->key_version, k->k->version.lo); } + ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); + if (ret) + goto err; + ret = commit_do(trans, NULL, NULL, 0, - bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); + bch2_key_trigger(trans, btree_id, level, old, + unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 96edf2c34d43..941401a210f5 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -525,6 +525,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), bch2_data_type_str(data_type))) { + BUG(); ret = -EIO; goto err; } @@ -628,6 +629,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + BUG(); ret = -EIO; goto err; } @@ -815,14 +817,14 @@ static int __mark_pointer(struct btree_trans *trans, static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, - s64 *sectors, - unsigned flags) + const union bch_extent_entry *entry, + s64 *sectors, unsigned flags) { bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); struct bpos bucket; struct bch_backpointer bp; - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp); *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); if (flags & BTREE_TRIGGER_TRANSACTIONAL) { @@ -851,7 +853,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_GC) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); percpu_down_read(&c->mark_lock); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); @@ -979,7 +981,7 @@ static int __trigger_extent(struct btree_trans *trans, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors; - ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags); + ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index fd2669cdd76f..3fd0169b98c1 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -596,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) return ret; } -static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return BCH_DATA_user; - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - BUG_ON(ptr < s.v->ptrs || - ptr >= s.v->ptrs + s.v->nr_blocks); - - return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity - : BCH_DATA_user; - } - default: - BUG(); - } -} - unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -- cgit v1.2.3 From 7f9e5080366726084eb765a5d689bdf502e7e2ed Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Mar 2024 19:39:26 -0400 Subject: bcachefs: Fix bch2_btree_increase_depth() When we haven't yet allocated any btree nodes for a given btree, we first need to call the regular split path to allocate one. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 826cebe4bce0..9c4325786ba8 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1882,9 +1882,12 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, { struct bch_fs *c = trans->c; struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + + if (btree_node_fake(b)) + return bch2_btree_split_leaf(trans, path, flags); + struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, - b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); -- cgit v1.2.3 From 11d5568d3e04a2e6734d1eccc394cfcf5ca8523c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Mar 2024 01:41:03 -0400 Subject: bcachefs: fix backpointer for missing alloc key msg Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 7537d78aa9db..762c8ddfc5e7 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -382,7 +382,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ backpointer_to_missing_alloc, "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } -- cgit v1.2.3 From d2554263adcb4041f3608cb7476f102fda036ccc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 20:07:46 -0400 Subject: bcachefs: Split out recovery_passes.c We've grown a fair amount of code for managing recovery passes; tracking which ones we're running, which ones need to be run, and flagging in the superblock which ones need to be run on the next recovery. So it's worth splitting out into its own file, this code is pretty different from the code in recovery.c. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/btree_gc.c | 2 +- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/chardev.c | 2 +- fs/bcachefs/error.c | 2 +- fs/bcachefs/fsck.c | 2 +- fs/bcachefs/recovery.c | 249 +----------------------------------- fs/bcachefs/recovery.h | 32 +---- fs/bcachefs/recovery_passes.c | 203 +++++++++++++++++++++++++++++ fs/bcachefs/recovery_passes.h | 16 +++ fs/bcachefs/recovery_passes_types.h | 69 ++++++++++ fs/bcachefs/recovery_types.h | 68 ---------- fs/bcachefs/sb-downgrade.c | 2 +- fs/bcachefs/subvolume.c | 72 +++++++++++ fs/bcachefs/subvolume.h | 3 + fs/bcachefs/super-io.c | 2 +- 17 files changed, 378 insertions(+), 351 deletions(-) create mode 100644 fs/bcachefs/recovery_passes.c create mode 100644 fs/bcachefs/recovery_passes.h create mode 100644 fs/bcachefs/recovery_passes_types.h delete mode 100644 fs/bcachefs/recovery_types.h diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index b02796c8a595..f6d86eb4b976 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -67,6 +67,7 @@ bcachefs-y := \ quota.o \ rebalance.o \ recovery.o \ + recovery_passes.o \ reflink.o \ replicas.o \ sb-clean.o \ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a9ade0b1f78c..963162a627cd 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -209,7 +209,7 @@ #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_types.h" +#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" #include "time_stats.h" diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0afefccf4e52..e5d2c6daa663 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -25,7 +25,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "reflink.h" #include "replicas.h" #include "super-io.h" diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 9c4325786ba8..983bb27298cc 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -19,7 +19,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "super-io.h" #include "trace.h" diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 38defa19d52d..cbfa6459bdbc 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -7,7 +7,7 @@ #include "chardev.h" #include "journal.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "super.h" #include "super-io.h" diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 043431206799..f942a3947496 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" -#include "recovery.h" +#include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6d8367ab5ddd..aca6fae409cd 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -12,7 +12,7 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "snapshot.h" #include "super.h" #include "xattr.h" diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 03f9d6afe467..f9e2b011f6d4 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1,35 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "backpointers.h" -#include "bkey_buf.h" #include "alloc_background.h" -#include "btree_gc.h" +#include "bkey_buf.h" #include "btree_journal_iter.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" #include "buckets.h" #include "dirent.h" -#include "ec.h" #include "errcode.h" #include "error.h" #include "fs-common.h" -#include "fsck.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" -#include "lru.h" #include "logged_ops.h" #include "move.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-downgrade.h" #include "snapshot.h" -#include "subvolume.h" #include "super-io.h" #include @@ -186,7 +181,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(l->journal_seq, r->journal_seq); } -static int bch2_journal_replay(struct bch_fs *c) +int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; DARRAY(struct journal_key *) keys_sorted = { 0 }; @@ -471,150 +466,6 @@ fsck_err: return ret; } -static int bch2_initialize_subvolumes(struct bch_fs *c) -{ - struct bkey_i_snapshot_tree root_tree; - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; - int ret; - - bkey_snapshot_tree_init(&root_tree.k_i); - root_tree.k.p.offset = 1; - root_tree.v.master_subvol = cpu_to_le32(1); - root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); - - bkey_snapshot_init(&root_snapshot.k_i); - root_snapshot.k.p.offset = U32_MAX; - root_snapshot.v.flags = 0; - root_snapshot.v.parent = 0; - root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); - root_snapshot.v.tree = cpu_to_le32(1); - SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - - bkey_subvolume_init(&root_volume.k_i); - root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_volume.v.flags = 0; - root_volume.v.snapshot = cpu_to_le32(U32_MAX); - root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; - goto err; - } - - ret = bch2_inode_unpack(k, &inode); - BUG_ON(ret); - - inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* set bi_subvol on root inode */ -noinline_for_stack -static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -{ - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -} - -const char * const bch2_recovery_passes[] = { -#define x(_fn, ...) #_fn, - BCH_RECOVERY_PASSES() -#undef x - NULL -}; - -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, c->opts.norecovery); -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - /* - * After we go RW, the journal keys buffer can't be modified (except for - * setting journal_key->overwritten: it will be accessed by multiple - * threads - */ - move_gap(keys, keys->nr); - - set_bit(BCH_FS_may_go_rw, &c->flags); - - if (keys->nr || c->opts.fsck || !c->sb.clean) - return bch2_fs_read_write_early(c); - return 0; -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -}; - -static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - static bool check_version_upgrade(struct bch_fs *c) { unsigned latest_version = bcachefs_metadata_version_current; @@ -687,96 +538,6 @@ static bool check_version_upgrade(struct bch_fs *c) return false; } -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); - return ret; -} - -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) - return false; - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - -static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; -} - -static int bch2_run_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - if (should_run_recovery_pass(c, c->curr_recovery_pass)) { - unsigned pass = c->curr_recovery_pass; - - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || - (ret && c->curr_recovery_pass < pass)) - continue; - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); - } - c->curr_recovery_pass++; - c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); - } - - return ret; -} - -int bch2_run_online_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; - - if (!(p->when & PASS_ONLINE)) - continue; - - ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; - continue; - } - if (ret) - break; - } - - return ret; -} - int bch2_fs_recovery(struct bch_fs *c) { struct bch_sb_field_clean *clean = NULL; @@ -1155,7 +916,7 @@ int bch2_fs_initialize(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) @@ -1230,7 +991,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; + c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 4e9d24719b2e..3962fd87b50d 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,37 +2,7 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -extern const char * const bch2_recovery_passes[]; - -u64 bch2_recovery_passes_to_stable(u64 v); -u64 bch2_recovery_passes_from_stable(u64 v); - -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return 0; - - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); - - c->recovery_passes_explicit |= BIT_ULL(pass); - - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; - } else { - return 0; - } -} - -int bch2_run_online_recovery_passes(struct bch_fs *); -u64 bch2_fsck_recovery_passes(void); +int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c new file mode 100644 index 000000000000..bab586dc395a --- /dev/null +++ b/fs/bcachefs/recovery_passes.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_gc.h" +#include "ec.h" +#include "fsck.h" +#include "inode.h" +#include "journal.h" +#include "lru.h" +#include "logged_ops.h" +#include "rebalance.h" +#include "recovery.h" +#include "recovery_passes.h" +#include "snapshot.h" +#include "subvolume.h" +#include "super.h" + +const char * const bch2_recovery_passes[] = { +#define x(_fn, ...) #_fn, + BCH_RECOVERY_PASSES() +#undef x + NULL +}; + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, c->opts.norecovery); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys, keys->nr); + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || c->opts.fsck || !c->sb.clean) + return bch2_fs_read_write_early(c); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + unsigned when; +}; + +static struct recovery_pass_fn recovery_pass_fns[] = { +#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +/* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return 0; + + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->recovery_passes_explicit |= BIT_ULL(pass); + + if (c->curr_recovery_pass >= pass) { + c->curr_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} + +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + + if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) + return false; + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return true; + if ((p->when & PASS_FSCK) && c->opts.fsck) + return true; + if ((p->when & PASS_UNCLEAN) && !c->sb.clean) + return true; + if (p->when & PASS_ALWAYS) + return true; + return false; +} + +static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + int ret; + + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); + + return 0; +} + +int bch2_run_online_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; + continue; + } + if (ret) + break; + } + + return ret; +} + +int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + unsigned pass = c->curr_recovery_pass; + + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || + (ret && c->curr_recovery_pass < pass)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + } + + return ret; +} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h new file mode 100644 index 000000000000..abefa67749eb --- /dev/null +++ b/fs/bcachefs/recovery_passes.h @@ -0,0 +1,16 @@ +#ifndef _BCACHEFS_RECOVERY_PASSES_H +#define _BCACHEFS_RECOVERY_PASSES_H + +extern const char * const bch2_recovery_passes[]; + +u64 bch2_recovery_passes_to_stable(u64 v); +u64 bch2_recovery_passes_from_stable(u64 v); + +u64 bch2_fsck_recovery_passes(void); + +int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); + +int bch2_run_online_recovery_passes(struct bch_fs *); +int bch2_run_recovery_passes(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h new file mode 100644 index 000000000000..09c1ab032a67 --- /dev/null +++ b/fs/bcachefs/recovery_passes_types.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H +#define _BCACHEFS_RECOVERY_PASSES_TYPES_H + +#define PASS_SILENT BIT(0) +#define PASS_FSCK BIT(1) +#define PASS_UNCLEAN BIT(2) +#define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) + +/* + * Passes may be reordered, but the second field is a persistent identifier and + * must never change: + */ +#define BCH_RECOVERY_PASSES() \ + x(check_topology, 4, 0) \ + x(alloc_read, 0, PASS_ALWAYS) \ + x(stripes_read, 1, PASS_ALWAYS) \ + x(initialize_subvolumes, 2, 0) \ + x(snapshots_read, 3, PASS_ALWAYS) \ + x(check_allocations, 5, PASS_FSCK) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ + x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, 9, PASS_ALWAYS) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ + x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 17, 0) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 22, 0) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ + x(check_inodes, 24, PASS_FSCK) \ + x(check_extents, 25, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_FSCK) \ + x(check_dirents, 27, PASS_FSCK) \ + x(check_xattrs, 28, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ + x(check_nlinks, 31, PASS_FSCK) \ + x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ + x(fix_reflink_p, 33, 0) \ + x(set_fs_needs_rebalance, 34, 0) \ + +/* We normally enumerate recovery passes in the order we run them: */ +enum bch_recovery_pass { +#define x(n, id, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + BCH_RECOVERY_PASS_NR +}; + +/* But we also need stable identifiers that can be used in the superblock */ +enum bch_recovery_pass_stable { +#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, + BCH_RECOVERY_PASSES() +#undef x +}; + +#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h deleted file mode 100644 index 4959e95e7c74..000000000000 --- a/fs/bcachefs/recovery_types.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_TYPES_H -#define _BCACHEFS_RECOVERY_TYPES_H - -#define PASS_SILENT BIT(0) -#define PASS_FSCK BIT(1) -#define PASS_UNCLEAN BIT(2) -#define PASS_ALWAYS BIT(3) -#define PASS_ONLINE BIT(4) - -/* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ -#define BCH_RECOVERY_PASSES() \ - x(check_topology, 4, 0) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) \ - -/* We normally enumerate recovery passes in the order we run them: */ -enum bch_recovery_pass { -#define x(n, id, when) BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x -}; - -/* But we also need stable identifiers that can be used in the superblock */ -enum bch_recovery_pass_stable { -#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, - BCH_RECOVERY_PASSES() -#undef x -}; - -#endif /* _BCACHEFS_RECOVERY_TYPES_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index e4396cb0bacb..d6f81179c3a2 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -7,7 +7,7 @@ #include "bcachefs.h" #include "darray.h" -#include "recovery.h" +#include "recovery_passes.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "super-io.h" diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index ce7aed121942..88a79c823276 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -595,6 +595,78 @@ err: return ret; } +int bch2_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; + root_tree.v.master_subvol = cpu_to_le32(1); + root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_is_inode(k.k)) { + bch_err(trans->c, "root inode not found"); + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* set bi_subvol on root inode */ +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) +{ + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + __bch2_fs_upgrade_for_subvolumes(trans)); + bch_err_fn(c, ret); + return ret; +} + int bch2_fs_subvolumes_init(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index 903c05162c06..d2015d549bd2 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -37,6 +37,9 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); +int bch2_initialize_subvolumes(struct bch_fs *); +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); + int bch2_fs_subvolumes_init(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ad28e370b640..e15f8b1f30c2 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -8,7 +8,7 @@ #include "journal.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "quota.h" #include "sb-clean.h" -- cgit v1.2.3 From e5aa80464155287cc309d18c1c93962357e3e393 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 19:30:58 -0400 Subject: bcachefs: Add error messages to logged ops fns Signed-off-by: Kent Overstreet --- fs/bcachefs/io_misc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 1baf78594cca..82f9170dab3f 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, ret = 0; err: bch2_logged_op_finish(trans, op_k); + bch_err_fn(c, ret); return ret; } @@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish: break; } err: + bch_err_fn(c, ret); bch2_logged_op_finish(trans, op_k); bch2_trans_iter_exit(trans, &iter); return ret; -- cgit v1.2.3 From af855a5f5e74cf0ef1166759fca937ce692b4aac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2024 19:31:15 -0400 Subject: bcachefs: Resume logged ops after fsck Finishing logged ops requires the filesystem to be in a reasonably consistent state - and other fsck passes don't require it to have completed, so just run it last. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 09c1ab032a67..f30521285706 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -37,7 +37,6 @@ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ x(check_inodes, 24, PASS_FSCK) \ x(check_extents, 25, PASS_FSCK) \ x(check_indirect_extents, 26, PASS_FSCK) \ @@ -47,6 +46,7 @@ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 33, 0) \ x(set_fs_needs_rebalance, 34, 0) \ -- cgit v1.2.3 From 4fe0eeeae477328cbd26af1e6f81a94e2080ffa8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Mar 2024 02:36:10 -0400 Subject: bcachefs: Flush journal immediately after replay if we did early repair Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index f9e2b011f6d4..dbedb5fd1dde 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -189,6 +189,7 @@ int bch2_journal_replay(struct bch_fs *c) u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; struct btree_trans *trans = bch2_trans_get(c); + bool immediate_flush = false; int ret = 0; if (keys->nr) { @@ -210,6 +211,13 @@ int bch2_journal_replay(struct bch_fs *c) darray_for_each(*keys, k) { cond_resched(); + /* + * k->allocated means the key wasn't read in from the journal, + * rather it was from early repair code + */ + if (k->allocated) + immediate_flush = true; + /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, @@ -269,6 +277,12 @@ int bch2_journal_replay(struct bch_fs *c) bch2_journal_set_replay_done(j); + /* if we did any repair, flush it immediately */ + if (immediate_flush) { + bch2_journal_flush_all_pins(&c->journal); + ret = bch2_journal_meta(&c->journal); + } + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: @@ -778,6 +792,12 @@ use_clean: clear_bit(BCH_FS_fsck_running, &c->flags); + /* fsync if we fixed errors */ + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && test_bit(BCH_FS_errors_fixed, &c->flags) && -- cgit v1.2.3 From 0a34c058fca84b10002228a1724e2e613e4dc3cc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 18:57:53 -0400 Subject: bcachefs: Ensure bch_sb_field_ext always exists This makes bch_sb_field_ext more consistent with the rest of -o nochanges - we don't want to be varying other codepaths based on -o nochanges, since it's used for testing in dry run mode; also fixes some potential null ptr derefs. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 25 ++++++++----------------- fs/bcachefs/super.c | 8 ++++++++ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index dbedb5fd1dde..e3b06430d606 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -592,16 +592,9 @@ int bch2_fs_recovery(struct bch_fs *c) if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; - struct bch_sb_field_ext *ext = - bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); - if (!ext) { - ret = -BCH_ERR_ENOSPC_sb; - mutex_unlock(&c->sb_lock); - goto err; - } - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { ext->recovery_passes_required[0] |= cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); @@ -832,6 +825,7 @@ use_clean: } mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { @@ -845,15 +839,12 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_error, &c->flags)) { - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (ext && - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); - memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); - write_sb = true; - } + if (!test_bit(BCH_FS_error, &c->flags) && + (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { + memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); + memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); + write_sb = true; } if (c->opts.fsck && diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 1ad6e5cd9476..89ee5d3ec119 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1015,8 +1015,16 @@ int bch2_fs_start(struct bch_fs *c) for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + struct bch_sb_field_ext *ext = + bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); mutex_unlock(&c->sb_lock); + if (!ext) { + bch_err(c, "insufficient space in superblock for sb_field_ext"); + ret = -BCH_ERR_ENOSPC_sb; + goto err; + } + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); -- cgit v1.2.3 From 060ff30a8596b649a80c19935758000dde7855fe Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 29 Mar 2024 20:43:39 -0400 Subject: bcachefs: bch2_run_explicit_recovery_pass_persistent() Flag that we need to run a recovery pass and run it - persistenly, so if we crash it'll still get run. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 18 ++++++++++++++++++ fs/bcachefs/recovery_passes.h | 1 + 2 files changed, 19 insertions(+) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index bab586dc395a..fb22cce10f66 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -16,6 +16,7 @@ #include "snapshot.h" #include "subvolume.h" #include "super.h" +#include "super-io.h" const char * const bch2_recovery_passes[] = { #define x(_fn, ...) #_fn, @@ -112,6 +113,23 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + __le64 s = cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(pass))); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (!(ext->recovery_passes_required[0] & s)) { + ext->recovery_passes_required[0] |= s; + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + + return bch2_run_explicit_recovery_pass(c, pass); +} + u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index abefa67749eb..99b464e127b8 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); int bch2_run_online_recovery_passes(struct bch_fs *); int bch2_run_recovery_passes(struct bch_fs *); -- cgit v1.2.3 From 13c1e583f9179ad7953dc71ebb2f12e613b9d052 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Mar 2024 21:34:14 -0400 Subject: bcachefs: Improve -o norecovery; opts.recovery_pass_limit This adds opts.recovery_pass_limit, and redoes -o norecovery to make use of it; this fixes some issues with -o norecovery so it can be safely used for data recovery. Norecovery means "don't do journal replay"; it's an important data recovery tool when we're getting stuck in journal replay. When using it this way we need to make sure we don't free journal keys after startup, so we continue to overlay them: thus it needs to imply retain_recovery_info, as well as nochanges. recovery_pass_limit is an explicit option for telling recovery to exit after a specific recovery pass; this is a much cleaner way of implementing -o norecovery, as well as being a useful debug feature in its own right. Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.c | 4 ++++ fs/bcachefs/opts.h | 7 ++++++- fs/bcachefs/recovery.c | 10 ++++------ fs/bcachefs/recovery_passes.c | 12 ++++++++---- fs/bcachefs/snapshot.c | 2 +- fs/bcachefs/super.c | 5 +++-- 6 files changed, 26 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 08ea0cfc4aef..e1800c4119b5 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -7,6 +7,7 @@ #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -205,6 +206,9 @@ const struct bch_option bch2_opt_table[] = { #define OPT_STR(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = ARRAY_SIZE(_choices), \ .choices = _choices +#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = U64_MAX, \ + .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 136083c11f3a..084247c13bd8 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -362,7 +362,12 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't replay the journal") \ + NULL, "Exit recovery immediately prior to journal replay")\ + x(recovery_pass_last, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_STR_NOLIMIT(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Exit recovery after specified pass") \ x(keep_journal, u8, \ 0, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e3b06430d606..e8b434009293 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -269,7 +269,8 @@ int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.keep_journal) + if (!c->opts.keep_journal && + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -584,11 +585,8 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (c->opts.fsck && c->opts.norecovery) { - bch_err(c, "cannot select both norecovery and fsck"); - ret = -EINVAL; - goto err; - } + if (c->opts.norecovery) + c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index fb22cce10f66..0089d9456b10 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -27,7 +27,7 @@ const char * const bch2_recovery_passes[] = { static int bch2_check_allocations(struct bch_fs *c) { - return bch2_gc(c, true, c->opts.norecovery); + return bch2_gc(c, true, false); } static int bch2_set_may_go_rw(struct bch_fs *c) @@ -144,8 +144,6 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa { struct recovery_pass_fn *p = recovery_pass_fns + pass; - if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) - return false; if (c->recovery_passes_explicit & BIT_ULL(pass)) return true; if ((p->when & PASS_FSCK) && c->opts.fsck) @@ -201,6 +199,10 @@ int bch2_run_recovery_passes(struct bch_fs *c) int ret = 0; while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (c->opts.recovery_pass_last && + c->curr_recovery_pass > c->opts.recovery_pass_last) + break; + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { unsigned pass = c->curr_recovery_pass; @@ -213,8 +215,10 @@ int bch2_run_recovery_passes(struct bch_fs *c) c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); } - c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + + c->curr_recovery_pass++; } return ret; diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 4e074136c490..4577ee7939a2 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -131,7 +131,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) rcu_read_lock(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) { + if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); goto out; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 89ee5d3ec119..bc026a77eb99 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -365,7 +365,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - !c->opts.norecovery) { + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); @@ -510,7 +510,8 @@ err: int bch2_fs_read_write(struct bch_fs *c) { - if (c->opts.norecovery) + if (c->opts.recovery_pass_last && + c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) return -BCH_ERR_erofs_norecovery; if (c->opts.nochanges) -- cgit v1.2.3 From cecfed9b446da5fba9d73e6448c9f0d1ff5d95ff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 31 Mar 2024 22:34:45 -0400 Subject: bcachefs: Logged op errors should be ignored If something is wrong with a logged op, we just want to delete it - there's nothing to repair. Signed-off-by: Kent Overstreet --- fs/bcachefs/logged_ops.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 9fac838d123e..b82f8209041f 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); struct bkey_buf sk; u32 restart_count = trans->restart_count; - int ret; if (!fn) return 0; @@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_buf_init(&sk); bch2_bkey_buf_reassemble(&sk, c, k); - ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?: - fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count); + fn->resume(trans, sk.k); bch2_bkey_buf_exit(&sk, c); - return ret; + + return trans_was_restarted(trans, restart_count); } int bch2_resume_logged_ops(struct bch_fs *c) -- cgit v1.2.3 From 8ce1db8091b23f5d2a0dd1dabe8007954114cb68 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Apr 2024 00:00:32 -0400 Subject: bcachefs: Fix remove_dirent() We were missing an iter_traverse(). Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index aca6fae409cd..0a47b0a473d8 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -158,9 +158,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); - ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); err: bch_err_fn(c, ret); -- cgit v1.2.3 From eab3a3ce2dea1a4013a3a553722b85f55a76ac2d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 01:00:50 -0400 Subject: bcachefs: Fix overlapping extent repair overlapping extent repair was colliding with extent past end of inode checks - don't update "extent ends at" until we know we have an extent. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0a47b0a473d8..cbb8b43e419f 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1372,10 +1372,6 @@ static int check_overlapping_extents(struct btree_trans *trans, goto err; } - ret = extent_ends_at(c, extent_ends, seen, k); - if (ret) - goto err; - extent_ends->last_pos = k.k->p; err: return ret; @@ -1505,6 +1501,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, i->seen_this_pos = true; } + + if (k.k->type != KEY_TYPE_whiteout) { + ret = extent_ends_at(c, extent_ends, s, k); + if (ret) + goto err; + } out: err: fsck_err: -- cgit v1.2.3 From b3c7fd35c03c17a950737fb56a06b730a7962d28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 15:59:57 -0400 Subject: bcachefs: On emergency shutdown, print out current journal sequence number Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index f942a3947496..82a6656c941c 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" +#include "journal.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" @@ -16,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c) return false; case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "inconsistency detected - emergency read only"); + bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", + journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); -- cgit v1.2.3 From e23d7e82b707d1d0a627e334fb46370e4f772c11 Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Thu, 14 Mar 2024 18:07:02 +0100 Subject: xfs: allow cross-linking special files without project quota There's an issue that if special files is created before quota project is enabled, then it's not possible to link this file. This works fine for normal files. This happens because xfs_quota skips special files (no ioctls to set necessary flags). The check for having the same project ID for source and destination then fails as source file doesn't have any ID. mkfs.xfs -f /dev/sda mount -o prjquota /dev/sda /mnt/test mkdir /mnt/test/foo mkfifo /mnt/test/foo/fifo1 xfs_quota -xc "project -sp /mnt/test/foo 9" /mnt/test > Setting up project 9 (path /mnt/test/foo)... > xfs_quota: skipping special file /mnt/test/foo/fifo1 > Processed 1 (/etc/projects and cmdline) paths for project 9 with recursion depth infinite (-1). ln /mnt/test/foo/fifo1 /mnt/test/foo/fifo1_link > ln: failed to create hard link '/mnt/test/testdir/fifo1_link' => '/mnt/test/testdir/fifo1': Invalid cross-device link mkfifo /mnt/test/foo/fifo2 ln /mnt/test/foo/fifo2 /mnt/test/foo/fifo2_link Fix this by allowing linking of special files to the project quota if special files doesn't have any ID set (ID = 0). Signed-off-by: Andrey Albershteyn Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ea48774f6b76..d55b42b2480d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1301,8 +1301,19 @@ xfs_link( */ if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && tdp->i_projid != sip->i_projid)) { - error = -EXDEV; - goto error_return; + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + error = -EXDEV; + goto error_return; + } } if (!resblks) { -- cgit v1.2.3 From f62d4c3eb687d87b616b4279acec7862553bda77 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Mar 2024 12:48:50 +0000 Subject: KVM: arm64: Don't defer TLB invalidation when zapping table entries Commit 7657ea920c54 ("KVM: arm64: Use TLBI range-based instructions for unmap") introduced deferred TLB invalidation for the stage-2 page-table so that range-based invalidation can be used for the accumulated addresses. This works fine if the structure of the page-tables remains unchanged, but if entire tables are zapped and subsequently freed then we transiently leave the hardware page-table walker with a reference to freed memory thanks to the translation walk caches. For example, stage2_unmap_walker() will free page-table pages: if (childp) mm_ops->put_page(childp); and issue the TLB invalidation later in kvm_pgtable_stage2_unmap(): if (stage2_unmap_defer_tlb_flush(pgt)) /* Perform the deferred TLB invalidations */ kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); For now, take the conservative approach and invalidate the TLB eagerly when we clear a table entry. Note, however, that the existing level hint passed to __kvm_tlb_flush_vmid_ipa() is incorrect and will be fixed in a subsequent patch. Cc: Raghavendra Rao Ananta Cc: Shaoqin Huang Cc: Marc Zyngier Cc: Oliver Upton Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-2-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 3fae5830f8d2..de0b667ba296 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -896,9 +896,11 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - if (!stage2_unmap_defer_tlb_flush(pgt)) + if (!stage2_unmap_defer_tlb_flush(pgt) || + kvm_pte_table(ctx->old, ctx->level)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + } } mm_ops->put_page(ctx->ptep); -- cgit v1.2.3 From 36e008323926036650299cfbb2dca704c7aba849 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Mar 2024 12:48:51 +0000 Subject: KVM: arm64: Don't pass a TLBI level hint when zapping table entries The TLBI level hints are for leaf entries only, so take care not to pass them incorrectly after clearing a table entry. Cc: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Fixes: 82bb02445de5 ("KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2") Fixes: 6d9d2115c480 ("KVM: arm64: Add support for stage-2 map()/unmap() in generic page-table") Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-3-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index de0b667ba296..a40dafc43bb6 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -528,7 +528,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; @@ -896,10 +896,12 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - if (!stage2_unmap_defer_tlb_flush(pgt) || - kvm_pte_table(ctx->old, ctx->level)) { - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, - ctx->addr, ctx->level); + if (kvm_pte_table(ctx->old, ctx->level)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + TLBI_TTL_UNKNOWN); + } else if (!stage2_unmap_defer_tlb_flush(pgt)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + ctx->level); } } -- cgit v1.2.3 From 0f0ff097bf77663b8d2692e33d56119947611bb0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Mar 2024 12:48:52 +0000 Subject: KVM: arm64: Use TLBI_TTL_UNKNOWN in __kvm_tlb_flush_vmid_range() Commit c910f2b65518 ("arm64/mm: Update tlb invalidation routines for FEAT_LPA2") updated the __tlbi_level() macro to take the target level as an argument, with TLBI_TTL_UNKNOWN (rather than 0) indicating that the caller cannot provide level information. Unfortunately, the two implementations of __kvm_tlb_flush_vmid_range() were not updated and so now ask for an level 0 invalidation if FEAT_LPA2 is implemented. Fix the problem by passing TLBI_TTL_UNKNOWN instead of 0 as the level argument to __flush_s2_tlb_range_op() in __kvm_tlb_flush_vmid_range(). Cc: Catalin Marinas Cc: Oliver Upton Cc: Marc Zyngier Reviewed-by: Ryan Roberts Fixes: c910f2b65518 ("arm64/mm: Update tlb invalidation routines for FEAT_LPA2") Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-4-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/tlb.c | 3 ++- arch/arm64/kvm/hyp/vhe/tlb.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index a60fb13e2192..2fc68da4036d 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -154,7 +154,8 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, /* Switch to requested VMID */ __tlb_switch_to_guest(mmu, &cxt, false); - __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); dsb(ish); __tlbi(vmalle1is); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index b32e2940df7d..1a60b95381e8 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -171,7 +171,8 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, /* Switch to requested VMID */ __tlb_switch_to_guest(mmu, &cxt); - __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, + TLBI_TTL_UNKNOWN); dsb(ish); __tlbi(vmalle1is); -- cgit v1.2.3 From 4c36a156738887c1edd78589fe192d757989bcde Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 27 Mar 2024 12:48:53 +0000 Subject: KVM: arm64: Ensure target address is granule-aligned for range TLBI When zapping a table entry in stage2_try_break_pte(), we issue range TLB invalidation for the region that was mapped by the table. However, we neglect to align the base address down to the granule size and so if we ended up reaching the table entry via a misaligned address then we will accidentally skip invalidation for some prefix of the affected address range. Align 'ctx->addr' down to the granule size when performing TLB invalidation for an unmapped table in stage2_try_break_pte(). Cc: Raghavendra Rao Ananta Cc: Gavin Shan Cc: Shaoqin Huang Cc: Quentin Perret Fixes: defc8cc7abf0 ("KVM: arm64: Invalidate the table entries upon a range") Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-5-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index a40dafc43bb6..5a59ef88b646 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -843,12 +843,15 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, * Perform the appropriate TLB invalidation based on the * evicted pte value (if any). */ - if (kvm_pte_table(ctx->old, ctx->level)) - kvm_tlb_flush_vmid_range(mmu, ctx->addr, - kvm_granule_size(ctx->level)); - else if (kvm_pte_valid(ctx->old)) + if (kvm_pte_table(ctx->old, ctx->level)) { + u64 size = kvm_granule_size(ctx->level); + u64 addr = ALIGN_DOWN(ctx->addr, size); + + kvm_tlb_flush_vmid_range(mmu, addr, size); + } else if (kvm_pte_valid(ctx->old)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + } } if (stage2_pte_is_counted(ctx->old)) -- cgit v1.2.3 From b3320142f3db9b3f2a23460abd3e22292e1530a5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 21 Mar 2024 11:54:14 +0000 Subject: arm64: Fix early handling of FEAT_E2H0 not being implemented Commit 3944382fa6f2 introduced checks for the FEAT_E2H0 not being implemented. However, the check is absolutely wrong and makes a point it testing a bit that is guaranteed to be zero. On top of that, the detection happens way too late, after the init_el2_state has done its job. This went undetected because the HW this was tested on has E2H being RAO/WI, and not RES1. However, the bug shows up when run as a nested guest, where HCR_EL2.E2H is not necessarily set to 1. As a result, booting the kernel in hVHE mode fails with timer accesses being cought in a trap loop (which was fun to debug). Fix the check for ID_AA64MMFR4_EL1.E2H0, and set the HCR_EL2.E2H bit early so that it can be checked by the rest of the init sequence. With this, hVHE works again in a NV environment that doesn't have FEAT_E2H0. Fixes: 3944382fa6f2 ("arm64: Treat HCR_EL2.E2H as RES1 when ID_AA64MMFR4_EL1.E2H0 is negative") Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240321115414.3169115-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/head.S | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index ce08b744aaab..06234c3a15f3 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -291,6 +291,21 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) blr x2 0: mov_q x0, HCR_HOST_NVHE_FLAGS + + /* + * Compliant CPUs advertise their VHE-onlyness with + * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be + * RES1 in that case. Publish the E2H bit early so that + * it can be picked up by the init_el2_state macro. + * + * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but + * don't advertise it (they predate this relaxation). + */ + mrs_s x1, SYS_ID_AA64MMFR4_EL1 + tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f + + orr x0, x0, #HCR_E2H +1: msr hcr_el2, x0 isb @@ -303,22 +318,10 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) mov_q x1, INIT_SCTLR_EL1_MMU_OFF - /* - * Compliant CPUs advertise their VHE-onlyness with - * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be - * RES1 in that case. - * - * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, but - * don't advertise it (they predate this relaxation). - */ - mrs_s x0, SYS_ID_AA64MMFR4_EL1 - ubfx x0, x0, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH - tbnz x0, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f - mrs x0, hcr_el2 and x0, x0, #HCR_E2H cbz x0, 2f -1: + /* Set a sane SCTLR_EL1, the VHE way */ pre_disable_mmu_workaround msr_s SYS_SCTLR_EL12, x1 -- cgit v1.2.3 From d96c66ab9fb3ad8b243669cf6b41e68d0f7f9ecd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 21 Mar 2024 17:37:06 +0000 Subject: KVM: arm64: Rationalise KVM banner output We are not very consistent when it comes to displaying which mode we're in (VHE, {n,h}VHE, protected or not). For example, booting in protected mode with hVHE results in: [ 0.969545] kvm [1]: Protected nVHE mode initialized successfully which is mildly amusing considering that the machine is VHE only. We already cleaned this up a bit with commit 1f3ca7023fe6 ("KVM: arm64: print Hyp mode"), but that's still unsatisfactory. Unify the three strings into one and use a mess of conditional statements to sort it out (yes, it's a slow day). Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240321173706.3280796-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3dee5490eea9..c4a0a35e02c7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2597,14 +2597,11 @@ static __init int kvm_arm_init(void) if (err) goto out_hyp; - if (is_protected_kvm_enabled()) { - kvm_info("Protected nVHE mode initialized successfully\n"); - } else if (in_hyp_mode) { - kvm_info("VHE mode initialized successfully\n"); - } else { - char mode = cpus_have_final_cap(ARM64_KVM_HVHE) ? 'h' : 'n'; - kvm_info("Hyp mode (%cVHE) initialized successfully\n", mode); - } + kvm_info("%s%sVHE mode initialized successfully\n", + in_hyp_mode ? "" : (is_protected_kvm_enabled() ? + "Protected " : "Hyp "), + in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ? + "h" : "n")); /* * FIXME: Do something reasonable if kvm_init() fails after pKVM -- cgit v1.2.3 From aa7cbefe65e455178c33eca308349e687d262ea7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:47 -0700 Subject: time/timecounter: Fix inline documentation Fix kernel-doc warnings, text punctuation, and a kernel-doc marker (change '%' to '&' to indicate a struct): timecounter.h:72: warning: No description found for return value of 'cyclecounter_cyc2ns' timecounter.h:85: warning: Function parameter or member 'tc' not described in 'timecounter_adjtime' timecounter.h:111: warning: No description found for return value of 'timecounter_read' timecounter.h:128: warning: No description found for return value of 'timecounter_cyc2time' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-2-rdunlap@infradead.org --- include/linux/timecounter.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h index c6540ceea143..0982d1d52b24 100644 --- a/include/linux/timecounter.h +++ b/include/linux/timecounter.h @@ -22,7 +22,7 @@ * * @read: returns the current cycle value * @mask: bitmask for two's complement - * subtraction of non 64 bit counters, + * subtraction of non-64-bit counters, * see CYCLECOUNTER_MASK() helper macro * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) @@ -35,7 +35,7 @@ struct cyclecounter { }; /** - * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds + * struct timecounter - layer above a &struct cyclecounter which counts nanoseconds * Contains the state needed by timecounter_read() to detect * cycle counter wrap around. Initialize with * timecounter_init(). Also used to convert cycle counts into the @@ -66,6 +66,8 @@ struct timecounter { * @cycles: Cycles * @mask: bit mask for maintaining the 'frac' field * @frac: pointer to storage for the fractional nanoseconds. + * + * Returns: cycle counter cycles converted to nanoseconds */ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, u64 cycles, u64 mask, u64 *frac) @@ -79,6 +81,7 @@ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, /** * timecounter_adjtime - Shifts the time of the clock. + * @tc: The &struct timecounter to adjust * @delta: Desired change in nanoseconds. */ static inline void timecounter_adjtime(struct timecounter *tc, s64 delta) @@ -107,6 +110,8 @@ extern void timecounter_init(struct timecounter *tc, * * In other words, keeps track of time since the same epoch as * the function which generated the initial time stamp. + * + * Returns: nanoseconds since the initial time stamp */ extern u64 timecounter_read(struct timecounter *tc); @@ -123,6 +128,8 @@ extern u64 timecounter_read(struct timecounter *tc); * * This allows conversion of cycle counter values which were generated * in the past. + * + * Returns: cycle counter converted to nanoseconds since the initial time stamp */ extern u64 timecounter_cyc2time(const struct timecounter *tc, u64 cycle_tstamp); -- cgit v1.2.3 From 76f788ee4a7d9f826738a034f9d2ee0bc4cd291b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:48 -0700 Subject: time/timekeeping: Fix kernel-doc warnings and typos Fix punctuation, spellos, and kernel-doc warnings: timekeeping.h:79: warning: No description found for return value of 'ktime_get_real' timekeeping.h:95: warning: No description found for return value of 'ktime_get_boottime' timekeeping.h:108: warning: No description found for return value of 'ktime_get_clocktai' timekeeping.h:149: warning: Function parameter or struct member 'mono' not described in 'ktime_mono_to_real' timekeeping.h:149: warning: No description found for return value of 'ktime_mono_to_real' timekeeping.h:255: warning: Function parameter or struct member 'cs_id' not described in 'system_time_snapshot' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-3-rdunlap@infradead.org --- include/linux/timekeeping.h | 49 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7e50cbd97f86..0ea7823b7f31 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -22,14 +22,14 @@ extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* - * ktime_get() family: read the current time in a multitude of ways, + * ktime_get() family - read the current time in a multitude of ways. * * The default time reference is CLOCK_MONOTONIC, starting at * boot time but not counting the time spent in suspend. * For other references, use the functions with "real", "clocktai", * "boottime" and "raw" suffixes. * - * To get the time in a different format, use the ones wit + * To get the time in a different format, use the ones with * "ns", "ts64" and "seconds" suffix. * * See Documentation/core-api/timekeeping.rst for more details. @@ -74,6 +74,8 @@ extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format + * + * Returns: real (wall) time in ktime_t format */ static inline ktime_t ktime_get_real(void) { @@ -86,10 +88,12 @@ static inline ktime_t ktime_get_coarse_real(void) } /** - * ktime_get_boottime - Returns monotonic time since boot in ktime_t format + * ktime_get_boottime - Get monotonic time since boot in ktime_t format * * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the * time spent in suspend. + * + * Returns: monotonic time since boot in ktime_t format */ static inline ktime_t ktime_get_boottime(void) { @@ -102,7 +106,9 @@ static inline ktime_t ktime_get_coarse_boottime(void) } /** - * ktime_get_clocktai - Returns the TAI time of day in ktime_t format + * ktime_get_clocktai - Get the TAI time of day in ktime_t format + * + * Returns: the TAI time of day in ktime_t format */ static inline ktime_t ktime_get_clocktai(void) { @@ -144,32 +150,60 @@ static inline u64 ktime_get_coarse_clocktai_ns(void) /** * ktime_mono_to_real - Convert monotonic time to clock realtime + * @mono: monotonic time to convert + * + * Returns: time converted to realtime clock */ static inline ktime_t ktime_mono_to_real(ktime_t mono) { return ktime_mono_to_any(mono, TK_OFFS_REAL); } +/** + * ktime_get_ns - Get the current time in nanoseconds + * + * Returns: current time converted to nanoseconds + */ static inline u64 ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } +/** + * ktime_get_real_ns - Get the current real/wall time in nanoseconds + * + * Returns: current real time converted to nanoseconds + */ static inline u64 ktime_get_real_ns(void) { return ktime_to_ns(ktime_get_real()); } +/** + * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds + * + * Returns: current boottime converted to nanoseconds + */ static inline u64 ktime_get_boottime_ns(void) { return ktime_to_ns(ktime_get_boottime()); } +/** + * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds + * + * Returns: current TAI time converted to nanoseconds + */ static inline u64 ktime_get_clocktai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); } +/** + * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds + * + * Returns: current raw monotonic time converted to nanoseconds + */ static inline u64 ktime_get_raw_ns(void) { return ktime_to_ns(ktime_get_raw()); @@ -224,8 +258,8 @@ extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); -/* - * struct ktime_timestanps - Simultaneous mono/boot/real timestamps +/** + * struct ktime_timestamps - Simultaneous mono/boot/real timestamps * @mono: Monotonic timestamp * @boot: Boottime timestamp * @real: Realtime timestamp @@ -242,7 +276,8 @@ struct ktime_timestamps { * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time - * @clock_was_set_seq: The sequence number of clock was set events + * @cs_id: Clocksource ID + * @clock_was_set_seq: The sequence number of clock-was-set events * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { -- cgit v1.2.3 From b87752ef5cc15b0bae04583d599e873d92dc0618 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:49 -0700 Subject: timers: Fix kernel-doc format and add Return values Fix kernel-doc format and warnings: timer.h:26: warning: Cannot understand * @TIMER_DEFERRABLE: A deferrable timer will work normally when the on line 26 - I thought it was a doc line timer.h:146: warning: No description found for return value of 'timer_pending' timer.h:180: warning: No description found for return value of 'del_timer_sync' timer.h:193: warning: No description found for return value of 'del_timer' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-4-rdunlap@infradead.org --- include/linux/timer.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index 14a633ba61d6..e67ecd1cbc97 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -22,7 +22,7 @@ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) #endif -/** +/* * @TIMER_DEFERRABLE: A deferrable timer will work normally when the * system is busy, but will not cause a CPU to come out of idle just * to service it; instead, the timer will be serviced when the CPU @@ -140,7 +140,7 @@ static inline void destroy_timer_on_stack(struct timer_list *timer) { } * or not. Callers must ensure serialization wrt. other operations done * to this timer, eg. interrupt contexts, or other CPUs on SMP. * - * return value: 1 if the timer is pending, 0 if not. + * Returns: 1 if the timer is pending, 0 if not. */ static inline int timer_pending(const struct timer_list * timer) { @@ -175,6 +175,10 @@ extern int timer_shutdown(struct timer_list *timer); * See timer_delete_sync() for detailed explanation. * * Do not use in new code. Use timer_delete_sync() instead. + * + * Returns: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ static inline int del_timer_sync(struct timer_list *timer) { @@ -188,6 +192,10 @@ static inline int del_timer_sync(struct timer_list *timer) * See timer_delete() for detailed explanation. * * Do not use in new code. Use timer_delete() instead. + * + * Returns: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ static inline int del_timer(struct timer_list *timer) { -- cgit v1.2.3 From f29536bf1721802d2ebdc7893ed2991d4da0a4b6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:50 -0700 Subject: tick/sched: Fix various kernel-doc warnings Fix a slew of kernel-doc warnings in tick-sched.c: tick-sched.c:650: warning: Function parameter or struct member 'now' not described in 'tick_nohz_update_jiffies' tick-sched.c:741: warning: No description found for return value of 'get_cpu_idle_time_us' tick-sched.c:767: warning: No description found for return value of 'get_cpu_iowait_time_us' tick-sched.c:1210: warning: No description found for return value of 'tick_nohz_idle_got_tick' tick-sched.c:1228: warning: No description found for return value of 'tick_nohz_get_next_hrtimer' tick-sched.c:1243: warning: No description found for return value of 'tick_nohz_get_sleep_length' tick-sched.c:1282: warning: Function parameter or struct member 'cpu' not described in 'tick_nohz_get_idle_calls_cpu' tick-sched.c:1282: warning: No description found for return value of 'tick_nohz_get_idle_calls_cpu' tick-sched.c:1294: warning: No description found for return value of 'tick_nohz_get_idle_calls' tick-sched.c:1577: warning: Function parameter or struct member 'hrtimer' not described in 'tick_setup_sched_timer' tick-sched.c:1577: warning: Excess function parameter 'mode' description in 'tick_setup_sched_timer' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-5-rdunlap@infradead.org --- kernel/time/tick-sched.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 269e21590df5..1331216a9cae 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -697,6 +697,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu) /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * @@ -794,7 +795,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * - * This function returns -1 if NOHZ is not enabled. + * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { @@ -820,7 +821,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * - * This function returns -1 if NOHZ is not enabled. + * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { @@ -1287,6 +1288,8 @@ void tick_nohz_irq_exit(void) /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + * + * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { @@ -1305,6 +1308,8 @@ bool tick_nohz_idle_got_tick(void) * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled + * + * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { @@ -1320,6 +1325,8 @@ ktime_t tick_nohz_get_next_hrtimer(void) * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. + * + * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { @@ -1357,8 +1364,11 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. + * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. + * + * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { @@ -1371,6 +1381,8 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. + * + * Return: the current idle calls counter value for the current CPU */ unsigned long tick_nohz_get_idle_calls(void) { @@ -1559,7 +1571,7 @@ early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer - * @mode: tick_nohz_mode to setup for + * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { -- cgit v1.2.3 From ba6ad57b803e33ed509213a5e840427dbef501d6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:51 -0700 Subject: tick/sched: Fix struct tick_sched doc warnings Fix kernel-doc warnings in struct tick_sched: tick-sched.h:103: warning: Function parameter or struct member 'idle_sleeptime_seq' not described in 'tick_sched' tick-sched.h:104: warning: Excess struct member 'nohz_mode' description in 'tick_sched' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-6-rdunlap@infradead.org --- kernel/time/tick-sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index e11c4dc65bcb..b4a7822f495d 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -46,8 +46,8 @@ struct tick_device { * @next_tick: Next tick to be fired when in dynticks mode. * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_waketime: Time when the idle was interrupted + * @idle_sleeptime_seq: sequence counter for data consistency * @idle_entrytime: Time when the idle call was entered - * @nohz_mode: Mode - one state of tick_nohz_mode * @last_jiffies: Base jiffies snapshot when next event was last computed * @timer_expires_base: Base time clock monotonic for @timer_expires * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) -- cgit v1.2.3 From 9e643ab59d7ee4332994671720a9528bac62e9b7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:52 -0700 Subject: timers: Fix text inconsistencies and spelling Fix some text for consistency: s/lvl/level/ in a comment and use correct/full function names in comments. Correct spelling errors as reported by codespell. Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-7-rdunlap@infradead.org --- kernel/time/timer.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index dee29f1f5b75..3baf2fbe6848 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -64,15 +64,15 @@ EXPORT_SYMBOL(jiffies_64); /* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of - * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * LVL_SIZE buckets. Each level is driven by its own clock and therefore each * level has a different granularity. * - * The level granularity is: LVL_CLK_DIV ^ lvl + * The level granularity is: LVL_CLK_DIV ^ level * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and - * therefor the granularity becomes. + * therefore the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading @@ -207,7 +207,7 @@ EXPORT_SYMBOL(jiffies_64); * struct timer_base - Per CPU timer base (number of base depends on config) * @lock: Lock protecting the timer_base * @running_timer: When expiring timers, the lock is dropped. To make - * sure not to race agains deleting/modifying a + * sure not to race against deleting/modifying a * currently running timer, the pointer is set to the * timer, which expires at the moment. If no timer is * running, the pointer is NULL. @@ -737,7 +737,7 @@ static bool timer_is_static_object(void *addr) } /* - * fixup_init is called when: + * timer_fixup_init is called when: * - an active object is initialized */ static bool timer_fixup_init(void *addr, enum debug_obj_state state) @@ -761,7 +761,7 @@ static void stub_timer(struct timer_list *unused) } /* - * fixup_activate is called when: + * timer_fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ @@ -783,7 +783,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) } /* - * fixup_free is called when: + * timer_fixup_free is called when: * - an active object is freed */ static bool timer_fixup_free(void *addr, enum debug_obj_state state) @@ -801,7 +801,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) } /* - * fixup_assert_init is called when: + * timer_fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) @@ -914,7 +914,7 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior calling *any* of the + * init_timer_key() must be done to a timer prior to calling *any* of the * other timer functions. */ void init_timer_key(struct timer_list *timer, @@ -1417,7 +1417,7 @@ static int __timer_delete(struct timer_list *timer, bool shutdown) * If @shutdown is set then the lock has to be taken whether the * timer is pending or not to protect against a concurrent rearm * which might hit between the lockless pending check and the lock - * aquisition. By taking the lock it is ensured that such a newly + * acquisition. By taking the lock it is ensured that such a newly * enqueued timer is dequeued and cannot end up with * timer->function == NULL in the expiry code. * @@ -2306,7 +2306,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, /* * When timer base is not set idle, undo the effect of - * tmigr_cpu_deactivate() to prevent inconsitent states - active + * tmigr_cpu_deactivate() to prevent inconsistent states - active * timer base but inactive timer migration hierarchy. * * When timer base was already marked idle, nothing will be -- cgit v1.2.3 From 481047d7e8391d3842ae59025806531cdad710d9 Mon Sep 17 00:00:00 2001 From: "Yanjun.Zhu" Date: Thu, 14 Mar 2024 07:51:40 +0100 Subject: RDMA/rxe: Fix the problem "mutex_destroy missing" When a mutex lock is not used any more, the function mutex_destroy should be called to mark the mutex lock uninitialized. Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Yanjun.Zhu Link: https://lore.kernel.org/r/20240314065140.27468-1-yanjun.zhu@linux.dev Reviewed-by: Daisuke Matsuda Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index ae466e72fc43..255677bc12b2 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -33,6 +33,8 @@ void rxe_dealloc(struct ib_device *ib_dev) if (rxe->tfm) crypto_free_shash(rxe->tfm); + + mutex_destroy(&rxe->usdev_lock); } /* initialize rxe device parameters */ -- cgit v1.2.3 From 755795cd3da053b0565085d9950c44d7b6cba5c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 29 Mar 2024 22:54:42 +0100 Subject: OSS: dmasound/paula: Mark driver struct with __refdata to prevent section mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As described in the added code comment, a reference to .exit.text is ok for drivers registered via module_platform_driver_probe(). Make this explicit to prevent the following section mismatch warning WARNING: modpost: sound/oss/dmasound/dmasound_paula: section mismatch in reference: amiga_audio_driver+0x8 (section: .data) -> amiga_audio_remove (section: .exit.text) that triggers on an allmodconfig W=1 build. Signed-off-by: Uwe Kleine-König Message-ID: Signed-off-by: Takashi Iwai --- sound/oss/dmasound/dmasound_paula.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sound/oss/dmasound/dmasound_paula.c b/sound/oss/dmasound/dmasound_paula.c index 0ba8f0c4cd99..3a593da09280 100644 --- a/sound/oss/dmasound/dmasound_paula.c +++ b/sound/oss/dmasound/dmasound_paula.c @@ -725,7 +725,13 @@ static void __exit amiga_audio_remove(struct platform_device *pdev) dmasound_deinit(); } -static struct platform_driver amiga_audio_driver = { +/* + * amiga_audio_remove() lives in .exit.text. For drivers registered via + * module_platform_driver_probe() this is ok because they cannot get unbound at + * runtime. So mark the driver struct with __refdata to prevent modpost + * triggering a section mismatch warning. + */ +static struct platform_driver amiga_audio_driver __refdata = { .remove_new = __exit_p(amiga_audio_remove), .driver = { .name = "amiga-audio", -- cgit v1.2.3 From b68e1acb5834ed1a2ad42d9d002815a8bae7c0b6 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Fri, 22 Mar 2024 13:20:49 +0200 Subject: RDMA/cm: Print the old state when cm_destroy_id gets timeout The old state is helpful for debugging, as the current state is always IB_CM_IDLE when timeout happens. Fixes: 96d9cbe2f2ff ("RDMA/cm: add timeout to cm_destroy_id wait") Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/20240322112049.2022994-1-markzhang@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cm.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index bf0df6ee4f78..07fb8d3c037f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1026,23 +1026,26 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) } } -static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id) +static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id, + enum ib_cm_state old_state) { struct cm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - pr_err("%s: cm_id=%p timed out. state=%d refcnt=%d\n", __func__, - cm_id, cm_id->state, refcount_read(&cm_id_priv->refcount)); + pr_err("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__, + cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount)); } static void cm_destroy_id(struct ib_cm_id *cm_id, int err) { struct cm_id_private *cm_id_priv; + enum ib_cm_state old_state; struct cm_work *work; int ret; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irq(&cm_id_priv->lock); + old_state = cm_id->state; retest: switch (cm_id->state) { case IB_CM_LISTEN: @@ -1151,7 +1154,7 @@ retest: msecs_to_jiffies( CM_DESTROY_ID_WAIT_TIMEOUT)); if (!ret) /* timeout happened */ - cm_destroy_id_wait_timeout(cm_id); + cm_destroy_id_wait_timeout(cm_id, old_state); } while (!ret); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) -- cgit v1.2.3 From 2a975d426c82ff05ec1f0b773798d909fe4a3105 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Apr 2024 11:27:33 -0600 Subject: io_uring/rw: don't allow multishot reads without NOWAIT support Supporting multishot reads requires support for NOWAIT, as the alternative would be always having io-wq execute the work item whenever the poll readiness triggered. Any fast file type will have NOWAIT support (eg it understands both O_NONBLOCK and IOCB_NOWAIT). If the given file type does not, then simply resort to single shot execution. Cc: stable@vger.kernel.org Fixes: fc68fcda04910 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT") Signed-off-by: Jens Axboe --- io_uring/rw.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0585ebcc9773..c8d48287439e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -936,6 +936,13 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) ret = __io_read(req, issue_flags); + /* + * If the file doesn't support proper NOWAIT, then disable multishot + * and stay in single shot mode. + */ + if (!io_file_supports_nowait(req)) + req->flags &= ~REQ_F_APOLL_MULTISHOT; + /* * If we get -EAGAIN, recycle our buffer and just let normal poll * handling arm it. @@ -955,7 +962,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) /* * Any successful return value will keep the multishot read armed. */ - if (ret > 0) { + if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) { /* * Put our buffer and post a CQE. If we fail to post a CQE, then * jump to the termination path. This request is then done. -- cgit v1.2.3 From bee1d5becdf5bf23d4ca0cd9c6b60bdf3c61d72b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Apr 2024 11:30:06 -0600 Subject: io_uring: disable io-wq execution of multishot NOWAIT requests Do the same check for direct io-wq execution for multishot requests that commit 2a975d426c82 did for the inline execution, and disable multishot mode (and revert to single shot) if the file type doesn't support NOWAIT, and isn't opened in O_NONBLOCK mode. For multishot to work properly, it's a requirement that nonblocking read attempts can be done. Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5d4b448fdc50..8baf8afb79c2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1982,10 +1982,15 @@ fail: err = -EBADFD; if (!io_file_can_poll(req)) goto fail; - err = -ECANCELED; - if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) - goto fail; - return; + if (req->file->f_flags & O_NONBLOCK || + req->file->f_mode & FMODE_NOWAIT) { + err = -ECANCELED; + if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) + goto fail; + return; + } else { + req->flags &= ~REQ_F_APOLL_MULTISHOT; + } } if (req->flags & REQ_F_FORCE_ASYNC) { -- cgit v1.2.3 From 24a9799aa8efecd0eb55a75e35f9d8e6400063aa Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 14:13:10 -0300 Subject: smb: client: fix UAF in smb2_reconnect_server() The UAF bug is due to smb2_reconnect_server() accessing a session that is already being teared down by another thread that is executing __cifs_put_smb_ses(). This can happen when (a) the client has connection to the server but no session or (b) another thread ends up setting @ses->ses_status again to something different than SES_EXITING. To fix this, we need to make sure to unconditionally set @ses->ses_status to SES_EXITING and prevent any other threads from setting a new status while we're still tearing it down. The following can be reproduced by adding some delay to right after the ipc is freed in __cifs_put_smb_ses() - which will give smb2_reconnect_server() worker a chance to run and then accessing @ses->ipc: kinit ... mount.cifs //srv/share /mnt/1 -o sec=krb5,nohandlecache,echo_interval=10 [disconnect srv] ls /mnt/1 &>/dev/null sleep 30 kdestroy [reconnect srv] sleep 10 umount /mnt/1 ... CIFS: VFS: Verify user has a krb5 ticket and keyutils is installed CIFS: VFS: \\srv Send error in SessSetup = -126 CIFS: VFS: Verify user has a krb5 ticket and keyutils is installed CIFS: VFS: \\srv Send error in SessSetup = -126 general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6b6b: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 50 Comm: kworker/3:1 Not tainted 6.9.0-rc2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014 Workqueue: cifsiod smb2_reconnect_server [cifs] RIP: 0010:__list_del_entry_valid_or_report+0x33/0xf0 Code: 4f 08 48 85 d2 74 42 48 85 c9 74 59 48 b8 00 01 00 00 00 00 ad de 48 39 c2 74 61 48 b8 22 01 00 00 00 00 74 69 <48> 8b 01 48 39 f8 75 7b 48 8b 72 08 48 39 c6 0f 85 88 00 00 00 b8 RSP: 0018:ffffc900001bfd70 EFLAGS: 00010a83 RAX: dead000000000122 RBX: ffff88810da53838 RCX: 6b6b6b6b6b6b6b6b RDX: 6b6b6b6b6b6b6b6b RSI: ffffffffc02f6878 RDI: ffff88810da53800 RBP: ffff88810da53800 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff88810c064000 R13: 0000000000000001 R14: ffff88810c064000 R15: ffff8881039cc000 FS: 0000000000000000(0000) GS:ffff888157c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe3728b1000 CR3: 000000010caa4000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: ? die_addr+0x36/0x90 ? exc_general_protection+0x1c1/0x3f0 ? asm_exc_general_protection+0x26/0x30 ? __list_del_entry_valid_or_report+0x33/0xf0 __cifs_put_smb_ses+0x1ae/0x500 [cifs] smb2_reconnect_server+0x4ed/0x710 [cifs] process_one_work+0x205/0x6b0 worker_thread+0x191/0x360 ? __pfx_worker_thread+0x10/0x10 kthread+0xe2/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 83 ++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 9b85b5341822..ee29bc57300c 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -232,7 +232,13 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { - /* check if iface is still active */ + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); + continue; + } + spin_unlock(&ses->ses_lock); + spin_lock(&ses->chan_lock); if (cifs_ses_get_chan_index(ses, server) == CIFS_INVAL_CHAN_INDEX) { @@ -1963,31 +1969,6 @@ out: return rc; } -/** - * cifs_free_ipc - helper to release the session IPC tcon - * @ses: smb session to unmount the IPC from - * - * Needs to be called everytime a session is destroyed. - * - * On session close, the IPC is closed and the server must release all tcons of the session. - * No need to send a tree disconnect here. - * - * Besides, it will make the server to not close durable and resilient files on session close, as - * specified in MS-SMB2 3.3.5.6 Receiving an SMB2 LOGOFF Request. - */ -static int -cifs_free_ipc(struct cifs_ses *ses) -{ - struct cifs_tcon *tcon = ses->tcon_ipc; - - if (tcon == NULL) - return 0; - - tconInfoFree(tcon); - ses->tcon_ipc = NULL; - return 0; -} - static struct cifs_ses * cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) { @@ -2019,48 +2000,52 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) void __cifs_put_smb_ses(struct cifs_ses *ses) { struct TCP_Server_Info *server = ses->server; + struct cifs_tcon *tcon; unsigned int xid; size_t i; + bool do_logoff; int rc; + spin_lock(&cifs_tcp_ses_lock); spin_lock(&ses->ses_lock); - if (ses->ses_status == SES_EXITING) { + cifs_dbg(FYI, "%s: id=0x%llx ses_count=%d ses_status=%u ipc=%s\n", + __func__, ses->Suid, ses->ses_count, ses->ses_status, + ses->tcon_ipc ? ses->tcon_ipc->tree_name : "none"); + if (ses->ses_status == SES_EXITING || --ses->ses_count > 0) { spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); return; } - spin_unlock(&ses->ses_lock); + /* ses_count can never go negative */ + WARN_ON(ses->ses_count < 0); - cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); - cifs_dbg(FYI, - "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE"); + spin_lock(&ses->chan_lock); + cifs_chan_clear_need_reconnect(ses, server); + spin_unlock(&ses->chan_lock); - spin_lock(&cifs_tcp_ses_lock); - if (--ses->ses_count > 0) { - spin_unlock(&cifs_tcp_ses_lock); - return; - } - spin_lock(&ses->ses_lock); - if (ses->ses_status == SES_GOOD) - ses->ses_status = SES_EXITING; + do_logoff = ses->ses_status == SES_GOOD && server->ops->logoff; + ses->ses_status = SES_EXITING; + tcon = ses->tcon_ipc; + ses->tcon_ipc = NULL; spin_unlock(&ses->ses_lock); spin_unlock(&cifs_tcp_ses_lock); - /* ses_count can never go negative */ - WARN_ON(ses->ses_count < 0); - - spin_lock(&ses->ses_lock); - if (ses->ses_status == SES_EXITING && server->ops->logoff) { - spin_unlock(&ses->ses_lock); - cifs_free_ipc(ses); + /* + * On session close, the IPC is closed and the server must release all + * tcons of the session. No need to send a tree disconnect here. + * + * Besides, it will make the server to not close durable and resilient + * files on session close, as specified in MS-SMB2 3.3.5.6 Receiving an + * SMB2 LOGOFF Request. + */ + tconInfoFree(tcon); + if (do_logoff) { xid = get_xid(); rc = server->ops->logoff(xid, ses); if (rc) cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n", __func__, rc); _free_xid(xid); - } else { - spin_unlock(&ses->ses_lock); - cifs_free_ipc(ses); } spin_lock(&cifs_tcp_ses_lock); -- cgit v1.2.3 From fddf09273807bf6e51537823aaae896e05f147f9 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Fri, 19 Jan 2024 11:22:15 +0100 Subject: drm/display: fix typo While studying the code I've bumped into a small typo within the kernel-doc for two functions, apparently, due to copy-paste. This commit fixes "sizo" word to be "size". Signed-off-by: Oleksandr Natalenko Acked-by: Randy Dunlap Fixes: b3daa5ef52c2 ("drm: Add helper for DP++ adaptors") Reviewed-by: Dmitry Baryshkov Signed-off-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20240119102215.201474-1-oleksandr@natalenko.name --- drivers/gpu/drm/display/drm_dp_dual_mode_helper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c b/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c index bd61e20770a5..14a2a8473682 100644 --- a/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c +++ b/drivers/gpu/drm/display/drm_dp_dual_mode_helper.c @@ -52,7 +52,7 @@ * @adapter: I2C adapter for the DDC bus * @offset: register offset * @buffer: buffer for return data - * @size: sizo of the buffer + * @size: size of the buffer * * Reads @size bytes from the DP dual mode adaptor registers * starting at @offset. @@ -116,7 +116,7 @@ EXPORT_SYMBOL(drm_dp_dual_mode_read); * @adapter: I2C adapter for the DDC bus * @offset: register offset * @buffer: buffer for write data - * @size: sizo of the buffer + * @size: size of the buffer * * Writes @size bytes to the DP dual mode adaptor registers * starting at @offset. -- cgit v1.2.3 From 8844f467d6a58dc915f241e81c46e0c126f8c070 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 30 Mar 2024 05:53:22 +0200 Subject: drm/msm/dpu: make error messages at dpu_core_irq_register_callback() more sensible There is little point in using %ps to print a value known to be NULL. On the other hand it makes sense to print the callback symbol in the 'invalid IRQ' message. Correct those two error messages to make more sense. Fixes: 6893199183f8 ("drm/msm/dpu: stop using raw IRQ indices in the kernel output") Signed-off-by: Dmitry Baryshkov Reviewed-by: Marijn Suijten Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/585565/ Link: https://lore.kernel.org/r/20240330-dpu-irq-messages-v1-1-9ce782ae35f9@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c index 946dd0135dff..6a0a74832fb6 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_interrupts.c @@ -525,14 +525,14 @@ int dpu_core_irq_register_callback(struct dpu_kms *dpu_kms, int ret; if (!irq_cb) { - DPU_ERROR("invalid IRQ=[%d, %d] irq_cb:%ps\n", - DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx), irq_cb); + DPU_ERROR("IRQ=[%d, %d] NULL callback\n", + DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx)); return -EINVAL; } if (!dpu_core_irq_is_valid(irq_idx)) { - DPU_ERROR("invalid IRQ=[%d, %d]\n", - DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx)); + DPU_ERROR("invalid IRQ=[%d, %d] irq_cb:%ps\n", + DPU_IRQ_REG(irq_idx), DPU_IRQ_BIT(irq_idx), irq_cb); return -EINVAL; } -- cgit v1.2.3 From 1197c5b2099f716b3de327437fb50900a0b936c9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Mar 2024 23:38:06 +0100 Subject: scsi: mylex: Fix sysfs buffer lengths The myrb and myrs drivers use an odd way of implementing their sysfs files, calling snprintf() with a fixed length of 32 bytes to print into a page sized buffer. One of the strings is actually longer than 32 bytes, which clang can warn about: drivers/scsi/myrb.c:1906:10: error: 'snprintf' will always be truncated; specified size is 32, but format string expands to at least 34 [-Werror,-Wformat-truncation] drivers/scsi/myrs.c:1089:10: error: 'snprintf' will always be truncated; specified size is 32, but format string expands to at least 34 [-Werror,-Wformat-truncation] These could all be plain sprintf() without a length as the buffer is always long enough. On the other hand, sysfs files should not be overly long either, so just double the length to make sure the longest strings don't get truncated here. Fixes: 77266186397c ("scsi: myrs: Add Mylex RAID controller (SCSI interface)") Fixes: 081ff398c56c ("scsi: myrb: Add Mylex RAID controller (block interface)") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240326223825.4084412-8-arnd@kernel.org Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/myrb.c | 20 ++++++++++---------- drivers/scsi/myrs.c | 24 ++++++++++++------------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c index ca2e932dd9b7..f684eb5e0489 100644 --- a/drivers/scsi/myrb.c +++ b/drivers/scsi/myrb.c @@ -1775,9 +1775,9 @@ static ssize_t raid_state_show(struct device *dev, name = myrb_devstate_name(ldev_info->state); if (name) - ret = snprintf(buf, 32, "%s\n", name); + ret = snprintf(buf, 64, "%s\n", name); else - ret = snprintf(buf, 32, "Invalid (%02X)\n", + ret = snprintf(buf, 64, "Invalid (%02X)\n", ldev_info->state); } else { struct myrb_pdev_state *pdev_info = sdev->hostdata; @@ -1796,9 +1796,9 @@ static ssize_t raid_state_show(struct device *dev, else name = myrb_devstate_name(pdev_info->state); if (name) - ret = snprintf(buf, 32, "%s\n", name); + ret = snprintf(buf, 64, "%s\n", name); else - ret = snprintf(buf, 32, "Invalid (%02X)\n", + ret = snprintf(buf, 64, "Invalid (%02X)\n", pdev_info->state); } return ret; @@ -1886,11 +1886,11 @@ static ssize_t raid_level_show(struct device *dev, name = myrb_raidlevel_name(ldev_info->raid_level); if (!name) - return snprintf(buf, 32, "Invalid (%02X)\n", + return snprintf(buf, 64, "Invalid (%02X)\n", ldev_info->state); - return snprintf(buf, 32, "%s\n", name); + return snprintf(buf, 64, "%s\n", name); } - return snprintf(buf, 32, "Physical Drive\n"); + return snprintf(buf, 64, "Physical Drive\n"); } static DEVICE_ATTR_RO(raid_level); @@ -1903,15 +1903,15 @@ static ssize_t rebuild_show(struct device *dev, unsigned char status; if (sdev->channel < myrb_logical_channel(sdev->host)) - return snprintf(buf, 32, "physical device - not rebuilding\n"); + return snprintf(buf, 64, "physical device - not rebuilding\n"); status = myrb_get_rbld_progress(cb, &rbld_buf); if (rbld_buf.ldev_num != sdev->id || status != MYRB_STATUS_SUCCESS) - return snprintf(buf, 32, "not rebuilding\n"); + return snprintf(buf, 64, "not rebuilding\n"); - return snprintf(buf, 32, "rebuilding block %u of %u\n", + return snprintf(buf, 64, "rebuilding block %u of %u\n", rbld_buf.ldev_size - rbld_buf.blocks_left, rbld_buf.ldev_size); } diff --git a/drivers/scsi/myrs.c b/drivers/scsi/myrs.c index a1eec65a9713..e824be9d9bbb 100644 --- a/drivers/scsi/myrs.c +++ b/drivers/scsi/myrs.c @@ -947,9 +947,9 @@ static ssize_t raid_state_show(struct device *dev, name = myrs_devstate_name(ldev_info->dev_state); if (name) - ret = snprintf(buf, 32, "%s\n", name); + ret = snprintf(buf, 64, "%s\n", name); else - ret = snprintf(buf, 32, "Invalid (%02X)\n", + ret = snprintf(buf, 64, "Invalid (%02X)\n", ldev_info->dev_state); } else { struct myrs_pdev_info *pdev_info; @@ -958,9 +958,9 @@ static ssize_t raid_state_show(struct device *dev, pdev_info = sdev->hostdata; name = myrs_devstate_name(pdev_info->dev_state); if (name) - ret = snprintf(buf, 32, "%s\n", name); + ret = snprintf(buf, 64, "%s\n", name); else - ret = snprintf(buf, 32, "Invalid (%02X)\n", + ret = snprintf(buf, 64, "Invalid (%02X)\n", pdev_info->dev_state); } return ret; @@ -1066,13 +1066,13 @@ static ssize_t raid_level_show(struct device *dev, ldev_info = sdev->hostdata; name = myrs_raid_level_name(ldev_info->raid_level); if (!name) - return snprintf(buf, 32, "Invalid (%02X)\n", + return snprintf(buf, 64, "Invalid (%02X)\n", ldev_info->dev_state); } else name = myrs_raid_level_name(MYRS_RAID_PHYSICAL); - return snprintf(buf, 32, "%s\n", name); + return snprintf(buf, 64, "%s\n", name); } static DEVICE_ATTR_RO(raid_level); @@ -1086,7 +1086,7 @@ static ssize_t rebuild_show(struct device *dev, unsigned char status; if (sdev->channel < cs->ctlr_info->physchan_present) - return snprintf(buf, 32, "physical device - not rebuilding\n"); + return snprintf(buf, 64, "physical device - not rebuilding\n"); ldev_info = sdev->hostdata; ldev_num = ldev_info->ldev_num; @@ -1098,11 +1098,11 @@ static ssize_t rebuild_show(struct device *dev, return -EIO; } if (ldev_info->rbld_active) { - return snprintf(buf, 32, "rebuilding block %zu of %zu\n", + return snprintf(buf, 64, "rebuilding block %zu of %zu\n", (size_t)ldev_info->rbld_lba, (size_t)ldev_info->cfg_devsize); } else - return snprintf(buf, 32, "not rebuilding\n"); + return snprintf(buf, 64, "not rebuilding\n"); } static ssize_t rebuild_store(struct device *dev, @@ -1190,7 +1190,7 @@ static ssize_t consistency_check_show(struct device *dev, unsigned short ldev_num; if (sdev->channel < cs->ctlr_info->physchan_present) - return snprintf(buf, 32, "physical device - not checking\n"); + return snprintf(buf, 64, "physical device - not checking\n"); ldev_info = sdev->hostdata; if (!ldev_info) @@ -1198,11 +1198,11 @@ static ssize_t consistency_check_show(struct device *dev, ldev_num = ldev_info->ldev_num; myrs_get_ldev_info(cs, ldev_num, ldev_info); if (ldev_info->cc_active) - return snprintf(buf, 32, "checking block %zu of %zu\n", + return snprintf(buf, 64, "checking block %zu of %zu\n", (size_t)ldev_info->cc_lba, (size_t)ldev_info->cfg_devsize); else - return snprintf(buf, 32, "not checking\n"); + return snprintf(buf, 64, "not checking\n"); } static ssize_t consistency_check_store(struct device *dev, -- cgit v1.2.3 From ba947ecd39ea0e6a6f6f1101f99611fc30943bcb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Apr 2024 19:16:19 -0400 Subject: bcachefs: Fix btree node reserve Sign error when checking the watermark - oops. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 983bb27298cc..29aee215384a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -303,7 +303,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim ? BTREE_NODE_RESERVE : 0; int ret; -- cgit v1.2.3 From e2a316b3cc45a1198f3feb18707403bb7f0cbc15 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Apr 2024 19:20:36 -0400 Subject: bcachefs: BCH_WATERMARK_interior_updates This adds a new watermark, higher priority than BCH_WATERMARK_reclaim, for interior btree updates. We've seen a deadlock where journal replay triggers a ton of btree node merges, and these use up all available open buckets and then interior updates get stuck. One cause of this is that we're currently lacking btree node merging on write buffer btrees - that needs to be fixed as well. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 4 +++- fs/bcachefs/alloc_types.h | 3 ++- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/btree_trans_commit.c | 3 ++- fs/bcachefs/btree_update_interior.c | 6 +++--- fs/bcachefs/buckets.h | 1 + 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 214b15c84d1f..a1fc30adf912 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { - case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; case BCH_WATERMARK_btree: case BCH_WATERMARK_btree_copygc: return OPEN_BUCKETS_COUNT / 4; diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index b91b7a461056..c2226e947c41 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -22,7 +22,8 @@ struct bucket_alloc_state { x(copygc) \ x(btree) \ x(btree_copygc) \ - x(reclaim) + x(reclaim) \ + x(interior_updates) enum bch_watermark { #define x(name) BCH_WATERMARK_##name, diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 9c71e6fb9c41..f3f27bb85a5b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1861,7 +1861,7 @@ static void btree_node_write_work(struct work_struct *work) } else { ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw, diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 96669fede7d3..aa9da4970740 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -887,6 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; switch (ret) { case -BCH_ERR_btree_insert_btree_node_full: @@ -905,7 +906,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * flag */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 29aee215384a..9fd2dd0f4682 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -687,7 +687,7 @@ static void btree_update_nodes_written(struct btree_update *as) * which may require allocations as well. */ ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_journal_reclaim, @@ -1121,7 +1121,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) + watermark < BCH_WATERMARK_reclaim) journal_flags |= JOURNAL_RES_GET_NONBLOCK; ret = drop_locks_do(trans, @@ -1217,7 +1217,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, */ if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 6387e039f789..00aaf4bb5139 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma fallthrough; case BCH_WATERMARK_btree_copygc: case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: break; } -- cgit v1.2.3 From 6bc5e70b1c792b31b497e48b4668a9a2909aca0d Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Fri, 29 Mar 2024 09:50:36 +0800 Subject: scsi: ufs: core: WLUN suspend dev/link state error recovery When wl suspend error occurs, for example BKOP or SSU timeout, the host triggers an error handler and returns -EBUSY to break the wl suspend process. However, it is possible for the runtime PM to enter wl suspend again before the error handler has finished, and return -EINVAL because the device is in an error state. To address this, ensure that the rumtime PM waits for the error handler to finish, or trigger the error handler in such cases, because returning -EINVAL can cause the I/O to hang. Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20240329015036.15707-1-peter.wang@mediatek.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index e30fd125988d..292b06f361a2 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -9791,7 +9791,10 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) /* UFS device & link must be active before we enter in this function */ if (!ufshcd_is_ufs_dev_active(hba) || !ufshcd_is_link_active(hba)) { - ret = -EINVAL; + /* Wait err handler finish or trigger err recovery */ + if (!ufshcd_eh_in_progress(hba)) + ufshcd_force_error_recovery(hba); + ret = -EBUSY; goto enable_scaling; } -- cgit v1.2.3 From 0296bea01cfa6526be6bd2d16dc83b4e7f1af91f Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 8 Dec 2023 16:23:35 +0800 Subject: scsi: sd: Unregister device if device_add_disk() failed in sd_probe() "if device_add() succeeds, you should call device_del() when you want to get rid of it." In sd_probe(), device_add_disk() fails when device_add() has already succeeded, so change put_device() to device_unregister() to ensure device resources are released. Fixes: 2a7a891f4c40 ("scsi: sd: Add error handling support for add_disk()") Signed-off-by: Li Nan Link: https://lore.kernel.org/r/20231208082335.1754205-1-linan666@huaweicloud.com Reviewed-by: Bart Van Assche Reviewed-by: Yu Kuai Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3cf898670290..58fdf679341d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3920,7 +3920,7 @@ static int sd_probe(struct device *dev) error = device_add_disk(dev, gd, NULL); if (error) { - put_device(&sdkp->disk_dev); + device_unregister(&sdkp->disk_dev); put_disk(gd); goto out; } -- cgit v1.2.3 From e675a4fd6d1f8990d3bed5dada3d20edfa000423 Mon Sep 17 00:00:00 2001 From: Yihang Li Date: Thu, 28 Mar 2024 17:06:26 +0800 Subject: scsi: libsas: Align SMP request allocation to ARCH_DMA_MINALIGN This series [1] reduced the kmalloc() minimum alignment on arm64 to 8 bytes (from 128). In libsas, this will cause SMP requests to be 8-byte aligned through kmalloc() allocation. However, for hisi_sas hardware, all command addresses must be 16-byte-aligned. Otherwise, the commands fail to be executed. ARCH_DMA_MINALIGN represents the minimum (static) alignment for safe DMA operations, so use ARCH_DMA_MINALIGN as the alignment for SMP request. Link: https://lkml.kernel.org/r/20230612153201.554742-1-catalin.marinas@arm.com [1] Signed-off-by: Yihang Li Link: https://lore.kernel.org/r/20240328090626.621147-1-liyihang9@huawei.com Reviewed-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Jason Yan Signed-off-by: Martin K. Petersen --- drivers/scsi/libsas/sas_expander.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 5c261005b74e..f6e6db8b8aba 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -135,7 +135,7 @@ static int smp_execute_task(struct domain_device *dev, void *req, int req_size, static inline void *alloc_smp_req(int size) { - u8 *p = kzalloc(size, GFP_KERNEL); + u8 *p = kzalloc(ALIGN(size, ARCH_DMA_MINALIGN), GFP_KERNEL); if (p) p[0] = SMP_REQUEST; return p; -- cgit v1.2.3 From 2a26a11e9c258b14be6fd98f8a85f20ac1fff66e Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Thu, 28 Mar 2024 19:12:44 +0800 Subject: scsi: ufs: core: Fix MCQ mode dev command timeout When a dev command times out in MCQ mode, a successfully cleared command should cause a retry. However, because we currently return 0, the caller considers the command a success which causes the following error to be logged: "Invalid offset 0x0 in descriptor IDN 0x9, length 0x0". Retry if clearing the command was successful. Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20240328111244.3599-1-peter.wang@mediatek.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 292b06f361a2..a0f8e930167d 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -3217,7 +3217,9 @@ retry: /* MCQ mode */ if (is_mcq_enabled(hba)) { - err = ufshcd_clear_cmd(hba, lrbp->task_tag); + /* successfully cleared the command, retry if needed */ + if (ufshcd_clear_cmd(hba, lrbp->task_tag) == 0) + err = -EAGAIN; hba->dev_cmd.complete = NULL; return err; } -- cgit v1.2.3 From cd49cca222bc5532105a1f20bff6ae60d7bf7713 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Wed, 6 Mar 2024 11:35:15 -0800 Subject: drm/msm/dp: fix typo in dp_display_handle_port_status_changed() Fix the typo in the name of dp_display_handle_port_status_changed(). Fixes: c58eb1b54fee ("drm/msm/dp: fix connect/disconnect handled at irq_hpd") Signed-off-by: Abhinav Kumar Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/581746/ Link: https://lore.kernel.org/r/20240306193515.455388-1-quic_abhinavk@quicinc.com --- drivers/gpu/drm/msm/dp/dp_display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 2ed40bd0acb6..9575d4b80c29 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -468,7 +468,7 @@ static void dp_display_handle_video_request(struct dp_display_private *dp) } } -static int dp_display_handle_port_ststus_changed(struct dp_display_private *dp) +static int dp_display_handle_port_status_changed(struct dp_display_private *dp) { int rc = 0; @@ -525,7 +525,7 @@ static int dp_display_usbpd_attention_cb(struct device *dev) drm_dbg_dp(dp->drm_dev, "hpd_state=%d sink_request=%d\n", dp->hpd_state, sink_request); if (sink_request & DS_PORT_STATUS_CHANGED) - rc = dp_display_handle_port_ststus_changed(dp); + rc = dp_display_handle_port_status_changed(dp); else rc = dp_display_handle_irq_hpd(dp); } -- cgit v1.2.3 From be1b7acb929137e3943fe380671242beb485190c Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 2 Apr 2024 05:57:15 +0300 Subject: dt-bindings: display/msm: sm8150-mdss: add DP node As Qualcomm SM8150 got support for the DisplayPort, add displayport@ node as a valid child to the MDSS node. Fixes: 88806318e2c2 ("dt-bindings: display: msm: dp: declare compatible string for sm8150") Reviewed-by: Krzysztof Kozlowski Signed-off-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/586156/ Link: https://lore.kernel.org/r/20240402-fd-fix-schema-v3-1-817ea6ddf775@linaro.org Signed-off-by: Abhinav Kumar --- .../devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml index c0d6a4fdff97..e6dc5494baee 100644 --- a/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml +++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml @@ -53,6 +53,15 @@ patternProperties: compatible: const: qcom,sm8150-dpu + "^displayport-controller@[0-9a-f]+$": + type: object + additionalProperties: true + + properties: + compatible: + contains: + const: qcom,sm8150-dp + "^dsi@[0-9a-f]+$": type: object additionalProperties: true -- cgit v1.2.3 From c88b50a12f962f520dfab0a53ab393f43df9bbd4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 1 Apr 2024 14:44:29 +0200 Subject: ata: ahci_st: Remove an unused field in struct st_ahci_drv_data In "struct st_ahci_drv_data", the 'ahci' field is unused. Remove it. Found with cppcheck, unusedStructMember. Signed-off-by: Christophe JAILLET Signed-off-by: Damien Le Moal --- drivers/ata/ahci_st.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/ata/ahci_st.c b/drivers/ata/ahci_st.c index d4a626f87963..79a8b0aa37bf 100644 --- a/drivers/ata/ahci_st.c +++ b/drivers/ata/ahci_st.c @@ -30,7 +30,6 @@ #define ST_AHCI_OOBR_CIMAX_SHIFT 0 struct st_ahci_drv_data { - struct platform_device *ahci; struct reset_control *pwr; struct reset_control *sw_rst; struct reset_control *pwr_rst; -- cgit v1.2.3 From 37801a36b4d68892ce807264f784d818f8d0d39b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Thu, 28 Mar 2024 20:16:58 +0100 Subject: selinux: avoid dereference of garbage after mount failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case kern_mount() fails and returns an error pointer return in the error branch instead of continuing and dereferencing the error pointer. While on it drop the never read static variable selinuxfs_mount. Cc: stable@vger.kernel.org Fixes: 0619f0f5e36f ("selinux: wrap selinuxfs state") Signed-off-by: Christian Göttsche Signed-off-by: Paul Moore --- security/selinux/selinuxfs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 0619a1cbbfbe..074d6c2714eb 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -2123,7 +2123,6 @@ static struct file_system_type sel_fs_type = { .kill_sb = sel_kill_sb, }; -static struct vfsmount *selinuxfs_mount __ro_after_init; struct path selinux_null __ro_after_init; static int __init init_sel_fs(void) @@ -2145,18 +2144,21 @@ static int __init init_sel_fs(void) return err; } - selinux_null.mnt = selinuxfs_mount = kern_mount(&sel_fs_type); - if (IS_ERR(selinuxfs_mount)) { + selinux_null.mnt = kern_mount(&sel_fs_type); + if (IS_ERR(selinux_null.mnt)) { pr_err("selinuxfs: could not mount!\n"); - err = PTR_ERR(selinuxfs_mount); - selinuxfs_mount = NULL; + err = PTR_ERR(selinux_null.mnt); + selinux_null.mnt = NULL; + return err; } + selinux_null.dentry = d_hash_and_lookup(selinux_null.mnt->mnt_root, &null_name); if (IS_ERR(selinux_null.dentry)) { pr_err("selinuxfs: could not lookup null!\n"); err = PTR_ERR(selinux_null.dentry); selinux_null.dentry = NULL; + return err; } return err; -- cgit v1.2.3 From 9d98aa088386aee3db1b7b60b800c0fde0654a4a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 1 Apr 2024 20:55:29 +0200 Subject: x86/bpf: Fix IP after emitting call depth accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adjust the IP passed to `emit_patch` so it calculates the correct offset for the CALL instruction if `x86_call_depth_emit_accounting` emits code. Otherwise we will skip some instructions and most likely crash. Fixes: b2e9dfe54be4 ("x86/bpf: Emit call depth accounting if required") Link: https://lore.kernel.org/lkml/20230105214922.250473-1-joanbrugueram@gmail.com/ Co-developed-by: Joan Bruguera Micó Signed-off-by: Joan Bruguera Micó Signed-off-by: Uros Bizjak Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lore.kernel.org/r/20240401185821.224068-2-ubizjak@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a7ba8e178645..e55745f512e1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -480,7 +480,7 @@ static int emit_call(u8 **pprog, void *func, void *ip) static int emit_rsb_call(u8 **pprog, void *func, void *ip) { OPTIMIZER_HIDE_VAR(func); - x86_call_depth_emit_accounting(pprog, func); + ip += x86_call_depth_emit_accounting(pprog, func); return emit_patch(pprog, func, ip, 0xE8); } -- cgit v1.2.3 From 6a537453000a916392fcac1acb96c1d9d1e05b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20Bruguera=20Mic=C3=B3?= Date: Mon, 1 Apr 2024 20:55:30 +0200 Subject: x86/bpf: Fix IP for relocating call depth accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit: 59bec00ace28 ("x86/percpu: Introduce %rip-relative addressing to PER_CPU_VAR()") made PER_CPU_VAR() to use rip-relative addressing, hence INCREMENT_CALL_DEPTH macro and skl_call_thunk_template got rip-relative asm code inside of it. A follow up commit: 17bce3b2ae2d ("x86/callthunks: Handle %rip-relative relocations in call thunk template") changed x86_call_depth_emit_accounting() to use apply_relocation(), but mistakenly assumed that the code is being patched in-place (where the destination of the relocation matches the address of the code), using *pprog as the destination ip. This is not true for the call depth accounting, emitted by the BPF JIT, so the calculated address was wrong, JIT-ed BPF progs on kernels with call depth tracking got broken and usually caused a page fault. Pass the destination IP when the BPF JIT emits call depth accounting. Fixes: 17bce3b2ae2d ("x86/callthunks: Handle %rip-relative relocations in call thunk template") Signed-off-by: Joan Bruguera Micó Reviewed-by: Uros Bizjak Acked-by: Ingo Molnar Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lore.kernel.org/r/20240401185821.224068-3-ubizjak@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/alternative.h | 4 ++-- arch/x86/kernel/callthunks.c | 4 ++-- arch/x86/net/bpf_jit_comp.c | 19 ++++++++----------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index fcd20c6dc7f9..67b68d0d17d1 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -117,7 +117,7 @@ extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); -extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void @@ -128,7 +128,7 @@ static __always_inline void *callthunks_translate_call_dest(void *dest) return dest; } static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, - void *func) + void *func, void *ip) { return 0; } diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 30335182b6b0..e92ff0c11db8 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -314,7 +314,7 @@ static bool is_callthunk(void *addr) return !bcmp(pad, insn_buff, tmpl_size); } -int x86_call_depth_emit_accounting(u8 **pprog, void *func) +int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) { unsigned int tmpl_size = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; @@ -327,7 +327,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, tmpl_size, *pprog, + apply_relocation(insn_buff, tmpl_size, ip, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e55745f512e1..df5fac428408 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -480,7 +480,7 @@ static int emit_call(u8 **pprog, void *func, void *ip) static int emit_rsb_call(u8 **pprog, void *func, void *ip) { OPTIMIZER_HIDE_VAR(func); - ip += x86_call_depth_emit_accounting(pprog, func); + ip += x86_call_depth_emit_accounting(pprog, func, ip); return emit_patch(pprog, func, ip, 0xE8); } @@ -1972,20 +1972,17 @@ populate_extable: /* call */ case BPF_JMP | BPF_CALL: { - int offs; + u8 *ip = image + addrs[i - 1]; func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); - if (!imm32) - return -EINVAL; - offs = 7 + x86_call_depth_emit_accounting(&prog, func); - } else { - if (!imm32) - return -EINVAL; - offs = x86_call_depth_emit_accounting(&prog, func); + ip += 7; } - if (emit_call(&prog, func, image + addrs[i - 1] + offs)) + if (!imm32) + return -EINVAL; + ip += x86_call_depth_emit_accounting(&prog, func, ip); + if (emit_call(&prog, func, ip)) return -EINVAL; break; } @@ -2835,7 +2832,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * Direct-call fentry stub, as such it needs accounting for the * __fentry__ call. */ - x86_call_depth_emit_accounting(&prog, NULL); + x86_call_depth_emit_accounting(&prog, NULL, image); } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ -- cgit v1.2.3 From 96c155943a703f0655c0c4cab540f67055960e91 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Fri, 29 Mar 2024 09:16:31 +0300 Subject: net: phy: micrel: Fix potential null pointer dereference In lan8814_get_sig_rx() and lan8814_get_sig_tx() ptp_parse_header() may return NULL as ptp_header due to abnormal packet type or corrupted packet. Fix this bug by adding ptp_header check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Aleksandr Mishin Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240329061631.33199-1-amishin@t-argos.ru Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 8b8634600c51..0f8a8ad7ea0b 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2537,7 +2537,7 @@ static void lan8814_txtstamp(struct mii_timestamper *mii_ts, } } -static void lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) +static bool lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) { struct ptp_header *ptp_header; u32 type; @@ -2547,7 +2547,11 @@ static void lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) ptp_header = ptp_parse_header(skb, type); skb_pull_inline(skb, ETH_HLEN); + if (!ptp_header) + return false; + *sig = (__force u16)(ntohs(ptp_header->sequence_id)); + return true; } static bool lan8814_match_rx_skb(struct kszphy_ptp_priv *ptp_priv, @@ -2559,7 +2563,8 @@ static bool lan8814_match_rx_skb(struct kszphy_ptp_priv *ptp_priv, bool ret = false; u16 skb_sig; - lan8814_get_sig_rx(skb, &skb_sig); + if (!lan8814_get_sig_rx(skb, &skb_sig)) + return ret; /* Iterate over all RX timestamps and match it with the received skbs */ spin_lock_irqsave(&ptp_priv->rx_ts_lock, flags); @@ -2834,7 +2839,7 @@ static int lan8814_ptpci_adjfine(struct ptp_clock_info *ptpci, long scaled_ppm) return 0; } -static void lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) +static bool lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) { struct ptp_header *ptp_header; u32 type; @@ -2842,7 +2847,11 @@ static void lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) type = ptp_classify_raw(skb); ptp_header = ptp_parse_header(skb, type); + if (!ptp_header) + return false; + *sig = (__force u16)(ntohs(ptp_header->sequence_id)); + return true; } static void lan8814_match_tx_skb(struct kszphy_ptp_priv *ptp_priv, @@ -2856,7 +2865,8 @@ static void lan8814_match_tx_skb(struct kszphy_ptp_priv *ptp_priv, spin_lock_irqsave(&ptp_priv->tx_queue.lock, flags); skb_queue_walk_safe(&ptp_priv->tx_queue, skb, skb_tmp) { - lan8814_get_sig_tx(skb, &skb_sig); + if (!lan8814_get_sig_tx(skb, &skb_sig)) + continue; if (memcmp(&skb_sig, &seq_id, sizeof(seq_id))) continue; @@ -2910,7 +2920,8 @@ static bool lan8814_match_skb(struct kszphy_ptp_priv *ptp_priv, spin_lock_irqsave(&ptp_priv->rx_queue.lock, flags); skb_queue_walk_safe(&ptp_priv->rx_queue, skb, skb_tmp) { - lan8814_get_sig_rx(skb, &skb_sig); + if (!lan8814_get_sig_rx(skb, &skb_sig)) + continue; if (memcmp(&skb_sig, &rx_ts->seq_id, sizeof(rx_ts->seq_id))) continue; -- cgit v1.2.3 From 31974122cfdeaf56abc18d8ab740d580d9833e90 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2024 09:05:59 -0700 Subject: selftests: reuseaddr_conflict: add missing new line at the end of the output The netdev CI runs in a VM and captures serial, so stdout and stderr get combined. Because there's a missing new line in stderr the test ends up corrupting KTAP: # Successok 1 selftests: net: reuseaddr_conflict which should have been: # Success ok 1 selftests: net: reuseaddr_conflict Fixes: 422d8dc6fd3a ("selftest: add a reuseaddr test") Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240329160559.249476-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/reuseaddr_conflict.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c index 7c5b12664b03..bfb07dc49518 100644 --- a/tools/testing/selftests/net/reuseaddr_conflict.c +++ b/tools/testing/selftests/net/reuseaddr_conflict.c @@ -109,6 +109,6 @@ int main(void) fd1 = open_port(0, 1); if (fd1 >= 0) error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6"); - fprintf(stderr, "Success"); + fprintf(stderr, "Success\n"); return 0; } -- cgit v1.2.3 From fcf4692fa39e86a590c14a4af2de704e1d20a3b5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 Mar 2024 19:50:36 +0100 Subject: mptcp: prevent BPF accessing lowat from a subflow socket. Alexei reported the following splat: WARNING: CPU: 32 PID: 3276 at net/mptcp/subflow.c:1430 subflow_data_ready+0x147/0x1c0 Modules linked in: dummy bpf_testmod(O) [last unloaded: bpf_test_no_cfi(O)] CPU: 32 PID: 3276 Comm: test_progs Tainted: GO 6.8.0-12873-g2c43c33bfd23 Call Trace: mptcp_set_rcvlowat+0x79/0x1d0 sk_setsockopt+0x6c0/0x1540 __bpf_setsockopt+0x6f/0x90 bpf_sock_ops_setsockopt+0x3c/0x90 bpf_prog_509ce5db2c7f9981_bpf_test_sockopt_int+0xb4/0x11b bpf_prog_dce07e362d941d2b_bpf_test_socket_sockopt+0x12b/0x132 bpf_prog_348c9b5faaf10092_skops_sockopt+0x954/0xe86 __cgroup_bpf_run_filter_sock_ops+0xbc/0x250 tcp_connect+0x879/0x1160 tcp_v6_connect+0x50c/0x870 mptcp_connect+0x129/0x280 __inet_stream_connect+0xce/0x370 inet_stream_connect+0x36/0x50 bpf_trampoline_6442491565+0x49/0xef inet_stream_connect+0x5/0x50 __sys_connect+0x63/0x90 __x64_sys_connect+0x14/0x20 The root cause of the issue is that bpf allows accessing mptcp-level proto_ops from a tcp subflow scope. Fix the issue detecting the problematic call and preventing any action. Reported-by: Alexei Starovoitov Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/482 Fixes: 5684ab1a0eff ("mptcp: give rcvlowat some love") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Reviewed-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/d8cb7d8476d66cb0812a6e29cd1e626869d9d53e.1711738080.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index dcd1c76d2a3b..73fdf423de44 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1493,6 +1493,10 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) struct mptcp_subflow_context *subflow; int space, cap; + /* bpf can land here with a wrong sk type */ + if (sk->sk_protocol == IPPROTO_TCP) + return -EINVAL; + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else -- cgit v1.2.3 From 7a1b3490f47e88ec4cbde65f1a77a0f4bc972282 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 29 Mar 2024 13:08:52 +0100 Subject: mptcp: don't account accept() of non-MPC client as fallback to TCP Current MPTCP servers increment MPTcpExtMPCapableFallbackACK when they accept non-MPC connections. As reported by Christoph, this is "surprising" because the counter might become greater than MPTcpExtMPCapableSYNRX. MPTcpExtMPCapableFallbackACK counter's name suggests it should only be incremented when a connection was seen using MPTCP options, then a fallback to TCP has been done. Let's do that by incrementing it when the subflow context of an inbound MPC connection attempt is dropped. Also, update mptcp_connect.sh kselftest, to ensure that the above MIB does not increment in case a pure TCP client connects to a MPTCP server. Fixes: fc518953bc9c ("mptcp: add and use MIB counter infrastructure") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/449 Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240329-upstream-net-20240329-fallback-mib-v1-1-324a8981da48@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 -- net/mptcp/subflow.c | 2 ++ tools/testing/selftests/net/mptcp/mptcp_connect.sh | 9 +++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3a1967bc7bad..7e74b812e366 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3937,8 +3937,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, mptcp_set_state(newsk, TCP_CLOSE); } } else { - MPTCP_INC_STATS(sock_net(ssk), - MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); tcpfallback: newsk->sk_kern_sock = kern; lock_sock(newsk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1626dd20c68f..6042a47da61b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -905,6 +905,8 @@ dispose_child: return child; fallback: + if (fallback) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 4c4248554826..4131f3263a48 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -383,12 +383,14 @@ do_transfer() local stat_cookierx_last local stat_csum_err_s local stat_csum_err_c + local stat_tcpfb_last_l stat_synrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") stat_ackrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") stat_cookietx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") stat_cookierx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") stat_csum_err_s=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtDataCsumErr") stat_csum_err_c=$(mptcp_lib_get_counter "${connector_ns}" "MPTcpExtDataCsumErr") + stat_tcpfb_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") timeout ${timeout_test} \ ip netns exec ${listener_ns} \ @@ -457,11 +459,13 @@ do_transfer() local stat_cookietx_now local stat_cookierx_now local stat_ooo_now + local stat_tcpfb_now_l stat_synrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") stat_ackrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") stat_cookietx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") stat_cookierx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") stat_ooo_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtTCPOFOQueue") + stat_tcpfb_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") expect_synrx=$((stat_synrx_last_l)) expect_ackrx=$((stat_ackrx_last_l)) @@ -508,6 +512,11 @@ do_transfer() fi fi + if [ ${stat_ooo_now} -eq 0 ] && [ ${stat_tcpfb_last_l} -ne ${stat_tcpfb_now_l} ]; then + mptcp_lib_pr_fail "unexpected fallback to TCP" + rets=1 + fi + if [ $cookies -eq 2 ];then if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then extra+=" WARN: CookieSent: did not advance" -- cgit v1.2.3 From 40061817d95bce6dd5634a61a65cd5922e6ccc92 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 29 Mar 2024 13:08:53 +0100 Subject: selftests: mptcp: join: fix dev in check_endpoint There's a bug in pm_nl_check_endpoint(), 'dev' didn't be parsed correctly. If calling it in the 2nd test of endpoint_tests() too, it fails with an error like this: creation [FAIL] expected '10.0.2.2 id 2 subflow dev dev' \ found '10.0.2.2 id 2 subflow dev ns2eth2' The reason is '$2' should be set to 'dev', not '$1'. This patch fixes it. Fixes: 69c6ce7b6eca ("selftests: mptcp: add implicit endpoint test case") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240329-upstream-net-20240329-fallback-mib-v1-2-324a8981da48@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 5e9211e89825..e4403236f655 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -729,7 +729,7 @@ pm_nl_check_endpoint() [ -n "$_flags" ]; flags="flags $_flags" shift elif [ $1 = "dev" ]; then - [ -n "$2" ]; dev="dev $1" + [ -n "$2" ]; dev="dev $2" shift elif [ $1 = "id" ]; then _id=$2 @@ -3610,6 +3610,8 @@ endpoint_tests() local tests_pid=$! wait_mpj $ns2 + pm_nl_check_endpoint "creation" \ + $ns2 10.0.2.2 id 2 flags subflow dev ns2eth2 chk_subflow_nr "before delete" 2 chk_mptcp_info subflows 1 subflows 1 -- cgit v1.2.3 From ea2a1cfc3b2019bdea6324acd3c03606b60d71ad Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 29 Mar 2024 11:06:37 -0700 Subject: i40e: Fix VF MAC filter removal Commit 73d9629e1c8c ("i40e: Do not allow untrusted VF to remove administratively set MAC") fixed an issue where untrusted VF was allowed to remove its own MAC address although this was assigned administratively from PF. Unfortunately the introduced check is wrong because it causes that MAC filters for other MAC addresses including multi-cast ones are not removed. if (ether_addr_equal(addr, vf->default_lan_addr.addr) && i40e_can_vf_change_mac(vf)) was_unimac_deleted = true; else continue; if (i40e_del_mac_filter(vsi, al->list[i].addr)) { ... The else path with `continue` effectively skips any MAC filter removal except one for primary MAC addr when VF is allowed to do so. Fix the check condition so the `continue` is only done for primary MAC address. Fixes: 73d9629e1c8c ("i40e: Do not allow untrusted VF to remove administratively set MAC") Signed-off-by: Ivan Vecera Reviewed-by: Michal Schmidt Reviewed-by: Brett Creeley Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20240329180638.211412-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 4c13f78af675..232b65b9c8ea 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3137,11 +3137,12 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) /* Allow to delete VF primary MAC only if it was not set * administratively by PF or if VF is trusted. */ - if (ether_addr_equal(addr, vf->default_lan_addr.addr) && - i40e_can_vf_change_mac(vf)) - was_unimac_deleted = true; - else - continue; + if (ether_addr_equal(addr, vf->default_lan_addr.addr)) { + if (i40e_can_vf_change_mac(vf)) + was_unimac_deleted = true; + else + continue; + } if (i40e_del_mac_filter(vsi, al->list[i].addr)) { ret = -EINVAL; -- cgit v1.2.3 From c42cd606e4f004e9ba36a05b9adb9e4eead5834a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Apr 2024 01:03:58 -0400 Subject: bcachefs: fix nocow lock deadlock Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index b564404dad71..34731ee0217f 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -580,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans, move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, PTR_BUCKET_POS(c, &p.ptr), 0)) || - (!atomic_read(&ctxt->read_sectors) && - !atomic_read(&ctxt->write_sectors))); + list_empty(&ctxt->ios)); if (!locked) bch2_bucket_nocow_lock(&c->nocow_locks, -- cgit v1.2.3 From c032cdd48b29549e8283c2fea99e7d91ddefebf7 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 26 Mar 2024 10:58:15 +0200 Subject: thunderbolt: Do not create DisplayPort tunnels on adapters of the same router Probably due to a firmware bug Dell TB16 dock announces that one of its DisplayPort adapters is actually DP IN. Now this is possible and used with some external GPUs but not likely in this case as we are dealing with a dock. Anyways the problem is that the driver tries to create a DisplayPort tunnel between adapters of the same router which then shows to user that there is no picture on the display (because there are no available DP OUT adapters on the dock anymore). Fix this by not creating DisplayPort tunnels between adapters that are on the same router. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10265 Fixes: 274baf695b08 ("thunderbolt: Add DP IN added last in the head of the list of DP resources") Cc: Gil Fine Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index c5ce7a694b27..360cb95f39aa 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -1801,6 +1801,12 @@ static struct tb_port *tb_find_dp_out(struct tb *tb, struct tb_port *in) continue; } + /* Needs to be on different routers */ + if (in->sw == port->sw) { + tb_port_dbg(port, "skipping DP OUT on same router\n"); + continue; + } + tb_port_dbg(port, "DP OUT available\n"); /* -- cgit v1.2.3 From 03f56ed4ead162551ac596c9e3076ff01f1c5836 Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Mon, 1 Apr 2024 16:58:05 +0200 Subject: Revert "ALSA: emu10k1: fix synthesizer sample playback position and caching" As already anticipated in the original commit, playback was broken for very short samples. I just didn't expect it to be an actual problem, because we're talking about less than 1.5 milliseconds here. But clearly such wavetable samples do actually exist. The problem was that for such short samples we'd set the current position beyond the end of the loop, so we'd run off the end of the sample and play garbage. This is a bigger (more audible) problem than the original one, which was that we'd start playback with garbage (whatever was still in the cache), which would be mostly masked by the note's attack phase. So revert to the old behavior for now. We'll subsequently fix it properly with a bigger patch series. Note that this isn't a full revert - the dead code is not re-introduced, because that would be silly. Fixes: df335e9a8bcb ("ALSA: emu10k1: fix synthesizer sample playback position and caching") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218625 Signed-off-by: Oswald Buddenhagen Message-ID: <20240401145805.528794-1-oswald.buddenhagen@gmx.de> Signed-off-by: Takashi Iwai --- sound/pci/emu10k1/emu10k1_callback.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sound/pci/emu10k1/emu10k1_callback.c b/sound/pci/emu10k1/emu10k1_callback.c index d36234b88fb4..941bfbf812ed 100644 --- a/sound/pci/emu10k1/emu10k1_callback.c +++ b/sound/pci/emu10k1/emu10k1_callback.c @@ -255,7 +255,7 @@ lookup_voices(struct snd_emux *emu, struct snd_emu10k1 *hw, /* check if sample is finished playing (non-looping only) */ if (bp != best + V_OFF && bp != best + V_FREE && (vp->reg.sample_mode & SNDRV_SFNT_SAMPLE_SINGLESHOT)) { - val = snd_emu10k1_ptr_read(hw, CCCA_CURRADDR, vp->ch) - 64; + val = snd_emu10k1_ptr_read(hw, CCCA_CURRADDR, vp->ch); if (val >= vp->reg.loopstart) bp = best + V_OFF; } @@ -362,7 +362,7 @@ start_voice(struct snd_emux_voice *vp) map = (hw->silent_page.addr << hw->address_mode) | (hw->address_mode ? MAP_PTI_MASK1 : MAP_PTI_MASK0); - addr = vp->reg.start + 64; + addr = vp->reg.start; temp = vp->reg.parm.filterQ; ccca = (temp << 28) | addr; if (vp->apitch < 0xe400) @@ -430,9 +430,6 @@ start_voice(struct snd_emux_voice *vp) /* Q & current address (Q 4bit value, MSB) */ CCCA, ccca, - /* cache */ - CCR, REG_VAL_PUT(CCR_CACHEINVALIDSIZE, 64), - /* reset volume */ VTFT, vtarget | vp->ftarget, CVCF, vtarget | CVCF_CURRENTFILTER_MASK, -- cgit v1.2.3 From b67a7dc418aabbddec41c752ac29b6fa0250d0a8 Mon Sep 17 00:00:00 2001 From: Christian Bendiksen Date: Mon, 1 Apr 2024 12:26:10 +0000 Subject: ALSA: hda/realtek: Add sound quirks for Lenovo Legion slim 7 16ARHA7 models This fixes the sound not working from internal speakers on Lenovo Legion Slim 7 16ARHA7 models. The correct subsystem ID have been added to cs35l41_hda_property.c and patch_realtek.c. Signed-off-by: Christian Bendiksen Cc: Message-ID: <20240401122603.6634-1-christian@bendiksen.me> Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda_property.c | 4 ++++ sound/pci/hda/patch_realtek.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index 72ec872afb8d..d6ea3ab72f75 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -109,6 +109,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "10431F1F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 }, { "10431F62", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 }, { "17AA386F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 }, + { "17AA3877", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, + { "17AA3878", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38A9", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38AB", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38B4", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, @@ -497,6 +499,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "10431F1F", generic_dsd_config }, { "CSC3551", "10431F62", generic_dsd_config }, { "CSC3551", "17AA386F", generic_dsd_config }, + { "CSC3551", "17AA3877", generic_dsd_config }, + { "CSC3551", "17AA3878", generic_dsd_config }, { "CSC3551", "17AA38A9", generic_dsd_config }, { "CSC3551", "17AA38AB", generic_dsd_config }, { "CSC3551", "17AA38B4", generic_dsd_config }, diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e866b8a75cda..405620bd543c 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10384,6 +10384,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3869, "Lenovo Yoga7 14IAL7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x386f, "Legion 7i 16IAX7", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3870, "Lenovo Yoga 7 14ARB7", ALC287_FIXUP_YOGA7_14ARB7_I2C), + SND_PCI_QUIRK(0x17aa, 0x3877, "Lenovo Legion 7 Slim 16ARHA7", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x3878, "Lenovo Legion 7 Slim 16ARHA7", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x387d, "Yoga S780-16 pro Quad AAC", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x387e, "Yoga S780-16 pro Quad YC", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3881, "YB9 dual power mode2 YC", ALC287_FIXUP_TAS2781_I2C), -- cgit v1.2.3 From 1576f263ee2147dc395531476881058609ad3d38 Mon Sep 17 00:00:00 2001 From: I Gede Agastya Darma Laksana Date: Tue, 2 Apr 2024 00:46:02 +0700 Subject: ALSA: hda/realtek: Update Panasonic CF-SZ6 quirk to support headset with microphone This patch addresses an issue with the Panasonic CF-SZ6's existing quirk, specifically its headset microphone functionality. Previously, the quirk used ALC269_FIXUP_HEADSET_MODE, which does not support the CF-SZ6's design of a single 3.5mm jack for both mic and audio output effectively. The device uses pin 0x19 for the headset mic without jack detection. Following verification on the CF-SZ6 and discussions with the original patch author, i determined that the update to ALC269_FIXUP_ASPIRE_HEADSET_MIC is the appropriate solution. This change is custom-designed for the CF-SZ6's unique hardware setup, which includes a single 3.5mm jack for both mic and audio output, connecting the headset microphone to pin 0x19 without the use of jack detection. Fixes: 0fca97a29b83 ("ALSA: hda/realtek - Add Panasonic CF-SZ6 headset jack quirk") Signed-off-by: I Gede Agastya Darma Laksana Cc: Message-ID: <20240401174602.14133-1-gedeagas22@gmail.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 405620bd543c..e1729d209922 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10210,7 +10210,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x10ec, 0x1254, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10ec, 0x12cc, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10ec, 0x12f6, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), - SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-SZ6", ALC269_FIXUP_HEADSET_MODE), + SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-SZ6", ALC269_FIXUP_ASPIRE_HEADSET_MIC), SND_PCI_QUIRK(0x144d, 0xc109, "Samsung Ativ book 9 (NP900X3G)", ALC269_FIXUP_INV_DMIC), SND_PCI_QUIRK(0x144d, 0xc169, "Samsung Notebook 9 Pen (NP930SBE-K01US)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc176, "Samsung Notebook 9 Pro (NP930MBE-K04US)", ALC298_FIXUP_SAMSUNG_AMP), -- cgit v1.2.3 From 0bfe105018bd2d7b1e4373193d9b55b37cf4458b Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Tue, 2 Apr 2024 14:51:26 +1300 Subject: ALSA: hda/realtek: cs35l41: Support ASUS ROG G634JYR Fixes the realtek quirk to initialise the Cirrus amp correctly and adds related quirk for missing DSD properties. This model laptop has slightly updated internals compared to the previous version with Realtek Codec ID of 0x1caf. Signed-off-by: Luke D. Jones Cc: Message-ID: <20240402015126.21115-1-luke@ljones.dev> Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda_property.c | 2 ++ sound/pci/hda/patch_realtek.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index d6ea3ab72f75..8fb688e41414 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -108,6 +108,7 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "10431F12", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 1000, 4500, 24 }, { "10431F1F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, -1, 0, 0, 0, 0 }, { "10431F62", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 0, 0, 0 }, + { "10433A60", 2, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 1, 2, 0, 1000, 4500, 24 }, { "17AA386F", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, -1, -1, 0, 0, 0 }, { "17AA3877", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA3878", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, @@ -498,6 +499,7 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "10431F12", generic_dsd_config }, { "CSC3551", "10431F1F", generic_dsd_config }, { "CSC3551", "10431F62", generic_dsd_config }, + { "CSC3551", "10433A60", generic_dsd_config }, { "CSC3551", "17AA386F", generic_dsd_config }, { "CSC3551", "17AA3877", generic_dsd_config }, { "CSC3551", "17AA3878", generic_dsd_config }, diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e1729d209922..cdcb28aa9d7b 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10184,7 +10184,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x3a40, "ASUS G814JZR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x3a50, "ASUS G834JYR/JZR", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x1043, 0x3a60, "ASUS G634JYR/JZR", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x3a60, "ASUS G634JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x8398, "ASUS P1005", ALC269_FIXUP_STEREO_DMIC), -- cgit v1.2.3 From c6ddd6e7b166532a0816825442ff60f70aed9647 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 22 Mar 2024 12:47:05 -0400 Subject: arm64: dts: imx8-ss-conn: fix usdhc wrong lpcg clock order The actual clock show wrong frequency: echo on >/sys/devices/platform/bus\@5b000000/5b010000.mmc/power/control cat /sys/kernel/debug/mmc0/ios clock: 200000000 Hz actual clock: 166000000 Hz ^^^^^^^^^ ..... According to sdhc0_lpcg: clock-controller@5b200000 { compatible = "fsl,imx8qxp-lpcg"; reg = <0x5b200000 0x10000>; #clock-cells = <1>; clocks = <&clk IMX_SC_R_SDHC_0 IMX_SC_PM_CLK_PER>, <&conn_ipg_clk>, <&conn_axi_clk>; clock-indices = , , ; clock-output-names = "sdhc0_lpcg_per_clk", "sdhc0_lpcg_ipg_clk", "sdhc0_lpcg_ahb_clk"; power-domains = <&pd IMX_SC_R_SDHC_0>; } "per_clk" should be IMX_LPCG_CLK_0 instead of IMX_LPCG_CLK_5. After correct clocks order: echo on >/sys/devices/platform/bus\@5b000000/5b010000.mmc/power/control cat /sys/kernel/debug/mmc0/ios clock: 200000000 Hz actual clock: 198000000 Hz ^^^^^^^^ ... Fixes: 16c4ea7501b1 ("arm64: dts: imx8: switch to new lpcg clock binding") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi index 3c42240e78e2..af2259e99796 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi @@ -67,8 +67,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b010000 0x10000>; clocks = <&sdhc0_lpcg IMX_LPCG_CLK_4>, - <&sdhc0_lpcg IMX_LPCG_CLK_0>, - <&sdhc0_lpcg IMX_LPCG_CLK_5>; + <&sdhc0_lpcg IMX_LPCG_CLK_5>, + <&sdhc0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_0>; status = "disabled"; @@ -78,8 +78,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b020000 0x10000>; clocks = <&sdhc1_lpcg IMX_LPCG_CLK_4>, - <&sdhc1_lpcg IMX_LPCG_CLK_0>, - <&sdhc1_lpcg IMX_LPCG_CLK_5>; + <&sdhc1_lpcg IMX_LPCG_CLK_5>, + <&sdhc1_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_1>; fsl,tuning-start-tap = <20>; @@ -91,8 +91,8 @@ conn_subsys: bus@5b000000 { interrupts = ; reg = <0x5b030000 0x10000>; clocks = <&sdhc2_lpcg IMX_LPCG_CLK_4>, - <&sdhc2_lpcg IMX_LPCG_CLK_0>, - <&sdhc2_lpcg IMX_LPCG_CLK_5>; + <&sdhc2_lpcg IMX_LPCG_CLK_5>, + <&sdhc2_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "ahb", "per"; power-domains = <&pd IMX_SC_R_SDHC_2>; status = "disabled"; -- cgit v1.2.3 From c4e51e424e2c772ce1836912a8b0b87cd61bc9d5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 2 Apr 2024 08:36:25 +0200 Subject: ALSA: line6: Zero-initialize message buffers For shutting up spurious KMSAN uninit-value warnings, just replace kmalloc() calls with kzalloc() for the buffers used for communications. There should be no real issue with the original code, but it's still better to cover. Reported-by: syzbot+7fb05ccf7b3d2f9617b3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/00000000000084b18706150bcca5@google.com Message-ID: <20240402063628.26609-1-tiwai@suse.de> Signed-off-by: Takashi Iwai --- sound/usb/line6/driver.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c index b67617b68e50..f4437015d43a 100644 --- a/sound/usb/line6/driver.c +++ b/sound/usb/line6/driver.c @@ -202,7 +202,7 @@ int line6_send_raw_message_async(struct usb_line6 *line6, const char *buffer, struct urb *urb; /* create message: */ - msg = kmalloc(sizeof(struct message), GFP_ATOMIC); + msg = kzalloc(sizeof(struct message), GFP_ATOMIC); if (msg == NULL) return -ENOMEM; @@ -688,7 +688,7 @@ static int line6_init_cap_control(struct usb_line6 *line6) int ret; /* initialize USB buffers: */ - line6->buffer_listen = kmalloc(LINE6_BUFSIZE_LISTEN, GFP_KERNEL); + line6->buffer_listen = kzalloc(LINE6_BUFSIZE_LISTEN, GFP_KERNEL); if (!line6->buffer_listen) return -ENOMEM; @@ -697,7 +697,7 @@ static int line6_init_cap_control(struct usb_line6 *line6) return -ENOMEM; if (line6->properties->capabilities & LINE6_CAP_CONTROL_MIDI) { - line6->buffer_message = kmalloc(LINE6_MIDI_MESSAGE_MAXLEN, GFP_KERNEL); + line6->buffer_message = kzalloc(LINE6_MIDI_MESSAGE_MAXLEN, GFP_KERNEL); if (!line6->buffer_message) return -ENOMEM; -- cgit v1.2.3 From 73eaa2b583493b680c6f426531d6736c39643bfb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Apr 2024 15:16:19 -0600 Subject: io_uring: use private workqueue for exit work Rather than use the system unbound event workqueue, use an io_uring specific one. This avoids dependencies with the tty, which also uses the system_unbound_wq, and issues flushes of said workqueue from inside its poll handling. Cc: stable@vger.kernel.org Reported-by: Rasmus Karlsson Tested-by: Rasmus Karlsson Tested-by: Iskren Chernev Link: https://github.com/axboe/liburing/issues/1113 Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8baf8afb79c2..d1defb99b89e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -147,6 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_queue_sqe(struct io_kiocb *req); struct kmem_cache *req_cachep; +static struct workqueue_struct *iou_wq __ro_after_init; static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; @@ -3166,7 +3167,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) * noise and overhead, there's no discernable change in runtime * over using system_wq. */ - queue_work(system_unbound_wq, &ctx->exit_work); + queue_work(iou_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -4190,6 +4191,8 @@ static int __init io_uring_init(void) io_buf_cachep = KMEM_CACHE(io_buffer, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); #endif -- cgit v1.2.3 From fbbd5d3ad9435748b8ae6451bc004ee9ac49b6b7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 30 Mar 2024 09:53:00 +0900 Subject: nullblk: Fix cleanup order in null_add_dev() error path In null_add_dev(), if an error happen after initializing the resources for a zoned null block device, we must free these resources before exiting the function. To ensure this, move the out_cleanup_zone label after out_cleanup_disk as we jump to this latter label if an error happens after calling null_init_zoned_dev(). Fixes: e440626b1caf ("null_blk: pass queue_limits to blk_mq_alloc_disk") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240330005300.1503252-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 71c39bcd872c..ed33cf7192d2 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1965,10 +1965,10 @@ static int null_add_dev(struct nullb_device *dev) out_ida_free: ida_free(&nullb_indexes, nullb->index); -out_cleanup_zone: - null_free_zoned_dev(dev); out_cleanup_disk: put_disk(nullb->disk); +out_cleanup_zone: + null_free_zoned_dev(dev); out_cleanup_tags: if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); -- cgit v1.2.3 From 22d24a544b0d49bbcbd61c8c0eaf77d3c9297155 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 29 Mar 2024 09:23:19 +0800 Subject: block: fix overflow in blk_ioctl_discard() There is no check for overflow of 'start + len' in blk_ioctl_discard(). Hung task occurs if submit an discard ioctl with the following param: start = 0x80000000000ff000, len = 0x8000000000fff000; Add the overflow validation now. Signed-off-by: Li Nan Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240329012319.2034550-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- block/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 0c76137adcaa..a9028a2c2db5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -96,7 +96,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; - uint64_t start, len; + uint64_t start, len, end; struct inode *inode = bdev->bd_inode; int err; @@ -117,7 +117,8 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (len & 511) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) + if (check_add_overflow(start, len, &end) || + end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(inode->i_mapping); -- cgit v1.2.3 From c1832f67035dc04fb89e6b591b64e4d515843cda Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 31 Mar 2024 21:58:26 +0900 Subject: ksmbd: don't send oplock break if rename fails Don't send oplock break if rename fails. This patch fix smb2.oplock.batch20 test. Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index d478fa0c57ab..5723bbf372d7 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5857,8 +5857,9 @@ static int smb2_rename(struct ksmbd_work *work, if (!file_info->ReplaceIfExists) flags = RENAME_NOREPLACE; - smb_break_all_levII_oplock(work, fp, 0); rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags); + if (!rc) + smb_break_all_levII_oplock(work, fp, 0); out: kfree(new_name); return rc; -- cgit v1.2.3 From a677ebd8ca2f2632ccdecbad7b87641274e15aac Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 31 Mar 2024 21:59:10 +0900 Subject: ksmbd: validate payload size in ipc response If installing malicious ksmbd-tools, ksmbd.mountd can return invalid ipc response to ksmbd kernel server. ksmbd should validate payload size of ipc response from ksmbd.mountd to avoid memory overrun or slab-out-of-bounds. This patch validate 3 ipc response that has payload. Cc: stable@vger.kernel.org Reported-by: Chao Ma Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/ksmbd_netlink.h | 3 ++- fs/smb/server/mgmt/share_config.c | 7 ++++++- fs/smb/server/transport_ipc.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 8ca8a45c4c62..686b321c5a8b 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -167,7 +167,8 @@ struct ksmbd_share_config_response { __u16 force_uid; __u16 force_gid; __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; - __u32 reserved[112]; /* Reserved room */ + __u32 reserved[111]; /* Reserved room */ + __u32 payload_sz; __u32 veto_list_sz; __s8 ____payload[]; }; diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index 328a412259dc..a2f0a2edceb8 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -158,7 +158,12 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um, share->name = kstrdup(name, GFP_KERNEL); if (!test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) { - share->path = kstrdup(ksmbd_share_config_path(resp), + int path_len = PATH_MAX; + + if (resp->payload_sz) + path_len = resp->payload_sz - resp->veto_list_sz; + + share->path = kstrndup(ksmbd_share_config_path(resp), path_len, GFP_KERNEL); if (share->path) share->path_sz = strlen(share->path); diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index f29bb03f0dc4..8752ac82c557 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -65,6 +65,7 @@ struct ipc_msg_table_entry { struct hlist_node ipc_table_hlist; void *response; + unsigned int msg_sz; }; static struct delayed_work ipc_timer_work; @@ -275,6 +276,7 @@ static int handle_response(int type, void *payload, size_t sz) } memcpy(entry->response, payload, sz); + entry->msg_sz = sz; wake_up_interruptible(&entry->wait); ret = 0; break; @@ -453,6 +455,34 @@ out: return ret; } +static int ipc_validate_msg(struct ipc_msg_table_entry *entry) +{ + unsigned int msg_sz = entry->msg_sz; + + if (entry->type == KSMBD_EVENT_RPC_REQUEST) { + struct ksmbd_rpc_command *resp = entry->response; + + msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz; + } else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) { + struct ksmbd_spnego_authen_response *resp = entry->response; + + msg_sz = sizeof(struct ksmbd_spnego_authen_response) + + resp->session_key_len + resp->spnego_blob_len; + } else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) { + struct ksmbd_share_config_response *resp = entry->response; + + if (resp->payload_sz) { + if (resp->payload_sz < resp->veto_list_sz) + return -EINVAL; + + msg_sz = sizeof(struct ksmbd_share_config_response) + + resp->payload_sz; + } + } + + return entry->msg_sz != msg_sz ? -EINVAL : 0; +} + static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle) { struct ipc_msg_table_entry entry; @@ -477,6 +507,13 @@ static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle ret = wait_event_interruptible_timeout(entry.wait, entry.response != NULL, IPC_WAIT_TIMEOUT); + if (entry.response) { + ret = ipc_validate_msg(&entry); + if (ret) { + kvfree(entry.response); + entry.response = NULL; + } + } out: down_write(&ipc_msg_table_lock); hash_del(&entry.ipc_table_hlist); -- cgit v1.2.3 From 5ed11af19e56f0434ce0959376d136005745a936 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 2 Apr 2024 09:31:22 +0900 Subject: ksmbd: do not set SMB2_GLOBAL_CAP_ENCRYPTION for SMB 3.1.1 SMB2_GLOBAL_CAP_ENCRYPTION flag should be used only for 3.0 and 3.0.2 dialects. This flags set cause compatibility problems with other SMB clients. Reported-by: James Christopher Adduono Tested-by: James Christopher Adduono Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2ops.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c index a45f7dca482e..606aa3c5189a 100644 --- a/fs/smb/server/smb2ops.c +++ b/fs/smb/server/smb2ops.c @@ -228,6 +228,11 @@ void init_smb3_0_server(struct ksmbd_conn *conn) conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION) conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || + (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; } @@ -278,11 +283,6 @@ int init_smb3_11_server(struct ksmbd_conn *conn) conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_DIRECTORY_LEASING; - if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || - (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && - conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)) - conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; - if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; -- cgit v1.2.3 From d725ce9d7c78fb4e22c6c7676106e135ade14fa8 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 11 Mar 2024 16:56:26 +0200 Subject: drm/i915/dp: Fix DSC state HW readout for SST connectors Commit 0848814aa296 ("drm/i915/dp: Fix connector DSC HW state readout") moved the DSC HW state readout to a connector specific hook, however only added the hook for DP MST connectors, not for DP SST ones. Fix adding the hook for SST connectors as well. This fixes the following warn on platforms where BIOS enables DSC: [ 66.208601] i915 0000:00:02.0: drm_WARN_ON(!connector->dp.dsc_decompression_aux || !connector->dp.dsc_decompression_enabled) ... [ 66.209024] RIP: 0010:intel_dp_sink_disable_decompression+0x76/0x110 [i915] ... [ 66.209333] ? intel_dp_sink_disable_decompression+0x76/0x110 [i915] ... [ 66.210068] intel_disable_ddi+0x135/0x1d0 [i915] [ 66.210302] intel_encoders_disable+0x9b/0xc0 [i915] [ 66.210565] hsw_crtc_disable+0x153/0x170 [i915] [ 66.210823] intel_old_crtc_state_disables+0x52/0xb0 [i915] [ 66.211107] intel_atomic_commit_tail+0x5cf/0x1330 [i915] [ 66.211366] intel_atomic_commit+0x39d/0x3f0 [i915] [ 66.211612] ? intel_atomic_commit+0x39d/0x3f0 [i915] [ 66.211872] drm_atomic_commit+0x9d/0xd0 [drm] [ 66.211921] ? __pfx___drm_printfn_info+0x10/0x10 [drm] [ 66.211975] intel_initial_commit+0x1a8/0x260 [i915] [ 66.212234] intel_display_driver_probe+0x2a/0x80 [i915] [ 66.212479] i915_driver_probe+0x7c6/0xc60 [i915] [ 66.212664] ? drm_privacy_screen_get+0x168/0x190 [drm] [ 66.212711] i915_pci_probe+0xe2/0x1c0 [i915] Fixes: 0848814aa296 ("drm/i915/dp: Fix connector DSC HW state readout") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10410 Cc: Ankit Nautiyal Reviewed-by: Ankit Nautiyal Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20240311145626.2454923-1-imre.deak@intel.com (cherry picked from commit 7a51a2aa2384ea8bee76698ae586a2bea5b8ddb5) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index f98ef4b42a44..af7ca00e9bc0 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -6557,6 +6557,7 @@ intel_dp_init_connector(struct intel_digital_port *dig_port, intel_connector->get_hw_state = intel_ddi_connector_get_hw_state; else intel_connector->get_hw_state = intel_connector_get_hw_state; + intel_connector->sync_state = intel_dp_connector_sync_state; if (!intel_edp_init_connector(intel_dp, intel_connector)) { intel_dp_aux_fini(intel_dp); -- cgit v1.2.3 From caf3d748f646889425312897e81307441160d485 Mon Sep 17 00:00:00 2001 From: Arun R Murthy Date: Wed, 28 Feb 2024 20:13:50 +0530 Subject: drm/i915/dp: Remove support for UHBR13.5 UHBR13.5 is not supported in MTL and also the DP2.1 spec says UHBR13.5 is optional. Hence removing UHBR135 from the supported link rates. v2: Reframed the commit message and added link to the issue. Signed-off-by: Arun R Murthy Fixes: 62618c7f117e ("drm/i915/mtl: C20 PLL programming") Reviewed-by: Jani Nikula Signed-off-by: Animesh Manna Link: https://patchwork.freedesktop.org/patch/msgid/20240228144350.3184930-1-arun.r.murthy@intel.com (cherry picked from commit ddf8a8bbb5643265883bab0c59adf0648422c4bb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index af7ca00e9bc0..4016cf6b6c61 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -499,7 +499,7 @@ intel_dp_set_source_rates(struct intel_dp *intel_dp) /* The values must be in increasing order */ static const int mtl_rates[] = { 162000, 216000, 243000, 270000, 324000, 432000, 540000, 675000, - 810000, 1000000, 1350000, 2000000, + 810000, 1000000, 2000000, }; static const int icl_rates[] = { 162000, 216000, 270000, 324000, 432000, 540000, 648000, 810000, -- cgit v1.2.3 From e9e62243a3e2322cf639f653a0b0a88a76446ce7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Apr 2024 10:11:35 +0100 Subject: cifs: Fix caching to try to do open O_WRONLY as rdwr on server When we're engaged in local caching of a cifs filesystem, we cannot perform caching of a partially written cache granule unless we can read the rest of the granule. This can result in unexpected access errors being reported to the user. Fix this by the following: if a file is opened O_WRONLY locally, but the mount was given the "-o fsc" flag, try first opening the remote file with GENERIC_READ|GENERIC_WRITE and if that returns -EACCES, try dropping the GENERIC_READ and doing the open again. If that last succeeds, invalidate the cache for that file as for O_DIRECT. Fixes: 70431bfd825d ("cifs: Support fscache indexing rewrite") Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Rohith Surabattula cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/dir.c | 15 +++++++++++++++ fs/smb/client/file.c | 48 ++++++++++++++++++++++++++++++++++++++---------- fs/smb/client/fscache.h | 6 ++++++ 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index d11dc3aa458b..864b194dbaa0 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -189,6 +189,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int disposition; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; + int rdwr_for_fscache = 0; *oplock = 0; if (tcon->ses->server->oplocks) @@ -200,6 +201,10 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned return PTR_ERR(full_path); } + /* If we're caching, we need to be able to fill in around partial writes. */ + if (cifs_fscache_enabled(inode) && (oflags & O_ACCMODE) == O_WRONLY) + rdwr_for_fscache = 1; + #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & @@ -276,6 +281,8 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned desired_access |= GENERIC_READ; /* is this too little? */ if (OPEN_FMODE(oflags) & FMODE_WRITE) desired_access |= GENERIC_WRITE; + if (rdwr_for_fscache == 1) + desired_access |= GENERIC_READ; disposition = FILE_OVERWRITE_IF; if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) @@ -304,6 +311,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; +retry_open: oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -317,8 +325,15 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) { cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); + if (rc == -EACCES && rdwr_for_fscache == 1) { + desired_access &= ~GENERIC_READ; + rdwr_for_fscache = 2; + goto retry_open; + } goto out; } + if (rdwr_for_fscache == 2) + cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 16aadce492b2..ab536ce8a04a 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -206,12 +206,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) */ } -static inline int cifs_convert_flags(unsigned int flags) +static inline int cifs_convert_flags(unsigned int flags, int rdwr_for_fscache) { if ((flags & O_ACCMODE) == O_RDONLY) return GENERIC_READ; else if ((flags & O_ACCMODE) == O_WRONLY) - return GENERIC_WRITE; + return rdwr_for_fscache == 1 ? (GENERIC_READ | GENERIC_WRITE) : GENERIC_WRITE; else if ((flags & O_ACCMODE) == O_RDWR) { /* GENERIC_ALL is too much permission to request can cause unnecessary access denied on create */ @@ -348,11 +348,16 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ int create_options = CREATE_NOT_DIR; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; + int rdwr_for_fscache = 0; if (!server->ops->open) return -ENOSYS; - desired_access = cifs_convert_flags(f_flags); + /* If we're caching, we need to be able to fill in around partial writes. */ + if (cifs_fscache_enabled(inode) && (f_flags & O_ACCMODE) == O_WRONLY) + rdwr_for_fscache = 1; + + desired_access = cifs_convert_flags(f_flags, rdwr_for_fscache); /********************************************************************* * open flag mapping table: @@ -389,6 +394,7 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ if (f_flags & O_DIRECT) create_options |= CREATE_NO_BUFFER; +retry_open: oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -400,8 +406,16 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ }; rc = server->ops->open(xid, &oparms, oplock, buf); - if (rc) + if (rc) { + if (rc == -EACCES && rdwr_for_fscache == 1) { + desired_access = cifs_convert_flags(f_flags, 0); + rdwr_for_fscache = 2; + goto retry_open; + } return rc; + } + if (rdwr_for_fscache == 2) + cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE); /* TODO: Add support for calling posix query info but with passing in fid */ if (tcon->unix_ext) @@ -834,11 +848,11 @@ int cifs_open(struct inode *inode, struct file *file) use_cache: fscache_use_cookie(cifs_inode_cookie(file_inode(file)), file->f_mode & FMODE_WRITE); - if (file->f_flags & O_DIRECT && - (!((file->f_flags & O_ACCMODE) != O_RDONLY) || - file->f_flags & O_APPEND)) - cifs_invalidate_cache(file_inode(file), - FSCACHE_INVAL_DIO_WRITE); + if (!(file->f_flags & O_DIRECT)) + goto out; + if ((file->f_flags & (O_ACCMODE | O_APPEND)) == O_RDONLY) + goto out; + cifs_invalidate_cache(file_inode(file), FSCACHE_INVAL_DIO_WRITE); out: free_dentry_path(page); @@ -903,6 +917,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) int disposition = FILE_OPEN; int create_options = CREATE_NOT_DIR; struct cifs_open_parms oparms; + int rdwr_for_fscache = 0; xid = get_xid(); mutex_lock(&cfile->fh_mutex); @@ -966,7 +981,11 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - desired_access = cifs_convert_flags(cfile->f_flags); + /* If we're caching, we need to be able to fill in around partial writes. */ + if (cifs_fscache_enabled(inode) && (cfile->f_flags & O_ACCMODE) == O_WRONLY) + rdwr_for_fscache = 1; + + desired_access = cifs_convert_flags(cfile->f_flags, rdwr_for_fscache); /* O_SYNC also has bit for O_DSYNC so following check picks up either */ if (cfile->f_flags & O_SYNC) @@ -978,6 +997,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &cfile->fid); +retry_open: oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -1003,6 +1023,11 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) /* indicate that we need to relock the file */ oparms.reconnect = true; } + if (rc == -EACCES && rdwr_for_fscache == 1) { + desired_access = cifs_convert_flags(cfile->f_flags, 0); + rdwr_for_fscache = 2; + goto retry_open; + } if (rc) { mutex_unlock(&cfile->fh_mutex); @@ -1011,6 +1036,9 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) goto reopen_error_exit; } + if (rdwr_for_fscache == 2) + cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE); + #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY reopen_success: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h index a3d73720914f..1f2ea9f5cc9a 100644 --- a/fs/smb/client/fscache.h +++ b/fs/smb/client/fscache.h @@ -109,6 +109,11 @@ static inline void cifs_readahead_to_fscache(struct inode *inode, __cifs_readahead_to_fscache(inode, pos, len); } +static inline bool cifs_fscache_enabled(struct inode *inode) +{ + return fscache_cookie_enabled(cifs_inode_cookie(inode)); +} + #else /* CONFIG_CIFS_FSCACHE */ static inline void cifs_fscache_fill_coherency(struct inode *inode, @@ -124,6 +129,7 @@ static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) {} static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {} +static inline bool cifs_fscache_enabled(struct inode *inode) { return false; } static inline int cifs_fscache_query_occupancy(struct inode *inode, pgoff_t first, unsigned int nr_pages, -- cgit v1.2.3 From ff91059932401894e6c86341915615c5eb0eca48 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 2 Apr 2024 12:46:21 +0200 Subject: bpf, sockmap: Prevent lock inversion deadlock in map delete elem syzkaller started using corpuses where a BPF tracing program deletes elements from a sockmap/sockhash map. Because BPF tracing programs can be invoked from any interrupt context, locks taken during a map_delete_elem operation must be hardirq-safe. Otherwise a deadlock due to lock inversion is possible, as reported by lockdep: CPU0 CPU1 ---- ---- lock(&htab->buckets[i].lock); local_irq_disable(); lock(&host->lock); lock(&htab->buckets[i].lock); lock(&host->lock); Locks in sockmap are hardirq-unsafe by design. We expects elements to be deleted from sockmap/sockhash only in task (normal) context with interrupts enabled, or in softirq context. Detect when map_delete_elem operation is invoked from a context which is _not_ hardirq-unsafe, that is interrupts are disabled, and bail out with an error. Note that map updates are not affected by this issue. BPF verifier does not allow updating sockmap/sockhash from a BPF tracing program today. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: xingwei lee Reported-by: yue sun Reported-by: syzbot+bc922f476bd65abbd466@syzkaller.appspotmail.com Reported-by: syzbot+d4066896495db380182e@syzkaller.appspotmail.com Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Tested-by: syzbot+d4066896495db380182e@syzkaller.appspotmail.com Acked-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=d4066896495db380182e Closes: https://syzkaller.appspot.com/bug?extid=bc922f476bd65abbd466 Link: https://lore.kernel.org/bpf/20240402104621.1050319-1-jakub@cloudflare.com --- net/core/sock_map.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 27d733c0f65e..8598466a3805 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -411,6 +411,9 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) @@ -933,6 +936,9 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_elem *elem; int ret = -ENOENT; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); -- cgit v1.2.3 From b91695b50d5b5c7c6a2cae08637b0a119cec0e12 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 25 Mar 2024 09:14:04 -0300 Subject: ARM: dts: imx7-mba7: Use 'no-mmc' property 'no-emmc' is not a valid property. The original intention was to use the 'no-mmc' property. Change it accordingly to fix the following dt-schema warning: imx7s-mba7.dtb: mmc@30b40000: Unevaluated properties are not allowed ('no-emmc' was unexpected) Fixes: d430a7e0e181 ("ARM: dts: imx7-mba7: restrict usdhc interface modes") Signed-off-by: Fabio Estevam Reviewed-by: Alexander Stein Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi b/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi index 1235a71c6abe..52869e68f833 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx7-mba7.dtsi @@ -666,7 +666,7 @@ bus-width = <4>; no-1-8-v; no-sdio; - no-emmc; + no-mmc; status = "okay"; }; -- cgit v1.2.3 From 8a655cee6c9d4588570ad0cb099c5660f9a44a12 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 2 Apr 2024 14:20:40 +0800 Subject: ASoC: codecs: ES8326: Solve error interruption issue We got an error report about headphone type detection and button detection. We fixed the headphone type detection error by adjusting the debounce timer configuration. And we fixed the button detection error by disabling the button detection feature when the headphone are unplugged and enabling it when headphone are plugged in. Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240402062043.20608-2-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 15289dadafea..a6783fd6553d 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -843,6 +843,7 @@ static void es8326_jack_detect_handler(struct work_struct *work) regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x01); regmap_write(es8326->regmap, ES8326_SYS_BIAS, 0x0a); regmap_update_bits(es8326->regmap, ES8326_HP_DRIVER_REF, 0x0f, 0x03); + regmap_write(es8326->regmap, ES8326_INT_SOURCE, ES8326_INT_SRC_PIN9); /* * Inverted HPJACK_POL bit to trigger one IRQ to double check HP Removal event */ @@ -865,6 +866,8 @@ static void es8326_jack_detect_handler(struct work_struct *work) * set auto-check mode, then restart jack_detect_work after 400ms. * Don't report jack status. */ + regmap_write(es8326->regmap, ES8326_INT_SOURCE, + (ES8326_INT_SRC_PIN9 | ES8326_INT_SRC_BUTTON)); regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x01); es8326_enable_micbias(es8326->component); usleep_range(50000, 70000); @@ -987,7 +990,7 @@ static int es8326_resume(struct snd_soc_component *component) regmap_write(es8326->regmap, ES8326_VMIDSEL, 0x0E); regmap_write(es8326->regmap, ES8326_ANA_LP, 0xf0); usleep_range(10000, 15000); - regmap_write(es8326->regmap, ES8326_HPJACK_TIMER, 0xe9); + regmap_write(es8326->regmap, ES8326_HPJACK_TIMER, 0xd9); regmap_write(es8326->regmap, ES8326_ANA_MICBIAS, 0xcb); /* set headphone default type and detect pin */ regmap_write(es8326->regmap, ES8326_HPDET_TYPE, 0x83); @@ -1038,8 +1041,7 @@ static int es8326_resume(struct snd_soc_component *component) es8326_enable_micbias(es8326->component); usleep_range(50000, 70000); regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x00); - regmap_write(es8326->regmap, ES8326_INT_SOURCE, - (ES8326_INT_SRC_PIN9 | ES8326_INT_SRC_BUTTON)); + regmap_write(es8326->regmap, ES8326_INT_SOURCE, ES8326_INT_SRC_PIN9); regmap_write(es8326->regmap, ES8326_INTOUT_IO, es8326->interrupt_clk); regmap_write(es8326->regmap, ES8326_SDINOUT1_IO, -- cgit v1.2.3 From 4581468d071b64a2e3c2ae333fff82dc0391a306 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 2 Apr 2024 14:20:41 +0800 Subject: ASoC: codecs: ES8326: modify clock table We got a digital microphone feature issue. And we fixed it by modifying the clock table. Also, we changed the marco ES8326_CLK_ON declaration Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240402062043.20608-3-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 22 +++++++++++----------- sound/soc/codecs/es8326.h | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index a6783fd6553d..275db81d10d4 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -412,9 +412,9 @@ static const struct _coeff_div coeff_div_v3[] = { {125, 48000, 6000000, 0x04, 0x04, 0x1F, 0x2D, 0x8A, 0x0A, 0x27, 0x27}, {128, 8000, 1024000, 0x60, 0x00, 0x05, 0x75, 0x8A, 0x1B, 0x1F, 0x7F}, - {128, 16000, 2048000, 0x20, 0x00, 0x31, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, - {128, 44100, 5644800, 0xE0, 0x00, 0x01, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, - {128, 48000, 6144000, 0xE0, 0x00, 0x01, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, + {128, 16000, 2048000, 0x20, 0x00, 0x31, 0x35, 0x08, 0x19, 0x1F, 0x3F}, + {128, 44100, 5644800, 0xE0, 0x00, 0x01, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, + {128, 48000, 6144000, 0xE0, 0x00, 0x01, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, {144, 8000, 1152000, 0x20, 0x00, 0x03, 0x35, 0x8A, 0x1B, 0x23, 0x47}, {144, 16000, 2304000, 0x20, 0x00, 0x11, 0x35, 0x8A, 0x1B, 0x23, 0x47}, {192, 8000, 1536000, 0x60, 0x02, 0x0D, 0x75, 0x8A, 0x1B, 0x1F, 0x7F}, @@ -423,10 +423,10 @@ static const struct _coeff_div coeff_div_v3[] = { {200, 48000, 9600000, 0x04, 0x04, 0x0F, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, {250, 48000, 12000000, 0x04, 0x04, 0x0F, 0x2D, 0xCA, 0x0A, 0x27, 0x27}, - {256, 8000, 2048000, 0x60, 0x00, 0x31, 0x35, 0x8A, 0x1B, 0x1F, 0x7F}, - {256, 16000, 4096000, 0x20, 0x00, 0x01, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, - {256, 44100, 11289600, 0xE0, 0x00, 0x30, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, - {256, 48000, 12288000, 0xE0, 0x00, 0x30, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, + {256, 8000, 2048000, 0x60, 0x00, 0x31, 0x35, 0x08, 0x19, 0x1F, 0x7F}, + {256, 16000, 4096000, 0x20, 0x00, 0x01, 0x35, 0x08, 0x19, 0x1F, 0x3F}, + {256, 44100, 11289600, 0xE0, 0x01, 0x01, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, + {256, 48000, 12288000, 0xE0, 0x01, 0x01, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, {288, 8000, 2304000, 0x20, 0x00, 0x01, 0x35, 0x8A, 0x1B, 0x23, 0x47}, {384, 8000, 3072000, 0x60, 0x02, 0x05, 0x75, 0x8A, 0x1B, 0x1F, 0x7F}, {384, 16000, 6144000, 0x20, 0x02, 0x03, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, @@ -435,10 +435,10 @@ static const struct _coeff_div coeff_div_v3[] = { {400, 48000, 19200000, 0xE4, 0x04, 0x35, 0x6d, 0xCA, 0x0A, 0x1F, 0x1F}, {500, 48000, 24000000, 0xF8, 0x04, 0x3F, 0x6D, 0xCA, 0x0A, 0x1F, 0x1F}, - {512, 8000, 4096000, 0x60, 0x00, 0x01, 0x35, 0x8A, 0x1B, 0x1F, 0x7F}, - {512, 16000, 8192000, 0x20, 0x00, 0x30, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, - {512, 44100, 22579200, 0xE0, 0x00, 0x00, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, - {512, 48000, 24576000, 0xE0, 0x00, 0x00, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, + {512, 8000, 4096000, 0x60, 0x00, 0x01, 0x08, 0x19, 0x1B, 0x1F, 0x7F}, + {512, 16000, 8192000, 0x20, 0x00, 0x30, 0x35, 0x08, 0x19, 0x1F, 0x3F}, + {512, 44100, 22579200, 0xE0, 0x00, 0x00, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, + {512, 48000, 24576000, 0xE0, 0x00, 0x00, 0x2D, 0x48, 0x08, 0x1F, 0x1F}, {768, 8000, 6144000, 0x60, 0x02, 0x11, 0x35, 0x8A, 0x1B, 0x1F, 0x7F}, {768, 16000, 12288000, 0x20, 0x02, 0x01, 0x35, 0x8A, 0x1B, 0x1F, 0x3F}, {768, 32000, 24576000, 0xE0, 0x02, 0x30, 0x2D, 0xCA, 0x0A, 0x1F, 0x1F}, diff --git a/sound/soc/codecs/es8326.h b/sound/soc/codecs/es8326.h index ee12caef8105..c3e52e7bdef5 100644 --- a/sound/soc/codecs/es8326.h +++ b/sound/soc/codecs/es8326.h @@ -104,7 +104,7 @@ #define ES8326_MUTE (3 << 0) /* ES8326_CLK_CTL */ -#define ES8326_CLK_ON (0x7e << 0) +#define ES8326_CLK_ON (0x7f << 0) #define ES8326_CLK_OFF (0 << 0) /* ES8326_CLK_INV */ -- cgit v1.2.3 From 6e5f5bf894eb9260f07ad0da4e2dd2efd616ed59 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 2 Apr 2024 14:20:42 +0800 Subject: ASoC: codecs: ES8326: Solve a headphone detection issue after suspend and resume We got a headphone detection issue after suspend and resume. And we fixed it by modifying the configuration at es8326_suspend and invoke es8326_irq at es8326_resume. Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240402062043.20608-4-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 275db81d10d4..fa809ab41a4a 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -1062,6 +1062,8 @@ static int es8326_resume(struct snd_soc_component *component) es8326->hp = 0; es8326->hpl_vol = 0x03; es8326->hpr_vol = 0x03; + + es8326_irq(es8326->irq, es8326); return 0; } @@ -1072,6 +1074,9 @@ static int es8326_suspend(struct snd_soc_component *component) cancel_delayed_work_sync(&es8326->jack_detect_work); es8326_disable_micbias(component); es8326->calibrated = false; + regmap_write(es8326->regmap, ES8326_CLK_MUX, 0x2d); + regmap_write(es8326->regmap, ES8326_DAC2HPMIX, 0x00); + regmap_write(es8326->regmap, ES8326_ANA_PDN, 0x3b); regmap_write(es8326->regmap, ES8326_CLK_CTL, ES8326_CLK_OFF); regcache_cache_only(es8326->regmap, true); regcache_mark_dirty(es8326->regmap); -- cgit v1.2.3 From fec9c7f668ac5dd107f4da5a3b18379e07ec1a41 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 2 Apr 2024 14:20:43 +0800 Subject: ASoC: codecs: ES8326: Removing the control of ADC_SCALE We removed the configuration of ES8326_ADC_SCALE in es8326_jack_detect_handler because user changed the configuration by snd_controls Signed-off-by: Zhang Yi Link: https://msgid.link/r/20240402062043.20608-5-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index fa809ab41a4a..17bd6b516077 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -835,7 +835,6 @@ static void es8326_jack_detect_handler(struct work_struct *work) dev_dbg(comp->dev, "Report hp remove event\n"); snd_soc_jack_report(es8326->jack, 0, SND_JACK_HEADSET); /* mute adc when mic path switch */ - regmap_write(es8326->regmap, ES8326_ADC_SCALE, 0x33); regmap_write(es8326->regmap, ES8326_ADC1_SRC, 0x44); regmap_write(es8326->regmap, ES8326_ADC2_SRC, 0x66); es8326->hp = 0; @@ -894,7 +893,6 @@ static void es8326_jack_detect_handler(struct work_struct *work) snd_soc_jack_report(es8326->jack, SND_JACK_HEADSET, SND_JACK_HEADSET); - regmap_write(es8326->regmap, ES8326_ADC_SCALE, 0x33); regmap_update_bits(es8326->regmap, ES8326_PGA_PDN, 0x08, 0x08); regmap_update_bits(es8326->regmap, ES8326_PGAGAIN, -- cgit v1.2.3 From d619b0b70dc4f160f2b95d95ccfed2631ab7ac3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Tue, 2 Apr 2024 15:06:40 +0200 Subject: ASoC: Intel: avs: boards: Add modules description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modpost warns about missing module description, add it. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://msgid.link/r/20240402130640.3310999-1-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/avs/boards/da7219.c | 1 + sound/soc/intel/avs/boards/dmic.c | 1 + sound/soc/intel/avs/boards/es8336.c | 1 + sound/soc/intel/avs/boards/i2s_test.c | 1 + sound/soc/intel/avs/boards/max98357a.c | 1 + sound/soc/intel/avs/boards/max98373.c | 1 + sound/soc/intel/avs/boards/max98927.c | 1 + sound/soc/intel/avs/boards/nau8825.c | 1 + sound/soc/intel/avs/boards/probe.c | 1 + sound/soc/intel/avs/boards/rt274.c | 1 + sound/soc/intel/avs/boards/rt286.c | 1 + sound/soc/intel/avs/boards/rt298.c | 1 + sound/soc/intel/avs/boards/rt5514.c | 1 + sound/soc/intel/avs/boards/rt5663.c | 1 + sound/soc/intel/avs/boards/rt5682.c | 1 + sound/soc/intel/avs/boards/ssm4567.c | 1 + 16 files changed, 16 insertions(+) diff --git a/sound/soc/intel/avs/boards/da7219.c b/sound/soc/intel/avs/boards/da7219.c index c018f84fe025..fc072dc58968 100644 --- a/sound/soc/intel/avs/boards/da7219.c +++ b/sound/soc/intel/avs/boards/da7219.c @@ -296,5 +296,6 @@ static struct platform_driver avs_da7219_driver = { module_platform_driver(avs_da7219_driver); +MODULE_DESCRIPTION("Intel da7219 machine driver"); MODULE_AUTHOR("Cezary Rojewski "); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/dmic.c b/sound/soc/intel/avs/boards/dmic.c index ba2bc7f689eb..d9e5e85f5233 100644 --- a/sound/soc/intel/avs/boards/dmic.c +++ b/sound/soc/intel/avs/boards/dmic.c @@ -96,4 +96,5 @@ static struct platform_driver avs_dmic_driver = { module_platform_driver(avs_dmic_driver); +MODULE_DESCRIPTION("Intel DMIC machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/es8336.c b/sound/soc/intel/avs/boards/es8336.c index 1090082e7d5b..5c90a6007577 100644 --- a/sound/soc/intel/avs/boards/es8336.c +++ b/sound/soc/intel/avs/boards/es8336.c @@ -326,4 +326,5 @@ static struct platform_driver avs_es8336_driver = { module_platform_driver(avs_es8336_driver); +MODULE_DESCRIPTION("Intel es8336 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/i2s_test.c b/sound/soc/intel/avs/boards/i2s_test.c index 28f254eb0d03..027373d6a16d 100644 --- a/sound/soc/intel/avs/boards/i2s_test.c +++ b/sound/soc/intel/avs/boards/i2s_test.c @@ -204,4 +204,5 @@ static struct platform_driver avs_i2s_test_driver = { module_platform_driver(avs_i2s_test_driver); +MODULE_DESCRIPTION("Intel i2s test machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/max98357a.c b/sound/soc/intel/avs/boards/max98357a.c index a83b95f25129..1ff85e4d8e16 100644 --- a/sound/soc/intel/avs/boards/max98357a.c +++ b/sound/soc/intel/avs/boards/max98357a.c @@ -154,4 +154,5 @@ static struct platform_driver avs_max98357a_driver = { module_platform_driver(avs_max98357a_driver) +MODULE_DESCRIPTION("Intel max98357a machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/max98373.c b/sound/soc/intel/avs/boards/max98373.c index 3b980a025e6f..8d31586b73ea 100644 --- a/sound/soc/intel/avs/boards/max98373.c +++ b/sound/soc/intel/avs/boards/max98373.c @@ -211,4 +211,5 @@ static struct platform_driver avs_max98373_driver = { module_platform_driver(avs_max98373_driver) +MODULE_DESCRIPTION("Intel max98373 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/max98927.c b/sound/soc/intel/avs/boards/max98927.c index 86dd2b228df3..572ec58073d0 100644 --- a/sound/soc/intel/avs/boards/max98927.c +++ b/sound/soc/intel/avs/boards/max98927.c @@ -208,4 +208,5 @@ static struct platform_driver avs_max98927_driver = { module_platform_driver(avs_max98927_driver) +MODULE_DESCRIPTION("Intel max98927 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/nau8825.c b/sound/soc/intel/avs/boards/nau8825.c index 1c1e2083f474..55db75efae41 100644 --- a/sound/soc/intel/avs/boards/nau8825.c +++ b/sound/soc/intel/avs/boards/nau8825.c @@ -313,4 +313,5 @@ static struct platform_driver avs_nau8825_driver = { module_platform_driver(avs_nau8825_driver) +MODULE_DESCRIPTION("Intel nau8825 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/probe.c b/sound/soc/intel/avs/boards/probe.c index a9469b5ecb40..8be6887bbc6e 100644 --- a/sound/soc/intel/avs/boards/probe.c +++ b/sound/soc/intel/avs/boards/probe.c @@ -69,4 +69,5 @@ static struct platform_driver avs_probe_mb_driver = { module_platform_driver(avs_probe_mb_driver); +MODULE_DESCRIPTION("Intel probe machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt274.c b/sound/soc/intel/avs/boards/rt274.c index bfcb8845fd15..1cf524216087 100644 --- a/sound/soc/intel/avs/boards/rt274.c +++ b/sound/soc/intel/avs/boards/rt274.c @@ -276,4 +276,5 @@ static struct platform_driver avs_rt274_driver = { module_platform_driver(avs_rt274_driver); +MODULE_DESCRIPTION("Intel rt274 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt286.c b/sound/soc/intel/avs/boards/rt286.c index 28d7d86b1cc9..4740bba10570 100644 --- a/sound/soc/intel/avs/boards/rt286.c +++ b/sound/soc/intel/avs/boards/rt286.c @@ -247,4 +247,5 @@ static struct platform_driver avs_rt286_driver = { module_platform_driver(avs_rt286_driver); +MODULE_DESCRIPTION("Intel rt286 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt298.c b/sound/soc/intel/avs/boards/rt298.c index 80f490b9e118..6e409e29f697 100644 --- a/sound/soc/intel/avs/boards/rt298.c +++ b/sound/soc/intel/avs/boards/rt298.c @@ -266,4 +266,5 @@ static struct platform_driver avs_rt298_driver = { module_platform_driver(avs_rt298_driver); +MODULE_DESCRIPTION("Intel rt298 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt5514.c b/sound/soc/intel/avs/boards/rt5514.c index 60105f453ae2..097ae5f73241 100644 --- a/sound/soc/intel/avs/boards/rt5514.c +++ b/sound/soc/intel/avs/boards/rt5514.c @@ -192,4 +192,5 @@ static struct platform_driver avs_rt5514_driver = { module_platform_driver(avs_rt5514_driver); +MODULE_DESCRIPTION("Intel rt5514 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt5663.c b/sound/soc/intel/avs/boards/rt5663.c index b4762c2a7bf2..1880c315cc4d 100644 --- a/sound/soc/intel/avs/boards/rt5663.c +++ b/sound/soc/intel/avs/boards/rt5663.c @@ -265,4 +265,5 @@ static struct platform_driver avs_rt5663_driver = { module_platform_driver(avs_rt5663_driver); +MODULE_DESCRIPTION("Intel rt5663 machine driver"); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/rt5682.c b/sound/soc/intel/avs/boards/rt5682.c index 243f979fda98..594a971ded9e 100644 --- a/sound/soc/intel/avs/boards/rt5682.c +++ b/sound/soc/intel/avs/boards/rt5682.c @@ -341,5 +341,6 @@ static struct platform_driver avs_rt5682_driver = { module_platform_driver(avs_rt5682_driver) +MODULE_DESCRIPTION("Intel rt5682 machine driver"); MODULE_AUTHOR("Cezary Rojewski "); MODULE_LICENSE("GPL"); diff --git a/sound/soc/intel/avs/boards/ssm4567.c b/sound/soc/intel/avs/boards/ssm4567.c index 4a0e136835ff..d6f7f046c24e 100644 --- a/sound/soc/intel/avs/boards/ssm4567.c +++ b/sound/soc/intel/avs/boards/ssm4567.c @@ -200,4 +200,5 @@ static struct platform_driver avs_ssm4567_driver = { module_platform_driver(avs_ssm4567_driver) +MODULE_DESCRIPTION("Intel ssm4567 machine driver"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From ac229a2d0939edfb469310c55b16d9321a858a46 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 Mar 2024 07:08:19 +1000 Subject: nvme-multipath: don't inherit LBA-related fields for the multipath node Linux 6.9 made the nvme multipath nodes not properly pick up changes when the LBA size goes smaller after an nvme format. This is because we now try to inherit the queue settings for the multipath node entirely from the individual paths. That is the right thing to do for I/O size limitations, which make up most of the queue limits, but it is wrong for changes to the namespace configuration, where we do want to pick up the new format, which will eventually show up on all paths once they are re-queried. Fix this by not inheriting the block size and related fields and always for updating them. Fixes: 8f03cfa117e0 ("nvme: don't use nvme_update_disk_info for the multipath disk") Reported-by: Nilay Shroff Tested-by: Nilay Shroff Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 943d72bdd794..0cf46068f1d0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2201,6 +2201,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) } if (!ret && nvme_ns_head_multipath(ns->head)) { + struct queue_limits *ns_lim = &ns->disk->queue->limits; struct queue_limits lim; blk_mq_freeze_queue(ns->head->disk->queue); @@ -2212,7 +2213,26 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); nvme_mpath_revalidate_paths(ns); + /* + * queue_limits mixes values that are the hardware limitations + * for bio splitting with what is the device configuration. + * + * For NVMe the device configuration can change after e.g. a + * Format command, and we really want to pick up the new format + * value here. But we must still stack the queue limits to the + * least common denominator for multipathing to split the bios + * properly. + * + * To work around this, we explicitly set the device + * configuration to those that we just queried, but only stack + * the splitting limits in to make sure we still obey possibly + * lower limitations of other controllers. + */ lim = queue_limits_start_update(ns->head->disk->queue); + lim.logical_block_size = ns_lim->logical_block_size; + lim.physical_block_size = ns_lim->physical_block_size; + lim.io_min = ns_lim->io_min; + lim.io_opt = ns_lim->io_opt; queue_limits_stack_bdev(&lim, ns->disk->part0, 0, ns->head->disk->disk_name); ret = queue_limits_commit_update(ns->head->disk->queue, &lim); -- cgit v1.2.3 From af133562d5aff41fcdbe51f1a504ae04788b5fc0 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 25 Mar 2024 09:31:04 +0100 Subject: swiotlb: extend buffer pre-padding to alloc_align_mask if necessary Allow a buffer pre-padding of up to alloc_align_mask, even if it requires allocating additional IO TLB slots. If the allocation alignment is bigger than IO_TLB_SIZE and min_align_mask covers any non-zero bits in the original address between IO_TLB_SIZE and alloc_align_mask, these bits are not preserved in the swiotlb buffer address. To fix this case, increase the allocation size and use a larger offset within the allocated buffer. As a result, extra padding slots may be allocated before the mapping start address. Leave orig_addr in these padding slots initialized to INVALID_PHYS_ADDR. These slots do not correspond to any CPU buffer, so attempts to sync the data should be ignored. The padding slots should be automatically released when the buffer is unmapped. However, swiotlb_tbl_unmap_single() takes only the address of the DMA buffer slot, not the first padding slot. Save the number of padding slots in struct io_tlb_slot and use it to adjust the slot index in swiotlb_release_slots(), so all allocated slots are properly freed. Fixes: 2fd4fa5d3fb5 ("swiotlb: Fix alignment checks when both allocation and DMA masks are present") Link: https://lore.kernel.org/linux-iommu/20240311210507.217daf8b@meshulam.tesarici.cz/ Signed-off-by: Petr Tesarik Reviewed-by: Michael Kelley Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 59 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 86fe172b5958..d7a8cb93ef2d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -69,11 +69,14 @@ * @alloc_size: Size of the allocated buffer. * @list: The free list describing the number of free entries available * from each index. + * @pad_slots: Number of preceding padding slots. Valid only in the first + * allocated non-padding slot. */ struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; - unsigned int list; + unsigned short list; + unsigned short pad_slots; }; static bool swiotlb_force_bounce; @@ -287,6 +290,7 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, mem->nslabs - i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; + mem->slots[i].pad_slots = 0; } memset(vaddr, 0, bytes); @@ -821,12 +825,30 @@ void swiotlb_dev_init(struct device *dev) #endif } -/* - * Return the offset into a iotlb slot required to keep the device happy. +/** + * swiotlb_align_offset() - Get required offset into an IO TLB allocation. + * @dev: Owning device. + * @align_mask: Allocation alignment mask. + * @addr: DMA address. + * + * Return the minimum offset from the start of an IO TLB allocation which is + * required for a given buffer address and allocation alignment to keep the + * device happy. + * + * First, the address bits covered by min_align_mask must be identical in the + * original address and the bounce buffer address. High bits are preserved by + * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra + * padding bytes before the bounce buffer. + * + * Second, @align_mask specifies which bits of the first allocated slot must + * be zero. This may require allocating additional padding slots, and then the + * offset (in bytes) from the first such padding slot is returned. */ -static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +static unsigned int swiotlb_align_offset(struct device *dev, + unsigned int align_mask, u64 addr) { - return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); + return addr & dma_get_min_align_mask(dev) & + (align_mask | (IO_TLB_SIZE - 1)); } /* @@ -847,7 +869,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size return; tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); - orig_addr_offset = swiotlb_align_offset(dev, orig_addr); + orig_addr_offset = swiotlb_align_offset(dev, 0, orig_addr); if (tlb_offset < orig_addr_offset) { dev_WARN_ONCE(dev, 1, "Access before mapping start detected. orig offset %u, requested offset %u.\n", @@ -1005,7 +1027,7 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev); unsigned int nslots = nr_slots(alloc_size), stride; - unsigned int offset = swiotlb_align_offset(dev, orig_addr); + unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr); unsigned int index, slots_checked, count = 0, i; unsigned long flags; unsigned int slot_base; @@ -1328,11 +1350,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, unsigned long attrs) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; - unsigned int offset = swiotlb_align_offset(dev, orig_addr); + unsigned int offset; struct io_tlb_pool *pool; unsigned int i; int index; phys_addr_t tlb_addr; + unsigned short pad_slots; if (!mem || !mem->nslabs) { dev_warn_ratelimited(dev, @@ -1349,6 +1372,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } + offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr); index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset, alloc_align_mask, &pool); if (index == -1) { @@ -1364,6 +1388,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ + pad_slots = offset >> IO_TLB_SHIFT; + offset &= (IO_TLB_SIZE - 1); + index += pad_slots; + pool->slots[index].pad_slots = pad_slots; for (i = 0; i < nr_slots(alloc_size + offset); i++) pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(pool->start, index) + offset; @@ -1384,13 +1412,17 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); unsigned long flags; - unsigned int offset = swiotlb_align_offset(dev, tlb_addr); - int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; - int nslots = nr_slots(mem->slots[index].alloc_size + offset); - int aindex = index / mem->area_nslabs; - struct io_tlb_area *area = &mem->areas[aindex]; + unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr); + int index, nslots, aindex; + struct io_tlb_area *area; int count, i; + index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; + index -= mem->slots[index].pad_slots; + nslots = nr_slots(mem->slots[index].alloc_size + offset); + aindex = index / mem->area_nslabs; + area = &mem->areas[aindex]; + /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. @@ -1413,6 +1445,7 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) mem->slots[i].list = ++count; mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; + mem->slots[i].pad_slots = 0; } /* -- cgit v1.2.3 From e8068f2d756d57a5206fa3180ade365a8c12ed85 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 26 Mar 2024 20:45:48 -0700 Subject: swiotlb: fix swiotlb_bounce() to do partial sync's correctly In current code, swiotlb_bounce() may do partial sync's correctly in some circumstances, but may incorrectly fail in other circumstances. The failure cases require both of these to be true: 1) swiotlb_align_offset() returns a non-zero "offset" value 2) the tlb_addr of the partial sync area points into the first "offset" bytes of the _second_ or subsequent swiotlb slot allocated for the mapping Code added in commit 868c9ddc182b ("swiotlb: add overflow checks to swiotlb_bounce") attempts to WARN on the invalid case where tlb_addr points into the first "offset" bytes of the _first_ allocated slot. But there's no way for swiotlb_bounce() to distinguish the first slot from the second and subsequent slots, so the WARN can be triggered incorrectly when #2 above is true. Related, current code calculates an adjustment to the orig_addr stored in the swiotlb slot. The adjustment compensates for the difference in the tlb_addr used for the partial sync vs. the tlb_addr for the full mapping. The adjustment is stored in the local variable tlb_offset. But when #1 and #2 above are true, it's valid for this adjustment to be negative. In such case the arithmetic to adjust orig_addr produces the wrong result due to tlb_offset being declared as unsigned. Fix these problems by removing the over-constraining validations added in 868c9ddc182b. Change the declaration of tlb_offset to be signed instead of unsigned so the adjustment arithmetic works correctly. Tested with a test-only hack to how swiotlb_tbl_map_single() calls swiotlb_bounce(). Instead of calling swiotlb_bounce() just once for the entire mapped area, do a loop with each iteration doing only a 128 byte partial sync until the entire mapped area is sync'ed. Then with swiotlb=force on the kernel boot line, run a variety of raw disk writes followed by read and verification of all bytes of the written data. The storage device has DMA min_align_mask set, and the writes are done with a variety of original buffer memory address alignments and overall buffer sizes. For many of the combinations, current code triggers the WARN statements, or the data verification fails. With the fixes, no WARNs occur and all verifications pass. Fixes: 5f89468e2f06 ("swiotlb: manipulate orig_addr when tlb_addr has offset") Fixes: 868c9ddc182b ("swiotlb: add overflow checks to swiotlb_bounce") Signed-off-by: Michael Kelley Dominique Martinet Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d7a8cb93ef2d..d57c8837c813 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -863,27 +863,23 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; - unsigned int tlb_offset, orig_addr_offset; + int tlb_offset; if (orig_addr == INVALID_PHYS_ADDR) return; - tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); - orig_addr_offset = swiotlb_align_offset(dev, 0, orig_addr); - if (tlb_offset < orig_addr_offset) { - dev_WARN_ONCE(dev, 1, - "Access before mapping start detected. orig offset %u, requested offset %u.\n", - orig_addr_offset, tlb_offset); - return; - } - - tlb_offset -= orig_addr_offset; - if (tlb_offset > alloc_size) { - dev_WARN_ONCE(dev, 1, - "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n", - alloc_size, size, tlb_offset); - return; - } + /* + * It's valid for tlb_offset to be negative. This can happen when the + * "offset" returned by swiotlb_align_offset() is non-zero, and the + * tlb_addr is pointing within the first "offset" bytes of the second + * or subsequent slots of the allocated swiotlb area. While it's not + * valid for tlb_addr to be pointing within the first "offset" bytes + * of the first slot, there's no way to check for such an error since + * this function can't distinguish the first slot from the second and + * subsequent slots. + */ + tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - + swiotlb_align_offset(dev, 0, orig_addr); orig_addr += tlb_offset; alloc_size -= tlb_offset; -- cgit v1.2.3 From a1255ccab8ecee89905ddb12161139b0d878a7f2 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 29 Mar 2024 12:28:09 -0700 Subject: swiotlb: do not set total_used to 0 in swiotlb_create_debugfs_files() Sometimes the readout of /sys/kernel/debug/swiotlb/io_tlb_used and io_tlb_used_hiwater can be a huge number (e.g. 18446744073709551615), which is actually a negative number if we use "%ld" to print the number. When swiotlb_create_default_debugfs() is running from late_initcall, mem->total_used may already be non-zero, because the storage driver may have already started to perform I/O operations: if the storage driver is built-in, its probe() callback is called before late_initcall. swiotlb_create_debugfs_files() should not blindly set mem->total_used and mem->used_hiwater to 0; actually it doesn't have to initialize the fields at all, because the fields, as part of the global struct io_tlb_default_mem, have been implicitly initialized to zero. Also don't explicitly set mem->transient_nslabs to 0. Fixes: 8b0977ecc8b3 ("swiotlb: track and report io_tlb_used high water marks in debugfs") Fixes: 02e765697038 ("swiotlb: add debugfs to track swiotlb transient pool usage") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Reviewed-by: ZhangPeng Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d57c8837c813..a5e0dfc44d24 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1676,9 +1676,6 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get, static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { - atomic_long_set(&mem->total_used, 0); - atomic_long_set(&mem->used_hiwater, 0); - mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; @@ -1689,7 +1686,6 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, &fops_io_tlb_hiwater); #ifdef CONFIG_SWIOTLB_DYNAMIC - atomic_long_set(&mem->transient_nslabs, 0); debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs, mem, &fops_io_tlb_transient_used); #endif -- cgit v1.2.3 From 062a7f0ff46eb57aff526897bd2bebfdb1d3046a Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 22:37:42 -0500 Subject: smb: client: guarantee refcounted children from parent session Avoid potential use-after-free bugs when walking DFS referrals, mounting and performing DFS failover by ensuring that all children from parent @tcon->ses are also refcounted. They're all needed across the entire DFS mount. Get rid of @tcon->dfs_ses_list while we're at it, too. Cc: stable@vger.kernel.org # 6.4+ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404021527.ZlRkIxgv-lkp@intel.com/ Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 2 -- fs/smb/client/cifsproto.h | 20 +++++++++---------- fs/smb/client/connect.c | 25 ++++++++++++++++++----- fs/smb/client/dfs.c | 51 ++++++++++++++++++++++------------------------- fs/smb/client/dfs.h | 33 +++++++++++++++++++----------- fs/smb/client/dfs_cache.c | 11 +--------- fs/smb/client/misc.c | 6 ------ 7 files changed, 76 insertions(+), 72 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 7ed9d05f6890..286afbe346be 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1281,7 +1281,6 @@ struct cifs_tcon { struct cached_fids *cfids; /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL - struct list_head dfs_ses_list; struct delayed_work dfs_cache_work; #endif struct delayed_work query_interfaces; /* query interfaces workqueue job */ @@ -1804,7 +1803,6 @@ struct cifs_mount_ctx { struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; - struct list_head dfs_ses_list; }; static inline void __free_dfs_info_param(struct dfs_info3_param *param) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 0723e1b57256..8e0a348f1f66 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -725,31 +725,31 @@ struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon); void cifs_put_tcon_super(struct super_block *sb); int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry); -/* Put references of @ses and @ses->dfs_root_ses */ +/* Put references of @ses and its children */ static inline void cifs_put_smb_ses(struct cifs_ses *ses) { - struct cifs_ses *rses = ses->dfs_root_ses; + struct cifs_ses *next; - __cifs_put_smb_ses(ses); - if (rses) - __cifs_put_smb_ses(rses); + do { + next = ses->dfs_root_ses; + __cifs_put_smb_ses(ses); + } while ((ses = next)); } -/* Get an active reference of @ses and @ses->dfs_root_ses. +/* Get an active reference of @ses and its children. * * NOTE: make sure to call this function when incrementing reference count of * @ses to ensure that any DFS root session attached to it (@ses->dfs_root_ses) * will also get its reference count incremented. * - * cifs_put_smb_ses() will put both references, so call it when you're done. + * cifs_put_smb_ses() will put all references, so call it when you're done. */ static inline void cifs_smb_ses_inc_refcount(struct cifs_ses *ses) { lockdep_assert_held(&cifs_tcp_ses_lock); - ses->ses_count++; - if (ses->dfs_root_ses) - ses->dfs_root_ses->ses_count++; + for (; ses; ses = ses->dfs_root_ses) + ses->ses_count++; } static inline bool dfs_src_pathname_equal(const char *s1, const char *s2) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index ee29bc57300c..38b75cfa06e3 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1866,6 +1866,9 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx) ctx->sectype != ses->sectype) return 0; + if (ctx->dfs_root_ses != ses->dfs_root_ses) + return 0; + /* * If an existing session is limited to less channels than * requested, it should not be reused @@ -2358,9 +2361,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) * need to lock before changing something in the session. */ spin_lock(&cifs_tcp_ses_lock); + if (ctx->dfs_root_ses) + cifs_smb_ses_inc_refcount(ctx->dfs_root_ses); ses->dfs_root_ses = ctx->dfs_root_ses; - if (ses->dfs_root_ses) - ses->dfs_root_ses->ses_count++; list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3311,6 +3314,9 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx) cifs_put_smb_ses(mnt_ctx->ses); else if (mnt_ctx->server) cifs_put_tcp_session(mnt_ctx->server, 0); + mnt_ctx->ses = NULL; + mnt_ctx->tcon = NULL; + mnt_ctx->server = NULL; mnt_ctx->cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; free_xid(mnt_ctx->xid); } @@ -3589,8 +3595,6 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) bool isdfs; int rc; - INIT_LIST_HEAD(&mnt_ctx.dfs_ses_list); - rc = dfs_mount_share(&mnt_ctx, &isdfs); if (rc) goto error; @@ -3621,7 +3625,6 @@ out: return rc; error: - dfs_put_root_smb_sessions(&mnt_ctx.dfs_ses_list); cifs_mount_put_conns(&mnt_ctx); return rc; } @@ -3636,6 +3639,18 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; rc = cifs_mount_get_tcon(&mnt_ctx); + if (!rc) { + /* + * Prevent superblock from being created with any missing + * connections. + */ + if (WARN_ON(!mnt_ctx.server)) + rc = -EHOSTDOWN; + else if (WARN_ON(!mnt_ctx.ses)) + rc = -EACCES; + else if (WARN_ON(!mnt_ctx.tcon)) + rc = -ENOENT; + } if (rc) goto error; diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 449c59830039..3ec965547e3d 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -66,33 +66,20 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path) } /* - * Track individual DFS referral servers used by new DFS mount. - * - * On success, their lifetime will be shared by final tcon (dfs_ses_list). - * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount(). + * Get an active reference of @ses so that next call to cifs_put_tcon() won't + * release it as any new DFS referrals must go through its IPC tcon. */ -static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) +static void add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - struct dfs_root_ses *root_ses; struct cifs_ses *ses = mnt_ctx->ses; if (ses) { - root_ses = kmalloc(sizeof(*root_ses), GFP_KERNEL); - if (!root_ses) - return -ENOMEM; - - INIT_LIST_HEAD(&root_ses->list); - spin_lock(&cifs_tcp_ses_lock); cifs_smb_ses_inc_refcount(ses); spin_unlock(&cifs_tcp_ses_lock); - root_ses->ses = ses; - list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list); } - /* Select new DFS referral server so that new referrals go through it */ ctx->dfs_root_ses = ses; - return 0; } static inline int parse_dfs_target(struct smb3_fs_context *ctx, @@ -185,11 +172,8 @@ again: continue; } - if (is_refsrv) { - rc = add_root_smb_session(mnt_ctx); - if (rc) - goto out; - } + if (is_refsrv) + add_root_smb_session(mnt_ctx); rc = ref_walk_advance(rw); if (!rc) { @@ -232,6 +216,7 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; struct cifs_tcon *tcon; char *origin_fullpath; + bool new_tcon = true; int rc; origin_fullpath = dfs_get_path(cifs_sb, ctx->source); @@ -239,6 +224,18 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) return PTR_ERR(origin_fullpath); rc = dfs_referral_walk(mnt_ctx); + if (!rc) { + /* + * Prevent superblock from being created with any missing + * connections. + */ + if (WARN_ON(!mnt_ctx->server)) + rc = -EHOSTDOWN; + else if (WARN_ON(!mnt_ctx->ses)) + rc = -EACCES; + else if (WARN_ON(!mnt_ctx->tcon)) + rc = -ENOENT; + } if (rc) goto out; @@ -247,15 +244,14 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) if (!tcon->origin_fullpath) { tcon->origin_fullpath = origin_fullpath; origin_fullpath = NULL; + } else { + new_tcon = false; } spin_unlock(&tcon->tc_lock); - if (list_empty(&tcon->dfs_ses_list)) { - list_replace_init(&mnt_ctx->dfs_ses_list, &tcon->dfs_ses_list); + if (new_tcon) { queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, dfs_cache_get_ttl() * HZ); - } else { - dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list); } out: @@ -298,7 +294,6 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) if (rc) return rc; - ctx->dfs_root_ses = mnt_ctx->ses; /* * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally * try to get an DFS referral (even cached) to determine whether it is an DFS mount. @@ -324,7 +319,9 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) *isdfs = true; add_root_smb_session(mnt_ctx); - return __dfs_mount_share(mnt_ctx); + rc = __dfs_mount_share(mnt_ctx); + dfs_put_root_smb_sessions(mnt_ctx); + return rc; } /* Update dfs referral path of superblock */ diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h index 875ab7ae57fc..e5c4dcf83750 100644 --- a/fs/smb/client/dfs.h +++ b/fs/smb/client/dfs.h @@ -7,7 +7,9 @@ #define _CIFS_DFS_H #include "cifsglob.h" +#include "cifsproto.h" #include "fs_context.h" +#include "dfs_cache.h" #include "cifs_unicode.h" #include @@ -114,11 +116,6 @@ static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw) ref_walk_tit(rw)); } -struct dfs_root_ses { - struct list_head list; - struct cifs_ses *ses; -}; - int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref, struct smb3_fs_context *ctx); int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs); @@ -133,20 +130,32 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; + struct cifs_ses *rses = ctx->dfs_root_ses ?: mnt_ctx->ses; - return dfs_cache_find(mnt_ctx->xid, ctx->dfs_root_ses, cifs_sb->local_nls, + return dfs_cache_find(mnt_ctx->xid, rses, cifs_sb->local_nls, cifs_remap(cifs_sb), path, ref, tl); } -static inline void dfs_put_root_smb_sessions(struct list_head *head) +/* + * cifs_get_smb_ses() already guarantees an active reference of + * @ses->dfs_root_ses when a new session is created, so we need to put extra + * references of all DFS root sessions that were used across the mount process + * in dfs_mount_share(). + */ +static inline void dfs_put_root_smb_sessions(struct cifs_mount_ctx *mnt_ctx) { - struct dfs_root_ses *root, *tmp; + const struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; + struct cifs_ses *ses = ctx->dfs_root_ses; + struct cifs_ses *cur; + + if (!ses) + return; - list_for_each_entry_safe(root, tmp, head, list) { - list_del_init(&root->list); - cifs_put_smb_ses(root->ses); - kfree(root); + for (cur = ses; cur; cur = cur->dfs_root_ses) { + if (cur->dfs_root_ses) + cifs_put_smb_ses(cur->dfs_root_ses); } + cifs_put_smb_ses(ses); } #endif /* _CIFS_DFS_H */ diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 508d831fabe3..0552a864ff08 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1278,21 +1278,12 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb) void dfs_cache_refresh(struct work_struct *work) { struct TCP_Server_Info *server; - struct dfs_root_ses *rses; struct cifs_tcon *tcon; struct cifs_ses *ses; tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work); - ses = tcon->ses; - server = ses->server; - mutex_lock(&server->refpath_lock); - if (server->leaf_fullpath) - __refresh_tcon(server->leaf_fullpath + 1, ses, false); - mutex_unlock(&server->refpath_lock); - - list_for_each_entry(rses, &tcon->dfs_ses_list, list) { - ses = rses->ses; + for (ses = tcon->ses; ses; ses = ses->dfs_root_ses) { server = ses->server; mutex_lock(&server->refpath_lock); if (server->leaf_fullpath) diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index c3771fc81328..1ea22b3955a2 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -138,9 +138,6 @@ tcon_info_alloc(bool dir_leases_enabled) atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); ret_buf->stats_from_time = ktime_get_real_seconds(); -#ifdef CONFIG_CIFS_DFS_UPCALL - INIT_LIST_HEAD(&ret_buf->dfs_ses_list); -#endif return ret_buf; } @@ -156,9 +153,6 @@ tconInfoFree(struct cifs_tcon *tcon) atomic_dec(&tconInfoAllocCount); kfree(tcon->nativeFileSystem); kfree_sensitive(tcon->password); -#ifdef CONFIG_CIFS_DFS_UPCALL - dfs_put_root_smb_sessions(&tcon->dfs_ses_list); -#endif kfree(tcon->origin_fullpath); kfree(tcon); } -- cgit v1.2.3 From 0a05ad21d77a188d06481c36d6016805a881bcc0 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 22:44:07 -0300 Subject: smb: client: refresh referral without acquiring refpath_lock Avoid refreshing DFS referral with refpath_lock acquired as the I/O could block for a while due to a potentially disconnected or slow DFS root server and then making other threads - that use same @server and don't require a DFS root server - unable to make any progress. Cc: stable@vger.kernel.org # 6.4+ Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/dfs_cache.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 0552a864ff08..11c8efecf7aa 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1172,8 +1172,8 @@ static bool is_ses_good(struct cifs_ses *ses) return ret; } -/* Refresh dfs referral of tcon and mark it for reconnect if needed */ -static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_refresh) +/* Refresh dfs referral of @ses and mark it for reconnect if needed */ +static void __refresh_ses_referral(struct cifs_ses *ses, bool force_refresh) { struct TCP_Server_Info *server = ses->server; DFS_CACHE_TGT_LIST(old_tl); @@ -1181,10 +1181,21 @@ static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_ref bool needs_refresh = false; struct cache_entry *ce; unsigned int xid; + char *path = NULL; int rc = 0; xid = get_xid(); + mutex_lock(&server->refpath_lock); + if (server->leaf_fullpath) { + path = kstrdup(server->leaf_fullpath + 1, GFP_ATOMIC); + if (!path) + rc = -ENOMEM; + } + mutex_unlock(&server->refpath_lock); + if (!path) + goto out; + down_read(&htable_rw_lock); ce = lookup_cache_entry(path); needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce); @@ -1218,19 +1229,17 @@ out: free_xid(xid); dfs_cache_free_tgts(&old_tl); dfs_cache_free_tgts(&new_tl); - return rc; + kfree(path); } -static int refresh_tcon(struct cifs_tcon *tcon, bool force_refresh) +static inline void refresh_ses_referral(struct cifs_ses *ses) { - struct TCP_Server_Info *server = tcon->ses->server; - struct cifs_ses *ses = tcon->ses; + __refresh_ses_referral(ses, false); +} - mutex_lock(&server->refpath_lock); - if (server->leaf_fullpath) - __refresh_tcon(server->leaf_fullpath + 1, ses, force_refresh); - mutex_unlock(&server->refpath_lock); - return 0; +static inline void force_refresh_ses_referral(struct cifs_ses *ses) +{ + __refresh_ses_referral(ses, true); } /** @@ -1271,25 +1280,20 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb) */ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; - return refresh_tcon(tcon, true); + force_refresh_ses_referral(tcon->ses); + return 0; } /* Refresh all DFS referrals related to DFS tcon */ void dfs_cache_refresh(struct work_struct *work) { - struct TCP_Server_Info *server; struct cifs_tcon *tcon; struct cifs_ses *ses; tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work); - for (ses = tcon->ses; ses; ses = ses->dfs_root_ses) { - server = ses->server; - mutex_lock(&server->refpath_lock); - if (server->leaf_fullpath) - __refresh_tcon(server->leaf_fullpath + 1, ses, false); - mutex_unlock(&server->refpath_lock); - } + for (ses = tcon->ses; ses; ses = ses->dfs_root_ses) + refresh_ses_referral(ses); queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, atomic_read(&dfs_cache_ttl) * HZ); -- cgit v1.2.3 From 4a5ba0e0bfe552ac7451f57e304f6343c3d87f89 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 22:44:08 -0300 Subject: smb: client: handle DFS tcons in cifs_construct_tcon() The tcons created by cifs_construct_tcon() on multiuser mounts must also be able to failover and refresh DFS referrals, so set the appropriate fields in order to get a full DFS tcon. They could be shared among different superblocks later, too. Cc: stable@vger.kernel.org # 6.4+ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404021518.3Xu2VU4s-lkp@intel.com/ Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 38b75cfa06e3..9a3578881496 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3995,6 +3995,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) struct cifs_ses *ses; struct cifs_tcon *tcon = NULL; struct smb3_fs_context *ctx; + char *origin_fullpath = NULL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (ctx == NULL) @@ -4018,6 +4019,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->sign = master_tcon->ses->sign; ctx->seal = master_tcon->seal; ctx->witness = master_tcon->use_witness; + ctx->dfs_root_ses = master_tcon->ses->dfs_root_ses; rc = cifs_set_vol_auth(ctx, master_tcon->ses); if (rc) { @@ -4037,12 +4039,39 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) goto out; } +#ifdef CONFIG_CIFS_DFS_UPCALL + spin_lock(&master_tcon->tc_lock); + if (master_tcon->origin_fullpath) { + spin_unlock(&master_tcon->tc_lock); + origin_fullpath = dfs_get_path(cifs_sb, cifs_sb->ctx->source); + if (IS_ERR(origin_fullpath)) { + tcon = ERR_CAST(origin_fullpath); + origin_fullpath = NULL; + cifs_put_smb_ses(ses); + goto out; + } + } else { + spin_unlock(&master_tcon->tc_lock); + } +#endif + tcon = cifs_get_tcon(ses, ctx); if (IS_ERR(tcon)) { cifs_put_smb_ses(ses); goto out; } +#ifdef CONFIG_CIFS_DFS_UPCALL + if (origin_fullpath) { + spin_lock(&tcon->tc_lock); + tcon->origin_fullpath = origin_fullpath; + spin_unlock(&tcon->tc_lock); + origin_fullpath = NULL; + queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, + dfs_cache_get_ttl() * HZ); + } +#endif + #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(ses)) reset_cifs_unix_caps(0, tcon, NULL, ctx); @@ -4051,6 +4080,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) out: kfree(ctx->username); kfree_sensitive(ctx->password); + kfree(origin_fullpath); kfree(ctx); return tcon; -- cgit v1.2.3 From 93cee45ccfebc62a3bb4cd622b89e00c8c7d8493 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 1 Apr 2024 22:44:09 -0300 Subject: smb: client: serialise cifs_construct_tcon() with cifs_mount_mutex Serialise cifs_construct_tcon() with cifs_mount_mutex to handle parallel mounts that may end up reusing the session and tcon created by it. Cc: stable@vger.kernel.org # 6.4+ Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 13 ++++++++++++- fs/smb/client/fs_context.c | 6 +++--- fs/smb/client/fs_context.h | 12 ++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 9a3578881496..95e4bda4fd51 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3988,7 +3988,7 @@ cifs_set_vol_auth(struct smb3_fs_context *ctx, struct cifs_ses *ses) } static struct cifs_tcon * -cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) +__cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) { int rc; struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); @@ -4086,6 +4086,17 @@ out: return tcon; } +static struct cifs_tcon * +cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) +{ + struct cifs_tcon *ret; + + cifs_mount_lock(); + ret = __cifs_construct_tcon(cifs_sb, fsuid); + cifs_mount_unlock(); + return ret; +} + struct cifs_tcon * cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) { diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index bdcbe6ff2739..b7bfe705b2c4 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -37,7 +37,7 @@ #include "rfc1002pdu.h" #include "fs_context.h" -static DEFINE_MUTEX(cifs_mount_mutex); +DEFINE_MUTEX(cifs_mount_mutex); static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, @@ -783,9 +783,9 @@ static int smb3_get_tree(struct fs_context *fc) if (err) return err; - mutex_lock(&cifs_mount_mutex); + cifs_mount_lock(); ret = smb3_get_tree_common(fc); - mutex_unlock(&cifs_mount_mutex); + cifs_mount_unlock(); return ret; } diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 7863f2248c4d..8a35645e0b65 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -304,4 +304,16 @@ extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb); #define MAX_CACHED_FIDS 16 extern char *cifs_sanitize_prepath(char *prepath, gfp_t gfp); +extern struct mutex cifs_mount_mutex; + +static inline void cifs_mount_lock(void) +{ + mutex_lock(&cifs_mount_mutex); +} + +static inline void cifs_mount_unlock(void) +{ + mutex_unlock(&cifs_mount_mutex); +} + #endif -- cgit v1.2.3 From c85c9ab926a592e2f59f7d9a6ca7d6562843d8fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Apr 2024 16:47:54 +0200 Subject: nvme: split nvme_update_zone_info nvme_update_zone_info does (admin queue) I/O to the device and can fail. We fail to abort the queue limits update if that happen, but really should avoid with the frozen I/O queue as much as possible anyway. Split the logic into a helper to query the information that can be called on an unfrozen queue and one to apply it to the queue limits. Fixes: 9b130d681443 ("nvme: use the atomic queue limits update API") Reported-by: Kanchan Joshi Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 19 +++++++++++-------- drivers/nvme/host/nvme.h | 12 ++++++++++-- drivers/nvme/host/zns.c | 33 ++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0cf46068f1d0..504dc352c458 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2076,6 +2076,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; struct queue_limits lim; struct nvme_id_ns_nvm *nvm = NULL; + struct nvme_zone_info zi = {}; struct nvme_id_ns *id; sector_t capacity; unsigned lbaf; @@ -2091,6 +2092,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ret = -ENODEV; goto out; } + lbaf = nvme_lbaf_index(id->flbas); if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) { ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm); @@ -2098,8 +2100,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_query_zone_info(ns, lbaf, &zi); + if (ret < 0) + goto out; + } + blk_mq_freeze_queue(ns->disk->queue); - lbaf = nvme_lbaf_index(id->flbas); ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); @@ -2112,13 +2120,8 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, capacity = 0; nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - ns->head->ids.csi == NVME_CSI_ZNS) { - ret = nvme_update_zone_info(ns, lbaf, &lim); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } - } + ns->head->ids.csi == NVME_CSI_ZNS) + nvme_update_zone_info(ns, &lim, &zi); ret = queue_limits_commit_update(ns->disk->queue, &lim); if (ret) { blk_mq_unfreeze_queue(ns->disk->queue); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 24193fcb8bd5..d0ed64dc7380 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1036,10 +1036,18 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) } #endif /* CONFIG_NVME_MULTIPATH */ +struct nvme_zone_info { + u64 zone_size; + unsigned int max_open_zones; + unsigned int max_active_zones; +}; + int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, - struct queue_limits *lim); +int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, + struct nvme_zone_info *zi); +void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, + struct nvme_zone_info *zi); #ifdef CONFIG_BLK_DEV_ZONED blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 722384bcc765..77aa0f440a6d 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -35,8 +35,8 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl) return 0; } -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, - struct queue_limits *lim) +int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, + struct nvme_zone_info *zi) { struct nvme_effects_log *log = ns->head->effects; struct nvme_command c = { }; @@ -89,27 +89,34 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, goto free_data; } - ns->head->zsze = - nvme_lba_to_sect(ns->head, le64_to_cpu(id->lbafe[lbaf].zsze)); - if (!is_power_of_2(ns->head->zsze)) { + zi->zone_size = le64_to_cpu(id->lbafe[lbaf].zsze); + if (!is_power_of_2(zi->zone_size)) { dev_warn(ns->ctrl->device, - "invalid zone size:%llu for namespace:%u\n", - ns->head->zsze, ns->head->ns_id); + "invalid zone size: %llu for namespace: %u\n", + zi->zone_size, ns->head->ns_id); status = -ENODEV; goto free_data; } + zi->max_open_zones = le32_to_cpu(id->mor) + 1; + zi->max_active_zones = le32_to_cpu(id->mar) + 1; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); - lim->zoned = 1; - lim->max_open_zones = le32_to_cpu(id->mor) + 1; - lim->max_active_zones = le32_to_cpu(id->mar) + 1; - lim->chunk_sectors = ns->head->zsze; - lim->max_zone_append_sectors = ns->ctrl->max_zone_append; free_data: kfree(id); return status; } +void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, + struct nvme_zone_info *zi) +{ + lim->zoned = 1; + lim->max_open_zones = zi->max_open_zones; + lim->max_active_zones = zi->max_active_zones; + lim->max_zone_append_sectors = ns->ctrl->max_zone_append; + lim->chunk_sectors = ns->head->zsze = + nvme_lba_to_sect(ns->head, zi->zone_size); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); +} + static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, unsigned int nr_zones, size_t *buflen) { -- cgit v1.2.3 From 538d505fde20393bce1e6fb95cec82b56cdd22ef Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 22 Jan 2024 15:22:35 +0100 Subject: tools/power turbostat: Read base_hz and bclk from CPUID.16H if available If MSRs cannot be read, values can be obtained from cpuid. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a4a40a6e1b95..c35c48b6a99a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5848,6 +5848,15 @@ void process_cpuid() base_mhz = max_mhz = bus_mhz = edx = 0; __cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx); + + bclk = bus_mhz; + + base_hz = base_mhz * 1000000; + has_base_hz = 1; + + if (platform->enable_tsc_tweak) + tsc_tweak = base_hz / tsc_hz; + if (!quiet) fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n", base_mhz, max_mhz, bus_mhz); -- cgit v1.2.3 From b6fe938317eed58e8c687bd5965a956e15fb5828 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 19 Jan 2024 12:25:42 -0600 Subject: tools/power turbostat: Fix warning upon failed /dev/cpu_dma_latency read Previously a failed read of /dev/cpu_dma_latency erroneously complained turbostat: capget(CAP_SYS_ADMIN) failed, try "# setcap cap_sys_admin=ep ./turbostat This went unnoticed because this file is typically visible to root, and turbostat was typically run as root. Going forward, when a non-root user can run turbostat... Complain about failed read access to this file only if --debug is used. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c35c48b6a99a..531f37e5f92a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5545,7 +5545,8 @@ void print_dev_latency(void) fd = open(path, O_RDONLY); if (fd < 0) { - warnx("capget(CAP_SYS_ADMIN) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); + if (debug) + warnx("Read %s failed", path); return; } -- cgit v1.2.3 From 2d2ccd57338779469777d4319152151272994182 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 5 Feb 2024 15:56:25 -0600 Subject: tools/power turbostat: enhance -D (debug counter dump) output Eliminate redundant debug output for core and package scope counters. Include name and path for all "ADDED" counters. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 531f37e5f92a..60432753fe6a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1673,11 +1673,13 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "SMI: %d\n", t->smi_count); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]); + outp += + sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + t->counter[i], mp->path); } } - if (c) { + if (c && is_cpu_first_thread_in_core(t, c, p)) { outp += sprintf(outp, "core: %d\n", c->core_id); outp += sprintf(outp, "c3: %016llX\n", c->c3); outp += sprintf(outp, "c6: %016llX\n", c->c6); @@ -1687,12 +1689,14 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "Joules: %0X\n", c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]); + outp += + sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + c->counter[i], mp->path); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } - if (p) { + if (p && is_cpu_first_core_in_package(t, c, p)) { outp += sprintf(outp, "package: %d\n", p->package_id); outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0); @@ -1721,7 +1725,9 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]); + outp += + sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, + p->counter[i], mp->path); } } -- cgit v1.2.3 From 3e4048466c396cff52c6d435156dbcd0571e4381 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 15:48:09 +0100 Subject: tools/power turbostat: Add --no-msr option Add --no-msr option to allow users to run turbostat without accessing MSRs via the MSR driver. Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 + tools/power/x86/turbostat/turbostat.c | 205 ++++++++++++++++++++++++---------- 2 files changed, 151 insertions(+), 56 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 1ba6340d3b3d..28be452fbfe2 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -67,6 +67,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP ++\fB--no-msr\fP Disable all the uses of the MSR driver. ++.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 60432753fe6a..4d5437a9725b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -36,6 +36,7 @@ #include #include #include +#include #define UNUSED(x) (void)(x) @@ -265,6 +266,7 @@ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; +bool no_msr; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1282,13 +1284,36 @@ int get_msr_fd(int cpu) sprintf(pathname, "/dev/cpu/%d/msr", cpu); fd = open(pathname, O_RDONLY); if (fd < 0) - err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, or run as root", pathname); + err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, " + "or run with --no-msr, or run as root", pathname); fd_percpu[cpu] = fd; return fd; } +static void bic_disable_msr_access(void) +{ + const unsigned long bic_msrs = + BIC_Avg_MHz | + BIC_Busy | + BIC_Bzy_MHz | + BIC_SMI | + BIC_CPU_c1 | + BIC_CPU_c3 | + BIC_CPU_c6 | + BIC_CPU_c7 | + BIC_Mod_c6 | + BIC_CoreTmp | + BIC_Totl_c0 | + BIC_Any_c0 | + BIC_GFX_c0 | + BIC_CPUGFX | + BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; + + bic_enabled &= ~bic_msrs; +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); @@ -1328,6 +1353,8 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) { ssize_t retval; + assert(!no_msr); + retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset); if (retval != sizeof *msr) @@ -1371,6 +1398,7 @@ void help(void) " Override default 5-second measurement interval\n" " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" + " -M, --no-msr Disable all uses of the MSR driver\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" @@ -2597,6 +2625,7 @@ unsigned long long snapshot_sysfs_counter(char *path) int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) { if (mp->msr_num != 0) { + assert(!no_msr); if (get_msr(cpu, mp->msr_num, counterp)) return -1; } else { @@ -2646,6 +2675,9 @@ int get_epb(int cpu) return epb; msr_fallback: + if (no_msr) + return -1; + get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); return msr & 0xf; @@ -2865,7 +2897,7 @@ retry: if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if (platform->rapl_msrs & RAPL_AMD_F17H) { + if ((platform->rapl_msrs & RAPL_AMD_F17H) && !no_msr) { if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) return -14; c->core_energy = msr & 0xFFFFFFFF; @@ -2930,41 +2962,44 @@ retry: if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (platform->rapl_msrs & RAPL_PKG) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) - return -13; - p->energy_pkg = msr; - } - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) - return -14; - p->energy_cores = msr; - } - if (platform->rapl_msrs & RAPL_DRAM) { - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) - return -15; - p->energy_dram = msr; - } - if (platform->rapl_msrs & RAPL_GFX) { - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) - return -16; - p->energy_gfx = msr; - } - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) - return -16; - p->rapl_pkg_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) - return -16; - p->rapl_dram_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) - return -13; - p->energy_pkg = msr; + if (!no_msr) { + if (platform->rapl_msrs & RAPL_PKG) { + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) + return -13; + p->energy_pkg = msr; + } + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { + if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) + return -14; + p->energy_cores = msr; + } + if (platform->rapl_msrs & RAPL_DRAM) { + if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) + return -15; + p->energy_dram = msr; + } + if (platform->rapl_msrs & RAPL_GFX) { + if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) + return -16; + p->energy_gfx = msr; + } + if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { + if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) + return -16; + p->rapl_pkg_perf_status = msr; + } + if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { + if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) + return -16; + p->rapl_dram_perf_status = msr; + } + if (platform->rapl_msrs & RAPL_AMD_F17H) { + if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) + return -13; + p->energy_pkg = msr; + } } + if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return -17; @@ -3072,7 +3107,7 @@ void probe_cst_limit(void) unsigned long long msr; int *pkg_cstate_limits; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; switch (platform->cst_limit) { @@ -3116,7 +3151,7 @@ static void dump_platform_info(void) unsigned long long msr; unsigned int ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); @@ -3134,7 +3169,7 @@ static void dump_power_ctl(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); @@ -3340,7 +3375,7 @@ static void dump_cst_cfg(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); @@ -3412,7 +3447,7 @@ void print_irtl(void) { unsigned long long msr; - if (!platform->has_irtl_msrs) + if (!platform->has_irtl_msrs || no_msr) return; if (platform->supported_cstates & PC3) { @@ -4193,6 +4228,8 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr) int ret, idx; unsigned long long msr_cur, msr_last; + assert(!no_msr); + if (!per_cpu_msr_sum) return 1; @@ -4221,6 +4258,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg UNUSED(c); UNUSED(p); + assert(!no_msr); + for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { unsigned long long msr_cur, msr_last; off_t offset; @@ -4465,7 +4504,7 @@ void check_permissions(void) sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { do_exit++; - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr"); + warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr, or run with --no-msr"); } /* if all else fails, thell them to be root */ @@ -4482,7 +4521,7 @@ void probe_bclk(void) unsigned long long msr; unsigned int base_ratio; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->bclk_freq == BCLK_100MHZ) @@ -4522,7 +4561,7 @@ static void dump_turbo_ratio_info(void) if (!has_turbo) return; - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) return; if (platform->trl_msrs & TRL_LIMIT2) @@ -4845,6 +4884,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!has_hwp) return 0; @@ -4931,6 +4973,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + cpu = t->cpu_id; /* per-package */ @@ -5264,7 +5309,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) */ void probe_rapl(void) { - if (!platform->rapl_msrs) + if (!platform->rapl_msrs || no_msr) return; if (genuine_intel) @@ -5320,7 +5365,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk } /* Temperature Target MSR is Nehalem and newer only */ - if (!platform->has_nhm_msrs) + if (!platform->has_nhm_msrs || no_msr) goto guess; if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) @@ -5367,6 +5412,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p UNUSED(c); UNUSED(p); + if (no_msr) + return 0; + if (!(do_dts || do_ptm)) return 0; @@ -5464,6 +5512,9 @@ void decode_feature_control_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr)) fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n", base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); @@ -5473,6 +5524,9 @@ void decode_misc_enable_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!genuine_intel) return; @@ -5490,6 +5544,9 @@ void decode_misc_feature_control(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_feature_control) return; @@ -5511,6 +5568,9 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_misc_pwr_mgmt) return; @@ -5530,6 +5590,9 @@ void decode_c6_demotion_policy_msr(void) { unsigned long long msr; + if (no_msr) + return; + if (!platform->has_msr_c6_demotion_policy_config) return; @@ -5626,7 +5689,7 @@ void probe_cstates(void) if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - if (platform->has_ext_cst_msrs) { + if (platform->has_ext_cst_msrs && !no_msr) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); BIC_PRESENT(BIC_GFX_c0); @@ -5714,10 +5777,12 @@ void process_cpuid() ecx_flags = ecx; edx_flags = edx; - if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) - warnx("get_msr(UCODE)"); - else - ucode_patch_valid = true; + if (!no_msr) { + if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) + warnx("get_msr(UCODE)"); + else + ucode_patch_valid = true; + } /* * check max extended function levels of CPUID. @@ -5892,7 +5957,7 @@ void probe_pm_features(void) probe_thermal(); - if (platform->has_nhm_msrs) + if (platform->has_nhm_msrs && !no_msr) BIC_PRESENT(BIC_SMI); if (!quiet) @@ -6252,8 +6317,10 @@ void turbostat_init() { setup_all_buffers(true); set_base_cpu(); - check_dev_msr(); - check_permissions(); + if (!no_msr) { + check_dev_msr(); + check_permissions(); + } process_cpuid(); probe_pm_features(); linux_perf_init(); @@ -6370,6 +6437,9 @@ int add_counter(unsigned int msr_num, char *path, char *name, { struct msr_counter *msrp; + if (no_msr && msr_num) + errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); + msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) { perror("calloc"); @@ -6674,6 +6744,7 @@ void cmdline(int argc, char **argv) { "list", no_argument, 0, 'l' }, { "out", required_argument, 0, 'o' }, { "quiet", no_argument, 0, 'q' }, + { "no-msr", no_argument, 0, 'M' }, { "show", required_argument, 0, 's' }, { "Summary", no_argument, 0, 'S' }, { "TCC", required_argument, 0, 'T' }, @@ -6683,7 +6754,22 @@ void cmdline(int argc, char **argv) progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) { + /* + * Parse some options early, because they may make other options invalid, + * like adding the MSR counter with --add and at the same time using --no-msr. + */ + while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) { + switch (opt) { + case 'M': + no_msr = 1; + break; + default: + break; + } + } + optind = 0; + + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) { switch (opt) { case 'a': parse_add_command(optarg); @@ -6741,6 +6827,9 @@ void cmdline(int argc, char **argv) case 'q': quiet = 1; break; + case 'M': + /* Parsed earlier */ + break; case 'n': num_iterations = strtod(optarg, NULL); @@ -6817,6 +6906,9 @@ skip_cgroup_setting: outf = stderr; cmdline(argc, argv); + if (no_msr) + bic_disable_msr_access(); + if (!quiet) { print_version(); print_bootcmd(); @@ -6829,7 +6921,8 @@ skip_cgroup_setting: turbostat_init(); - msr_sum_record(); + if (!no_msr) + msr_sum_record(); /* dump counters and exit */ if (dump_only) -- cgit v1.2.3 From a0e86c90b83c118985260e36490583b5a38d4359 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 15:58:02 +0100 Subject: tools/power turbostat: Add --no-perf option Add the --no-perf option to allow users to run turbostat without accessing perf. Signed-off-by: Patryk Wlazlyn Reviewed-by: Len Brown Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 ++ tools/power/x86/turbostat/turbostat.c | 25 ++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 28be452fbfe2..567327b004e6 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -69,6 +69,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP +\fB--no-msr\fP Disable all the uses of the MSR driver. +.PP ++\fB--no-perf\fP Disable all the uses of the perf API. ++.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4d5437a9725b..bad2fec7f342 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -267,6 +267,7 @@ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; +bool no_perf; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1314,8 +1315,17 @@ static void bic_disable_msr_access(void) bic_enabled &= ~bic_msrs; } +static void bic_disable_perf_access(void) +{ + const unsigned long bic_perf = BIC_IPC; + + bic_enabled &= ~bic_perf; +} + static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { + assert(!no_perf); + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } @@ -1332,8 +1342,8 @@ static int perf_instr_count_open(int cpu_num) /* counter for cpu_num, including user + kernel and all processes */ fd = perf_event_open(&pea, -1, cpu_num, -1, 0); if (fd == -1) { - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"", progname); - BIC_NOT_PRESENT(BIC_IPC); + warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\" or use --no-perf", progname); + bic_disable_perf_access(); } return fd; @@ -1399,6 +1409,7 @@ void help(void) " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" " -M, --no-msr Disable all uses of the MSR driver\n" + " -P, --no-perf Disable all uses of the perf API\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -N, --header_iterations num\n" @@ -6745,6 +6756,7 @@ void cmdline(int argc, char **argv) { "out", required_argument, 0, 'o' }, { "quiet", no_argument, 0, 'q' }, { "no-msr", no_argument, 0, 'M' }, + { "no-perf", no_argument, 0, 'P' }, { "show", required_argument, 0, 's' }, { "Summary", no_argument, 0, 'S' }, { "TCC", required_argument, 0, 'T' }, @@ -6758,11 +6770,14 @@ void cmdline(int argc, char **argv) * Parse some options early, because they may make other options invalid, * like adding the MSR counter with --add and at the same time using --no-msr. */ - while ((opt = getopt_long_only(argc, argv, "M", long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "MP", long_options, &option_index)) != -1) { switch (opt) { case 'M': no_msr = 1; break; + case 'P': + no_perf = 1; + break; default: break; } @@ -6828,6 +6843,7 @@ void cmdline(int argc, char **argv) quiet = 1; break; case 'M': + case 'P': /* Parsed earlier */ break; case 'n': @@ -6909,6 +6925,9 @@ skip_cgroup_setting: if (no_msr) bic_disable_msr_access(); + if (no_perf) + bic_disable_perf_access(); + if (!quiet) { print_version(); print_bootcmd(); -- cgit v1.2.3 From e48934c9f1048ed4640b60321baf1986d1a470e1 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 11 Jan 2024 16:42:22 +0100 Subject: tools/power turbostat: Add reading aperf and mperf via perf API By using the perf API we spend less time in between the reads of the counters, resulting in more accurate calculations of the dependent metrics. Using perf API is also usually faster overall, although cache miss, if we get one, is more costly when using perf vs MSR driver. We would fallback to the msr reads if the sysfs isn't there or when in --no-perf mode. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 374 +++++++++++++++++++++++++++------- 1 file changed, 301 insertions(+), 73 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bad2fec7f342..1294c46c2170 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -59,6 +59,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; +enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; struct msr_counter { unsigned int msr_num; @@ -207,10 +208,13 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) +struct amperf_group_fd; + char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; +struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -268,6 +272,7 @@ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; bool no_perf; +enum amperf_source amperf_source; int get_msr(int cpu, off_t offset, unsigned long long *msr); @@ -1296,9 +1301,6 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { const unsigned long bic_msrs = - BIC_Avg_MHz | - BIC_Busy | - BIC_Bzy_MHz | BIC_SMI | BIC_CPU_c1 | BIC_CPU_c3 | @@ -1329,21 +1331,30 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static int perf_instr_count_open(int cpu_num) +static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) { - struct perf_event_attr pea; - int fd; + struct perf_event_attr attr; + const pid_t pid = -1; + const unsigned long flags = 0; + + memset(&attr, 0, sizeof(struct perf_event_attr)); - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_HARDWARE; - pea.size = sizeof(struct perf_event_attr); - pea.config = PERF_COUNT_HW_INSTRUCTIONS; + attr.type = type; + attr.size = sizeof(struct perf_event_attr); + attr.config = config; + attr.disabled = 0; + attr.sample_type = PERF_SAMPLE_IDENTIFIER; + attr.read_format = read_format; - /* counter for cpu_num, including user + kernel and all processes */ - fd = perf_event_open(&pea, -1, cpu_num, -1, 0); + const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags); if (fd == -1) { - warnx("capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\" or use --no-perf", progname); - bic_disable_perf_access(); + if (errno == EACCES) { + errx(1, "capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"" + " or use --no-perf or run as root", progname); + } else { + perror("perf_event_open"); + errx(1, "use --no-perf or run as root"); + } } return fd; @@ -1354,7 +1365,8 @@ int get_instr_count_fd(int cpu) if (fd_instr_count_percpu[cpu]) return fd_instr_count_percpu[cpu]; - fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu); + fd_instr_count_percpu[cpu] = + open_perf_counter_or_fail(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); return fd_instr_count_percpu[cpu]; } @@ -2762,6 +2774,175 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt) return 0; } +struct amperf_group_fd { + int aperf; /* Also the group descriptor */ + int mperf; +}; + +static unsigned int read_perf_counter_info(const char *const path, const char *const parse_format) +{ + int fdmt; + char buf[16]; + unsigned int v; + + fdmt = open(path, O_RDONLY, 0); + if (fdmt == -1) + errx(1, "Failed to read perf counter info %s\n", path); + + if (read(fdmt, buf, sizeof(buf)) <= 0) + return 0; + + buf[sizeof(buf) - 1] = '\0'; + + if (sscanf(buf, parse_format, &v) != 1) + errx(1, "Failed to parse perf counter info %s\n", path); + + close(fdmt); + + return v; +} + +static unsigned read_msr_type(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/type"; + const char *const format = "%u"; + + return read_perf_counter_info(path, format); +} + +static unsigned read_aperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info(path, format); +} + +static unsigned read_mperf_config(void) +{ + const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; + const char *const format = "event=%x"; + + return read_perf_counter_info(path, format); +} + +static struct amperf_group_fd open_amperf_fd(int cpu) +{ + const unsigned int msr_type = read_msr_type(); + const unsigned int aperf_config = read_aperf_config(); + const unsigned int mperf_config = read_mperf_config(); + struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; + + fds.aperf = open_perf_counter_or_fail(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); + fds.mperf = open_perf_counter_or_fail(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); + + return fds; +} + +static int get_amperf_fd(int cpu) +{ + assert(fd_amperf_percpu); + + if (fd_amperf_percpu[cpu].aperf) + return fd_amperf_percpu[cpu].aperf; + + fd_amperf_percpu[cpu] = open_amperf_fd(cpu); + + return fd_amperf_percpu[cpu].aperf; +} + +/* Read APERF, MPERF and TSC using the perf API. */ +static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) +{ + union { + struct { + unsigned long nr_entries; + unsigned long aperf; + unsigned long mperf; + }; + + unsigned long as_array[3]; + } cnt; + + const int fd_amperf = get_amperf_fd(cpu); + + /* + * Read the TSC with rdtsc, because we want the absolute value and not + * the offset from the start of the counter. + */ + t->tsc = rdtsc(); + + const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); + if (n != sizeof(cnt.as_array)) + return -2; + + t->aperf = cnt.aperf * aperf_mperf_multiplier; + t->mperf = cnt.mperf * aperf_mperf_multiplier; + + return 0; +} + +/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ +static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) +{ + unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; + int aperf_mperf_retry_count = 0; + + /* + * The TSC, APERF and MPERF must be read together for + * APERF/MPERF and MPERF/TSC to give accurate results. + * + * Unfortunately, APERF and MPERF are read by + * individual system call, so delays may occur + * between them. If the time to read them + * varies by a large amount, we re-read them. + */ + + /* + * This initial dummy APERF read has been seen to + * reduce jitter in the subsequent reads. + */ + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + +retry: + t->tsc = rdtsc(); /* re-read close to APERF */ + + tsc_before = t->tsc; + + if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) + return -3; + + tsc_between = rdtsc(); + + if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) + return -4; + + tsc_after = rdtsc(); + + aperf_time = tsc_between - tsc_before; + mperf_time = tsc_after - tsc_between; + + /* + * If the system call latency to read APERF and MPERF + * differ by more than 2x, then try again. + */ + if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { + aperf_mperf_retry_count++; + if (aperf_mperf_retry_count < 5) + goto retry; + else + warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + } + aperf_mperf_retry_count = 0; + + t->aperf = t->aperf * aperf_mperf_multiplier; + t->mperf = t->mperf * aperf_mperf_multiplier; + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2771,7 +2952,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int cpu = t->cpu_id; unsigned long long msr; - int aperf_mperf_retry_count = 0; struct msr_counter *mp; int i; @@ -2784,63 +2964,26 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (first_counter_read) get_apic_id(t); -retry: + t->tsc = rdtsc(); /* we are running on local CPU of interest */ if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) || soft_c1_residency_display(BIC_Avg_MHz)) { - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; + int status = -1; - tsc_after = rdtsc(); + assert(!no_perf || !no_msr); - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); + switch (amperf_source) { + case AMPERF_SOURCE_PERF: + status = read_aperf_mperf_tsc_perf(t, cpu); + break; + case AMPERF_SOURCE_MSR: + status = read_aperf_mperf_tsc_msr(t, cpu); + break; } - aperf_mperf_retry_count = 0; - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; + if (status != 0) + return status; } if (DO_BIC(BIC_IPC)) @@ -3516,6 +3659,21 @@ void free_fd_percpu(void) free(fd_percpu); } +void free_fd_amperf_percpu(void) +{ + int i; + + for (i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_amperf_percpu[i].mperf != 0) + close(fd_amperf_percpu[i].mperf); + + if (fd_amperf_percpu[i].aperf != 0) + close(fd_amperf_percpu[i].aperf); + } + + free(fd_amperf_percpu); +} + void free_all_buffers(void) { int i; @@ -3557,6 +3715,7 @@ void free_all_buffers(void) outp = NULL; free_fd_percpu(); + free_fd_amperf_percpu(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -5647,17 +5806,62 @@ void print_dev_latency(void) */ void linux_perf_init(void) { - if (!BIC_IS_ENABLED(BIC_IPC)) - return; - if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) return; - fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); - if (fd_instr_count_percpu == NULL) - err(-1, "calloc fd_instr_count_percpu"); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) { + fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_instr_count_percpu == NULL) + err(-1, "calloc fd_instr_count_percpu"); + } - BIC_PRESENT(BIC_IPC); + const bool aperf_required = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || + BIC_IS_ENABLED(BIC_Bzy_MHz) || BIC_IS_ENABLED(BIC_IPC); + if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { + fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); + if (fd_amperf_percpu == NULL) + err(-1, "calloc fd_amperf_percpu"); + } +} + +static int has_amperf_access_via_msr(void) +{ + const int cpu = sched_getcpu(); + unsigned long long dummy; + + if (get_msr(cpu, MSR_IA32_APERF, &dummy)) + return 0; + + if (get_msr(cpu, MSR_IA32_MPERF, &dummy)) + return 0; + + return 1; +} + +static int has_amperf_access_via_perf(void) +{ + if (access("/sys/bus/event_source/devices/msr/type", F_OK)) + return 0; + + if (access("/sys/bus/event_source/devices/msr/events/aperf", F_OK)) + return 0; + + if (access("/sys/bus/event_source/devices/msr/events/mperf", F_OK)) + return 0; + + return 1; +} + +/* Check if we can access APERF and MPERF */ +static int has_amperf_access(void) +{ + if (!no_msr && has_amperf_access_via_msr()) + return 1; + + if (!no_perf && has_amperf_access_via_perf()) + return 1; + + return 0; } void probe_cstates(void) @@ -5845,10 +6049,11 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf) { + if (has_aperf && has_amperf_access()) { BIC_PRESENT(BIC_Avg_MHz); BIC_PRESENT(BIC_Busy); BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_IPC); } do_dts = eax & (1 << 0); if (do_dts) @@ -6324,6 +6529,19 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } +static void set_amperf_source(void) +{ + amperf_source = AMPERF_SOURCE_PERF; + + if (no_perf || !has_amperf_access_via_perf()) + amperf_source = AMPERF_SOURCE_MSR; + + if (quiet || !debug) + return; + + fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); +} + void turbostat_init() { setup_all_buffers(true); @@ -6334,6 +6552,7 @@ void turbostat_init() } process_cpuid(); probe_pm_features(); + set_amperf_source(); linux_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); @@ -6341,6 +6560,15 @@ void turbostat_init() if (DO_BIC(BIC_IPC)) (void)get_instr_count_fd(base_cpu); + + /* + * If TSC tweak is needed, but couldn't get it, + * disable more BICs, since it can't be reported accurately. + */ + if (platform->enable_tsc_tweak && !has_base_hz) { + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + } } int fork_it(char **argv) -- cgit v1.2.3 From 5088741ec805cd249e27c7176ed09bdab164960e Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 15 Jan 2024 19:04:21 +0100 Subject: tools/power turbostat: detect and disable unavailable BICs at runtime To allow unprivileged user to run turbostat seamlessly. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 188 ++++++++++++++++++++++------------ 1 file changed, 125 insertions(+), 63 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1294c46c2170..30db6e92193a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1317,13 +1317,6 @@ static void bic_disable_msr_access(void) bic_enabled &= ~bic_msrs; } -static void bic_disable_perf_access(void) -{ - const unsigned long bic_perf = BIC_IPC; - - bic_enabled &= ~bic_perf; -} - static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { assert(!no_perf); @@ -1331,12 +1324,14 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) +static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format) { struct perf_event_attr attr; const pid_t pid = -1; const unsigned long flags = 0; + assert(!no_perf); + memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = type; @@ -1347,15 +1342,6 @@ static long open_perf_counter_or_fail(int cpu, unsigned int type, unsigned int c attr.read_format = read_format; const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags); - if (fd == -1) { - if (errno == EACCES) { - errx(1, "capget(CAP_PERFMON) failed, try \"# setcap cap_sys_admin=ep %s\"" - " or use --no-perf or run as root", progname); - } else { - perror("perf_event_open"); - errx(1, "use --no-perf or run as root"); - } - } return fd; } @@ -1365,8 +1351,7 @@ int get_instr_count_fd(int cpu) if (fd_instr_count_percpu[cpu]) return fd_instr_count_percpu[cpu]; - fd_instr_count_percpu[cpu] = - open_perf_counter_or_fail(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); return fd_instr_count_percpu[cpu]; } @@ -2833,8 +2818,8 @@ static struct amperf_group_fd open_amperf_fd(int cpu) const unsigned int mperf_config = read_mperf_config(); struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; - fds.aperf = open_perf_counter_or_fail(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); - fds.mperf = open_perf_counter_or_fail(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); + fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); + fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); return fds; } @@ -4509,7 +4494,8 @@ release_msr: /* * set_my_sched_priority(pri) - * return previous + * return previous priority on success + * return value < -20 on failure */ int set_my_sched_priority(int priority) { @@ -4519,16 +4505,16 @@ int set_my_sched_priority(int priority) errno = 0; original_priority = getpriority(PRIO_PROCESS, 0); if (errno && (original_priority == -1)) - err(errno, "getpriority"); + return -21; retval = setpriority(PRIO_PROCESS, 0, priority); if (retval) - errx(retval, "capget(CAP_SYS_NICE) failed,try \"# setcap cap_sys_nice=ep %s\"", progname); + return -21; errno = 0; retval = getpriority(PRIO_PROCESS, 0); if (retval != priority) - err(retval, "getpriority(%d) != setpriority(%d)", retval, priority); + return -21; return original_priority; } @@ -4543,6 +4529,9 @@ void turbostat_loop() /* * elevate own priority for interval mode + * + * ignore on error - we probably don't have permission to set it, but + * it's not a big deal */ set_my_sched_priority(-20); @@ -4628,10 +4617,13 @@ void check_dev_msr() struct stat sb; char pathname[32]; + if (no_msr) + return; + sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (stat(pathname, &sb)) if (system("/sbin/modprobe msr > /dev/null 2>&1")) - err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); + no_msr = 1; } /* @@ -4643,47 +4635,51 @@ int check_for_cap_sys_rawio(void) { cap_t caps; cap_flag_value_t cap_flag_value; + int ret = 0; caps = cap_get_proc(); if (caps == NULL) - err(-6, "cap_get_proc\n"); + return 1; - if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) - err(-6, "cap_get\n"); + if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) { + ret = 1; + goto free_and_exit; + } if (cap_flag_value != CAP_SET) { - warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname); - return 1; + ret = 1; + goto free_and_exit; } +free_and_exit: if (cap_free(caps) == -1) err(-6, "cap_free\n"); - return 0; + return ret; } -void check_permissions(void) +void check_msr_permission(void) { - int do_exit = 0; + int failed = 0; char pathname[32]; + if (no_msr) + return; + /* check for CAP_SYS_RAWIO */ - do_exit += check_for_cap_sys_rawio(); + failed += check_for_cap_sys_rawio(); /* test file permissions */ sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (euidaccess(pathname, R_OK)) { - do_exit++; - warn("/dev/cpu/0/msr open failed, try chown or chmod +r /dev/cpu/*/msr, or run with --no-msr"); + failed++; } - /* if all else fails, thell them to be root */ - if (do_exit) - if (getuid() != 0) - warnx("... or simply run as root"); - - if (do_exit) - exit(-6); + if (failed) { + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr"); + no_msr = 1; + } } void probe_bclk(void) @@ -5800,6 +5796,28 @@ void print_dev_latency(void) close(fd); } +static int has_instr_count_access(void) +{ + int fd; + int has_access; + + if (no_perf) + return 0; + + fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0); + has_access = fd != -1; + + if (fd != -1) + close(fd); + + if (!has_access) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "instructions retired perf counter", "--no-perf"); + + return has_access; +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5826,13 +5844,15 @@ void linux_perf_init(void) static int has_amperf_access_via_msr(void) { - const int cpu = sched_getcpu(); unsigned long long dummy; - if (get_msr(cpu, MSR_IA32_APERF, &dummy)) + if (no_msr) + return 0; + + if (get_msr(base_cpu, MSR_IA32_APERF, &dummy)) return 0; - if (get_msr(cpu, MSR_IA32_MPERF, &dummy)) + if (get_msr(base_cpu, MSR_IA32_MPERF, &dummy)) return 0; return 1; @@ -5840,16 +5860,44 @@ static int has_amperf_access_via_msr(void) static int has_amperf_access_via_perf(void) { - if (access("/sys/bus/event_source/devices/msr/type", F_OK)) - return 0; + struct amperf_group_fd fds; - if (access("/sys/bus/event_source/devices/msr/events/aperf", F_OK)) - return 0; + /* + * Cache the last result, so we don't warn the user multiple times + * + * Negative means cached, no access + * Zero means not cached + * Positive means cached, has access + */ + static int has_access_cached; - if (access("/sys/bus/event_source/devices/msr/events/mperf", F_OK)) + if (no_perf) return 0; - return 1; + if (has_access_cached != 0) + return has_access_cached > 0; + + fds = open_amperf_fd(base_cpu); + has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); + + if (fds.aperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "APERF perf counter", "--no-perf"); + else + close(fds.aperf); + + if (fds.mperf == -1) + warnx("Failed to access %s. Some of the counters may not be available\n" + "\tRun as root to enable them or use %s to disable the access explicitly", + "MPERF perf counter", "--no-perf"); + else + close(fds.mperf); + + if (has_access_cached == 0) + has_access_cached = -1; + + return has_access_cached > 0; } /* Check if we can access APERF and MPERF */ @@ -6542,14 +6590,34 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +void check_msr_access(void) +{ + check_dev_msr(); + check_msr_permission(); + + if (no_msr) + bic_disable_msr_access(); +} + +void check_perf_access(void) +{ + if (no_perf || !has_instr_count_access()) + bic_enabled &= ~BIC_IPC; + + if (!has_amperf_access()) { + bic_enabled &= ~BIC_Avg_MHz; + bic_enabled &= ~BIC_Busy; + bic_enabled &= ~BIC_Bzy_MHz; + bic_enabled &= ~BIC_IPC; + } +} + void turbostat_init() { setup_all_buffers(true); set_base_cpu(); - if (!no_msr) { - check_dev_msr(); - check_permissions(); - } + check_msr_access(); + check_perf_access(); process_cpuid(); probe_pm_features(); set_amperf_source(); @@ -7150,12 +7218,6 @@ skip_cgroup_setting: outf = stderr; cmdline(argc, argv); - if (no_msr) - bic_disable_msr_access(); - - if (no_perf) - bic_disable_perf_access(); - if (!quiet) { print_version(); print_bootcmd(); -- cgit v1.2.3 From aed48c48fa65abdd584e14f7d0273711bc10d223 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 30 Jan 2024 23:57:07 +0100 Subject: tools/power turbostat: add early exits for permission checks Checking early if the permissions are even needed gets rid of the warnings about some of them missing. Earlier we issued a warning in case of missing MSR and/or perf permissions, even when user never asked for counters that require those. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 66 ++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 30db6e92193a..e5e01b58992e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5818,6 +5818,14 @@ static int has_instr_count_access(void) return has_access; } +bool is_aperf_access_required(void) +{ + return BIC_IS_ENABLED(BIC_Avg_MHz) + || BIC_IS_ENABLED(BIC_Busy) + || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC); +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5833,8 +5841,7 @@ void linux_perf_init(void) err(-1, "calloc fd_instr_count_percpu"); } - const bool aperf_required = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || - BIC_IS_ENABLED(BIC_Bzy_MHz) || BIC_IS_ENABLED(BIC_IPC); + const bool aperf_required = is_aperf_access_required(); if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); if (fd_amperf_percpu == NULL) @@ -5903,6 +5910,9 @@ static int has_amperf_access_via_perf(void) /* Check if we can access APERF and MPERF */ static int has_amperf_access(void) { + if (!is_aperf_access_required()) + return 0; + if (!no_msr && has_amperf_access_via_msr()) return 1; @@ -6581,7 +6591,8 @@ static void set_amperf_source(void) { amperf_source = AMPERF_SOURCE_PERF; - if (no_perf || !has_amperf_access_via_perf()) + const bool aperf_required = is_aperf_access_required(); + if (no_perf || !aperf_required || !has_amperf_access_via_perf()) amperf_source = AMPERF_SOURCE_MSR; if (quiet || !debug) @@ -6590,8 +6601,51 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +bool is_msr_access_required(void) +{ + /* TODO: add detection for dynamic counters from add_counter() */ + if (no_msr) + return false; + + return BIC_IS_ENABLED(BIC_SMI) + || BIC_IS_ENABLED(BIC_CPU_c1) + || BIC_IS_ENABLED(BIC_CPU_c3) + || BIC_IS_ENABLED(BIC_CPU_c6) + || BIC_IS_ENABLED(BIC_CPU_c7) + || BIC_IS_ENABLED(BIC_Mod_c6) + || BIC_IS_ENABLED(BIC_CoreTmp) + || BIC_IS_ENABLED(BIC_Totl_c0) + || BIC_IS_ENABLED(BIC_Any_c0) + || BIC_IS_ENABLED(BIC_GFX_c0) + || BIC_IS_ENABLED(BIC_CPUGFX) + || BIC_IS_ENABLED(BIC_Pkgpc3) + || BIC_IS_ENABLED(BIC_Pkgpc6) + || BIC_IS_ENABLED(BIC_Pkgpc2) + || BIC_IS_ENABLED(BIC_Pkgpc7) + || BIC_IS_ENABLED(BIC_Pkgpc8) + || BIC_IS_ENABLED(BIC_Pkgpc9) + || BIC_IS_ENABLED(BIC_Pkgpc10) + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_PkgWatt) + || BIC_IS_ENABLED(BIC_CorWatt) + || BIC_IS_ENABLED(BIC_GFXWatt) + || BIC_IS_ENABLED(BIC_RAMWatt) + || BIC_IS_ENABLED(BIC_Pkg_J) + || BIC_IS_ENABLED(BIC_Cor_J) + || BIC_IS_ENABLED(BIC_GFX_J) + || BIC_IS_ENABLED(BIC_RAM_J) + || BIC_IS_ENABLED(BIC_PKG__) + || BIC_IS_ENABLED(BIC_RAM__) + || BIC_IS_ENABLED(BIC_PkgTmp) + || (is_aperf_access_required() && !has_amperf_access_via_perf()); +} + void check_msr_access(void) { + if (!is_msr_access_required()) + no_msr = 1; + check_dev_msr(); check_msr_permission(); @@ -6601,10 +6655,12 @@ void check_msr_access(void) void check_perf_access(void) { - if (no_perf || !has_instr_count_access()) + const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + if (no_perf || !intrcount_required || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; - if (!has_amperf_access()) { + const bool aperf_required = is_aperf_access_required(); + if (!aperf_required || !has_amperf_access()) { bic_enabled &= ~BIC_Avg_MHz; bic_enabled &= ~BIC_Busy; bic_enabled &= ~BIC_Bzy_MHz; -- cgit v1.2.3 From 4a1bb4dad5d16669e841410944e7bc84ef7263fc Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 14 Mar 2024 11:36:55 +0100 Subject: tools/power turbostat: Clear added counters when in no-msr mode If user request --no-msr or is not able to access the MSRs, turbostat should clear all the counters added with --add. Because MSR access permission checks are done after the cmdline is parsed, the decision has to be defered up until the transition into no-msr mode happen. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 47 ++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5e01b58992e..b4a892bf22bf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1160,6 +1160,37 @@ struct sys_counters { struct msr_counter *pp; } sys; +void free_sys_counters(void) +{ + struct msr_counter *p = sys.tp, *pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.cp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + p = sys.pp, pnext = NULL; + while (p) { + pnext = p->next; + free(p); + p = pnext; + } + + sys.added_thread_counters = 0; + sys.added_core_counters = 0; + sys.added_package_counters = 0; + sys.tp = NULL; + sys.cp = NULL; + sys.pp = NULL; +} + struct system_summary { struct thread_data threads; struct core_data cores; @@ -1315,6 +1346,8 @@ static void bic_disable_msr_access(void) BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_PkgTmp; bic_enabled &= ~bic_msrs; + + free_sys_counters(); } static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) @@ -6601,12 +6634,24 @@ static void set_amperf_source(void) fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); } +bool has_added_counters(void) +{ + /* + * It only makes sense to call this after the command line is parsed, + * otherwise sys structure is not populated. + */ + + return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; +} + bool is_msr_access_required(void) { - /* TODO: add detection for dynamic counters from add_counter() */ if (no_msr) return false; + if (has_added_counters()) + return true; + return BIC_IS_ENABLED(BIC_SMI) || BIC_IS_ENABLED(BIC_CPU_c1) || BIC_IS_ENABLED(BIC_CPU_c3) -- cgit v1.2.3 From ebf8449caba1df2eb6ba0b465fe15dc06d3b9135 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 15 Feb 2024 12:50:19 +0100 Subject: tools/power turbostat: Add proper re-initialization for perf file descriptors Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b4a892bf22bf..a380829c5890 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3669,18 +3669,25 @@ void free_fd_percpu(void) { int i; + if (!fd_percpu) + return; + for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (fd_percpu[i] != 0) close(fd_percpu[i]); } free(fd_percpu); + fd_percpu = NULL; } void free_fd_amperf_percpu(void) { int i; + if (!fd_amperf_percpu) + return; + for (i = 0; i < topo.max_cpu_num + 1; ++i) { if (fd_amperf_percpu[i].mperf != 0) close(fd_amperf_percpu[i].mperf); @@ -3690,6 +3697,21 @@ void free_fd_amperf_percpu(void) } free(fd_amperf_percpu); + fd_amperf_percpu = NULL; +} + +void free_fd_instr_count_percpu(void) +{ + if (!fd_instr_count_percpu) + return; + + for (int i = 0; i < topo.max_cpu_num + 1; ++i) { + if (fd_instr_count_percpu[i] != 0) + close(fd_instr_count_percpu[i]); + } + + free(fd_instr_count_percpu); + fd_instr_count_percpu = NULL; } void free_all_buffers(void) @@ -3733,6 +3755,7 @@ void free_all_buffers(void) outp = NULL; free_fd_percpu(); + free_fd_instr_count_percpu(); free_fd_amperf_percpu(); free(irq_column_2_cpu); @@ -4067,10 +4090,13 @@ static void update_effective_set(bool startup) err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str); } +void linux_perf_init(void); + void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); + linux_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } -- cgit v1.2.3 From 141fb8cd206ace23c02cd2791c6da52c1d77d42a Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 19 Mar 2024 10:54:22 -0700 Subject: btrfs: qgroup: correctly model root qgroup rsv in convert We use add_root_meta_rsv and sub_root_meta_rsv to track prealloc and pertrans reservations for subvolumes when quotas are enabled. The convert function does not properly increment pertrans after decrementing prealloc, so the count is not accurate. Note: we check that the fs is not read-only to mirror the logic in qgroup_convert_meta, which checks that before adding to the pertrans rsv. Fixes: 8287475a2055 ("btrfs: qgroup: Use root::qgroup_meta_rsv_* to record qgroup meta reserved space") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5f90f0605b12..cf8820ce7aa2 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4495,6 +4495,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + if (!sb_rdonly(fs_info->sb)) + add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* -- cgit v1.2.3 From 74e97958121aa1f5854da6effba70143f051b0cd Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:02:04 -0700 Subject: btrfs: qgroup: fix qgroup prealloc rsv leak in subvolume operations Create subvolume, create snapshot and delete subvolume all use btrfs_subvolume_reserve_metadata() to reserve metadata for the changes done to the parent subvolume's fs tree, which cannot be mediated in the normal way via start_transaction. When quota groups (squota or qgroups) are enabled, this reserves qgroup metadata of type PREALLOC. Once the operation is associated to a transaction, we convert PREALLOC to PERTRANS, which gets cleared in bulk at the end of the transaction. However, the error paths of these three operations were not implementing this lifecycle correctly. They unconditionally converted the PREALLOC to PERTRANS in a generic cleanup step regardless of errors or whether the operation was fully associated to a transaction or not. This resulted in error paths occasionally converting this rsv to PERTRANS without calling record_root_in_trans successfully, which meant that unless that root got recorded in the transaction by some other thread, the end of the transaction would not free that root's PERTRANS, leaking it. Ultimately, this resulted in hitting a WARN in CONFIG_BTRFS_DEBUG builds at unmount for the leaked reservation. The fix is to ensure that every qgroup PREALLOC reservation observes the following properties: 1. any failure before record_root_in_trans is called successfully results in freeing the PREALLOC reservation. 2. after record_root_in_trans, we convert to PERTRANS, and now the transaction owns freeing the reservation. This patch enforces those properties on the three operations. Without it, generic/269 with squotas enabled at mkfs time would fail in ~5-10 runs on my system. With this patch, it ran successfully 1000 times in a row. Fixes: e85fde5162bf ("btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 13 ++++++++++++- fs/btrfs/ioctl.c | 37 ++++++++++++++++++++++++++++--------- fs/btrfs/root-tree.c | 10 ---------- fs/btrfs/root-tree.h | 2 -- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 37701531eeb1..1f9e74c4161d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4503,6 +4503,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; + u64 qgroup_reserved = 0; int ret; down_write(&fs_info->subvol_sem); @@ -4547,12 +4548,20 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -4611,7 +4620,9 @@ out_end_trans: ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(root, &block_rsv); + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_undead: if (ret) { spin_lock(&dest->root_item_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 38459a89b27c..5a507237c4fa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -613,6 +613,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, int ret; dev_t anon_dev; u64 objectid; + u64 qgroup_reserved = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -650,13 +651,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, trans_num_items, false); if (ret) goto out_new_inode_args; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(root, &block_rsv); - goto out_new_inode_args; + goto out_release_rsv; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) + goto out; + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; /* Tree log can't currently deal with an inode which is a new root. */ @@ -767,9 +773,11 @@ static noinline int create_subvol(struct mnt_idmap *idmap, out: trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(root, &block_rsv); - btrfs_end_transaction(trans); +out_release_rsv: + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -791,6 +799,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv *block_rsv; + u64 qgroup_reserved = 0; int ret; /* We do not support snapshotting right now. */ @@ -827,19 +837,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, - BTRFS_BLOCK_RSV_TEMP); + block_rsv = &pending_snapshot->block_rsv; + btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item */ trans_num_items = create_subvol_num_items(inherit) + 3; - ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv, trans_num_items, false); if (ret) goto free_pending; + qgroup_reserved = block_rsv->qgroup_rsv_reserved; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -852,6 +862,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(trans); goto fail; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) { + btrfs_end_transaction(trans); + goto fail; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->pending_snapshot = pending_snapshot; @@ -881,7 +898,9 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); + btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 4bb538a372ce..7007f9e0c972 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,13 +548,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, } return ret; } - -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - u64 qgroup_to_release; - - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); - btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); -} diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index 6f929cf3bd49..8f5739e732b9 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -18,8 +18,6 @@ struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const struct fscrypt_str *name); -- cgit v1.2.3 From 71537e35c324ea6fbd68377a4f26bb93a831ae35 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:14:24 -0700 Subject: btrfs: record delayed inode root in transaction When running delayed inode updates, we do not record the inode's root in the transaction, but we do allocate PREALLOC and thus converted PERTRANS space for it. To be sure we free that PERTRANS meta rsv, we must ensure that we record the root in the transaction. Fixes: 4f5427ccce5d ("btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index dd6f566a383f..121ab890bd05 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1133,6 +1133,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_record_root_in_trans(trans, node->root); + if (ret) + return ret; ret = btrfs_update_delayed_inode(trans, node->root, path, node); return ret; } -- cgit v1.2.3 From 211de93367304ab395357f8cb12568a4d1e20701 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:18:39 -0700 Subject: btrfs: qgroup: convert PREALLOC to PERTRANS after record_root_in_trans The transaction is only able to free PERTRANS reservations for a root once that root has been recorded with the TRANS tag on the roots radix tree. Therefore, until we are sure that this root will get tagged, it isn't safe to convert. Generally, this is not an issue as *some* transaction will likely tag the root before long and this reservation will get freed in that transaction, but technically it could stick around until unmount and result in a warning about leaked metadata reservation space. This path is most exercised by running the generic/269 fstest with CONFIG_BTRFS_DEBUG. Fixes: a6496849671a ("btrfs: fix start transaction qgroup rsv double free") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 46e8426adf4f..1f10082d35d9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -745,14 +745,6 @@ again: h->reloc_reserved = reloc_reserved; } - /* - * Now that we have found a transaction to be a part of, convert the - * qgroup reservation from prealloc to pertrans. A different transaction - * can't race in and free our pertrans out from under us. - */ - if (qgroup_reserved) - btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); - got_it: if (!current->journal_info) current->journal_info = h; @@ -786,8 +778,15 @@ got_it: * not just freed. */ btrfs_end_transaction(h); - return ERR_PTR(ret); + goto reserve_fail; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); return h; -- cgit v1.2.3 From 3c6f0c5ecc8910d4ffb0dfe85609ebc0c91c8f34 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 26 Mar 2024 11:55:22 -0700 Subject: btrfs: make btrfs_clear_delalloc_extent() free delalloc reserve Currently, this call site in btrfs_clear_delalloc_extent() only converts the reservation. We are marking it not delalloc, so I don't think it makes sense to keep the rsv around. This is a path where we are not sure to join a transaction, so it leads to incorrect free-ing during umount. Helps with the pass rate of generic/269 and generic/475. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f9e74c4161d..c65fe5de4022 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2533,7 +2533,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, len, false); + btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) -- cgit v1.2.3 From 6e68de0bb0ed59e0554a0c15ede7308c47351e2d Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 26 Mar 2024 12:01:28 -0700 Subject: btrfs: always clear PERTRANS metadata during commit It is possible to clear a root's IN_TRANS tag from the radix tree, but not clear its PERTRANS, if there is some error in between. Eliminate that possibility by moving the free up to where we clear the tag. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1f10082d35d9..85f359e0e0a7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1494,6 +1494,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); @@ -1518,7 +1519,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) if (ret2) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); -- cgit v1.2.3 From 97ca7c1f93bbac6982717a7055cd727813c45e61 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 25 Feb 2024 08:29:25 -0800 Subject: mean_and_variance: Drop always failing tests mean_and_variance_test_2 and mean_and_variance_test_4 always fail. The input parameters to those tests are identical to the input parameters to tests 1 and 3, yet the expected result for tests 2 and 4 is different for the mean and stddev tests. That will always fail. Expected mean_and_variance_get_mean(mv) == mean[i], but mean_and_variance_get_mean(mv) == 22 (0x16) mean[i] == 10 (0xa) Drop the bad tests. Fixes: 65bc41090720 ("mean and variance: More tests") Closes: https://lore.kernel.org/lkml/065b94eb-6a24-4248-b7d7-d3212efb4787@roeck-us.net/ Cc: Kent Overstreet Signed-off-by: Guenter Roeck Signed-off-by: Kent Overstreet --- fs/bcachefs/mean_and_variance_test.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index db63b3f3b338..4c298e74723d 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -136,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_2(struct kunit *test) -{ - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; - s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; - s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - /* Test behaviour where we switch from one steady state to another: */ -static void mean_and_variance_test_3(struct kunit *test) +static void mean_and_variance_test_2(struct kunit *test) { s64 d[] = { 100, 100, 100, 100, 100 }; s64 mean[] = { 22, 32, 40, 46, 50 }; @@ -161,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_4(struct kunit *test) -{ - s64 d[] = { 100, 100, 100, 100, 100 }; - s64 mean[] = { 10, 11, 12, 13, 14 }; - s64 stddev[] = { 9, 13, 15, 17, 19 }; - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - static void mean_and_variance_fast_divpow2(struct kunit *test) { s64 i; @@ -230,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = { KUNIT_CASE(mean_and_variance_weighted_advanced_test), KUNIT_CASE(mean_and_variance_test_1), KUNIT_CASE(mean_and_variance_test_2), - KUNIT_CASE(mean_and_variance_test_3), - KUNIT_CASE(mean_and_variance_test_4), {} }; -- cgit v1.2.3 From 8a4ff5452dd0cdcc35940460bb777d836bece11c Mon Sep 17 00:00:00 2001 From: Stephen Horvath Date: Sun, 31 Mar 2024 18:37:06 +1000 Subject: ACPI: thermal: Register thermal zones without valid trip points Some laptops where the thermal control is handled by the EC may provide trip points that fail the kernels new validation, but still have working temperature sensors. An example of this is the Framework 13 AMD. This patch allows the thermal zone to still be registered without trip points if the trip points fail validation, allowing the temperature sensor to be viewed and used by the user. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218586 Fixes: 9c8647224e9f ("ACPI: thermal: Use library functions to obtain trip point temperature values") Signed-off-by: Stephen Horvath [ rjw: Subject edits, remove redundant braces ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/thermal.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 302dce0b2b50..d67881b50bca 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -662,14 +662,15 @@ static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz, { int result; - tz->thermal_zone = thermal_zone_device_register_with_trips("acpitz", - trip_table, - trip_count, - tz, - &acpi_thermal_zone_ops, - NULL, - passive_delay, - tz->polling_frequency * 100); + if (trip_count) + tz->thermal_zone = thermal_zone_device_register_with_trips( + "acpitz", trip_table, trip_count, tz, + &acpi_thermal_zone_ops, NULL, passive_delay, + tz->polling_frequency * 100); + else + tz->thermal_zone = thermal_tripless_zone_device_register( + "acpitz", tz, &acpi_thermal_zone_ops, NULL); + if (IS_ERR(tz->thermal_zone)) return PTR_ERR(tz->thermal_zone); @@ -901,11 +902,8 @@ static int acpi_thermal_add(struct acpi_device *device) trip++; } - if (trip == trip_table) { + if (trip == trip_table) pr_warn(FW_BUG "No valid trip points!\n"); - result = -ENODEV; - goto free_memory; - } result = acpi_thermal_register_thermal_zone(tz, trip_table, trip - trip_table, -- cgit v1.2.3 From e0319af2b6cdfa7c39edf73dcb813b7ff1261fa5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Apr 2024 16:42:27 -0400 Subject: bcachefs: Improve bch2_btree_update_to_text() Print out the mode as a string, and also print out the btree and watermark. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 41 +++++++++++++++++++++++++------------ fs/bcachefs/btree_update_interior.h | 24 ++++++++++++++-------- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 9fd2dd0f4682..2b46e6ad248a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -26,6 +26,13 @@ #include +const char * const bch2_btree_update_modes[] = { +#define x(t) #t, + BCH_WATERMARKS() +#undef x + NULL +}; + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); @@ -846,11 +853,11 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); - as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->mode = BTREE_UPDATE_node; as->b = b; set_btree_node_write_blocked(b); @@ -873,7 +880,7 @@ static void btree_update_reparent(struct btree_update *as, lockdep_assert_held(&c->btree_interior_update_lock); child->b = NULL; - child->mode = BTREE_INTERIOR_UPDATING_AS; + child->mode = BTREE_UPDATE_update; bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, bch2_update_reparent_journal_pin_flush); @@ -884,7 +891,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) struct bkey_i *insert = &b->key; struct bch_fs *c = as->c; - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -898,7 +905,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->mode = BTREE_UPDATE_root; mutex_unlock(&c->btree_interior_update_lock); } @@ -1076,7 +1083,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * struct bch_fs *c = as->c; u64 start_time = as->start_time; - BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode == BTREE_UPDATE_none); if (as->took_gc_lock) up_read(&as->c->gc_lock); @@ -1172,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->c = c; as->start_time = start_time; as->ip_started = _RET_IP_; - as->mode = BTREE_INTERIOR_NO_UPDATE; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level = update_level; @@ -2509,18 +2517,25 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); } +static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) +{ + prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + (void *) as->ip_started, + bch2_btree_id_str(as->btree_id), + bch2_watermarks[as->watermark], + bch2_btree_update_modes[as->mode], + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); +} + void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_update *as; mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - (void *) as->ip_started, - as->mode, - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); + bch2_btree_update_to_text(out, as); mutex_unlock(&c->btree_interior_update_lock); } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7ac3e17b84fd..1d41d3f4b107 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -12,6 +12,18 @@ int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); +#define BTREE_UPDATE_MODES() \ + x(none) \ + x(node) \ + x(root) \ + x(update) + +enum btree_update_mode { +#define x(n) BTREE_UPDATE_##n, + BTREE_UPDATE_MODES() +#undef x +}; + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: @@ -39,14 +51,8 @@ struct btree_update { struct list_head list; struct list_head unwritten_list; - /* What kind of update are we doing? */ - enum { - BTREE_INTERIOR_NO_UPDATE, - BTREE_INTERIOR_UPDATING_NODE, - BTREE_INTERIOR_UPDATING_ROOT, - BTREE_INTERIOR_UPDATING_AS, - } mode; - + enum btree_update_mode mode; + enum bch_watermark watermark; unsigned nodes_written:1; unsigned took_gc_lock:1; @@ -56,7 +62,7 @@ struct btree_update { struct disk_reservation disk_res; /* - * BTREE_INTERIOR_UPDATING_NODE: + * BTREE_UPDATE_node: * The update that made the new nodes visible was a regular update to an * existing interior node - @b. We can't write out the update to @b * until the new nodes we created are finished writing, so we block @b -- cgit v1.2.3 From 7ee88737ab802ac832f978d6e6258571fe08d870 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Apr 2024 18:30:14 -0400 Subject: bcachefs: Check for bad needs_discard before doing discard In the discard worker, we were failing to validate the bucket state - meaning a corrupt needs_discard btree could cause us to discard a bucket that we shouldn't. If check_alloc_info hasn't run yet we just want to bail out, otherwise it's a filesystem inconsistent error. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 47 +++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 893e38f9db80..4ff56fa4d539 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { - a->v.gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - goto write; - } - - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" - "%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (a->v.dirty_sectors) { + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "attempting to discard bucket with dirty data\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; - } goto out; } if (a->v.data_type != BCH_DATA_need_discard) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "bucket incorrectly set in need_discard btree\n" - "%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; + if (data_type_is_empty(a->v.data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { + a->v.gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; } + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "bucket incorrectly set in need_discard btree\n" + "%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = -EIO; + goto out; + } + + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", + a->v.journal_seq, + c->journal.flushed_seq_ondisk, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = -EIO; goto out; } @@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo if (ret) goto err; + BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); @@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, goto out; BUG_ON(a->v.data_type != BCH_DATA_cached); + BUG_ON(a->v.dirty_sectors); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); -- cgit v1.2.3 From fa14b50460baba40c8b1138c517fb8cf04464292 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Apr 2024 18:57:05 -0400 Subject: bcachefs: ratelimit informational fsck errors Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index cbb8b43e419f..b704fb0dda95 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1131,8 +1131,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal i->count = count2; if (i->count != count2) { - bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); return -BCH_ERR_internal_fsck_err; } @@ -1587,8 +1587,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ return count2; if (i->count != count2) { - bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", - i->count, count2); + bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; -- cgit v1.2.3 From 135f218255b28c5bbf71e9e32a49e5c734cabbe5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 28 Mar 2024 12:19:54 -0300 Subject: ARM: dts: imx7s-warp: Pass OV2680 link-frequencies Since commit 63b0cd30b78e ("media: ov2680: Add bus-cfg / endpoint property verification") the ov2680 no longer probes on a imx7s-warp7: ov2680 1-0036: error -EINVAL: supported link freq 330000000 not found ov2680 1-0036: probe with driver ov2680 failed with error -22 Fix it by passing the required 'link-frequencies' property as recommended by: https://www.kernel.org/doc/html/v6.9-rc1/driver-api/media/camera-sensor.html#handling-clocks Cc: stable@vger.kernel.org Fixes: 63b0cd30b78e ("media: ov2680: Add bus-cfg / endpoint property verification") Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx7s-warp.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts b/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts index ba7231b364bb..7bab113ca6da 100644 --- a/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts +++ b/arch/arm/boot/dts/nxp/imx/imx7s-warp.dts @@ -210,6 +210,7 @@ remote-endpoint = <&mipi_from_sensor>; clock-lanes = <0>; data-lanes = <1>; + link-frequencies = /bits/ 64 <330000000>; }; }; }; -- cgit v1.2.3 From fd819ad3ecf6f3c232a06b27423ce9ed8c20da89 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 29 Mar 2024 09:50:23 +0800 Subject: ax25: fix use-after-free bugs caused by ax25_ds_del_timer When the ax25 device is detaching, the ax25_dev_device_down() calls ax25_ds_del_timer() to cleanup the slave_timer. When the timer handler is running, the ax25_ds_del_timer() that calls del_timer() in it will return directly. As a result, the use-after-free bugs could happen, one of the scenarios is shown below: (Thread 1) | (Thread 2) | ax25_ds_timeout() ax25_dev_device_down() | ax25_ds_del_timer() | del_timer() | ax25_dev_put() //FREE | | ax25_dev-> //USE In order to mitigate bugs, when the device is detaching, use timer_shutdown_sync() to stop the timer. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Duoming Zhou Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240329015023.9223-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski --- net/ax25/ax25_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index c5462486dbca..282ec581c072 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -105,7 +105,7 @@ void ax25_dev_device_down(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); #ifdef CONFIG_AX25_DAMA_SLAVE - ax25_ds_del_timer(ax25_dev); + timer_shutdown_sync(&ax25_dev->dama.slave_timer); #endif /* -- cgit v1.2.3 From b32a09ea7c38849ff925489a6bf5bd8914bc45df Mon Sep 17 00:00:00 2001 From: Marco Pinna Date: Fri, 29 Mar 2024 17:12:59 +0100 Subject: vsock/virtio: fix packet delivery to tap device Commit 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") added virtio_transport_deliver_tap_pkt() for handing packets to the vsockmon device. However, in virtio_transport_send_pkt_work(), the function is called before actually sending the packet (i.e. before placing it in the virtqueue with virtqueue_add_sgs() and checking whether it returned successfully). Queuing the packet in the virtqueue can fail even multiple times. However, in virtio_transport_deliver_tap_pkt() we deliver the packet to the monitoring tap interface only the first time we call it. This certainly avoids seeing the same packet replicated multiple times in the monitoring interface, but it can show the packet sent with the wrong timestamp or even before we succeed to queue it in the virtqueue. Move virtio_transport_deliver_tap_pkt() after calling virtqueue_add_sgs() and making sure it returned successfully. Fixes: 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") Cc: stable@vge.kernel.org Signed-off-by: Marco Pinna Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20240329161259.411751-1-marco.pinn95@gmail.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 1748268e0694..ee5d306a96d0 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -120,7 +120,6 @@ virtio_transport_send_pkt_work(struct work_struct *work) if (!skb) break; - virtio_transport_deliver_tap_pkt(skb); reply = virtio_vsock_skb_reply(skb); sgs = vsock->out_sgs; sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), @@ -170,6 +169,8 @@ virtio_transport_send_pkt_work(struct work_struct *work) break; } + virtio_transport_deliver_tap_pkt(skb); + if (reply) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int val; -- cgit v1.2.3 From 09ab7eff38202159271534d2f5ad45526168f2a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Mar 2024 10:45:07 -0600 Subject: io_uring/kbuf: get rid of lower BGID lists Just rely on the xarray for any kind of bgid. This simplifies things, and it really doesn't bring us much, if anything. Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - io_uring/io_uring.c | 2 -- io_uring/kbuf.c | 70 +++++------------------------------------- 3 files changed, 8 insertions(+), 65 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e24893625085..05df0e399d7c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -294,7 +294,6 @@ struct io_ring_ctx { struct io_submit_state submit_state; - struct io_buffer_list *io_bl; struct xarray io_bl_xa; struct io_hash_table cancel_table_locked; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d1defb99b89e..bc730f59265f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -351,7 +351,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; @@ -2932,7 +2931,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_napi_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 693c26da4ee1..8bf0121f00af 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -17,8 +17,6 @@ #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -#define BGID_ARRAY 64 - /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) @@ -40,13 +38,9 @@ struct io_buf_free { int inuse; }; -static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, - unsigned int bgid) +static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) { - if (bl && bgid < BGID_ARRAY) - return &bl[bgid]; - return xa_load(&ctx->io_bl_xa, bgid); } @@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, ctx->io_bl, bgid); + return __io_buffer_get_list(ctx, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -68,10 +62,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, */ bl->bgid = bgid; smp_store_release(&bl->is_ready, 1); - - if (bgid < BGID_ARRAY) - return 0; - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -208,24 +198,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) -{ - struct io_buffer_list *bl; - int i; - - bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL); - if (!bl) - return -ENOMEM; - - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&bl[i].buf_list); - bl[i].bgid = i; - } - - smp_store_release(&ctx->io_bl, bl); - return 0; -} - /* * Mark the given mapped range as free for reuse */ @@ -300,13 +272,6 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) struct list_head *item, *tmp; struct io_buffer *buf; unsigned long index; - int i; - - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } xa_for_each(&ctx->io_bl_xa, index, bl) { xa_erase(&ctx->io_bl_xa, bl->bgid); @@ -489,12 +454,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - bl = io_buffer_get_list(ctx, p->bgid); if (unlikely(!bl)) { bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); @@ -507,14 +466,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) if (ret) { /* * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. Also can't - * be a lower indexed array group, as adding one - * where lookup failed cannot happen. + * let's keep it consistent throughout. */ - if (p->bgid >= BGID_ARRAY) - kfree_rcu(bl, rcu); - else - WARN_ON_ONCE(1); + kfree_rcu(bl, rcu); goto err; } } @@ -679,12 +633,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.ring_entries >= 65536) return -EINVAL; - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ @@ -734,10 +682,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree_rcu(bl, rcu); - } + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree_rcu(bl, rcu); return 0; } @@ -771,7 +717,7 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; - bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid); + bl = __io_buffer_get_list(ctx, bgid); if (!bl || !bl->is_mmap) return NULL; -- cgit v1.2.3 From 3b80cff5a4d117c53d38ce805823084eaeffbde6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Mar 2024 10:46:40 -0600 Subject: io_uring/kbuf: get rid of bl->is_ready Now that xarray is being exclusively used for the buffer_list lookup, this check is no longer needed. Get rid of it and the is_ready member. Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 8 -------- io_uring/kbuf.h | 2 -- 2 files changed, 10 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8bf0121f00af..011280d873e7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -61,7 +61,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, * always under the ->uring_lock, but the RCU lookup from mmap does. */ bl->bgid = bgid; - smp_store_release(&bl->is_ready, 1); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -721,13 +720,6 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) if (!bl || !bl->is_mmap) return NULL; - /* - * Ensure the list is fully setup. Only strictly needed for RCU lookup - * via mmap, and in that case only for the array indexed groups. For - * the xarray lookups, it's either visible and ready, or not at all. - */ - if (!smp_load_acquire(&bl->is_ready)) - return NULL; return bl->buf_ring; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 1c7b654ee726..fdbb10449513 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -29,8 +29,6 @@ struct io_buffer_list { __u8 is_buf_ring; /* ring mapped provided buffers, but mmap'ed by application */ __u8 is_mmap; - /* bl is visible from an RCU point of view for lookup */ - __u8 is_ready; }; struct io_buffer { -- cgit v1.2.3 From 6b69c4ab4f685327d9e10caf0d84217ba23a8c4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Mar 2024 16:12:51 -0600 Subject: io_uring/kbuf: protect io_buffer_list teardown with a reference No functional changes in this patch, just in preparation for being able to keep the buffer list alive outside of the ctx->uring_lock. Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 15 +++++++++++---- io_uring/kbuf.h | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 011280d873e7..2edc6854f6f3 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -61,6 +61,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, * always under the ->uring_lock, but the RCU lookup from mmap does. */ bl->bgid = bgid; + atomic_set(&bl->refs, 1); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -265,6 +266,14 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + if (atomic_dec_and_test(&bl->refs)) { + __io_remove_buffers(ctx, bl, -1U); + kfree_rcu(bl, rcu); + } +} + void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; @@ -274,8 +283,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) xa_for_each(&ctx->io_bl_xa, index, bl) { xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); + io_put_bl(ctx, bl); } /* @@ -680,9 +688,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!bl->is_buf_ring) return -EINVAL; - __io_remove_buffers(ctx, bl, -1U); xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree_rcu(bl, rcu); + io_put_bl(ctx, bl); return 0; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index fdbb10449513..8b868a1744e2 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -25,6 +25,8 @@ struct io_buffer_list { __u16 head; __u16 mask; + atomic_t refs; + /* ring mapped provided buffers */ __u8 is_buf_ring; /* ring mapped provided buffers, but mmap'ed by application */ -- cgit v1.2.3 From 561e4f9451d65fc2f7eef564e0064373e3019793 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Apr 2024 16:16:03 -0600 Subject: io_uring/kbuf: hold io_buffer_list reference over mmap If we look up the kbuf, ensure that it doesn't get unregistered until after we're done with it. Since we're inside mmap, we cannot safely use the io_uring lock. Rely on the fact that we can lookup the buffer list under RCU now and grab a reference to it, preventing it from being unregistered until we're done with it. The lookup returns the io_buffer_list directly with it referenced. Cc: stable@vger.kernel.org # v6.4+ Fixes: 5cf4f52e6d8a ("io_uring: free io_buffer_list entries via RCU") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 ++++++----- io_uring/kbuf.c | 35 +++++++++++++++++++++++++++-------- io_uring/kbuf.h | 4 +++- 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bc730f59265f..4521c2b66b98 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3447,14 +3447,15 @@ static void *io_uring_validate_mmap_request(struct file *file, ptr = ctx->sq_sqes; break; case IORING_OFF_PBUF_RING: { + struct io_buffer_list *bl; unsigned int bgid; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - rcu_read_lock(); - ptr = io_pbuf_get_address(ctx, bgid); - rcu_read_unlock(); - if (!ptr) - return ERR_PTR(-EINVAL); + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return bl; + ptr = bl->buf_ring; + io_put_bl(ctx, bl); break; } default: diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2edc6854f6f3..3aa16e27f509 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -266,7 +266,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } -static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { if (atomic_dec_and_test(&bl->refs)) { __io_remove_buffers(ctx, bl, -1U); @@ -719,16 +719,35 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) +struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, + unsigned long bgid) { struct io_buffer_list *bl; + bool ret; - bl = __io_buffer_get_list(ctx, bgid); - - if (!bl || !bl->is_mmap) - return NULL; - - return bl->buf_ring; + /* + * We have to be a bit careful here - we're inside mmap and cannot grab + * the uring_lock. This means the buffer_list could be simultaneously + * going away, if someone is trying to be sneaky. Look it up under rcu + * so we know it's not going away, and attempt to grab a reference to + * it. If the ref is already zero, then fail the mapping. If successful, + * the caller will call io_put_bl() to drop the the reference at at the + * end. This may then safely free the buffer_list (and drop the pages) + * at that point, vm_insert_pages() would've already grabbed the + * necessary vma references. + */ + rcu_read_lock(); + bl = xa_load(&ctx->io_bl_xa, bgid); + /* must be a mmap'able buffer ring and have pages */ + ret = false; + if (bl && bl->is_mmap) + ret = atomic_inc_not_zero(&bl->refs); + rcu_read_unlock(); + + if (ret) + return bl; + + return ERR_PTR(-EINVAL); } /* diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 8b868a1744e2..df365b8860cf 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -61,7 +61,9 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid); +void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); +struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, + unsigned long bgid); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { -- cgit v1.2.3 From 5d872c9f46bd2ea3524af3c2420a364a13667135 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 30 Mar 2024 12:49:02 +0100 Subject: r8169: fix issue caused by buggy BIOS on certain boards with RTL8168d On some boards with this chip version the BIOS is buggy and misses to reset the PHY page selector. This results in the PHY ID read accessing registers on a different page, returning a more or less random value. Fix this by resetting the page selector first. Fixes: f1e911d5d0df ("r8169: add basic phylib support") Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/64f2055e-98b8-45ec-8568-665e3d54d4e6@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4ac444eb269f..6f1e6f386b7b 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5164,6 +5164,15 @@ static int r8169_mdio_register(struct rtl8169_private *tp) struct mii_bus *new_bus; int ret; + /* On some boards with this chip version the BIOS is buggy and misses + * to reset the PHY page selector. This results in the PHY ID read + * accessing registers on a different page, returning a more or + * less random value. Fix this by resetting the page selector first. + */ + if (tp->mac_version == RTL_GIGA_MAC_VER_25 || + tp->mac_version == RTL_GIGA_MAC_VER_26) + r8169_mdio_write(tp, 0x1f, 0); + new_bus = devm_mdiobus_alloc(&pdev->dev); if (!new_bus) return -ENOMEM; -- cgit v1.2.3 From d21d40605bca7bd5fc23ef03d4c1ca1f48bc2cae Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 1 Apr 2024 14:10:04 -0700 Subject: ipv6: Fix infinite recursion in fib6_dump_done(). syzkaller reported infinite recursive calls of fib6_dump_done() during netlink socket destruction. [1] From the log, syzkaller sent an AF_UNSPEC RTM_GETROUTE message, and then the response was generated. The following recvmmsg() resumed the dump for IPv6, but the first call of inet6_dump_fib() failed at kzalloc() due to the fault injection. [0] 12:01:34 executing program 3: r0 = socket$nl_route(0x10, 0x3, 0x0) sendmsg$nl_route(r0, ... snip ...) recvmmsg(r0, ... snip ...) (fail_nth: 8) Here, fib6_dump_done() was set to nlk_sk(sk)->cb.done, and the next call of inet6_dump_fib() set it to nlk_sk(sk)->cb.args[3]. syzkaller stopped receiving the response halfway through, and finally netlink_sock_destruct() called nlk_sk(sk)->cb.done(). fib6_dump_done() calls fib6_dump_end() and nlk_sk(sk)->cb.done() if it is still not NULL. fib6_dump_end() rewrites nlk_sk(sk)->cb.done() by nlk_sk(sk)->cb.args[3], but it has the same function, not NULL, calling itself recursively and hitting the stack guard page. To avoid the issue, let's set the destructor after kzalloc(). [0]: FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 0 CPU: 1 PID: 432110 Comm: syz-executor.3 Not tainted 6.8.0-12821-g537c2e91d354-dirty #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:117) should_fail_ex (lib/fault-inject.c:52 lib/fault-inject.c:153) should_failslab (mm/slub.c:3733) kmalloc_trace (mm/slub.c:3748 mm/slub.c:3827 mm/slub.c:3992) inet6_dump_fib (./include/linux/slab.h:628 ./include/linux/slab.h:749 net/ipv6/ip6_fib.c:662) rtnl_dump_all (net/core/rtnetlink.c:4029) netlink_dump (net/netlink/af_netlink.c:2269) netlink_recvmsg (net/netlink/af_netlink.c:1988) ____sys_recvmsg (net/socket.c:1046 net/socket.c:2801) ___sys_recvmsg (net/socket.c:2846) do_recvmmsg (net/socket.c:2943) __x64_sys_recvmmsg (net/socket.c:3041 net/socket.c:3034 net/socket.c:3034) [1]: BUG: TASK stack guard page was hit at 00000000f2fa9af1 (stack is 00000000b7912430..000000009a436beb) stack guard page: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 223719 Comm: kworker/1:3 Not tainted 6.8.0-12821-g537c2e91d354-dirty #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Workqueue: events netlink_sock_destruct_work RIP: 0010:fib6_dump_done (net/ipv6/ip6_fib.c:570) Code: 3c 24 e8 f3 e9 51 fd e9 28 fd ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 41 57 41 56 41 55 41 54 55 48 89 fd <53> 48 8d 5d 60 e8 b6 4d 07 fd 48 89 da 48 b8 00 00 00 00 00 fc ff RSP: 0018:ffffc9000d980000 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffffffff84405990 RCX: ffffffff844059d3 RDX: ffff8881028e0000 RSI: ffffffff84405ac2 RDI: ffff88810c02f358 RBP: ffff88810c02f358 R08: 0000000000000007 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000224 R12: 0000000000000000 R13: ffff888007c82c78 R14: ffff888007c82c68 R15: ffff888007c82c68 FS: 0000000000000000(0000) GS:ffff88811b100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc9000d97fff8 CR3: 0000000102309002 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: <#DF> fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) ... fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) netlink_sock_destruct (net/netlink/af_netlink.c:401) __sk_destruct (net/core/sock.c:2177 (discriminator 2)) sk_destruct (net/core/sock.c:2224) __sk_free (net/core/sock.c:2235) sk_free (net/core/sock.c:2246) process_one_work (kernel/workqueue.c:3259) worker_thread (kernel/workqueue.c:3329 kernel/workqueue.c:3416) kthread (kernel/kthread.c:388) ret_from_fork (arch/x86/kernel/process.c:153) ret_from_fork_asm (arch/x86/entry/entry_64.S:256) Modules linked in: Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240401211003.25274-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5c558dc1c683..7209419cfb0e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -651,19 +651,19 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (!w) { /* New dump: * - * 1. hook callback destructor. - */ - cb->args[3] = (long)cb->done; - cb->done = fib6_dump_done; - - /* - * 2. allocate and initialize walker. + * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; + + /* 2. hook callback destructor. + */ + cb->args[3] = (long)cb->done; + cb->done = fib6_dump_done; + } arg.skb = skb; -- cgit v1.2.3 From c53fe72cb5fffd69f2fff104b0119d6e271759c5 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 1 Apr 2024 21:43:47 +0300 Subject: MAINTAINERS: mlx5: Add Tariq Toukan Add myself as mlx5 core and EN maintainer. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Acked-by: Saeed Mahameed Link: https://lore.kernel.org/r/20240401184347.53884-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7b8f97bf7975..de969a9c8ad2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14014,6 +14014,7 @@ F: drivers/net/ethernet/mellanox/mlx4/en_* MELLANOX ETHERNET DRIVER (mlx5e) M: Saeed Mahameed +M: Tariq Toukan L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com @@ -14081,6 +14082,7 @@ F: include/uapi/rdma/mlx4-abi.h MELLANOX MLX5 core VPI driver M: Saeed Mahameed M: Leon Romanovsky +M: Tariq Toukan L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org S: Supported -- cgit v1.2.3 From 5bd31ab5f79eb6e3bdfa0ca0b57650f9d1604062 Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Thu, 15 Feb 2024 07:52:32 -0600 Subject: powerpc/iommu: Refactor spapr_tce_platform_iommu_attach_dev() The patch makes the iommu_group_get() call only when using it thereby avoiding the unnecessary get & put for domain already being set case. Reviewed-by: Jason Gunthorpe Signed-off-by: Shivaprasad G Bhat Signed-off-by: Michael Ellerman Link: https://msgid.link/170800513841.2411.13524607664262048895.stgit@linux.ibm.com --- arch/powerpc/kernel/iommu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 1185efebf032..29a8c8e18585 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1285,15 +1285,14 @@ spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, struct device *dev) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_group *grp = iommu_group_get(dev); struct iommu_table_group *table_group; + struct iommu_group *grp; /* At first attach the ownership is already set */ - if (!domain) { - iommu_group_put(grp); + if (!domain) return 0; - } + grp = iommu_group_get(dev); table_group = iommu_group_get_iommudata(grp); /* * The domain being set to PLATFORM from earlier -- cgit v1.2.3 From c3eeb1ffc6a88af9b002e22be0f70851759be03a Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 1 Apr 2024 11:16:39 -0700 Subject: x86/resctrl: Fix uninitialized memory read when last CPU of domain goes offline Tony encountered this OOPS when the last CPU of a domain goes offline while running a kernel built with CONFIG_NO_HZ_FULL: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI ... RIP: 0010:__find_nth_andnot_bit+0x66/0x110 ... Call Trace: ? __die() ? page_fault_oops() ? exc_page_fault() ? asm_exc_page_fault() cpumask_any_housekeeping() mbm_setup_overflow_handler() resctrl_offline_cpu() resctrl_arch_offline_cpu() cpuhp_invoke_callback() cpuhp_thread_fun() smpboot_thread_fn() kthread() ret_from_fork() ret_from_fork_asm() The NULL pointer dereference is encountered while searching for another online CPU in the domain (of which there are none) that can be used to run the MBM overflow handler. Because the kernel is configured with CONFIG_NO_HZ_FULL the search for another CPU (in its effort to prefer those CPUs that aren't marked nohz_full) consults the mask representing the nohz_full CPUs, tick_nohz_full_mask. On a kernel with CONFIG_CPUMASK_OFFSTACK=y tick_nohz_full_mask is not allocated unless the kernel is booted with the "nohz_full=" parameter and because of that any access to tick_nohz_full_mask needs to be guarded with tick_nohz_full_enabled(). Replace the IS_ENABLED(CONFIG_NO_HZ_FULL) with tick_nohz_full_enabled(). The latter ensures tick_nohz_full_mask can be accessed safely and can be used whether kernel is built with CONFIG_NO_HZ_FULL enabled or not. [ Use Ingo's suggestion that combines the two NO_HZ checks into one. ] Fixes: a4846aaf3945 ("x86/resctrl: Add cpumask_any_housekeeping() for limbo/overflow") Reported-by: Tony Luck Signed-off-by: Reinette Chatre Signed-off-by: Ingo Molnar Reviewed-by: Babu Moger Link: https://lore.kernel.org/r/ff8dfc8d3dcb04b236d523d1e0de13d2ef585223.1711993956.git.reinette.chatre@intel.com Closes: https://lore.kernel.org/lkml/ZgIFT5gZgIQ9A9G7@agluck-desk3/ --- arch/x86/kernel/cpu/resctrl/internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index c99f26ebe7a6..1a8687f8073a 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -78,7 +78,8 @@ cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) else cpu = cpumask_any_but(mask, exclude_cpu); - if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) + /* Only continue if tick_nohz_full_mask has been initialized. */ + if (!tick_nohz_full_enabled()) return cpu; /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ -- cgit v1.2.3 From 312be9fc2234c8acfb8148a9f4c358b70d358dee Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 1 Apr 2024 06:33:20 -0700 Subject: perf/x86/intel/ds: Don't clear ->pebs_data_cfg for the last PEBS event The MSR_PEBS_DATA_CFG MSR register is used to configure which data groups should be generated into a PEBS record, and it's shared among all counters. If there are different configurations among counters, perf combines all the configurations. The first perf command as below requires a complete PEBS record (including memory info, GPRs, XMMs, and LBRs). The second perf command only requires a basic group. However, after the second perf command is running, the MSR_PEBS_DATA_CFG register is cleared. Only a basic group is generated in a PEBS record, which is wrong. The required information for the first perf command is missed. $ perf record --intr-regs=AX,SP,XMM0 -a -C 8 -b -W -d -c 100000003 -o /dev/null -e cpu/event=0xd0,umask=0x81/upp & $ sleep 5 $ perf record --per-thread -c 1 -e cycles:pp --no-timestamp --no-tid taskset -c 8 ./noploop 1000 The first PEBS event is a system-wide PEBS event. The second PEBS event is a per-thread event. When the thread is scheduled out, the intel_pmu_pebs_del() function is invoked to update the PEBS state. Since the system-wide event is still available, the cpuc->n_pebs is 1. The cpuc->pebs_data_cfg is cleared. The data configuration for the system-wide PEBS event is lost. The (cpuc->n_pebs == 1) check was introduced in commit: b6a32f023fcc ("perf/x86: Fix PEBS threshold initialization") At that time, it indeed didn't hurt whether the state was updated during the removal, because only the threshold is updated. The calculation of the threshold takes the last PEBS event into account. However, since commit: b752ea0c28e3 ("perf/x86/intel/ds: Flush PEBS DS when changing PEBS_DATA_CFG") we delay the threshold update, and clear the PEBS data config, which triggers the bug. The PEBS data config update scope should not be shrunk during removal. [ mingo: Improved the changelog & comments. ] Fixes: b752ea0c28e3 ("perf/x86/intel/ds: Flush PEBS DS when changing PEBS_DATA_CFG") Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240401133320.703971-1-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 2641ba620f12..e010bfed8417 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1237,11 +1237,11 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu = event->pmu; /* - * Make sure we get updated with the first PEBS - * event. It will trigger also during removal, but - * that does not hurt: + * Make sure we get updated with the first PEBS event. + * During removal, ->pebs_data_cfg is still valid for + * the last PEBS event. Don't clear it. */ - if (cpuc->n_pebs == 1) + if ((cpuc->n_pebs == 1) && add) cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW; if (needed_cb != pebs_needs_sched_cb(cpuc)) { -- cgit v1.2.3 From ef15ddeeb6bee87c044bf7754fac524545bf71e8 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Thu, 28 Mar 2024 19:55:05 +0300 Subject: octeontx2-af: Add array index check In rvu_map_cgx_lmac_pf() the 'iter', which is used as an array index, can reach value (up to 14) that exceed the size (MAX_LMAC_COUNT = 8) of the array. Fix this bug by adding 'iter' value check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 91c6945ea1f9 ("octeontx2-af: cn10k: Add RPM MAC support") Signed-off-by: Aleksandr Mishin Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index 72e060cf6b61..e9bf9231b018 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -160,6 +160,8 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) continue; lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu)); for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) { + if (iter >= MAX_LMAC_COUNT) + continue; lmac = cgx_get_lmacid(rvu_cgx_pdata(cgx, rvu), iter); rvu->pf2cgxlmac_map[pf] = cgxlmac_id_to_bmap(cgx, lmac); -- cgit v1.2.3 From bff892acf79cec531da6cb21c50980a584ce1476 Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Wed, 3 Apr 2024 16:40:29 +0800 Subject: spi: spi-fsl-lpspi: remove redundant spi_controller_put call devm_spi_alloc_controller will allocate an SPI controller and automatically release a reference on it when dev is unbound from its driver. It doesn't need to call spi_controller_put explicitly to put the reference when lpspi driver failed initialization. Fixes: 2ae0ab0143fc ("spi: lpspi: Avoid potential use-after-free in probe()") Signed-off-by: Carlos Song Reviewed-by: Alexander Sverdlin Link: https://msgid.link/r/20240403084029.2000544-1-carlos.song@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-lpspi.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 079035db7dd8..92a662d1b55c 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -852,39 +852,39 @@ static int fsl_lpspi_probe(struct platform_device *pdev) fsl_lpspi->base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(fsl_lpspi->base)) { ret = PTR_ERR(fsl_lpspi->base); - goto out_controller_put; + return ret; } fsl_lpspi->base_phys = res->start; irq = platform_get_irq(pdev, 0); if (irq < 0) { ret = irq; - goto out_controller_put; + return ret; } ret = devm_request_irq(&pdev->dev, irq, fsl_lpspi_isr, 0, dev_name(&pdev->dev), fsl_lpspi); if (ret) { dev_err(&pdev->dev, "can't get irq%d: %d\n", irq, ret); - goto out_controller_put; + return ret; } fsl_lpspi->clk_per = devm_clk_get(&pdev->dev, "per"); if (IS_ERR(fsl_lpspi->clk_per)) { ret = PTR_ERR(fsl_lpspi->clk_per); - goto out_controller_put; + return ret; } fsl_lpspi->clk_ipg = devm_clk_get(&pdev->dev, "ipg"); if (IS_ERR(fsl_lpspi->clk_ipg)) { ret = PTR_ERR(fsl_lpspi->clk_ipg); - goto out_controller_put; + return ret; } /* enable the clock */ ret = fsl_lpspi_init_rpm(fsl_lpspi); if (ret) - goto out_controller_put; + return ret; ret = pm_runtime_get_sync(fsl_lpspi->dev); if (ret < 0) { @@ -945,8 +945,6 @@ out_pm_get: pm_runtime_dont_use_autosuspend(fsl_lpspi->dev); pm_runtime_put_sync(fsl_lpspi->dev); pm_runtime_disable(fsl_lpspi->dev); -out_controller_put: - spi_controller_put(controller); return ret; } -- cgit v1.2.3 From 1f886a7bfb3faf4c1021e73f045538008ce7634e Mon Sep 17 00:00:00 2001 From: Huai-Yuan Liu Date: Wed, 3 Apr 2024 09:42:21 +0800 Subject: spi: mchp-pci1xxx: Fix a possible null pointer dereference in pci1xxx_spi_probe In function pci1xxxx_spi_probe, there is a potential null pointer that may be caused by a failed memory allocation by the function devm_kzalloc. Hence, a null pointer check needs to be added to prevent null pointer dereferencing later in the code. To fix this issue, spi_bus->spi_int[iter] should be checked. The memory allocated by devm_kzalloc will be automatically released, so just directly return -ENOMEM without worrying about memory leaks. Fixes: 1cc0cbea7167 ("spi: microchip: pci1xxxx: Add driver for SPI controller of PCI1XXXX PCIe switch") Signed-off-by: Huai-Yuan Liu Link: https://msgid.link/r/20240403014221.969801-1-qq810974084@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-pci1xxxx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-pci1xxxx.c b/drivers/spi/spi-pci1xxxx.c index 969965d7bc98..cc18d320370f 100644 --- a/drivers/spi/spi-pci1xxxx.c +++ b/drivers/spi/spi-pci1xxxx.c @@ -725,6 +725,8 @@ static int pci1xxxx_spi_probe(struct pci_dev *pdev, const struct pci_device_id * spi_bus->spi_int[iter] = devm_kzalloc(&pdev->dev, sizeof(struct pci1xxxx_spi_internal), GFP_KERNEL); + if (!spi_bus->spi_int[iter]) + return -ENOMEM; spi_sub_ptr = spi_bus->spi_int[iter]; spi_sub_ptr->spi_host = devm_spi_alloc_host(dev, sizeof(struct spi_controller)); if (!spi_sub_ptr->spi_host) -- cgit v1.2.3 From 0a6380cb4c6b5c1d6dad226ba3130f9090f0ccea Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Mon, 1 Apr 2024 13:09:33 +0200 Subject: net: bcmgenet: Reset RBUF on first open If the RBUF logic is not reset when the kernel starts then there may be some data left over from any network boot loader. If the 64-byte packet headers are enabled then this can be fatal. Extend bcmgenet_dma_disable to do perform the reset, but not when called from bcmgenet_resume in order to preserve a wake packet. N.B. This different handling of resume is just based on a hunch - why else wouldn't one reset the RBUF as well as the TBUF? If this isn't the case then it's easy to change the patch to make the RBUF reset unconditional. See: https://github.com/raspberrypi/linux/issues/3850 See: https://github.com/raspberrypi/firmware/issues/1882 Signed-off-by: Phil Elwell Signed-off-by: Maarten Vanraes Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 7396e2823e32..b1f84b37032a 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3280,7 +3280,7 @@ static void bcmgenet_get_hw_addr(struct bcmgenet_priv *priv, } /* Returns a reusable dma control register value */ -static u32 bcmgenet_dma_disable(struct bcmgenet_priv *priv) +static u32 bcmgenet_dma_disable(struct bcmgenet_priv *priv, bool flush_rx) { unsigned int i; u32 reg; @@ -3305,6 +3305,14 @@ static u32 bcmgenet_dma_disable(struct bcmgenet_priv *priv) udelay(10); bcmgenet_umac_writel(priv, 0, UMAC_TX_FLUSH); + if (flush_rx) { + reg = bcmgenet_rbuf_ctrl_get(priv); + bcmgenet_rbuf_ctrl_set(priv, reg | BIT(0)); + udelay(10); + bcmgenet_rbuf_ctrl_set(priv, reg); + udelay(10); + } + return dma_ctrl; } @@ -3368,8 +3376,8 @@ static int bcmgenet_open(struct net_device *dev) bcmgenet_set_hw_addr(priv, dev->dev_addr); - /* Disable RX/TX DMA and flush TX queues */ - dma_ctrl = bcmgenet_dma_disable(priv); + /* Disable RX/TX DMA and flush TX and RX queues */ + dma_ctrl = bcmgenet_dma_disable(priv, true); /* Reinitialize TDMA and RDMA and SW housekeeping */ ret = bcmgenet_init_dma(priv); @@ -4235,7 +4243,7 @@ static int bcmgenet_resume(struct device *d) bcmgenet_hfb_create_rxnfc_filter(priv, rule); /* Disable RX/TX DMA and flush TX queues */ - dma_ctrl = bcmgenet_dma_disable(priv); + dma_ctrl = bcmgenet_dma_disable(priv, false); /* Reinitialize TDMA and RDMA and SW housekeeping */ ret = bcmgenet_init_dma(priv); -- cgit v1.2.3 From e8acd2d209a387f2358c2c83fe894b444db9ea46 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 2 Apr 2024 18:43:45 +0200 Subject: gpiolib: Fix triggering "kobject: 'gpiochipX' is not initialized, yet" kobject_get() errors When a gpiochip gets added by loading a module, then another driver may be waiting for that gpiochip to load on the deferred-probe list. If the deferred-probe for the consumer of gpiochip then triggers between the gpiodev_add_to_list_unlocked() calls which makes gpio_device_find() see the chip and the gpiochip_setup_dev() later then gpio_device_find() does a kobject_get() on an uninitialized kobject since the kobject is initialized by gpiochip_setup_dev() calling device_initialize(): arizona spi-10WM5102:00: cannot find GPIO chip arizona, deferring arizona spi-10WM5102:00: cannot find GPIO chip arizona, deferring ------------[ cut here ]------------ kobject: 'gpiochip5' (00000000241466f2): is not initialized, yet kobject_get() is being called. WARNING: CPU: 3 PID: 42 at lib/kobject.c:640 kobject_get+0x43/0x70 Call Trace: kobject_get gpio_device_find gpiod_find_and_request gpiod_get snd_byt_wm5102_mc_probe Not only is the device not initialized yet, but when the gpio-device is added to the list things like the irqchip also have not been initialized yet. So gpio_device_find() should really ignore the gpio-device until gpiochip_add_data_with_key() is fully done. Add a device_is_registered() check to gpio_device_find() to ignore gpio-devices on the list which are not yet fully initialized. Fixes: aab5c6f20023 ("gpio: set device type for GPIO chips") Suggested-by: Bartosz Golaszewski Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko [Bartosz: fix a typo in commit message] Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 59ccf9a3e153..94903fc1c145 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1175,6 +1175,9 @@ struct gpio_device *gpio_device_find(const void *data, list_for_each_entry_srcu(gdev, &gpio_devices, list, srcu_read_lock_held(&gpio_devices_srcu)) { + if (!device_is_registered(&gdev->dev)) + continue; + guard(srcu)(&gdev->srcu); gc = srcu_dereference(gdev->chip, &gdev->srcu); -- cgit v1.2.3 From 1d86c2b3946e69d6b0b93568d312aae6247847c0 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:03 -0400 Subject: arm64: dts: imx8-ss-lsio: fix pwm lpcg indices lpcg's arg0 should use clock indices instead of index. pwm0_lpcg: clock-controller@5d400000 { ... // Col1 Col2 clocks = <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 0 0 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 1 1 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>, // 2 4 <&lsio_bus_clk>, // 3 5 <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>; // 4 6 clock-indices = , , , , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. pwm1 { .... clocks = <&pwm1_lpcg 4>, <&pwm1_lpcg 1>; ^^ ^^ should be: clocks = <&pwm1_lpcg IMX_LPCG_CLK_6>, <&pwm1_lpcg IMX_LPCG_CLK_1>; }; Arg0 is divided by 4 in lpcg driver, so index 0 and 1 will be get by pwm driver, which are same as IMX_LPCG_CLK_6 and IMX_LPCG_CLK_1. Even it can work, but code logic is wrong. Fixed it by use correct indices. Cc: stable@vger.kernel.org Fixes: 23fa99b205ea ("arm64: dts: freescale: imx8-ss-lsio: add support for lsio_pwm0-3") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi index 7e510b21bbac..764c1a08e3b1 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-lsio.dtsi @@ -25,8 +25,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d000000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm0_lpcg 4>, - <&pwm0_lpcg 1>; + clocks = <&pwm0_lpcg IMX_LPCG_CLK_6>, + <&pwm0_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -38,8 +38,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d010000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm1_lpcg 4>, - <&pwm1_lpcg 1>; + clocks = <&pwm1_lpcg IMX_LPCG_CLK_6>, + <&pwm1_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -51,8 +51,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d020000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm2_lpcg 4>, - <&pwm2_lpcg 1>; + clocks = <&pwm2_lpcg IMX_LPCG_CLK_6>, + <&pwm2_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_2 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; @@ -64,8 +64,8 @@ lsio_subsys: bus@5d000000 { compatible = "fsl,imx27-pwm"; reg = <0x5d030000 0x10000>; clock-names = "ipg", "per"; - clocks = <&pwm3_lpcg 4>, - <&pwm3_lpcg 1>; + clocks = <&pwm3_lpcg IMX_LPCG_CLK_6>, + <&pwm3_lpcg IMX_LPCG_CLK_1>; assigned-clocks = <&clk IMX_SC_R_PWM_3 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; #pwm-cells = <3>; -- cgit v1.2.3 From 808e7716edcdb39d3498b9f567ef6017858b49aa Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:04 -0400 Subject: arm64: dts: imx8-ss-conn: fix usb lpcg indices usb2_lpcg: clock-controller@5b270000 { ... Col1 Col2 clocks = <&conn_ahb_clk>, <&conn_ipg_clk>; // 0 6 clock-indices = , ; // 0 7 ... }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. usbotg1: usb@5b0d0000 { ... clocks = <&usb2_lpcg 0>; ^^ Should be: clocks = <&usb2_lpcg IMX_LPCG_CLK_6>; }; usbphy1: usbphy@5b100000 { clocks = <&usb2_lpcg 1>; ^^ SHould be: clocks = <&usb2_lpcg IMX_LPCG_CLK_7>; }; Arg0 is divided by 4 in lpcg driver. So lpcg will do dummy enable. Fix it by use correct clock indices. Cc: stable@vger.kernel.org Fixes: 8065fc937f0f ("arm64: dts: imx8dxl: add usb1 and usb2 support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi index af2259e99796..4aaf5a0c1ed8 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi @@ -41,7 +41,7 @@ conn_subsys: bus@5b000000 { interrupts = ; fsl,usbphy = <&usbphy1>; fsl,usbmisc = <&usbmisc1 0>; - clocks = <&usb2_lpcg 0>; + clocks = <&usb2_lpcg IMX_LPCG_CLK_6>; ahb-burst-config = <0x0>; tx-burst-size-dword = <0x10>; rx-burst-size-dword = <0x10>; @@ -58,7 +58,7 @@ conn_subsys: bus@5b000000 { usbphy1: usbphy@5b100000 { compatible = "fsl,imx7ulp-usbphy"; reg = <0x5b100000 0x1000>; - clocks = <&usb2_lpcg 1>; + clocks = <&usb2_lpcg IMX_LPCG_CLK_7>; power-domains = <&pd IMX_SC_R_USB_0_PHY>; status = "disabled"; }; -- cgit v1.2.3 From f72b544a514c07d34a0d9d5380f5905b3731e647 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:05 -0400 Subject: arm64: dts: imx8-ss-dma: fix spi lpcg indices spi0_lpcg: clock-controller@5a400000 { ... Col0 Col1 clocks = <&clk IMX_SC_R_SPI_0 IMX_SC_PM_CLK_PER>,// 0 1 <&dma_ipg_clk>; // 1 4 clock-indices = , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. lpspi0: spi@5a000000 { ... clocks = <&spi0_lpcg 0>, <&spi0_lpcg 1>; ^ ^ Should be: clocks = <&spi0_lpcg IMX_LPCG_CLK_0>, <&spi0_lpcg IMX_LPCG_CLK_4>; }; Arg0 is divided by 4 in lpcg driver. <&spi0_lpcg 0> and <&spi0_lpcg 1> are IMX_SC_PM_CLK_PER. Although code can work, code logic is wrong. It should use IMX_LPCG_CLK_0 and IMX_LPCG_CLK_4 for lpcg arg0. Cc: stable@vger.kernel.org Fixes: c4098885e790 ("arm64: dts: imx8dxl: add lpspi support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index cab3468b1875..169cf885c744 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -28,8 +28,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi0_lpcg 0>, - <&spi0_lpcg 1>; + clocks = <&spi0_lpcg IMX_LPCG_CLK_0>, + <&spi0_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -44,8 +44,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi1_lpcg 0>, - <&spi1_lpcg 1>; + clocks = <&spi1_lpcg IMX_LPCG_CLK_0>, + <&spi1_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -60,8 +60,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi2_lpcg 0>, - <&spi2_lpcg 1>; + clocks = <&spi2_lpcg IMX_LPCG_CLK_0>, + <&spi2_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_2 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; @@ -76,8 +76,8 @@ dma_subsys: bus@5a000000 { #size-cells = <0>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&spi3_lpcg 0>, - <&spi3_lpcg 1>; + clocks = <&spi3_lpcg IMX_LPCG_CLK_0>, + <&spi3_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_SPI_3 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <60000000>; -- cgit v1.2.3 From 9055d87bce7276234173fa90e9702af31b3f5353 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:06 -0400 Subject: arm64: dts: imx8-ss-dma: fix pwm lpcg indices adma_pwm_lpcg: clock-controller@5a590000 { ... col1 col2 clocks = <&clk IMX_SC_R_LCD_0_PWM_0 IMX_SC_PM_CLK_PER>,// 0 0 <&dma_ipg_clk>; // 1 4 clock-indices = , ; ... }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. adma_pwm: pwm@5a190000 { ... clocks = <&adma_pwm_lpcg 1>, <&adma_pwm_lpcg 0>; ^^ ^^ Should be clocks = <&adma_pwm_lpcg IMX_LPCG_CLK_4>, <&adma_pwm_lpcg IMX_LPCG_CLK_0>; }; Arg0 will be divided by 4 in lcpg driver, so pwm will get IMX_SC_PM_CLK_PER by <&adma_pwm_lpcg 1>, <&adma_pwm_lpcg 0>. Although function can work, code logic is wrong. Fix it by use correct indices. Cc: stable@vger.kernel.org Fixes: f1d6a6b991ef ("arm64: dts: imx8qxp: add adma_pwm in adma") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index 169cf885c744..e3e911e57790 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -145,8 +145,8 @@ dma_subsys: bus@5a000000 { compatible = "fsl,imx8qxp-pwm", "fsl,imx27-pwm"; reg = <0x5a190000 0x1000>; interrupts = ; - clocks = <&adma_pwm_lpcg 1>, - <&adma_pwm_lpcg 0>; + clocks = <&adma_pwm_lpcg IMX_LPCG_CLK_4>, + <&adma_pwm_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_LCD_0_PWM_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; -- cgit v1.2.3 From 81975080f14167610976e968e8016e92d836266f Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:07 -0400 Subject: arm64: dts: imx8-ss-dma: fix adc lpcg indices adc0_lpcg: clock-controller@5ac80000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_ADC_0 IMX_SC_PM_CLK_PER>, // 0 0 <&dma_ipg_clk>; // 1 4 clock-indices = , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. adc0: adc@5a880000 { clocks = <&adc0_lpcg 0>, <&adc0_lpcg 1>; ^^ ^^ clocks = <&adc0_lpcg IMX_LPCG_CLK_0>, <&adc0_lpcg IMX_LPCG_CLK_4>; Arg0 is divided by 4 in lpcg driver. So adc get IMX_SC_PM_CLK_PER by <&adc0_lpcg 0>, <&adc0_lpcg 1>. Although function can work, code logic is wrong. Fix it by using correct indices. Cc: stable@vger.kernel.org Fixes: 1db044b25d2e ("arm64: dts: imx8dxl: add adc0 support") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index e3e911e57790..23d59b34bc4c 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -355,8 +355,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a880000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&adc0_lpcg 0>, - <&adc0_lpcg 1>; + clocks = <&adc0_lpcg IMX_LPCG_CLK_0>, + <&adc0_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_ADC_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; @@ -370,8 +370,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a890000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&adc1_lpcg 0>, - <&adc1_lpcg 1>; + clocks = <&adc1_lpcg IMX_LPCG_CLK_0>, + <&adc1_lpcg IMX_LPCG_CLK_4>; clock-names = "per", "ipg"; assigned-clocks = <&clk IMX_SC_R_ADC_1 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <24000000>; -- cgit v1.2.3 From 0893392334b5dffdf616a53679c6a2942c46391b Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:08 -0400 Subject: arm64: dts: imx8-ss-dma: fix can lpcg indices can0_lpcg: clock-controller@5acd0000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>, // 0 0 <&dma_ipg_clk>, // 1 4 <&dma_ipg_clk>; // 2 5 clock-indices = , , ; } Col1: index, which existing dts try to get. Col2: actual index in lpcg driver. flexcan1: can@5a8d0000 { clocks = <&can0_lpcg 1>, <&can0_lpcg 0>; ^^ ^^ Should be: clocks = <&can0_lpcg IMX_LPCG_CLK_4>, <&can0_lpcg IMX_LPCG_CLK_0>; }; Arg0 is divided by 4 in lpcg driver. flexcan driver get IMX_SC_PM_CLK_PER by <&can0_lpcg 1> and <&can0_lpcg 0>. Although function can work, code logic is wrong. Fix it by using correct clock indices. Cc: stable@vger.kernel.org Fixes: 5e7d5b023e03 ("arm64: dts: imx8qxp: add flexcan in adma") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi index 23d59b34bc4c..f7a91d43a0ff 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-dma.dtsi @@ -384,8 +384,8 @@ dma_subsys: bus@5a000000 { reg = <0x5a8d0000 0x10000>; interrupts = ; interrupt-parent = <&gic>; - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; @@ -405,8 +405,8 @@ dma_subsys: bus@5a000000 { * CAN1 shares CAN0's clock and to enable CAN0's clock it * has to be powered on. */ - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; @@ -426,8 +426,8 @@ dma_subsys: bus@5a000000 { * CAN2 shares CAN0's clock and to enable CAN0's clock it * has to be powered on. */ - clocks = <&can0_lpcg 1>, - <&can0_lpcg 0>; + clocks = <&can0_lpcg IMX_LPCG_CLK_4>, + <&can0_lpcg IMX_LPCG_CLK_0>; clock-names = "ipg", "per"; assigned-clocks = <&clk IMX_SC_R_CAN_0 IMX_SC_PM_CLK_PER>; assigned-clock-rates = <40000000>; -- cgit v1.2.3 From 00b436182138310bb8d362b912b12a9df8f72ca3 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 1 Apr 2024 18:25:09 -0400 Subject: arm64: dts: imx8qm-ss-dma: fix can lpcg indices can1_lpcg: clock-controller@5ace0000 { ... Col1 Col2 clocks = <&clk IMX_SC_R_CAN_1 IMX_SC_PM_CLK_PER>,// 0 0 <&dma_ipg_clk>, // 1 4 <&dma_ipg_clk>; // 2 5 clock-indices = , , ; }; Col1: index, which existing dts try to get. Col2: actual index in lpcg driver &flexcan2 { clocks = <&can1_lpcg 1>, <&can1_lpcg 0>; ^^ ^^ Should be: clocks = <&can1_lpcg IMX_LPCG_CLK_4>, <&can1_lpcg IMX_LPCG_CLK_0>; }; Arg0 is divided by 4 in lpcg driver. So flexcan get IMX_SC_PM_CLK_PER by <&can1_lpcg 1> and <&can1_lpcg 0>. Although function work, code logic is wrong. Fix it by using correct clock indices. Cc: stable@vger.kernel.org Fixes: be85831de020 ("arm64: dts: imx8qm: add can node in devicetree") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi b/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi index 11626fae5f97..aa9f28c4431d 100644 --- a/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8qm-ss-dma.dtsi @@ -153,15 +153,15 @@ }; &flexcan2 { - clocks = <&can1_lpcg 1>, - <&can1_lpcg 0>; + clocks = <&can1_lpcg IMX_LPCG_CLK_4>, + <&can1_lpcg IMX_LPCG_CLK_0>; assigned-clocks = <&clk IMX_SC_R_CAN_1 IMX_SC_PM_CLK_PER>; fsl,clk-source = /bits/ 8 <1>; }; &flexcan3 { - clocks = <&can2_lpcg 1>, - <&can2_lpcg 0>; + clocks = <&can2_lpcg IMX_LPCG_CLK_4>, + <&can2_lpcg IMX_LPCG_CLK_0>; assigned-clocks = <&clk IMX_SC_R_CAN_2 IMX_SC_PM_CLK_PER>; fsl,clk-source = /bits/ 8 <1>; }; -- cgit v1.2.3 From e6ec07dc6dd498415bc8cc49437d5ec9e09cc48e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 20 Mar 2024 10:38:58 +0100 Subject: s390/mm: fix NULL pointer dereference The recently added check to figure out if a fault happened on gmap ASCE dereferences the gmap pointer in lowcore without checking that it is not NULL. For all non-KVM processes the pointer is NULL, so that some value from lowcore will be read. With the current layouts of struct gmap and struct lowcore the read value (aka ASCE) is zero, so that this doesn't lead to any observable bug; at least currently. Fix this by adding the missing NULL pointer check. Fixes: 64c3431808bd ("s390/entry: compare gmap asce to determine guest/host fault") Acked-by: Sven Schnelle Reviewed-by: Claudio Imbrenda Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index c421dd44ffbe..0c66b32e0f9f 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -75,7 +75,7 @@ static enum fault_type get_fault_type(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_PGSTE)) return KERNEL_FAULT; gmap = (struct gmap *)S390_lowcore.gmap; - if (regs->cr1 == gmap->asce) + if (gmap && gmap->asce == regs->cr1) return GMAP_FAULT; return KERNEL_FAULT; } -- cgit v1.2.3 From 01cac82ae02b43983173ea8e475a1c999edd25a6 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 20 Mar 2024 23:47:48 +0100 Subject: s390/atomic: mark all functions __always_inline Atomic functions are quite ubiquitous and may be called by noinstr ones, introducing unwanted instrumentation. They are very small, so there are no significant downsides to force-inlining them. Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20240320230007.4782-2-iii@linux.ibm.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/atomic.h | 44 +++++++++++++++++++------------------- arch/s390/include/asm/atomic_ops.h | 22 +++++++++---------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 7138d189cc42..0c4cad7d5a5b 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -15,31 +15,31 @@ #include #include -static inline int arch_atomic_read(const atomic_t *v) +static __always_inline int arch_atomic_read(const atomic_t *v) { return __atomic_read(v); } #define arch_atomic_read arch_atomic_read -static inline void arch_atomic_set(atomic_t *v, int i) +static __always_inline void arch_atomic_set(atomic_t *v, int i) { __atomic_set(v, i); } #define arch_atomic_set arch_atomic_set -static inline int arch_atomic_add_return(int i, atomic_t *v) +static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter) + i; } #define arch_atomic_add_return arch_atomic_add_return -static inline int arch_atomic_fetch_add(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return __atomic_add_barrier(i, &v->counter); } #define arch_atomic_fetch_add arch_atomic_fetch_add -static inline void arch_atomic_add(int i, atomic_t *v) +static __always_inline void arch_atomic_add(int i, atomic_t *v) { __atomic_add(i, &v->counter); } @@ -50,11 +50,11 @@ static inline void arch_atomic_add(int i, atomic_t *v) #define arch_atomic_fetch_sub(_i, _v) arch_atomic_fetch_add(-(int)(_i), _v) #define ATOMIC_OPS(op) \ -static inline void arch_atomic_##op(int i, atomic_t *v) \ +static __always_inline void arch_atomic_##op(int i, atomic_t *v) \ { \ __atomic_##op(i, &v->counter); \ } \ -static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ +static __always_inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ { \ return __atomic_##op##_barrier(i, &v->counter); \ } @@ -74,7 +74,7 @@ ATOMIC_OPS(xor) #define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new)) -static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { return __atomic_cmpxchg(&v->counter, old, new); } @@ -82,31 +82,31 @@ static inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) #define ATOMIC64_INIT(i) { (i) } -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return __atomic64_read(v); } #define arch_atomic64_read arch_atomic64_read -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { __atomic64_set(v, i); } #define arch_atomic64_set arch_atomic64_set -static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return __atomic64_add_barrier(i, (long *)&v->counter) + i; } #define arch_atomic64_add_return arch_atomic64_add_return -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return __atomic64_add_barrier(i, (long *)&v->counter); } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static inline void arch_atomic64_add(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { __atomic64_add(i, (long *)&v->counter); } @@ -114,20 +114,20 @@ static inline void arch_atomic64_add(s64 i, atomic64_t *v) #define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new)) -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return __atomic64_cmpxchg((long *)&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg -#define ATOMIC64_OPS(op) \ -static inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ -{ \ - __atomic64_##op(i, (long *)&v->counter); \ -} \ -static inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \ -{ \ - return __atomic64_##op##_barrier(i, (long *)&v->counter); \ +#define ATOMIC64_OPS(op) \ +static __always_inline void arch_atomic64_##op(s64 i, atomic64_t *v) \ +{ \ + __atomic64_##op(i, (long *)&v->counter); \ +} \ +static __always_inline long arch_atomic64_fetch_##op(s64 i, atomic64_t *v) \ +{ \ + return __atomic64_##op##_barrier(i, (long *)&v->counter); \ } ATOMIC64_OPS(and) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index 50510e08b893..7fa5f96a553a 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -8,7 +8,7 @@ #ifndef __ARCH_S390_ATOMIC_OPS__ #define __ARCH_S390_ATOMIC_OPS__ -static inline int __atomic_read(const atomic_t *v) +static __always_inline int __atomic_read(const atomic_t *v) { int c; @@ -18,14 +18,14 @@ static inline int __atomic_read(const atomic_t *v) return c; } -static inline void __atomic_set(atomic_t *v, int i) +static __always_inline void __atomic_set(atomic_t *v, int i) { asm volatile( " st %1,%0\n" : "=R" (v->counter) : "d" (i)); } -static inline s64 __atomic64_read(const atomic64_t *v) +static __always_inline s64 __atomic64_read(const atomic64_t *v) { s64 c; @@ -35,7 +35,7 @@ static inline s64 __atomic64_read(const atomic64_t *v) return c; } -static inline void __atomic64_set(atomic64_t *v, s64 i) +static __always_inline void __atomic64_set(atomic64_t *v, s64 i) { asm volatile( " stg %1,%0\n" @@ -45,7 +45,7 @@ static inline void __atomic64_set(atomic64_t *v, s64 i) #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES #define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \ -static inline op_type op_name(op_type val, op_type *ptr) \ +static __always_inline op_type op_name(op_type val, op_type *ptr) \ { \ op_type old; \ \ @@ -96,7 +96,7 @@ __ATOMIC_CONST_OPS(__atomic64_add_const, long, "agsi") #else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ #define __ATOMIC_OP(op_name, op_string) \ -static inline int op_name(int val, int *ptr) \ +static __always_inline int op_name(int val, int *ptr) \ { \ int old, new; \ \ @@ -122,7 +122,7 @@ __ATOMIC_OPS(__atomic_xor, "xr") #undef __ATOMIC_OPS #define __ATOMIC64_OP(op_name, op_string) \ -static inline long op_name(long val, long *ptr) \ +static __always_inline long op_name(long val, long *ptr) \ { \ long old, new; \ \ @@ -154,7 +154,7 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr") #endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ -static inline int __atomic_cmpxchg(int *ptr, int old, int new) +static __always_inline int __atomic_cmpxchg(int *ptr, int old, int new) { asm volatile( " cs %[old],%[new],%[ptr]" @@ -164,7 +164,7 @@ static inline int __atomic_cmpxchg(int *ptr, int old, int new) return old; } -static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) +static __always_inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) { int old_expected = old; @@ -176,7 +176,7 @@ static inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new) return old == old_expected; } -static inline long __atomic64_cmpxchg(long *ptr, long old, long new) +static __always_inline long __atomic64_cmpxchg(long *ptr, long old, long new) { asm volatile( " csg %[old],%[new],%[ptr]" @@ -186,7 +186,7 @@ static inline long __atomic64_cmpxchg(long *ptr, long old, long new) return old; } -static inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new) +static __always_inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new) { long old_expected = old; -- cgit v1.2.3 From c9c260681f521e4ad9f9f4cc71fe35b978e06222 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 20 Mar 2024 23:47:49 +0100 Subject: s390/preempt: mark all functions __always_inline preempt_count-related functions are quite ubiquitous and may be called by noinstr ones, introducing unwanted instrumentation. Here is one example call chain: irqentry_nmi_enter() # noinstr lockdep_hardirqs_enabled() this_cpu_read() __pcpu_size_call_return() this_cpu_read_*() this_cpu_generic_read() __this_cpu_generic_read_nopreempt() preempt_disable_notrace() __preempt_count_inc() __preempt_count_add() They are very small, so there are no significant downsides to force-inlining them. Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20240320230007.4782-3-iii@linux.ibm.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/preempt.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index bf15da0fedbc..0e3da500e98c 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -12,12 +12,12 @@ #define PREEMPT_NEED_RESCHED 0x80000000 #define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) -static inline int preempt_count(void) +static __always_inline int preempt_count(void) { return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED; } -static inline void preempt_count_set(int pc) +static __always_inline void preempt_count_set(int pc) { int old, new; @@ -29,22 +29,22 @@ static inline void preempt_count_set(int pc) old, new) != old); } -static inline void set_preempt_need_resched(void) +static __always_inline void set_preempt_need_resched(void) { __atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); } -static inline void clear_preempt_need_resched(void) +static __always_inline void clear_preempt_need_resched(void) { __atomic_or(PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); } -static inline bool test_preempt_need_resched(void) +static __always_inline bool test_preempt_need_resched(void) { return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED); } -static inline void __preempt_count_add(int val) +static __always_inline void __preempt_count_add(int val) { /* * With some obscure config options and CONFIG_PROFILE_ALL_BRANCHES @@ -59,17 +59,17 @@ static inline void __preempt_count_add(int val) __atomic_add(val, &S390_lowcore.preempt_count); } -static inline void __preempt_count_sub(int val) +static __always_inline void __preempt_count_sub(int val) { __preempt_count_add(-val); } -static inline bool __preempt_count_dec_and_test(void) +static __always_inline bool __preempt_count_dec_and_test(void) { return __atomic_add(-1, &S390_lowcore.preempt_count) == 1; } -static inline bool should_resched(int preempt_offset) +static __always_inline bool should_resched(int preempt_offset) { return unlikely(READ_ONCE(S390_lowcore.preempt_count) == preempt_offset); @@ -79,45 +79,45 @@ static inline bool should_resched(int preempt_offset) #define PREEMPT_ENABLED (0) -static inline int preempt_count(void) +static __always_inline int preempt_count(void) { return READ_ONCE(S390_lowcore.preempt_count); } -static inline void preempt_count_set(int pc) +static __always_inline void preempt_count_set(int pc) { S390_lowcore.preempt_count = pc; } -static inline void set_preempt_need_resched(void) +static __always_inline void set_preempt_need_resched(void) { } -static inline void clear_preempt_need_resched(void) +static __always_inline void clear_preempt_need_resched(void) { } -static inline bool test_preempt_need_resched(void) +static __always_inline bool test_preempt_need_resched(void) { return false; } -static inline void __preempt_count_add(int val) +static __always_inline void __preempt_count_add(int val) { S390_lowcore.preempt_count += val; } -static inline void __preempt_count_sub(int val) +static __always_inline void __preempt_count_sub(int val) { S390_lowcore.preempt_count -= val; } -static inline bool __preempt_count_dec_and_test(void) +static __always_inline bool __preempt_count_dec_and_test(void) { return !--S390_lowcore.preempt_count && tif_need_resched(); } -static inline bool should_resched(int preempt_offset) +static __always_inline bool should_resched(int preempt_offset) { return unlikely(preempt_count() == preempt_offset && tif_need_resched()); -- cgit v1.2.3 From e9f3af02f63909f41b43c28330434cc437639c5c Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 29 Feb 2024 15:00:28 +0100 Subject: s390/pai: fix sampling event removal for PMU device driver In case of a sampling event, the PAI PMU device drivers need a reference to this event. Currently to PMU device driver reference is removed when a sampling event is destroyed. This may lead to situations where the reference of the PMU device driver is removed while being used by a different sampling event. Reset the event reference pointer of the PMU device driver when a sampling event is deleted and before the next one might be added. Fixes: 39d62336f5c1 ("s390/pai: add support for cryptography counters") Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_pai_crypto.c | 10 +++++++--- arch/s390/kernel/perf_pai_ext.c | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 823d652e3917..4ad472d130a3 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -90,7 +90,6 @@ static void paicrypt_event_destroy(struct perf_event *event) event->cpu); struct paicrypt_map *cpump = mp->mapptr; - cpump->event = NULL; static_branch_dec(&pai_key); mutex_lock(&pai_reserve_mutex); debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d" @@ -356,10 +355,15 @@ static int paicrypt_add(struct perf_event *event, int flags) static void paicrypt_stop(struct perf_event *event, int flags) { - if (!event->attr.sample_period) /* Counting */ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + + if (!event->attr.sample_period) { /* Counting */ paicrypt_read(event); - else /* Sampling */ + } else { /* Sampling */ perf_sched_cb_dec(event->pmu); + cpump->event = NULL; + } event->hw.state = PERF_HES_STOPPED; } diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 616a25606cd6..a6da7e0cc7a6 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -122,7 +122,6 @@ static void paiext_event_destroy(struct perf_event *event) free_page(PAI_SAVE_AREA(event)); mutex_lock(&paiext_reserve_mutex); - cpump->event = NULL; if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */ paiext_free(mp); paiext_root_free(); @@ -362,10 +361,15 @@ static int paiext_add(struct perf_event *event, int flags) static void paiext_stop(struct perf_event *event, int flags) { - if (!event->attr.sample_period) /* Counting */ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + + if (!event->attr.sample_period) { /* Counting */ paiext_read(event); - else /* Sampling */ + } else { /* Sampling */ perf_sched_cb_dec(event->pmu); + cpump->event = NULL; + } event->hw.state = PERF_HES_STOPPED; } -- cgit v1.2.3 From 378ca2d2ad410a1cd5690d06b46c5e2297f4c8c0 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Tue, 26 Mar 2024 18:12:13 +0100 Subject: s390/entry: align system call table on 8 bytes Align system call table on 8 bytes. With sys_call_table entry size of 8 bytes that eliminates the possibility of a system call pointer crossing cache line boundary. Cc: stable@kernel.org Suggested-by: Ulrich Weigand Reviewed-by: Alexander Gordeev Signed-off-by: Sumanth Korikkar Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 787394978bc0..3dc85638bc63 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -635,6 +635,7 @@ SYM_DATA_START_LOCAL(daton_psw) SYM_DATA_END(daton_psw) .section .rodata, "a" + .balign 8 #define SYSCALL(esame,emu) .quad __s390x_ ## esame SYM_DATA_START(sys_call_table) #include "asm/syscall_table.h" -- cgit v1.2.3 From 438d3fc46f0deba24da7ded046c818e7bf434d24 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 24 Feb 2024 10:12:34 +0100 Subject: dt-bindings: clock: keystone: remove unstable remark Keystone clock controller bindings were marked as work-in-progress / unstable in 2013 in commit b9e0d40c0d83 ("clk: keystone: add Keystone PLL clock driver") and commit 7affe5685c96 ("clk: keystone: Add gate control clock driver") Almost eleven years is enough, so drop the "unstable" remark and expect usual ABI rules. Signed-off-by: Krzysztof Kozlowski Acked-by: Stephen Boyd Acked-by: Rob Herring Link: https://lore.kernel.org/r/20240224091236.10146-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/clock/keystone-gate.txt | 2 -- Documentation/devicetree/bindings/clock/keystone-pll.txt | 2 -- 2 files changed, 4 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/keystone-gate.txt b/Documentation/devicetree/bindings/clock/keystone-gate.txt index c5aa187026e3..43f6fb6c9392 100644 --- a/Documentation/devicetree/bindings/clock/keystone-gate.txt +++ b/Documentation/devicetree/bindings/clock/keystone-gate.txt @@ -1,5 +1,3 @@ -Status: Unstable - ABI compatibility may be broken in the future - Binding for Keystone gate control driver which uses PSC controller IP. This binding uses the common clock binding[1]. diff --git a/Documentation/devicetree/bindings/clock/keystone-pll.txt b/Documentation/devicetree/bindings/clock/keystone-pll.txt index 9a3fbc665606..69b0eb7c03c9 100644 --- a/Documentation/devicetree/bindings/clock/keystone-pll.txt +++ b/Documentation/devicetree/bindings/clock/keystone-pll.txt @@ -1,5 +1,3 @@ -Status: Unstable - ABI compatibility may be broken in the future - Binding for keystone PLLs. The main PLL IP typically has a multiplier, a divider and a post divider. The additional PLL IPs like ARMPLL, DDRPLL and PAPLL are controlled by the memory mapped register where as the Main -- cgit v1.2.3 From 63fd4d7dc45db58a348624fd46ed74509c458054 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 24 Feb 2024 10:12:35 +0100 Subject: dt-bindings: clock: ti: remove unstable remark Several TI SoC clock bindings were marked as work-in-progress / unstable between 2013-2016, for example in commit f60b1ea5ea7a ("CLK: TI: add support for gate clock"). It was enough of time to consider them stable and expect usual ABI rules. Signed-off-by: Krzysztof Kozlowski Acked-by: Stephen Boyd Acked-by: Rob Herring Acked-by: Tony Lindgren Link: https://lore.kernel.org/r/20240224091236.10146-2-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/clock/ti/adpll.txt | 2 -- Documentation/devicetree/bindings/clock/ti/apll.txt | 2 -- Documentation/devicetree/bindings/clock/ti/autoidle.txt | 2 -- Documentation/devicetree/bindings/clock/ti/clockdomain.txt | 2 -- Documentation/devicetree/bindings/clock/ti/composite.txt | 2 -- Documentation/devicetree/bindings/clock/ti/divider.txt | 2 -- Documentation/devicetree/bindings/clock/ti/dpll.txt | 2 -- Documentation/devicetree/bindings/clock/ti/fapll.txt | 2 -- Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt | 2 -- Documentation/devicetree/bindings/clock/ti/gate.txt | 2 -- Documentation/devicetree/bindings/clock/ti/interface.txt | 2 -- Documentation/devicetree/bindings/clock/ti/mux.txt | 2 -- 12 files changed, 24 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/ti/adpll.txt b/Documentation/devicetree/bindings/clock/ti/adpll.txt index 4c8a2ce2cd70..3122360adcf3 100644 --- a/Documentation/devicetree/bindings/clock/ti/adpll.txt +++ b/Documentation/devicetree/bindings/clock/ti/adpll.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments ADPLL clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped ADPLL with two to three selectable input clocks and three to four children. diff --git a/Documentation/devicetree/bindings/clock/ti/apll.txt b/Documentation/devicetree/bindings/clock/ti/apll.txt index ade4dd4c30f0..bbd505c1199d 100644 --- a/Documentation/devicetree/bindings/clock/ti/apll.txt +++ b/Documentation/devicetree/bindings/clock/ti/apll.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments APLL clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped APLL with usually two selectable input clocks (reference clock and bypass clock), with analog phase locked diff --git a/Documentation/devicetree/bindings/clock/ti/autoidle.txt b/Documentation/devicetree/bindings/clock/ti/autoidle.txt index 7c735dde9fe9..05645a10a9e3 100644 --- a/Documentation/devicetree/bindings/clock/ti/autoidle.txt +++ b/Documentation/devicetree/bindings/clock/ti/autoidle.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments autoidle clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register mapped clock which can be put to idle automatically by hardware based on the usage and a configuration bit setting. Autoidle clock is never an individual diff --git a/Documentation/devicetree/bindings/clock/ti/clockdomain.txt b/Documentation/devicetree/bindings/clock/ti/clockdomain.txt index 9c6199249ce5..edf0b5d42768 100644 --- a/Documentation/devicetree/bindings/clock/ti/clockdomain.txt +++ b/Documentation/devicetree/bindings/clock/ti/clockdomain.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments clockdomain. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1] in consumer role. Every clock on TI SoC belongs to one clockdomain, but software only needs this information for specific clocks which require diff --git a/Documentation/devicetree/bindings/clock/ti/composite.txt b/Documentation/devicetree/bindings/clock/ti/composite.txt index 33ac7c9ad053..6f7e1331b546 100644 --- a/Documentation/devicetree/bindings/clock/ti/composite.txt +++ b/Documentation/devicetree/bindings/clock/ti/composite.txt @@ -1,7 +1,5 @@ Binding for TI composite clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped composite clock with multiple different sub-types; diff --git a/Documentation/devicetree/bindings/clock/ti/divider.txt b/Documentation/devicetree/bindings/clock/ti/divider.txt index 9b13b32974f9..4d7c76f0b356 100644 --- a/Documentation/devicetree/bindings/clock/ti/divider.txt +++ b/Documentation/devicetree/bindings/clock/ti/divider.txt @@ -1,7 +1,5 @@ Binding for TI divider clock -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped adjustable clock rate divider that does not gate and has only one input clock or parent. By default the value programmed into diff --git a/Documentation/devicetree/bindings/clock/ti/dpll.txt b/Documentation/devicetree/bindings/clock/ti/dpll.txt index 37a7cb6ad07d..14a1b72c2e71 100644 --- a/Documentation/devicetree/bindings/clock/ti/dpll.txt +++ b/Documentation/devicetree/bindings/clock/ti/dpll.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments DPLL clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped DPLL with usually two selectable input clocks (reference clock and bypass clock), with digital phase locked diff --git a/Documentation/devicetree/bindings/clock/ti/fapll.txt b/Documentation/devicetree/bindings/clock/ti/fapll.txt index c19b3f253b8c..88986ef39ddd 100644 --- a/Documentation/devicetree/bindings/clock/ti/fapll.txt +++ b/Documentation/devicetree/bindings/clock/ti/fapll.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments FAPLL clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped FAPLL with usually two selectable input clocks (reference clock and bypass clock), and one or more child diff --git a/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt b/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt index 518e3c142276..dc69477b6e98 100644 --- a/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt +++ b/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt @@ -1,7 +1,5 @@ Binding for TI fixed factor rate clock sources. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1], and also uses the autoidle support from TI autoidle clock [2]. diff --git a/Documentation/devicetree/bindings/clock/ti/gate.txt b/Documentation/devicetree/bindings/clock/ti/gate.txt index 4982615c01b9..a8e0335b006a 100644 --- a/Documentation/devicetree/bindings/clock/ti/gate.txt +++ b/Documentation/devicetree/bindings/clock/ti/gate.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments gate clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. This clock is quite much similar to the basic gate-clock [2], however, it supports a number of additional features. If no register diff --git a/Documentation/devicetree/bindings/clock/ti/interface.txt b/Documentation/devicetree/bindings/clock/ti/interface.txt index d3eb5ca92a7f..85fb1f2d2d28 100644 --- a/Documentation/devicetree/bindings/clock/ti/interface.txt +++ b/Documentation/devicetree/bindings/clock/ti/interface.txt @@ -1,7 +1,5 @@ Binding for Texas Instruments interface clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. This clock is quite much similar to the basic gate-clock [2], however, it supports a number of additional features, including diff --git a/Documentation/devicetree/bindings/clock/ti/mux.txt b/Documentation/devicetree/bindings/clock/ti/mux.txt index b33f641f1043..cd56d3c1c09f 100644 --- a/Documentation/devicetree/bindings/clock/ti/mux.txt +++ b/Documentation/devicetree/bindings/clock/ti/mux.txt @@ -1,7 +1,5 @@ Binding for TI mux clock. -Binding status: Unstable - ABI compatibility may be broken in the future - This binding uses the common clock binding[1]. It assumes a register-mapped multiplexer with multiple input clock signals or parents, one of which can be selected as output. This clock does not -- cgit v1.2.3 From 9117a64403e66b295a875e172954b758477e5f57 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 24 Feb 2024 10:12:36 +0100 Subject: dt-bindings: remoteproc: ti,davinci: remove unstable remark TI Davinci remoteproc bindings were marked as work-in-progress / unstable in 2017 in commit ae67b8007816 ("dt-bindings: remoteproc: Add bindings for Davinci DSP processors"). Almost seven years is enough, so drop the "unstable" remark and expect usual ABI rules. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Mathieu Poirier Acked-by: Rob Herring Link: https://lore.kernel.org/r/20240224091236.10146-3-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt b/Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt index 25f8658e216f..48a49c516b62 100644 --- a/Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt +++ b/Documentation/devicetree/bindings/remoteproc/ti,davinci-rproc.txt @@ -1,9 +1,6 @@ TI Davinci DSP devices ======================= -Binding status: Unstable - Subject to changes for DT representation of clocks - and resets - The TI Davinci family of SoCs usually contains a TI DSP Core sub-system that is used to offload some of the processor-intensive tasks or algorithms, for achieving various system level goals. -- cgit v1.2.3 From 6fad9df49b40fdb7d8458167ebbde46a8681f729 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 25 Mar 2024 11:48:32 +0100 Subject: dt-bindings: soc: fsl: narrow regex for unit address to hex numbers Regular expression used to match the unit address part should not allow non-hex numbers. Acked-by: Rob Herring Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240325104833.33372-1-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml | 2 +- Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml b/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml index 397f75909b20..ce1a6505eb51 100644 --- a/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml +++ b/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml @@ -51,7 +51,7 @@ properties: ranges: true patternProperties: - "^clock-controller@[0-9a-z]+$": + "^clock-controller@[0-9a-f]+$": $ref: /schemas/clock/fsl,flexspi-clock.yaml# required: diff --git a/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml b/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml index 8d088b5fe823..a6a511b00a12 100644 --- a/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml +++ b/Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml @@ -41,7 +41,7 @@ properties: ranges: true patternProperties: - "^interrupt-controller@[a-z0-9]+$": + "^interrupt-controller@[a-f0-9]+$": $ref: /schemas/interrupt-controller/fsl,ls-extirq.yaml# required: -- cgit v1.2.3 From 500b42091c1dd878b7a8a59ef89aba85e0054b7b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 25 Mar 2024 11:48:33 +0100 Subject: dt-bindings: timer: narrow regex for unit address to hex numbers Regular expression used to match the unit address part should not allow non-hex numbers. Expect at least one hex digit as well. Acked-by: Rob Herring Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240325104833.33372-2-krzysztof.kozlowski@linaro.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml b/Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml index 7a4a6ab85970..ab8f28993139 100644 --- a/Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml +++ b/Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml @@ -60,7 +60,7 @@ properties: be implemented in an always-on power domain." patternProperties: - '^frame@[0-9a-z]*$': + '^frame@[0-9a-f]+$': type: object additionalProperties: false description: A timer node has up to 8 frame sub-nodes, each with the following properties. -- cgit v1.2.3 From 0200ceed3042222a1f78b3e79ec71a5a52977e3a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Feb 2024 22:51:38 +0000 Subject: vboxsf: remove redundant variable out_len The variable out_len is being used to accumulate the number of bytes but it is not being used for any other purpose. The variable is redundant and can be removed. Cleans up clang scan build warning: fs/vboxsf/utils.c:443:9: warning: variable 'out_len' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20240229225138.351909-1-colin.i.king@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- fs/vboxsf/utils.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 72ac9320e6a3..9515bbf0b54c 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -440,7 +440,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, { const char *in; char *out; - size_t out_len; size_t out_bound_len; size_t in_bound_len; @@ -448,7 +447,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, in_bound_len = utf8_len; out = name; - out_len = 0; /* Reserve space for terminating 0 */ out_bound_len = name_bound_len - 1; @@ -469,7 +467,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, out += nb; out_bound_len -= nb; - out_len += nb; } *out = 0; -- cgit v1.2.3 From b017a0cea627fcbe158fc2c214fe893e18c4d0c4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 25 Mar 2024 16:35:21 +0000 Subject: arm64/ptrace: Use saved floating point state type to determine SVE layout The SVE register sets have two different formats, one of which is a wrapped version of the standard FPSIMD register set and another with actual SVE register data. At present we check TIF_SVE to see if full SVE register state should be provided when reading the SVE regset but if we were in a syscall we may have saved only floating point registers even though that is set. Fix this and simplify the logic by checking and using the format which we recorded when deciding if we should use FPSIMD or SVE format. Fixes: 8c845e273104 ("arm64/sve: Leave SVE enabled on syscall if we don't context switch") Cc: # 6.2.x Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240325-arm64-ptrace-fp-type-v1-1-8dc846caf11f@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/ptrace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 162b030ab9da..0d022599eb61 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -761,7 +761,6 @@ static void sve_init_header_from_task(struct user_sve_header *header, { unsigned int vq; bool active; - bool fpsimd_only; enum vec_type task_type; memset(header, 0, sizeof(*header)); @@ -777,12 +776,10 @@ static void sve_init_header_from_task(struct user_sve_header *header, case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; - fpsimd_only = !test_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; - fpsimd_only = false; break; default: WARN_ON_ONCE(1); @@ -790,7 +787,7 @@ static void sve_init_header_from_task(struct user_sve_header *header, } if (active) { - if (fpsimd_only) { + if (target->thread.fp_type == FP_STATE_FPSIMD) { header->flags |= SVE_PT_REGS_FPSIMD; } else { header->flags |= SVE_PT_REGS_SVE; -- cgit v1.2.3 From de3f64b738af57e2732b91a0774facc675b75b54 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 1 Nov 2023 11:49:48 +0100 Subject: vboxsf: Avoid an spurious warning if load_nls_xxx() fails If an load_nls_xxx() function fails a few lines above, the 'sbi->bdi_id' is still 0. So, in the error handling path, we will call ida_simple_remove(..., 0) which is not allocated yet. In order to prevent a spurious "ida_free called for id=0 which is not allocated." message, tweak the error handling path and add a new label. Fixes: 0fd169576648 ("fs: Add VirtualBox guest shared folder (vboxsf) support") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/d09eaaa4e2e08206c58a1a27ca9b3e81dc168773.1698835730.git.christophe.jaillet@wanadoo.fr Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- fs/vboxsf/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index cabe8ac4fefc..1ed6b1a76bf9 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -151,7 +151,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) if (!sbi->nls) { vbg_err("vboxsf: Count not load '%s' nls\n", nls_name); err = -EINVAL; - goto fail_free; + goto fail_destroy_idr; } } @@ -224,6 +224,7 @@ fail_free: ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id); if (sbi->nls) unload_nls(sbi->nls); +fail_destroy_idr: idr_destroy(&sbi->ino_idr); kfree(sbi); return err; -- cgit v1.2.3 From 0141d68f86d78953fb4c3983d666e92f7df4a43d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 1 Nov 2023 11:49:49 +0100 Subject: vboxsf: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/b3c057c86b73f0309a6362031d21f4d7ebb60587.1698835730.git.christophe.jaillet@wanadoo.fr Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- fs/vboxsf/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 1ed6b1a76bf9..ffb1d565da39 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -155,7 +155,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) } } - sbi->bdi_id = ida_simple_get(&vboxsf_bdi_ida, 0, 0, GFP_KERNEL); + sbi->bdi_id = ida_alloc(&vboxsf_bdi_ida, GFP_KERNEL); if (sbi->bdi_id < 0) { err = sbi->bdi_id; goto fail_free; @@ -221,7 +221,7 @@ fail_unmap: vboxsf_unmap_folder(sbi->root); fail_free: if (sbi->bdi_id >= 0) - ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id); + ida_free(&vboxsf_bdi_ida, sbi->bdi_id); if (sbi->nls) unload_nls(sbi->nls); fail_destroy_idr: @@ -269,7 +269,7 @@ static void vboxsf_put_super(struct super_block *sb) vboxsf_unmap_folder(sbi->root); if (sbi->bdi_id >= 0) - ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id); + ida_free(&vboxsf_bdi_ida, sbi->bdi_id); if (sbi->nls) unload_nls(sbi->nls); -- cgit v1.2.3 From 1ece2c43b88660ddbdf8ecb772e9c41ed9cda3dd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 19 Mar 2024 12:32:04 -0400 Subject: vboxsf: explicitly deny setlease attempts vboxsf does not break leases on its own, so it can't properly handle the case where the hypervisor changes the data. Don't allow file leases on vboxsf. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240319-setlease-v1-1-5997d67e04b3@kernel.org Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- fs/vboxsf/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index 2307f8037efc..118dedef8ebe 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -218,6 +218,7 @@ const struct file_operations vboxsf_reg_fops = { .release = vboxsf_file_release, .fsync = noop_fsync, .splice_read = filemap_splice_read, + .setlease = simple_nosetlease, }; const struct inode_operations vboxsf_reg_iops = { -- cgit v1.2.3 From 1057c4c36ef8b236a2e28edef301da0801338c5f Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Wed, 3 Apr 2024 16:31:40 +0500 Subject: thermal: gov_power_allocator: Allow binding without cooling devices IPA was recently refactored to split out memory allocation into a separate funciton. That funciton was made to return -EINVAL if there is zero power_actors and thus no memory to allocate. This causes IPA to fail probing when the thermal zone has no attached cooling devices. Since cooling devices can attach after the thermal zone is created and the governer is attached to it, failing probe due to the lack of cooling devices is incorrect. Change the allocate_actors_buffer() to return success when there is no cooling devices present. Fixes: 912e97c67cc3 ("thermal: gov_power_allocator: Move memory allocation out of throttle()") Signed-off-by: Nikita Travkin Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 1b17dc4c219c..ec637071ef1f 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -606,7 +606,7 @@ static int allocate_actors_buffer(struct power_allocator_params *params, /* There might be no cooling devices yet. */ if (!num_actors) { - ret = -EINVAL; + ret = 0; goto clean_state; } -- cgit v1.2.3 From da781936e7c301e6197eb6513775748e79fb2575 Mon Sep 17 00:00:00 2001 From: Nikita Travkin Date: Wed, 3 Apr 2024 16:31:41 +0500 Subject: thermal: gov_power_allocator: Allow binding without trip points IPA probe function was recently refactored to perform extra error checks and make sure the thermal zone has trip points necessary for the IPA operation. With this change, if a thermal zone is probed such that it has no trip points that IPA can use, IPA will fail and the TZ won't be created. This is the case if a platform defines a TZ without cooling devices and only with "hot"/"critical" trip points, often found on some Qualcomm devices [1]. Documentation across IPA code (notably get_governor_trips() kerneldoc) suggests that IPA is supposed to handle such TZ even if it won't actually do anything. This commit partially reverts the previous change to allow IPA to bind to such "empty" thermal zones. Fixes: e83747c2f8e3 ("thermal: gov_power_allocator: Set up trip points earlier") Link: arch/arm64/boot/dts/qcom/sc7180.dtsi#n4776 # [1] Signed-off-by: Nikita Travkin Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index ec637071ef1f..e25e48d76aa7 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -679,11 +679,6 @@ static int power_allocator_bind(struct thermal_zone_device *tz) return -ENOMEM; get_governor_trips(tz, params); - if (!params->trip_max) { - dev_warn(&tz->device, "power_allocator: missing trip_max\n"); - kfree(params); - return -EINVAL; - } ret = check_power_actors(tz, params); if (ret < 0) { @@ -714,9 +709,10 @@ static int power_allocator_bind(struct thermal_zone_device *tz) else params->sustainable_power = tz->tzp->sustainable_power; - estimate_pid_constants(tz, tz->tzp->sustainable_power, - params->trip_switch_on, - params->trip_max->temperature); + if (params->trip_max) + estimate_pid_constants(tz, tz->tzp->sustainable_power, + params->trip_switch_on, + params->trip_max->temperature); reset_pid_controller(params); -- cgit v1.2.3 From 90ca6956d3834db4060f87700e2fcbb699c4e4fd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 Mar 2024 17:42:12 +0300 Subject: ice: Fix freeing uninitialized pointers Automatically cleaned up pointers need to be initialized before exiting their scope. In this case, they need to be initialized to NULL before any return statement. Fixes: 90f821d72e11 ("ice: avoid unnecessary devm_ usage") Signed-off-by: Dan Carpenter Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 10 +++++----- drivers/net/ethernet/intel/ice/ice_ethtool.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index db4b2844e1f7..d9f6cc71d900 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1002,8 +1002,8 @@ static void ice_get_itr_intrl_gran(struct ice_hw *hw) */ int ice_init_hw(struct ice_hw *hw) { - struct ice_aqc_get_phy_caps_data *pcaps __free(kfree); - void *mac_buf __free(kfree); + struct ice_aqc_get_phy_caps_data *pcaps __free(kfree) = NULL; + void *mac_buf __free(kfree) = NULL; u16 mac_buf_len; int status; @@ -3272,7 +3272,7 @@ int ice_update_link_info(struct ice_port_info *pi) return status; if (li->link_info & ICE_AQ_MEDIA_AVAILABLE) { - struct ice_aqc_get_phy_caps_data *pcaps __free(kfree); + struct ice_aqc_get_phy_caps_data *pcaps __free(kfree) = NULL; pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL); if (!pcaps) @@ -3420,7 +3420,7 @@ ice_cfg_phy_fc(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg, int ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool ena_auto_link_update) { - struct ice_aqc_get_phy_caps_data *pcaps __free(kfree); + struct ice_aqc_get_phy_caps_data *pcaps __free(kfree) = NULL; struct ice_aqc_set_phy_cfg_data cfg = { 0 }; struct ice_hw *hw; int status; @@ -3561,7 +3561,7 @@ int ice_cfg_phy_fec(struct ice_port_info *pi, struct ice_aqc_set_phy_cfg_data *cfg, enum ice_fec_mode fec) { - struct ice_aqc_get_phy_caps_data *pcaps __free(kfree); + struct ice_aqc_get_phy_caps_data *pcaps __free(kfree) = NULL; struct ice_hw *hw; int status; diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 255a9c8151b4..78b833b3e1d7 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -941,11 +941,11 @@ static u64 ice_loopback_test(struct net_device *netdev) struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_vsi *orig_vsi = np->vsi, *test_vsi; struct ice_pf *pf = orig_vsi->back; + u8 *tx_frame __free(kfree) = NULL; u8 broadcast[ETH_ALEN], ret = 0; int num_frames, valid_frames; struct ice_tx_ring *tx_ring; struct ice_rx_ring *rx_ring; - u8 *tx_frame __free(kfree); int i; netdev_info(netdev, "loopback test\n"); -- cgit v1.2.3 From 8edfc7a40e3300fc6c5fa7a3228a24d5bcd86ba5 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 25 Mar 2024 21:19:01 +0100 Subject: ice: fix enabling RX VLAN filtering ice_port_vlan_on/off() was introduced in commit 2946204b3fa8 ("ice: implement bridge port vlan"). But ice_port_vlan_on() incorrectly assigns ena_rx_filtering to inner_vlan_ops in DVM mode. This causes an error when rx_filtering cannot be enabled in legacy mode. Reproducer: echo 1 > /sys/class/net/$PF/device/sriov_numvfs ip link set $PF vf 0 spoofchk off trust on vlan 3 dmesg: ice 0000:41:00.0: failed to enable Rx VLAN filtering for VF 0 VSI 9 during VF rebuild, error -95 Fixes: 2946204b3fa8 ("ice: implement bridge port vlan") Signed-off-by: Petr Oros Reviewed-by: Michal Swiatkowski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c index 80dc4bcdd3a4..b3e1bdcb80f8 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c @@ -26,24 +26,22 @@ static void ice_port_vlan_on(struct ice_vsi *vsi) struct ice_vsi_vlan_ops *vlan_ops; struct ice_pf *pf = vsi->back; - if (ice_is_dvm_ena(&pf->hw)) { - vlan_ops = &vsi->outer_vlan_ops; - - /* setup outer VLAN ops */ - vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan; - vlan_ops->clear_port_vlan = ice_vsi_clear_outer_port_vlan; + /* setup inner VLAN ops */ + vlan_ops = &vsi->inner_vlan_ops; - /* setup inner VLAN ops */ - vlan_ops = &vsi->inner_vlan_ops; + if (ice_is_dvm_ena(&pf->hw)) { vlan_ops->add_vlan = noop_vlan_arg; vlan_ops->del_vlan = noop_vlan_arg; vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping; vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping; vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion; vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion; - } else { - vlan_ops = &vsi->inner_vlan_ops; + /* setup outer VLAN ops */ + vlan_ops = &vsi->outer_vlan_ops; + vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan; + vlan_ops->clear_port_vlan = ice_vsi_clear_outer_port_vlan; + } else { vlan_ops->set_port_vlan = ice_vsi_set_inner_port_vlan; vlan_ops->clear_port_vlan = ice_vsi_clear_inner_port_vlan; } -- cgit v1.2.3 From 0e110732473e14d6520e49d75d2c88ef7d46fe67 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 2 Apr 2024 16:05:49 +0200 Subject: x86/retpoline: Do the necessary fixup to the Zen3/4 srso return thunk for !SRSO The srso_alias_untrain_ret() dummy thunk in the !CONFIG_MITIGATION_SRSO case is there only for the altenative in CALL_UNTRAIN_RET to have a symbol to resolve. However, testing with kernels which don't have CONFIG_MITIGATION_SRSO enabled, leads to the warning in patch_return() to fire: missing return thunk: srso_alias_untrain_ret+0x0/0x10-0x0: eb 0e 66 66 2e WARNING: CPU: 0 PID: 0 at arch/x86/kernel/alternative.c:826 apply_returns (arch/x86/kernel/alternative.c:826 Put in a plain "ret" there so that gcc doesn't put a return thunk in in its place which special and gets checked. In addition: ERROR: modpost: "srso_alias_untrain_ret" [arch/x86/kvm/kvm-amd.ko] undefined! make[2]: *** [scripts/Makefile.modpost:145: Module.symvers] Chyba 1 make[1]: *** [/usr/src/linux-6.8.3/Makefile:1873: modpost] Chyba 2 make: *** [Makefile:240: __sub-make] Chyba 2 since !SRSO builds would use the dummy return thunk as reported by petr.pisar@atlas.cz, https://bugzilla.kernel.org/show_bug.cgi?id=218679. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404020901.da75a60f-oliver.sang@intel.com Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/all/202404020901.da75a60f-oliver.sang@intel.com/ Signed-off-by: Linus Torvalds --- arch/x86/lib/retpoline.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 02cde194a99e..0795b3464058 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -228,8 +228,11 @@ SYM_CODE_END(srso_return_thunk) #else /* !CONFIG_MITIGATION_SRSO */ /* Dummy for the alternative in CALL_UNTRAIN_RET. */ SYM_CODE_START(srso_alias_untrain_ret) - RET + ANNOTATE_UNRET_SAFE + ret + int3 SYM_FUNC_END(srso_alias_untrain_ret) +__EXPORT_THUNK(srso_alias_untrain_ret) #define JMP_SRSO_UNTRAIN_RET "ud2" #endif /* CONFIG_MITIGATION_SRSO */ -- cgit v1.2.3 From 701b38995e5bdd2a293936c55782140423827fb1 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 3 Apr 2024 09:57:29 +0200 Subject: security: Place security_path_post_mknod() where the original IMA call was Commit 08abce60d63f ("security: Introduce path_post_mknod hook") introduced security_path_post_mknod(), to replace the IMA-specific call to ima_post_path_mknod(). For symmetry with security_path_mknod(), security_path_post_mknod() was called after a successful mknod operation, for any file type, rather than only for regular files at the time there was the IMA call. However, as reported by VFS maintainers, successful mknod operation does not mean that the dentry always has an inode attached to it (for example, not for FIFOs on a SAMBA mount). If that condition happens, the kernel crashes when security_path_post_mknod() attempts to verify if the inode associated to the dentry is private. Move security_path_post_mknod() where the ima_post_path_mknod() call was, which is obviously correct from IMA/EVM perspective. IMA/EVM are the only in-kernel users, and only need to inspect regular files. Reported-by: Steve French Closes: https://lore.kernel.org/linux-kernel/CAH2r5msAVzxCUHHG8VKrMPUKQHmBpE6K9_vjhgDa1uAvwx4ppw@mail.gmail.com/ Suggested-by: Al Viro Fixes: 08abce60d63f ("security: Introduce path_post_mknod hook") Signed-off-by: Roberto Sassu Reviewed-by: Christian Brauner Reviewed-by: Mimi Zohar Acked-by: Paul Moore Signed-off-by: Linus Torvalds --- fs/namei.c | 7 ++----- security/security.c | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ceb9ddf8dfdd..c5b2a25be7d0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4050,6 +4050,8 @@ retry: case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); + if (!error) + security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, @@ -4060,11 +4062,6 @@ retry: dentry, mode, 0); break; } - - if (error) - goto out2; - - security_path_post_mknod(idmap, dentry); out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { diff --git a/security/security.c b/security/security.c index 7e118858b545..0a9a0ac3f266 100644 --- a/security/security.c +++ b/security/security.c @@ -1793,11 +1793,11 @@ int security_path_mknod(const struct path *dir, struct dentry *dentry, EXPORT_SYMBOL(security_path_mknod); /** - * security_path_post_mknod() - Update inode security field after file creation + * security_path_post_mknod() - Update inode security after reg file creation * @idmap: idmap of the mount * @dentry: new file * - * Update inode security field after a file has been created. + * Update inode security field after a regular file has been created. */ void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { -- cgit v1.2.3 From 3f5eb32513e75eb321919a703800d4e13e9d3ba8 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 3 Apr 2024 14:18:39 +0300 Subject: ASoC: SOF: Intel: lnl: Disable DMIC/SSP offload on remove During probe the DMIC/SSP offload is enabled and it is not reversed on remove. Add a remove wrapper for LNL to disable the offload for DMIC and SSP similarly to what is done during probe. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Link: https://msgid.link/r/20240403111839.27259-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/lnl.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/sound/soc/sof/intel/lnl.c b/sound/soc/sof/intel/lnl.c index d1c73d407e68..aeb4350cce6b 100644 --- a/sound/soc/sof/intel/lnl.c +++ b/sound/soc/sof/intel/lnl.c @@ -29,15 +29,17 @@ static const struct snd_sof_debugfs_map lnl_dsp_debugfs[] = { }; /* this helps allows the DSP to setup DMIC/SSP */ -static int hdac_bus_offload_dmic_ssp(struct hdac_bus *bus) +static int hdac_bus_offload_dmic_ssp(struct hdac_bus *bus, bool enable) { int ret; - ret = hdac_bus_eml_enable_offload(bus, true, AZX_REG_ML_LEPTR_ID_INTEL_SSP, true); + ret = hdac_bus_eml_enable_offload(bus, true, + AZX_REG_ML_LEPTR_ID_INTEL_SSP, enable); if (ret < 0) return ret; - ret = hdac_bus_eml_enable_offload(bus, true, AZX_REG_ML_LEPTR_ID_INTEL_DMIC, true); + ret = hdac_bus_eml_enable_offload(bus, true, + AZX_REG_ML_LEPTR_ID_INTEL_DMIC, enable); if (ret < 0) return ret; @@ -52,7 +54,19 @@ static int lnl_hda_dsp_probe(struct snd_sof_dev *sdev) if (ret < 0) return ret; - return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev)); + return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev), true); +} + +static void lnl_hda_dsp_remove(struct snd_sof_dev *sdev) +{ + int ret; + + ret = hdac_bus_offload_dmic_ssp(sof_to_bus(sdev), false); + if (ret < 0) + dev_warn(sdev->dev, + "Failed to disable offload for DMIC/SSP: %d\n", ret); + + hda_dsp_remove(sdev); } static int lnl_hda_dsp_resume(struct snd_sof_dev *sdev) @@ -63,7 +77,7 @@ static int lnl_hda_dsp_resume(struct snd_sof_dev *sdev) if (ret < 0) return ret; - return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev)); + return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev), true); } static int lnl_hda_dsp_runtime_resume(struct snd_sof_dev *sdev) @@ -74,7 +88,7 @@ static int lnl_hda_dsp_runtime_resume(struct snd_sof_dev *sdev) if (ret < 0) return ret; - return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev)); + return hdac_bus_offload_dmic_ssp(sof_to_bus(sdev), true); } static int lnl_dsp_post_fw_run(struct snd_sof_dev *sdev) @@ -97,9 +111,11 @@ int sof_lnl_ops_init(struct snd_sof_dev *sdev) /* common defaults */ memcpy(&sof_lnl_ops, &sof_hda_common_ops, sizeof(struct snd_sof_dsp_ops)); - /* probe */ - if (!sdev->dspless_mode_selected) + /* probe/remove */ + if (!sdev->dspless_mode_selected) { sof_lnl_ops.probe = lnl_hda_dsp_probe; + sof_lnl_ops.remove = lnl_hda_dsp_remove; + } /* shutdown */ sof_lnl_ops.shutdown = hda_dsp_shutdown; -- cgit v1.2.3 From 64d845f651267deb62bcf013ce37e2360161fdf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Tue, 19 Mar 2024 14:33:23 +0200 Subject: drm/i915/psr: Calculate PIPE_SRCSZ_ERLY_TPT value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When early transport is enabled we need to write PIPE_SRCSZ_ERLY_TPT on every flip doing selective update. This patch calculates PIPE_SRCSZ_ERLY_TPT same way as is done for PSR2_MAN_TRK_CTL value and stores i in intel_crtc_state->pipe_srcsz_early_tpt to be written later during flip. Bspec: 68927 Signed-off-by: Jouni Högander Reviewed-by: Mika Kahola Link: https://patchwork.freedesktop.org/patch/msgid/20240319123327.1661097-2-jouni.hogander@intel.com (cherry picked from commit f3b899f0b4b17fa0b20e27c23f78604d5686383d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_display_types.h | 2 ++ drivers/gpu/drm/i915/display/intel_psr.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h index 9104f18753b4..bf3f942e19c3 100644 --- a/drivers/gpu/drm/i915/display/intel_display_types.h +++ b/drivers/gpu/drm/i915/display/intel_display_types.h @@ -1423,6 +1423,8 @@ struct intel_crtc_state { u32 psr2_man_track_ctl; + u32 pipe_srcsz_early_tpt; + struct drm_rect psr2_su_area; /* Variable Refresh Rate state */ diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 6927785fd6ff..2c4978e189a3 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -2051,6 +2051,20 @@ exit: crtc_state->psr2_man_track_ctl = val; } +static u32 psr2_pipe_srcsz_early_tpt_calc(struct intel_crtc_state *crtc_state, + bool full_update) +{ + int width, height; + + if (!crtc_state->enable_psr2_su_region_et || full_update) + return 0; + + width = drm_rect_width(&crtc_state->psr2_su_area); + height = drm_rect_height(&crtc_state->psr2_su_area); + + return PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1); +} + static void clip_area_update(struct drm_rect *overlap_damage_area, struct drm_rect *damage_area, struct drm_rect *pipe_src) @@ -2338,6 +2352,8 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state, skip_sel_fetch_set_loop: psr2_man_trk_ctl_calc(crtc_state, full_update); + crtc_state->pipe_srcsz_early_tpt = + psr2_pipe_srcsz_early_tpt_calc(crtc_state, full_update); return 0; } -- cgit v1.2.3 From 4e29234353a4378a49e5ee6f5683678d7e101e17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Tue, 19 Mar 2024 14:33:24 +0200 Subject: drm/i915/psr: Move writing early transport pipe src MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently PIPE_SRCSZ_ERLY_TPT is written in intel_display.c:intel_set_pipe_src_size. This doesn't work as intel_set_pipe_src_size is called only on modeset. Bspec: 68927 Fixes: 3291bbb93e16 ("drm/i915/psr: Configure PIPE_SRCSZ_ERLY_TPT for psr2 early transport") Signed-off-by: Jouni Högander Reviewed-by: Mika Kahola Link: https://patchwork.freedesktop.org/patch/msgid/20240319123327.1661097-3-jouni.hogander@intel.com (cherry picked from commit b52c4093b0c9089b00b42823d41986a94d32e341) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_display.c | 9 --------- drivers/gpu/drm/i915/display/intel_psr.c | 7 +++++++ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index ab2f52d21bad..8af9e6128277 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -2709,15 +2709,6 @@ static void intel_set_pipe_src_size(const struct intel_crtc_state *crtc_state) */ intel_de_write(dev_priv, PIPESRC(pipe), PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1)); - - if (!crtc_state->enable_psr2_su_region_et) - return; - - width = drm_rect_width(&crtc_state->psr2_su_area); - height = drm_rect_height(&crtc_state->psr2_su_area); - - intel_de_write(dev_priv, PIPE_SRCSZ_ERLY_TPT(pipe), - PIPESRC_WIDTH(width - 1) | PIPESRC_HEIGHT(height - 1)); } static bool intel_pipe_is_interlaced(const struct intel_crtc_state *crtc_state) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 2c4978e189a3..47175e4156bd 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1994,6 +1994,7 @@ static void psr_force_hw_tracking_exit(struct intel_dp *intel_dp) void intel_psr2_program_trans_man_trk_ctl(const struct intel_crtc_state *crtc_state) { + struct intel_crtc *crtc = to_intel_crtc(crtc_state->uapi.crtc); struct drm_i915_private *dev_priv = to_i915(crtc_state->uapi.crtc->dev); enum transcoder cpu_transcoder = crtc_state->cpu_transcoder; struct intel_encoder *encoder; @@ -2013,6 +2014,12 @@ void intel_psr2_program_trans_man_trk_ctl(const struct intel_crtc_state *crtc_st intel_de_write(dev_priv, PSR2_MAN_TRK_CTL(cpu_transcoder), crtc_state->psr2_man_track_ctl); + + if (!crtc_state->enable_psr2_su_region_et) + return; + + intel_de_write(dev_priv, PIPE_SRCSZ_ERLY_TPT(crtc->pipe), + crtc_state->pipe_srcsz_early_tpt); } static void psr2_man_trk_ctl_calc(struct intel_crtc_state *crtc_state, -- cgit v1.2.3 From bf1f6f8d0b193561f213209b902edda634b6c74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Tue, 19 Mar 2024 14:33:25 +0200 Subject: drm/i915/psr: Fix intel_psr2_sel_fetch_et_alignment usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we are not aligning selective update area to cover cursor fully when cursor is not updated by itself but still in the selective update area. Fix this by checking cursor separately after drm_atomic_add_affected_planes. Bspec: 68927 Fixes: 1bff93b8bc27 ("drm/i915/psr: Extend SU area to cover cursor fully if needed") Signed-off-by: Jouni Högander Reviewed-by: Mika Kahola Link: https://patchwork.freedesktop.org/patch/msgid/20240319123327.1661097-4-jouni.hogander@intel.com (cherry picked from commit d37b3dac68e26669f03f768b3afc9abc094c9ac9) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_psr.c | 55 +++++++++++++++++++------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 47175e4156bd..b6e539f1342c 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -2116,21 +2116,36 @@ static void intel_psr2_sel_fetch_pipe_alignment(struct intel_crtc_state *crtc_st * cursor fully when cursor is in SU area. */ static void -intel_psr2_sel_fetch_et_alignment(struct intel_crtc_state *crtc_state, - struct intel_plane_state *cursor_state) +intel_psr2_sel_fetch_et_alignment(struct intel_atomic_state *state, + struct intel_crtc *crtc) { - struct drm_rect inter; + struct intel_crtc_state *crtc_state = intel_atomic_get_new_crtc_state(state, crtc); + struct intel_plane_state *new_plane_state; + struct intel_plane *plane; + int i; - if (!crtc_state->enable_psr2_su_region_et || - !cursor_state->uapi.visible) + if (!crtc_state->enable_psr2_su_region_et) return; - inter = crtc_state->psr2_su_area; - if (!drm_rect_intersect(&inter, &cursor_state->uapi.dst)) - return; + for_each_new_intel_plane_in_state(state, plane, new_plane_state, i) { + struct drm_rect inter; - clip_area_update(&crtc_state->psr2_su_area, &cursor_state->uapi.dst, - &crtc_state->pipe_src); + if (new_plane_state->uapi.crtc != crtc_state->uapi.crtc) + continue; + + if (plane->id != PLANE_CURSOR) + continue; + + if (!new_plane_state->uapi.visible) + continue; + + inter = crtc_state->psr2_su_area; + if (!drm_rect_intersect(&inter, &new_plane_state->uapi.dst)) + continue; + + clip_area_update(&crtc_state->psr2_su_area, &new_plane_state->uapi.dst, + &crtc_state->pipe_src); + } } /* @@ -2173,8 +2188,7 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state, { struct drm_i915_private *dev_priv = to_i915(state->base.dev); struct intel_crtc_state *crtc_state = intel_atomic_get_new_crtc_state(state, crtc); - struct intel_plane_state *new_plane_state, *old_plane_state, - *cursor_plane_state = NULL; + struct intel_plane_state *new_plane_state, *old_plane_state; struct intel_plane *plane; bool full_update = false; int i, ret; @@ -2259,13 +2273,6 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state, damaged_area.x2 += new_plane_state->uapi.dst.x1 - src.x1; clip_area_update(&crtc_state->psr2_su_area, &damaged_area, &crtc_state->pipe_src); - - /* - * Cursor plane new state is stored to adjust su area to cover - * cursor are fully. - */ - if (plane->id == PLANE_CURSOR) - cursor_plane_state = new_plane_state; } /* @@ -2294,9 +2301,13 @@ int intel_psr2_sel_fetch_update(struct intel_atomic_state *state, if (ret) return ret; - /* Adjust su area to cover cursor fully as necessary */ - if (cursor_plane_state) - intel_psr2_sel_fetch_et_alignment(crtc_state, cursor_plane_state); + /* + * Adjust su area to cover cursor fully as necessary (early + * transport). This needs to be done after + * drm_atomic_add_affected_planes to ensure visible cursor is added into + * affected planes even when cursor is not updated by itself. + */ + intel_psr2_sel_fetch_et_alignment(state, crtc); intel_psr2_sel_fetch_pipe_alignment(crtc_state); -- cgit v1.2.3 From 94bf3e60e1a61973cdb6488af873b8de66250c77 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Wed, 27 Mar 2024 21:05:46 +0100 Subject: drm/i915/gt: Limit the reserved VM space to only the platforms that need it Commit 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") reduces the available VM space of one page in order to apply Wa_16018031267 and Wa_16018063123. This page was reserved indiscrimitely in all platforms even when not needed. Limit it to DG2 onwards. Fixes: 9bb66c179f50 ("drm/i915: Reserve some kernel space per vm") Signed-off-by: Andi Shyti Cc: Andrzej Hajda Cc: Chris Wilson Cc: Jonathan Cavitt Cc: Nirmoy Das Reviewed-by: Nirmoy Das Acked-by: Michal Mrozek Link: https://patchwork.freedesktop.org/patch/msgid/20240327200546.640108-1-andi.shyti@linux.intel.com (cherry picked from commit 9721634441d5dedba7f9eebb2bf0c9411cbafc4e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 +++ drivers/gpu/drm/i915/gt/intel_gt.c | 6 ++++++ drivers/gpu/drm/i915/gt/intel_gt.h | 9 +++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index fa46d2308b0e..81bf2216371b 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -961,6 +961,9 @@ static int gen8_init_rsvd(struct i915_address_space *vm) struct i915_vma *vma; int ret; + if (!intel_gt_needs_wa_16018031267(vm->gt)) + return 0; + /* The memory will be used only by GPU. */ obj = i915_gem_object_create_lmem(i915, PAGE_SIZE, I915_BO_ALLOC_VOLATILE | diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index a425db5ed3a2..6a2c2718bcc3 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -1024,6 +1024,12 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt, return I915_MAP_WC; } +bool intel_gt_needs_wa_16018031267(struct intel_gt *gt) +{ + /* Wa_16018031267, Wa_16018063123 */ + return IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 55), IP_VER(12, 71)); +} + bool intel_gt_needs_wa_22016122933(struct intel_gt *gt) { return MEDIA_VER_FULL(gt->i915) == IP_VER(13, 0) && gt->type == GT_MEDIA; diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index 608f5c872928..003eb93b826f 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -82,17 +82,18 @@ struct drm_printer; ##__VA_ARGS__); \ } while (0) -#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \ - IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 55), IP_VER(12, 71)) && \ - engine->class == COPY_ENGINE_CLASS && engine->instance == 0) - static inline bool gt_is_root(struct intel_gt *gt) { return !gt->info.id; } +bool intel_gt_needs_wa_16018031267(struct intel_gt *gt); bool intel_gt_needs_wa_22016122933(struct intel_gt *gt); +#define NEEDS_FASTCOLOR_BLT_WABB(engine) ( \ + intel_gt_needs_wa_16018031267(engine->gt) && \ + engine->class == COPY_ENGINE_CLASS && engine->instance == 0) + static inline struct intel_gt *uc_to_gt(struct intel_uc *uc) { return container_of(uc, struct intel_gt, uc); -- cgit v1.2.3 From bc9a1ec01289e6e7259dc5030b413a9c6654a99a Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Thu, 28 Mar 2024 08:34:03 +0100 Subject: drm/i915/gt: Disable HW load balancing for CCS The hardware should not dynamically balance the load between CCS engines. Wa_14019159160 recommends disabling it across all platforms. Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement") Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Joonas Lahtinen Cc: Matt Roper Cc: # v6.2+ Reviewed-by: Matt Roper Acked-by: Michal Mrozek Link: https://patchwork.freedesktop.org/patch/msgid/20240328073409.674098-2-andi.shyti@linux.intel.com (cherry picked from commit f5d2904cf814f20b79e3e4c1b24a4ccc2411b7e0) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_gt_regs.h | 1 + drivers/gpu/drm/i915/gt/intel_workarounds.c | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h index 50962cfd1353..31b102604e3d 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h @@ -1477,6 +1477,7 @@ #define ECOBITS_PPGTT_CACHE4B (0 << 8) #define GEN12_RCU_MODE _MMIO(0x14800) +#define XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE REG_BIT(1) #define GEN12_RCU_MODE_CCS_ENABLE REG_BIT(0) #define CHV_FUSE_GT _MMIO(VLV_GUNIT_BASE + 0x2168) diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 25413809b9dc..4865eb5ca9c9 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -51,7 +51,8 @@ * registers belonging to BCS, VCS or VECS should be implemented in * xcs_engine_wa_init(). Workarounds for registers not belonging to a specific * engine's MMIO range but that are part of of the common RCS/CCS reset domain - * should be implemented in general_render_compute_wa_init(). + * should be implemented in general_render_compute_wa_init(). The settings + * about the CCS load balancing should be added in ccs_engine_wa_mode(). * * - GT workarounds: the list of these WAs is applied whenever these registers * revert to their default values: on GPU reset, suspend/resume [1]_, etc. @@ -2854,6 +2855,22 @@ add_render_compute_tuning_settings(struct intel_gt *gt, wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC); } +static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal) +{ + struct intel_gt *gt = engine->gt; + + if (!IS_DG2(gt->i915)) + return; + + /* + * Wa_14019159160: This workaround, along with others, leads to + * significant challenges in utilizing load balancing among the + * CCS slices. Consequently, an architectural decision has been + * made to completely disable automatic CCS load balancing. + */ + wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE); +} + /* * The workarounds in this function apply to shared registers in * the general render reset domain that aren't tied to a @@ -3004,8 +3021,10 @@ engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal * to a single RCS/CCS engine's workaround list since * they're reset as part of the general render domain reset. */ - if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) + if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) { general_render_compute_wa_init(engine, wal); + ccs_engine_wa_mode(engine, wal); + } if (engine->class == COMPUTE_CLASS) ccs_engine_wa_init(engine, wal); -- cgit v1.2.3 From ea315f98e5d6d3191b74beb0c3e5fc16081d517c Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Thu, 28 Mar 2024 08:34:04 +0100 Subject: drm/i915/gt: Do not generate the command streamer for all the CCS We want a fixed load CCS balancing consisting in all slices sharing one single user engine. For this reason do not create the intel_engine_cs structure with its dedicated command streamer for CCS slices beyond the first. Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement") Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Joonas Lahtinen Cc: Matt Roper Cc: # v6.2+ Acked-by: Michal Mrozek Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240328073409.674098-3-andi.shyti@linux.intel.com (cherry picked from commit c7a5aa4e57f88470313a8277eb299b221b86e3b1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 1ade568ffbfa..7a6dc371c384 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -908,6 +908,23 @@ static intel_engine_mask_t init_engine_mask(struct intel_gt *gt) info->engine_mask &= ~BIT(GSC0); } + /* + * Do not create the command streamer for CCS slices beyond the first. + * All the workload submitted to the first engine will be shared among + * all the slices. + * + * Once the user will be allowed to customize the CCS mode, then this + * check needs to be removed. + */ + if (IS_DG2(gt->i915)) { + u8 first_ccs = __ffs(CCS_MASK(gt)); + + /* Mask off all the CCS engine */ + info->engine_mask &= ~GENMASK(CCS3, CCS0); + /* Put back in the first CCS engine */ + info->engine_mask |= BIT(_CCS(first_ccs)); + } + return info->engine_mask; } -- cgit v1.2.3 From 6db31251bb265813994bfb104eb4b4d0f44d64fb Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Thu, 28 Mar 2024 08:34:05 +0100 Subject: drm/i915/gt: Enable only one CCS for compute workload Enable only one CCS engine by default with all the compute sices allocated to it. While generating the list of UABI engines to be exposed to the user, exclude any additional CCS engines beyond the first instance. This change can be tested with igt i915_query. Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement") Signed-off-by: Andi Shyti Cc: Chris Wilson Cc: Joonas Lahtinen Cc: Matt Roper Cc: # v6.2+ Reviewed-by: Matt Roper Acked-by: Michal Mrozek Link: https://patchwork.freedesktop.org/patch/msgid/20240328073409.674098-4-andi.shyti@linux.intel.com (cherry picked from commit 2bebae0112b117de7e8a7289277a4bd2403b9e17) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c | 39 +++++++++++++++++++++++++++++ drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h | 13 ++++++++++ drivers/gpu/drm/i915/gt/intel_gt_regs.h | 5 ++++ drivers/gpu/drm/i915/gt/intel_workarounds.c | 7 ++++++ 5 files changed, 65 insertions(+) create mode 100644 drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c create mode 100644 drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 4c2f85632391..fba73c38e235 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -118,6 +118,7 @@ gt-y += \ gt/intel_ggtt_fencing.o \ gt/intel_gt.o \ gt/intel_gt_buffer_pool.o \ + gt/intel_gt_ccs_mode.o \ gt/intel_gt_clock_utils.o \ gt/intel_gt_debugfs.o \ gt/intel_gt_engines_debugfs.o \ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c new file mode 100644 index 000000000000..044219c5960a --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2024 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_gt.h" +#include "intel_gt_ccs_mode.h" +#include "intel_gt_regs.h" + +void intel_gt_apply_ccs_mode(struct intel_gt *gt) +{ + int cslice; + u32 mode = 0; + int first_ccs = __ffs(CCS_MASK(gt)); + + if (!IS_DG2(gt->i915)) + return; + + /* Build the value for the fixed CCS load balancing */ + for (cslice = 0; cslice < I915_MAX_CCS; cslice++) { + if (CCS_MASK(gt) & BIT(cslice)) + /* + * If available, assign the cslice + * to the first available engine... + */ + mode |= XEHP_CCS_MODE_CSLICE(cslice, first_ccs); + + else + /* + * ... otherwise, mark the cslice as + * unavailable if no CCS dispatches here + */ + mode |= XEHP_CCS_MODE_CSLICE(cslice, + XEHP_CCS_MODE_CSLICE_MASK); + } + + intel_uncore_write(gt->uncore, XEHP_CCS_MODE, mode); +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h new file mode 100644 index 000000000000..9e5549caeb26 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_ccs_mode.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2024 Intel Corporation + */ + +#ifndef __INTEL_GT_CCS_MODE_H__ +#define __INTEL_GT_CCS_MODE_H__ + +struct intel_gt; + +void intel_gt_apply_ccs_mode(struct intel_gt *gt); + +#endif /* __INTEL_GT_CCS_MODE_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h index 31b102604e3d..743fe3566722 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h @@ -1480,6 +1480,11 @@ #define XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE REG_BIT(1) #define GEN12_RCU_MODE_CCS_ENABLE REG_BIT(0) +#define XEHP_CCS_MODE _MMIO(0x14804) +#define XEHP_CCS_MODE_CSLICE_MASK REG_GENMASK(2, 0) /* CCS0-3 + rsvd */ +#define XEHP_CCS_MODE_CSLICE_WIDTH ilog2(XEHP_CCS_MODE_CSLICE_MASK + 1) +#define XEHP_CCS_MODE_CSLICE(cslice, ccs) (ccs << (cslice * XEHP_CCS_MODE_CSLICE_WIDTH)) + #define CHV_FUSE_GT _MMIO(VLV_GUNIT_BASE + 0x2168) #define CHV_FGT_DISABLE_SS0 (1 << 10) #define CHV_FGT_DISABLE_SS1 (1 << 11) diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 4865eb5ca9c9..6ec3582c9735 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -10,6 +10,7 @@ #include "intel_engine_regs.h" #include "intel_gpu_commands.h" #include "intel_gt.h" +#include "intel_gt_ccs_mode.h" #include "intel_gt_mcr.h" #include "intel_gt_print.h" #include "intel_gt_regs.h" @@ -2869,6 +2870,12 @@ static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_li * made to completely disable automatic CCS load balancing. */ wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE); + + /* + * After having disabled automatic load balancing we need to + * assign all slices to a single CCS. We will call it CCS mode 1 + */ + intel_gt_apply_ccs_mode(gt); } /* -- cgit v1.2.3 From f7caddfd558e32db0ae944256e623a259538b357 Mon Sep 17 00:00:00 2001 From: Ankit Nautiyal Date: Tue, 5 Mar 2024 11:14:43 +0530 Subject: drm/i915/dp: Fix the computation for compressed_bpp for DISPLAY < 13 For DISPLAY < 13, compressed bpp is chosen from a list of supported compressed bpps. Fix the condition to choose the appropriate compressed bpp from the list. Fixes: 1c56e9a39833 ("drm/i915/dp: Get optimal link config to have best compressed bpp") Cc: Ankit Nautiyal Cc: Stanislav Lisovskiy Cc: Jani Nikula Cc: # v6.7+ Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10162 Signed-off-by: Ankit Nautiyal Reviewed-by: Suraj Kandpal Link: https://patchwork.freedesktop.org/patch/msgid/20240305054443.2489895-1-ankit.k.nautiyal@intel.com (cherry picked from commit 5a1da42b50f3594e18738885c2f23ed36629dd00) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 4016cf6b6c61..36afbb68d87d 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1917,8 +1917,9 @@ icl_dsc_compute_link_config(struct intel_dp *intel_dp, dsc_max_bpp = min(dsc_max_bpp, pipe_bpp - 1); for (i = 0; i < ARRAY_SIZE(valid_dsc_bpp); i++) { - if (valid_dsc_bpp[i] < dsc_min_bpp || - valid_dsc_bpp[i] > dsc_max_bpp) + if (valid_dsc_bpp[i] < dsc_min_bpp) + continue; + if (valid_dsc_bpp[i] > dsc_max_bpp) break; ret = dsc_compute_link_config(intel_dp, -- cgit v1.2.3 From 51bc63392e96ca45d7be98bc43c180b174ffca09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 16:51:46 +0300 Subject: drm/i915/mst: Limit MST+DSC to TGL+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MST code currently assumes that glk+ already supports MST+DSC, which is incorrect. We need to check for TGL+ actually. ICL does support SST+DSC, but supposedly it can't do MST+FEC which will also rule out MST+DSC. Note that a straight TGL+ check doesn't work here because DSC support can get fused out, so we do need to also check 'has_dsc'. Cc: stable@vger.kernel.org Fixes: d51f25eb479a ("drm/i915: Add DSC support to MST path") Reviewed-by: Uma Shankar Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402135148.23011-6-ville.syrjala@linux.intel.com (cherry picked from commit c9c92f286dbdf872390ef3e74dbe5f0641e46f55) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_display_device.h | 1 + drivers/gpu/drm/i915/display/intel_dp_mst.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_device.h b/drivers/gpu/drm/i915/display/intel_display_device.h index fe4268813786..9b1bce2624b9 100644 --- a/drivers/gpu/drm/i915/display/intel_display_device.h +++ b/drivers/gpu/drm/i915/display/intel_display_device.h @@ -47,6 +47,7 @@ struct drm_printer; #define HAS_DPT(i915) (DISPLAY_VER(i915) >= 13) #define HAS_DSB(i915) (DISPLAY_INFO(i915)->has_dsb) #define HAS_DSC(__i915) (DISPLAY_RUNTIME_INFO(__i915)->has_dsc) +#define HAS_DSC_MST(__i915) (DISPLAY_VER(__i915) >= 12 && HAS_DSC(__i915)) #define HAS_FBC(i915) (DISPLAY_RUNTIME_INFO(i915)->fbc_mask != 0) #define HAS_FPGA_DBG_UNCLAIMED(i915) (DISPLAY_INFO(i915)->has_fpga_dbg) #define HAS_FW_BLC(i915) (DISPLAY_VER(i915) >= 3) diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 53aec023ce92..b651c990af85 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -1355,7 +1355,7 @@ intel_dp_mst_mode_valid_ctx(struct drm_connector *connector, return 0; } - if (DISPLAY_VER(dev_priv) >= 10 && + if (HAS_DSC_MST(dev_priv) && drm_dp_sink_supports_dsc(intel_connector->dp.dsc_dpcd)) { /* * TBD pass the connector BPC, -- cgit v1.2.3 From 99f855082f228cdcecd6ab768d3b8b505e0eb028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 16:51:47 +0300 Subject: drm/i915/mst: Reject FEC+MST on ICL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ICL supposedly doesn't support FEC on MST. Reject it. Cc: stable@vger.kernel.org Fixes: d51f25eb479a ("drm/i915: Add DSC support to MST path") Reviewed-by: Uma Shankar Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402135148.23011-7-ville.syrjala@linux.intel.com (cherry picked from commit b648ce2a28ba83c4fa67c61fcc5983e15e9d4afb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 36afbb68d87d..abd62bebc46d 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1422,7 +1422,8 @@ static bool intel_dp_source_supports_fec(struct intel_dp *intel_dp, if (DISPLAY_VER(dev_priv) >= 12) return true; - if (DISPLAY_VER(dev_priv) == 11 && encoder->port != PORT_A) + if (DISPLAY_VER(dev_priv) == 11 && encoder->port != PORT_A && + !intel_crtc_has_type(pipe_config, INTEL_OUTPUT_DP_MST)) return true; return false; -- cgit v1.2.3 From 27fcec6c27caec05a512d2f1be7f855d8899cb8d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 22:25:45 -0400 Subject: bcachefs: Clear recovery_passes_required as they complete without errors Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 4 +--- fs/bcachefs/recovery_passes.c | 41 ++++++++++++++++++++++++++++++++--------- fs/bcachefs/util.h | 10 ++++++++++ 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e8b434009293..88b794d5dbd0 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -838,9 +838,7 @@ use_clean: } if (!test_bit(BCH_FS_error, &c->flags) && - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); write_sb = true; } diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 0089d9456b10..b066c5155b74 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -59,18 +59,23 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { +static const u8 passes_to_stable_map[] = { #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, BCH_RECOVERY_PASSES() #undef x - }; +}; + +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} +u64 bch2_recovery_passes_to_stable(u64 v) +{ u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); + ret |= BIT_ULL(passes_to_stable_map[i]); return ret; } @@ -116,13 +121,13 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, enum bch_recovery_pass pass) { - __le64 s = cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(pass))); + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (!(ext->recovery_passes_required[0] & s)) { - ext->recovery_passes_required[0] |= s; + if (!test_bit_le64(s, ext->recovery_passes_required)) { + __set_bit_le64(s, ext->recovery_passes_required); bch2_write_super(c); } mutex_unlock(&c->sb_lock); @@ -130,6 +135,21 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, return bch2_run_explicit_recovery_pass(c, pass); } +static void bch2_clear_recovery_pass_required(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (test_bit_le64(s, ext->recovery_passes_required)) { + __clear_bit_le64(s, ext->recovery_passes_required); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); +} + u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; @@ -218,6 +238,9 @@ int bch2_run_recovery_passes(struct bch_fs *c) c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + if (!test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); + c->curr_recovery_pass++; } diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 175aee3074c7..4577c0789a3a 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -797,4 +797,14 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); } +static inline void __clear_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); +} + +static inline bool test_bit_le64(size_t bit, __le64 *addr) +{ + return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; +} + #endif /* _BCACHEFS_UTIL_H */ -- cgit v1.2.3 From bdbf953b3c9036e284e28c9484dda5c81b2a45fa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 19 Mar 2024 18:56:26 -0400 Subject: bcachefs: bch2_shoot_down_journal_keys() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 19 +++++++++++++++++++ fs/bcachefs/btree_journal_iter.h | 4 ++++ fs/bcachefs/recovery.c | 22 ++++++++++++---------- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 1f588264575d..5cbcbfe85235 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -567,3 +567,22 @@ int bch2_journal_keys_sort(struct bch_fs *c) bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } + +void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, + unsigned level_min, unsigned level_max, + struct bpos start, struct bpos end) +{ + struct journal_keys *keys = &c->journal_keys; + size_t dst = 0; + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) + if (!(i->btree_id == btree && + i->level >= level_min && + i->level <= level_max && + bpos_ge(i->k->k.p, start) && + bpos_le(i->k->k.p, end))) + keys->data[dst++] = *i; + keys->nr = keys->gap = dst; +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 09cd53091a05..af25046ebcaa 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -66,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *); int bch2_journal_keys_sort(struct bch_fs *); +void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, + unsigned, unsigned, + struct bpos, struct bpos); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 88b794d5dbd0..7cbf30e3ef73 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -47,7 +47,7 @@ static bool btree_id_is_alloc(enum btree_id id) } /* for -o reconstruct_alloc: */ -static void do_reconstruct_alloc(struct bch_fs *c) +static void bch2_reconstruct_alloc(struct bch_fs *c) { bch2_journal_log_msg(c, "dropping alloc info"); bch_info(c, "dropping and reconstructing all alloc info"); @@ -82,15 +82,17 @@ static void do_reconstruct_alloc(struct bch_fs *c) c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - struct journal_keys *keys = &c->journal_keys; - size_t src, dst; - - move_gap(keys, keys->nr); - for (src = 0, dst = 0; src < keys->nr; src++) - if (!btree_id_is_alloc(keys->data[src].btree_id)) - keys->data[dst++] = keys->data[src]; - keys->nr = keys->gap = dst; + bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); } /* @@ -731,7 +733,7 @@ use_clean: c->journal_replay_seq_end = blacklist_seq - 1; if (c->opts.reconstruct_alloc) - do_reconstruct_alloc(c); + bch2_reconstruct_alloc(c); zero_out_btree_mem_ptr(&c->journal_keys); -- cgit v1.2.3 From ca1e02f7e9a1352b3f7b04de821ae74c9e07df74 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2024 19:26:33 -0400 Subject: bcachefs: Etyzinger cleanups Pull out eytzinger.c and kill eytzinger_cmp_fn. We now provide eytzinger0_sort and eytzinger0_sort_r, which use the standard cmp_func_t and cmp_r_func_t callbacks. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/eytzinger.c | 234 ++++++++++++++++++++++++++++++++++++ fs/bcachefs/eytzinger.h | 63 ++++++---- fs/bcachefs/journal_seq_blacklist.c | 3 +- fs/bcachefs/replicas.c | 19 +-- fs/bcachefs/util.c | 143 ---------------------- fs/bcachefs/util.h | 4 - 7 files changed, 285 insertions(+), 182 deletions(-) create mode 100644 fs/bcachefs/eytzinger.c diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index f6d86eb4b976..0933493a322c 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -37,6 +37,7 @@ bcachefs-y := \ error.o \ extents.o \ extent_update.o \ + eytzinger.o \ fs.o \ fs-common.o \ fs-ioctl.o \ diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c new file mode 100644 index 000000000000..4ce5e957a6e9 --- /dev/null +++ b/fs/bcachefs/eytzinger.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "eytzinger.h" + +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 (swap_r_func_t)0 +#define SWAP_WORDS_32 (swap_r_func_t)1 +#define SWAP_BYTES (swap_r_func_t)2 +#define SWAP_WRAPPER (swap_r_func_t)3 + +struct wrapper { + cmp_func_t cmp; + swap_func_t swap; +}; + +/* + * The function pointer is last to make tail calls most efficient if the + * compiler decides not to inline this function. + */ +static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) +{ + if (swap_func == SWAP_WRAPPER) { + ((const struct wrapper *)priv)->swap(a, b, (int)size); + return; + } + + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, (int)size, priv); +} + +#define _CMP_WRAPPER ((cmp_r_func_t)0L) + +static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) +{ + if (cmp == _CMP_WRAPPER) + return ((const struct wrapper *)priv)->cmp(a, b); + return cmp(a, b, priv); +} + +static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, const void *priv, + size_t l, size_t r) +{ + return do_cmp(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + cmp_func, priv); +} + +static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, + swap_r_func_t swap_func, const void *priv, + size_t l, size_t r) +{ + do_swap(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size, swap_func, priv); +} + +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + int i, c, r; + + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + swap_func = NULL; + + if (!swap_func) { + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +{ + struct wrapper w = { + .cmp = cmp_func, + .swap = swap_func, + }; + + return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); +} diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index b04750dbf870..ee0e2df33322 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -5,23 +5,33 @@ #include #include -#include "util.h" +#ifdef EYTZINGER_DEBUG +#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) +#else +#define EYTZINGER_BUG_ON(cond) +#endif /* * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array - */ - -/* - * One based indexing version: + * array. * - * With one based indexing each level of the tree starts at a power of two - - * good for cacheline alignment: + * Consider using an eytzinger tree any time you would otherwise be doing binary + * search over an array. Binary search is a worst case scenario for branch + * prediction and prefetching, but in an eytzinger tree every node's children + * are adjacent in memory, thus we can prefetch children before knowing the + * result of the comparison, assuming multiple nodes fit on a cacheline. + * + * Two variants are provided, for one based indexing and zero based indexing. + * + * Zero based indexing is more convenient, but one based indexing has better + * alignment and thus better performance because each new level of the tree + * starts at a power of two, and thus if element 0 was cacheline aligned, each + * new level will be as well. */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + child; } @@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); @@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; @@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned shift = __fls(size) - b; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); /* * sign bit trick: @@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) static inline unsigned eytzinger0_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + 1 + child; } @@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) -typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); - /* return greatest node <= @search, or -1 if not found */ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -244,21 +252,24 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, do { i = n; - n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); } while (n < nr); if (n & 1) { /* @i was greater than @search, return previous node: */ - - if (i == eytzinger0_first(nr)) - return -1; - return eytzinger0_prev(i, nr); } else { return i; } } +static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) +{ + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + return eytzinger0_next(idx, size); +} + #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ void *_base = (base); \ @@ -269,13 +280,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, int _res; \ \ while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size, _size))) \ + (_res = _cmp(_search, _base + _i * _size))) \ _i = eytzinger0_child(_i, _res > 0); \ _i; \ }) -void eytzinger0_sort(void *, size_t, size_t, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); +void eytzinger0_sort_r(void *, size_t, size_t, + cmp_r_func_t, swap_r_func_t, const void *); +void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); #endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index b5303874fc35..37a024e034d4 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -95,8 +95,7 @@ out: return ret ?: bch2_blacklist_table_initialize(c); } -static int journal_seq_blacklist_table_cmp(const void *_l, - const void *_r, size_t size) +static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) { const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index cc2672c12031..678b9c20e251 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -6,12 +6,15 @@ #include "replicas.h" #include "super-io.h" +#include + static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, size_t size) +static int bch2_memcmp(const void *l, const void *r, const void *priv) { + size_t size = (size_t) priv; return memcmp(l, r, size); } @@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); + eytzinger0_sort_r(r->entries, r->nr, r->entry_size, + bch2_memcmp, NULL, (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, verify_replicas_entry(search); -#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) +#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, entry_cmp, search); #undef entry_cmp @@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, { unsigned i; - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL); + sort_r(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + bch2_memcmp, NULL, + (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 216fadf16928..92c6ad75e702 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -707,149 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -static int alignment_ok(const void *base, size_t align) -{ - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || - ((unsigned long)base & (align - 1)) == 0; -} - -static void u32_swap(void *a, void *b, size_t size) -{ - u32 t = *(u32 *)a; - *(u32 *)a = *(u32 *)b; - *(u32 *)b = t; -} - -static void u64_swap(void *a, void *b, size_t size) -{ - u64 t = *(u64 *)a; - *(u64 *)a = *(u64 *)b; - *(u64 *)b = t; -} - -static void generic_swap(void *a, void *b, size_t size) -{ - char t; - - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while (--size > 0); -} - -static inline int do_cmp(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - size_t l, size_t r) -{ - return cmp_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -static inline void do_swap(void *base, size_t n, size_t size, - void (*swap_func)(void *, void *, size_t), - size_t l, size_t r) -{ - swap_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)) -{ - int i, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; - - if (c + 1 < n && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } - - /* sort */ - for (i = n - 1; i > 0; --i) { - do_swap(base, n, size, swap_func, 0, i); - - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; - - if (c + 1 < i && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } -} - -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t size)) -{ - /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } - - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } -} - #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 4577c0789a3a..b7e7c29278fc 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -631,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); - /* just the memmove, doesn't update @_nr */ #define __array_insert_item(_array, _nr, _pos) \ memmove(&(_array)[(_pos) + 1], \ -- cgit v1.2.3 From f2f61f4192de536fa36bebe49dc2c241213b53ee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Mar 2024 22:17:40 -0400 Subject: bcachefs: bch2_btree_root_alloc() -> bch2_btree_root_alloc_fake() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 8 ++++---- fs/bcachefs/btree_update_interior.h | 2 +- fs/bcachefs/recovery.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 2b46e6ad248a..32397b99752f 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2466,7 +2466,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2485,7 +2485,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) set_btree_node_fake(b); set_btree_node_need_rewrite(b); - b->c.level = 0; + b->c.level = level; b->c.btree_id = id; bkey_btree_ptr_init(&b->key); @@ -2512,9 +2512,9 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) return 0; } -void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); + bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 1d41d3f4b107..88dcf5a22a3b 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -171,7 +171,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, unsigned, bool); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); +void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 7cbf30e3ef73..f234de0ac834 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -476,7 +476,7 @@ static int read_btree_roots(struct bch_fs *c) if (!r->b) { r->alive = false; r->level = 0; - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); } } fsck_err: @@ -929,7 +929,7 @@ int bch2_fs_initialize(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); for_each_member_device(c, ca) bch2_dev_usage_init(ca); -- cgit v1.2.3 From b268aa4e7fb8be3c50e25a09008fb2feed2cd345 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 10 Mar 2024 16:18:41 -0400 Subject: bcachefs: Don't skip fake btree roots in fsck When a btree root is unreadable, we might still have keys fro the journal to walk and mark. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e5d2c6daa663..9db9c8a5beaa 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -931,9 +931,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans, b = bch2_btree_id_root(c, btree_id)->b; - if (btree_node_fake(b)) - return 0; - six_lock_read(&b->c.lock, NULL, NULL); printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); -- cgit v1.2.3 From 4409b8081d1624af814a9cda781ad9cdda3973cb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Mar 2024 23:11:46 -0400 Subject: bcachefs: Repair pass for scanning for btree nodes If a btree root or interior btree node goes bad, we're going to lose a lot of data, unless we can recover the nodes that it pointed to by scanning. Fortunately btree node headers are fully self describing, and additionally the magic number is xored with the filesytem UUID, so we can do so safely. This implements the scanning - next patch will rework topology repair to make use of the found nodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/bcachefs.h | 3 + fs/bcachefs/btree_node_scan.c | 495 ++++++++++++++++++++++++++++++++++++ fs/bcachefs/btree_node_scan.h | 11 + fs/bcachefs/btree_node_scan_types.h | 30 +++ fs/bcachefs/extents.c | 52 ++-- fs/bcachefs/extents.h | 1 + fs/bcachefs/opts.h | 4 +- fs/bcachefs/recovery.c | 54 ++-- fs/bcachefs/recovery_passes.c | 1 + fs/bcachefs/recovery_passes_types.h | 1 + fs/bcachefs/super.c | 3 + 12 files changed, 605 insertions(+), 51 deletions(-) create mode 100644 fs/bcachefs/btree_node_scan.c create mode 100644 fs/bcachefs/btree_node_scan.h create mode 100644 fs/bcachefs/btree_node_scan_types.h diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 0933493a322c..66ca0bbee639 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -17,6 +17,7 @@ bcachefs-y := \ btree_journal_iter.o \ btree_key_cache.o \ btree_locking.o \ + btree_node_scan.o \ btree_trans_commit.o \ btree_update.o \ btree_update_interior.o \ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 963162a627cd..93a61dbaa3d8 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -456,6 +456,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" +#include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" @@ -1103,6 +1104,8 @@ struct bch_fs { struct journal_keys journal_keys; struct list_head journal_iters; + struct find_btree_nodes found_btree_nodes; + u64 last_bucket_seq_cleanup; u64 counters_on_mount[BCH_COUNTER_NR]; diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c new file mode 100644 index 000000000000..3f33be7e5e5c --- /dev/null +++ b/fs/bcachefs/btree_node_scan.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_journal_iter.h" +#include "btree_node_scan.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "error.h" +#include "journal_io.h" +#include "recovery_passes.h" + +#include +#include + +struct find_btree_nodes_worker { + struct closure *cl; + struct find_btree_nodes *f; + struct bch_dev *ca; +}; + +static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) +{ + prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie); + bch2_bpos_to_text(out, n->min_key); + prt_str(out, "-"); + bch2_bpos_to_text(out, n->max_key); + + if (n->range_updated) + prt_str(out, " range updated"); + if (n->overwritten) + prt_str(out, " overwritten"); + + for (unsigned i = 0; i < n->nr_ptrs; i++) { + prt_char(out, ' '); + bch2_extent_ptr_to_text(out, c, n->ptrs + i); + } +} + +static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) +{ + printbuf_indent_add(out, 2); + darray_for_each(nodes, i) { + found_btree_node_to_text(out, c, i); + prt_newline(out); + } + printbuf_indent_sub(out, 2); +} + +static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) +{ + struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); + + set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); + bp->k.p = f->max_key; + bp->v.seq = cpu_to_le64(f->cookie); + bp->v.sectors_written = 0; + bp->v.flags = 0; + bp->v.min_key = f->min_key; + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); + memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); +} + +static bool found_btree_node_is_readable(struct btree_trans *trans, + const struct found_btree_node *f) +{ + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k; + + found_btree_node_to_key(&k.k, f); + + struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); + bool ret = !IS_ERR_OR_NULL(b); + if (ret) + six_unlock_read(&b->c.lock); + + /* + * We might update this node's range; if that happens, we need the node + * to be re-read so the read path can trim keys that are no longer in + * this node + */ + if (b != btree_node_root(trans->c, b)) + bch2_btree_node_evict(trans, &k.k); + return ret; +} + +static int found_btree_node_cmp_cookie(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + cmp_int(l->cookie, r->cookie); +} + +/* + * Given two found btree nodes, if their sequence numbers are equal, take the + * one that's readable: + */ +static int found_btree_node_cmp_time(const struct found_btree_node *l, + const struct found_btree_node *r) +{ + return cmp_int(l->seq, r->seq); +} + +static int found_btree_node_cmp_pos(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->min_key, r->min_key) ?: + -found_btree_node_cmp_time(l, r); +} + +static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, + struct bio *bio, struct btree_node *bn, u64 offset) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, bn, PAGE_SIZE); + + submit_bio_wait(bio); + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status))) + return; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return; + + rcu_read_lock(); + struct found_btree_node n = { + .btree_id = BTREE_NODE_ID(bn), + .level = BTREE_NODE_LEVEL(bn), + .seq = BTREE_NODE_SEQ(bn), + .cookie = le64_to_cpu(bn->keys.seq), + .min_key = bn->min_key, + .max_key = bn->max_key, + .nr_ptrs = 1, + .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, + .ptrs[0].offset = offset, + .ptrs[0].dev = ca->dev_idx, + .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), + }; + rcu_read_unlock(); + + if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { + mutex_lock(&f->lock); + if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { + bch_err(c, "try_read_btree_node() can't handle endian conversion"); + f->ret = -EINVAL; + goto unlock; + } + + if (darray_push(&f->nodes, n)) + f->ret = -ENOMEM; +unlock: + mutex_unlock(&f->lock); + } +} + +static int read_btree_nodes_worker(void *p) +{ + struct find_btree_nodes_worker *w = p; + struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); + struct bch_dev *ca = w->ca; + void *buf = (void *) __get_free_page(GFP_KERNEL); + struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); + unsigned long last_print = jiffies; + + if (!buf || !bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + w->f->ret = -ENOMEM; + goto err; + } + + for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) + for (unsigned bucket_offset = 0; + bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; + bucket_offset += btree_sectors(c)) { + if (time_after(jiffies, last_print + HZ * 30)) { + u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; + u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; + + bch_info(ca, "%s: %2u%% done", __func__, + (unsigned) div64_u64(cur_sector * 100, end_sector)); + last_print = jiffies; + } + + try_read_btree_node(w->f, ca, bio, buf, + bucket * ca->mi.bucket_size + bucket_offset); + } +err: + bio_put(bio); + free_page((unsigned long) buf); + percpu_ref_get(&ca->io_ref); + closure_put(w->cl); + kfree(w); + return 0; +} + +static int read_btree_nodes(struct find_btree_nodes *f) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct closure cl; + int ret = 0; + + closure_init_stack(&cl); + + for_each_online_member(c, ca) { + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); + struct task_struct *t; + + if (!w) { + percpu_ref_put(&ca->io_ref); + ret = -ENOMEM; + goto err; + } + + percpu_ref_get(&ca->io_ref); + closure_get(&cl); + w->cl = &cl; + w->f = f; + w->ca = ca; + + t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + ret = IS_ERR_OR_NULL(t); + if (ret) { + percpu_ref_put(&ca->io_ref); + closure_put(&cl); + f->ret = ret; + bch_err(c, "error starting kthread: %i", ret); + break; + } + } +err: + closure_sync(&cl); + return f->ret ?: ret; +} + +static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) +{ + while (n + 1 < end && + found_btree_node_cmp_pos(n, n + 1) > 0) { + swap(n[0], n[1]); + n++; + } +} + +static int handle_overwrites(struct bch_fs *c, + struct found_btree_node *start, + struct found_btree_node *end) +{ + struct found_btree_node *n; +again: + for (n = start + 1; + n < end && + n->btree_id == start->btree_id && + n->level == start->level && + bpos_lt(n->min_key, start->max_key); + n++) { + int cmp = found_btree_node_cmp_time(start, n); + + if (cmp > 0) { + if (bpos_cmp(start->max_key, n->max_key) >= 0) + n->overwritten = true; + else { + n->range_updated = true; + n->min_key = bpos_successor(start->max_key); + n->range_updated = true; + bubble_up(n, end); + goto again; + } + } else if (cmp < 0) { + BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); + + start->max_key = bpos_predecessor(n->min_key); + start->range_updated = true; + } else { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "overlapping btree nodes with same seq! halting\n "); + found_btree_node_to_text(&buf, c, start); + prt_str(&buf, "\n "); + found_btree_node_to_text(&buf, c, n); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return -1; + } + } + + return 0; +} + +int bch2_scan_for_btree_nodes(struct bch_fs *c) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + struct printbuf buf = PRINTBUF; + size_t dst; + int ret = 0; + + if (f->nodes.nr) + return 0; + + mutex_init(&f->lock); + + ret = read_btree_nodes(f); + if (ret) + return ret; + + if (!f->nodes.nr) { + bch_err(c, "%s: no btree nodes found", __func__); + ret = -EINVAL; + goto err; + } + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); + + dst = 0; + darray_for_each(f->nodes, i) { + struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; + + if (prev && + prev->cookie == i->cookie) { + if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { + bch_err(c, "%s: found too many replicas for btree node", __func__); + ret = -EINVAL; + goto err; + } + prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; + } else { + f->nodes.data[dst++] = *i; + } + } + f->nodes.nr = dst; + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + dst = 0; + darray_for_each(f->nodes, i) { + if (i->overwritten) + continue; + + ret = handle_overwrites(c, i, &darray_top(f->nodes)); + if (ret) + goto err; + + BUG_ON(i->overwritten); + f->nodes.data[dst++] = *i; + } + f->nodes.nr = dst; + + if (c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); +err: + printbuf_exit(&buf); + return ret; +} + +static int found_btree_node_range_start_cmp(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->max_key, r->min_key); +} + +#define for_each_found_btree_node_in_range(_f, _search, _idx) \ + for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ + sizeof((_f)->nodes.data[0]), \ + found_btree_node_range_start_cmp, &search); \ + _idx < (_f)->nodes.nr && \ + (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ + (_f)->nodes.data[_idx].level == _search.level && \ + bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ + _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) + +bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + struct found_btree_node search = { + .btree_id = b->c.btree_id, + .level = b->c.level, + .min_key = b->data->min_key, + .max_key = b->key.k.p, + }; + + for_each_found_btree_node_in_range(f, search, idx) + if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) + return true; + return false; +} + +bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +{ + struct found_btree_node search = { + .btree_id = btree, + .level = 0, + .min_key = POS_MIN, + .max_key = SPOS_MAX, + }; + + for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) + return true; + return false; +} + +int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos node_min, struct bpos node_max) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); + bch2_bpos_to_text(&buf, node_min); + prt_str(&buf, " - "); + bch2_bpos_to_text(&buf, node_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + + struct found_btree_node search = { + .btree_id = btree, + .level = level, + .min_key = node_min, + .max_key = node_max, + }; + + for_each_found_btree_node_in_range(f, search, idx) { + struct found_btree_node n = f->nodes.data[idx]; + + n.range_updated |= bpos_lt(n.min_key, node_min); + n.min_key = bpos_max(n.min_key, node_min); + + n.range_updated |= bpos_gt(n.max_key, node_max); + n.max_key = bpos_min(n.max_key, node_max); + + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; + + found_btree_node_to_key(&tmp.k, &n); + + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); + printbuf_exit(&buf); + + BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); + + ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); + if (ret) + return ret; + } + + return 0; +} + +void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) +{ + darray_exit(&f->nodes); +} diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h new file mode 100644 index 000000000000..08687b209787 --- /dev/null +++ b/fs/bcachefs/btree_node_scan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_H +#define _BCACHEFS_BTREE_NODE_SCAN_H + +int bch2_scan_for_btree_nodes(struct bch_fs *); +bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); +bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); +void bch2_find_btree_nodes_exit(struct find_btree_nodes *); + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */ diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h new file mode 100644 index 000000000000..abb7b27d556a --- /dev/null +++ b/fs/bcachefs/btree_node_scan_types.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H +#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H + +#include "darray.h" + +struct found_btree_node { + bool range_updated:1; + bool overwritten:1; + u8 btree_id; + u8 level; + u32 seq; + u64 cookie; + + struct bpos min_key; + struct bpos max_key; + + unsigned nr_ptrs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; +}; + +typedef DARRAY(struct found_btree_node) found_btree_nodes; + +struct find_btree_nodes { + int ret; + struct mutex lock; + found_btree_nodes nodes; +}; + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index b2432d88cda6..0e3ca99fbd2d 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -978,6 +978,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) return bkey_deleted(k.k); } +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) +{ + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } +} + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -993,31 +1018,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *ptr = entry_to_ptr(entry); - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - - if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : ""); - } else { - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, "ptr: %u:%llu:%u gen %u", - ptr->dev, b, offset, ptr->gen); - if (ptr->cached) - prt_str(out, " cached"); - if (ptr->unwritten) - prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) - prt_printf(out, " stale"); - } + case BCH_EXTENT_ENTRY_ptr: + bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); break; - } + case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 3fd0169b98c1..528e817eacbd 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -676,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 084247c13bd8..1ac4135cca1c 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -368,11 +368,11 @@ enum fsck_err_opts { OPT_STR_NOLIMIT(bch2_recovery_passes), \ BCH2_NO_SB_OPT, 0, \ NULL, "Exit recovery after specified pass") \ - x(keep_journal, u8, \ + x(retain_recovery_info, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't free journal entries/keys after startup")\ + NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index f234de0ac834..24671020f22b 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -4,6 +4,7 @@ #include "alloc_background.h" #include "bkey_buf.h" #include "btree_journal_iter.h" +#include "btree_node_scan.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -271,7 +272,7 @@ int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.keep_journal && + if (!c->opts.retain_recovery_info && c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); @@ -435,10 +436,9 @@ static int journal_replay_early(struct bch_fs *c, static int read_btree_roots(struct bch_fs *c) { - unsigned i; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (!r->alive) @@ -447,33 +447,36 @@ static int read_btree_roots(struct bch_fs *c) if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; - if (r->error) { - __fsck_err(c, - btree_id_is_alloc(i) - ? FSCK_CAN_IGNORE : 0, - btree_root_bkey_invalid, - "invalid btree root %s", - bch2_btree_id_str(i)); - if (i == BTREE_ID_alloc) + if (mustfix_fsck_err_on((ret = r->error), + c, btree_root_bkey_invalid, + "invalid btree root %s", + bch2_btree_id_str(i)) || + mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), + c, btree_root_read_error, + "error reading btree root %s l=%u: %s", + bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { + if (btree_id_is_alloc(i)) { + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - } + r->error = 0; + } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { + bch_info(c, "will run btree node scan"); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + } - ret = bch2_btree_root_read(c, i, &r->key, r->level); - if (ret) { - fsck_err(c, - btree_root_read_error, - "error reading btree root %s", - bch2_btree_id_str(i)); - if (btree_id_is_alloc(i)) - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ret = 0; } } - for (i = 0; i < BTREE_ID_NR; i++) { + for (unsigned i = 0; i < BTREE_ID_NR; i++) { struct btree_root *r = bch2_btree_id_root(c, i); - if (!r->b) { + if (!r->b && !r->error) { r->alive = false; r->level = 0; bch2_btree_root_alloc_fake(c, i, 0); @@ -653,7 +656,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { struct genradix_iter iter; struct journal_replay **i; @@ -883,9 +886,10 @@ use_clean: out: bch2_flush_fsck_errs(c); - if (!c->opts.keep_journal && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + if (!c->opts.retain_recovery_info) { bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); + } kfree(clean); if (!ret && diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index b066c5155b74..cb501460d615 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -4,6 +4,7 @@ #include "alloc_background.h" #include "backpointers.h" #include "btree_gc.h" +#include "btree_node_scan.h" #include "ec.h" #include "fsck.h" #include "inode.h" diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index f30521285706..840542cfd65b 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -13,6 +13,7 @@ * must never change: */ #define BCH_RECOVERY_PASSES() \ + x(scan_for_btree_nodes, 37, 0) \ x(check_topology, 4, 0) \ x(alloc_read, 0, PASS_ALWAYS) \ x(stripes_read, 1, PASS_ALWAYS) \ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index bc026a77eb99..ed63018f21be 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -15,6 +15,7 @@ #include "btree_gc.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_write_buffer.h" @@ -536,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); @@ -560,6 +562,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); -- cgit v1.2.3 From 43f5ea4646b2271a9a5af3729dfdf644d69b3282 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 16 Mar 2024 22:45:30 -0400 Subject: bcachefs: Topology repair now uses nodes found by scanning to fill holes With the new btree node scan code, we can now recover from corrupt btree roots - simply create a new fake root at depth 1, and then insert all the leaves we found. If the root wasn't corrupt but there's corruption elsewhere in the btree, we can fill in holes as needed with the newest version of a given node(s) from the scan; we also check if a given btree node is older than what we found from the scan. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 303 +++++++++++++++++++++++++++--------------- fs/bcachefs/sb-errors_types.h | 3 +- 2 files changed, 199 insertions(+), 107 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 9db9c8a5beaa..6280da1244b5 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -13,6 +13,7 @@ #include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_gc.h" @@ -41,6 +42,7 @@ #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +#define DID_FILL_FROM_SCAN 12 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) { @@ -129,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_min); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) return -BCH_ERR_ENOMEM_gc_repair_key; @@ -154,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); if (ret) return ret; @@ -185,127 +209,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) return 0; } -static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, - struct btree *prev, struct btree *cur) +static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b, + struct btree *prev, struct btree *cur, + struct bpos *pulled_from_scan) { struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - if (!prev) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, b->data->min_key); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); + + if (bpos_eq(expected_start, cur->data->min_key)) + return 0; + + prt_printf(&buf, " at btree %s level %u:\n parent: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (prev) { + prt_printf(&buf, "\n prev: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); - - if (prev && - bpos_gt(expected_start, cur->data->min_key) && - BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { - /* cur overwrites prev: */ - - if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, - cur->data->min_key), c, - btree_node_topology_overwritten_by_next_node, - "btree node overwritten by next node at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_PREV_NODE; - goto out; - } + prt_str(&buf, "\n next: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, - bpos_predecessor(cur->data->min_key)), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - } else { - /* prev overwrites cur: */ - - if (mustfix_fsck_err_on(bpos_ge(expected_start, - cur->data->max_key), c, - btree_node_topology_overwritten_by_prev_node, - "btree node overwritten by prev node at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_THIS_NODE; - goto out; - } + if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, cur->data->min_key)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + expected_start, + bpos_predecessor(cur->data->min_key)); + if (ret) + goto err; - if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_min(c, cur, expected_start); + *pulled_from_scan = cur->data->min_key; + ret = DID_FILL_FROM_SCAN; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); + } + } else { /* overlap */ + if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ + if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node, + "btree node overwritten by next node%s", buf.buf)) + ret = DROP_PREV_NODE; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } + } else { + if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node, + "btree node overwritten by prev node%s", buf.buf)) + ret = DROP_THIS_NODE; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); + } + } } -out: +err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } static int btree_repair_node_end(struct bch_fs *c, struct btree *b, - struct btree *child) + struct btree *child, struct bpos *pulled_from_scan) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); - bch2_bpos_to_text(&buf2, b->key.k.p); + if (bpos_eq(child->key.k.p, b->key.k.p)) + return 0; - if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = set_node_max(c, child, b->key.k.p); - if (ret) - goto err; + prt_printf(&buf, "at btree %s level %u:\n parent: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_str(&buf, "\n child: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); + + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) { + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, b->key.k.p)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + bpos_successor(child->key.k.p), b->key.k.p); + if (ret) + goto err; + + *pulled_from_scan = b->key.k.p; + ret = DID_FILL_FROM_SCAN; + } else { + ret = set_node_max(c, child, b->key.k.p); + } } err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } -static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, + struct bpos *pulled_from_scan) { struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; - bool have_child, dropped_children = false; + bool have_child, new_pass = false; struct printbuf buf = PRINTBUF; int ret = 0; if (!b->c.level) return 0; -again: - prev = NULL; - have_child = dropped_children = false; + bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); +again: + cur = prev = NULL; + have_child = new_pass = false; bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; @@ -332,9 +367,10 @@ again: b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); cur = NULL; + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: + bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); if (ret) break; continue; @@ -344,7 +380,23 @@ again: if (ret) break; - ret = btree_repair_node_boundaries(c, b, prev, cur); + if (bch2_btree_node_is_stale(c, cur)) { + bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } if (ret == DROP_THIS_NODE) { six_unlock_read(&cur->c.lock); @@ -370,8 +422,6 @@ again: break; bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); goto again; } else if (ret) break; @@ -383,7 +433,11 @@ again: if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(c, b, prev); + ret = btree_repair_node_end(c, b, prev, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } } if (!IS_ERR_OR_NULL(prev)) @@ -397,6 +451,10 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); + + if (new_pass) + goto again; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; @@ -413,7 +471,7 @@ again: if (ret) goto err; - ret = bch2_btree_repair_topology_recurse(trans, cur); + ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); six_unlock_read(&cur->c.lock); cur = NULL; @@ -421,7 +479,7 @@ again: bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); - dropped_children = true; + new_pass = true; } if (ret) @@ -448,12 +506,14 @@ fsck_err: six_unlock_read(&cur->c.lock); bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); - if (!ret && dropped_children) + if (!ret && new_pass) goto again; + BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); + + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); printbuf_exit(&buf); return ret; } @@ -461,32 +521,63 @@ fsck_err: int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree *b; - unsigned i; + struct bpos pulled_from_scan = POS_MIN; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { struct btree_root *r = bch2_btree_id_root(c, i); + bool reconstructed_root = false; - if (!r->alive) - continue; + if (r->error) { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + break; +reconstruct_root: + bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); - b = r->b; - if (btree_node_fake(b)) - continue; + r->alive = false; + r->error = 0; + + if (!bch2_btree_has_scanned_nodes(c, i)) { + mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); + bch2_btree_root_alloc_fake(c, i, 0); + } else { + bch2_btree_root_alloc_fake(c, i, 1); + ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + if (ret) + break; + } + + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + reconstructed_root = true; + } + + struct btree *b = r->b; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b); + ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { - bch_err(c, "empty btree root - repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + r->b = NULL; + + if (!reconstructed_root) + goto reconstruct_root; + + bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); + bch2_btree_root_alloc_fake(c, i, 0); + r->alive = false; + ret = 0; } } - +fsck_err: bch2_trans_put(trans); - return ret; } diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 73e9634df8ff..2fec03a24c95 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -267,7 +267,8 @@ x(subvol_unreachable, 259) \ x(btree_node_bkey_bad_u64s, 260) \ x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) + x(btree_ptr_v2_min_key_bad, 262) \ + x(btree_root_unreadable_and_scan_found_nothing, 263) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- cgit v1.2.3 From 55936afe11077a84d9e1c5068169af328bbf2811 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Mar 2024 23:03:42 -0400 Subject: bcachefs: Flag btrees with missing data We need this to know when we should attempt to reconstruct the snapshots btree Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/btree_io.c | 13 +++++++++---- fs/bcachefs/recovery.c | 23 +++++++++++++++++++++++ fs/bcachefs/recovery.h | 2 ++ fs/bcachefs/super-io.c | 9 ++++++++- 6 files changed, 44 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 93a61dbaa3d8..d1a0e5478503 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -797,6 +797,7 @@ struct bch_fs { u64 features; u64 compat; unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; + u64 btrees_lost_data; } sb; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index bff8750ac0d7..63102992d955 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -818,6 +818,7 @@ struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; __le64 errors_silent[8]; + __le64 btrees_lost_data; }; struct bch_sb_field_downgrade_entry { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index f3f27bb85a5b..d7de82ac3893 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1264,10 +1264,12 @@ out: return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) + ret == -BCH_ERR_btree_node_read_err_must_retry) { retry_read = 1; - else + } else { set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); + } goto out; } @@ -1328,6 +1330,7 @@ start: if (!can_retry) { set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); break; } } @@ -1527,9 +1530,10 @@ fsck_err: ret = -1; } - if (ret) + if (ret) { set_btree_node_read_error(b); - else if (*saw_error) + bch2_btree_lost_data(c, b->c.btree_id); + } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); for (i = 0; i < ra->nr; i++) { @@ -1665,6 +1669,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_fatal_error(c); set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); printbuf_exit(&buf); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 24671020f22b..b3c67ae3d3b2 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -33,6 +33,20 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +{ + u64 b = BIT_ULL(btree); + + if (!(c->sb.btrees_lost_data & b)) { + bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); + + mutex_lock(&c->sb_lock); + bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } +} + static bool btree_id_is_alloc(enum btree_id id) { switch (id) { @@ -470,6 +484,7 @@ static int read_btree_roots(struct bch_fs *c) } ret = 0; + bch2_btree_lost_data(c, i); } } @@ -848,6 +863,14 @@ use_clean: write_sb = true; } + if (c->opts.fsck && + !test_bit(BCH_FS_error, &c->flags) && + c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + ext->btrees_lost_data) { + ext->btrees_lost_data = 0; + write_sb = true; + } + if (c->opts.fsck && !test_bit(BCH_FS_error, &c->flags) && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 3962fd87b50d..4bf818de1f2f 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H +void bch2_btree_lost_data(struct bch_fs *, enum btree_id); + int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index e15f8b1f30c2..e0aa3655b63b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c) memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) + if (ext) { le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); + c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); + } for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); @@ -1162,6 +1164,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, kfree(errors_silent); } + + prt_printf(out, "Btrees with missing data:"); + prt_tab(out); + prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); + prt_newline(out); } static const struct bch_sb_field_ops bch_sb_field_ops_ext = { -- cgit v1.2.3 From a292be3b68f3fdad6cff50c716174f51b119efd1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Mar 2024 22:50:19 -0400 Subject: bcachefs: Reconstruct missing snapshot nodes When the snapshots btree is going, we'll have to delete huge amounts of data - unless we can reconstruct it by looking at the keys that refer to it. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/recovery.c | 1 + fs/bcachefs/recovery_passes_types.h | 1 + fs/bcachefs/sb-errors_types.h | 3 +- fs/bcachefs/snapshot.c | 173 +++++++++++++++++++++++++++++++++++- fs/bcachefs/snapshot.h | 26 +++++- 6 files changed, 199 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d1a0e5478503..a31a5f706929 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -615,6 +615,7 @@ struct bch_dev { */ #define BCH_FS_FLAGS() \ + x(new_fs) \ x(started) \ x(may_go_rw) \ x(rw) \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b3c67ae3d3b2..b76c16152579 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -938,6 +938,7 @@ int bch2_fs_initialize(struct bch_fs *c) int ret; bch_notice(c, "initializing new filesystem"); + set_bit(BCH_FS_new_fs, &c->flags); mutex_lock(&c->sb_lock); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 840542cfd65b..773aea9a0080 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -32,6 +32,7 @@ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 2fec03a24c95..2f8f4d2388b0 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -268,7 +268,8 @@ x(btree_node_bkey_bad_u64s, 260) \ x(btree_node_topology_empty_interior_node, 261) \ x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) + x(btree_root_unreadable_and_scan_found_nothing, 263) \ + x(snapshot_node_missing, 264) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 4577ee7939a2..0e806f04f3d7 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -8,6 +8,7 @@ #include "errcode.h" #include "error.h" #include "fs.h" +#include "recovery_passes.h" #include "snapshot.h" #include @@ -574,6 +575,13 @@ static int check_snapshot_tree(struct btree_trans *trans, u32 subvol_id; ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + bch_err_fn(c, ret); + + if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ + ret = 0; + goto err; + } + if (ret) goto err; @@ -731,7 +739,6 @@ static int check_snapshot(struct btree_trans *trans, u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); u32 real_depth; struct printbuf buf = PRINTBUF; - bool should_have_subvol; u32 i, id; int ret = 0; @@ -777,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans, } } - should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && !BCH_SNAPSHOT_DELETED(&s); if (should_have_subvol) { @@ -879,6 +886,154 @@ int bch2_check_snapshots(struct bch_fs *c) return ret; } +static int check_snapshot_exists(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + + if (bch2_snapshot_equiv(c, id)) + return 0; + + u32 tree_id; + int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); + ret = PTR_ERR_OR_ZERO(snapshot); + if (ret) + return ret; + + bkey_snapshot_init(&snapshot->k_i); + snapshot->k.p = POS(0, id); + snapshot->v.tree = cpu_to_le32(tree_id); + snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: + bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); +} + +/* Figure out which snapshot nodes belong in the same tree: */ +struct snapshot_tree_reconstruct { + enum btree_id btree; + struct bpos cur_pos; + snapshot_id_list cur_ids; + DARRAY(snapshot_id_list) trees; +}; + +static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) +{ + darray_for_each(r->trees, i) + darray_exit(i); + darray_exit(&r->trees); + darray_exit(&r->cur_ids); +} + +static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + return r->btree == BTREE_ID_inodes + ? r->cur_pos.offset == pos.offset + : r->cur_pos.inode == pos.inode; +} + +static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) +{ + darray_for_each(*l, i) + if (snapshot_list_has_id(r, *i)) + return true; + return false; +} + +static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) +{ + bool first = true; + darray_for_each(*s, i) { + if (!first) + prt_char(out, ' '); + first = false; + prt_printf(out, "%u", *i); + } +} + +static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) +{ + if (r->cur_ids.nr) { + darray_for_each(r->trees, i) + if (snapshot_id_lists_have_common(i, &r->cur_ids)) { + int ret = snapshot_list_merge(c, i, &r->cur_ids); + if (ret) + return ret; + goto out; + } + darray_push(&r->trees, r->cur_ids); + darray_init(&r->cur_ids); + } +out: + r->cur_ids.nr = 0; + return 0; +} + +static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + if (!same_snapshot(r, pos)) + snapshot_tree_reconstruct_next(c, r); + r->cur_pos = pos; + return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); +} + +int bch2_reconstruct_snapshots(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + struct snapshot_tree_reconstruct r = {}; + int ret = 0; + + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + if (btree_type_has_snapshots(btree)) { + r.btree = btree; + + ret = for_each_btree_key(trans, iter, btree, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ + get_snapshot_trees(c, &r, k.k->p); + })); + if (ret) + goto err; + + snapshot_tree_reconstruct_next(c, &r); + } + } + + darray_for_each(r.trees, t) { + printbuf_reset(&buf); + snapshot_id_list_to_text(&buf, t); + + darray_for_each(*t, id) { + if (fsck_err_on(!bch2_snapshot_equiv(c, *id), + c, snapshot_node_missing, + "snapshot node %u from tree %s missing", *id, buf.buf)) { + if (t->nr > 1) { + bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_snapshot_exists(trans, *id)); + if (ret) + goto err; + } + } + } +fsck_err: +err: + bch2_trans_put(trans); + snapshot_tree_reconstruct_exit(&r); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1689,6 +1844,20 @@ int bch2_snapshots_read(struct bch_fs *c) POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); bch_err_fn(c, ret); + + /* + * It's important that we check if we need to reconstruct snapshots + * before going RW, so we mark that pass as required in the superblock - + * otherwise, we could end up deleting keys with missing snapshot nodes + * instead + */ + BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && + test_bit(BCH_FS_may_go_rw, &c->flags)); + + if (bch2_err_matches(ret, EIO) || + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); + return ret; } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 331f20fd8d03..b7d2fed37c4f 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -209,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) { - int ret; - BUG_ON(snapshot_list_has_id(s, id)); - ret = darray_push(s, id); + int ret = darray_push(s, id); if (ret) bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); return ret; } +static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + int ret = snapshot_list_has_id(s, id) + ? 0 + : darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; +} + +static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) +{ + darray_for_each(*src, i) { + int ret = snapshot_list_add_nodup(c, dst, *i); + if (ret) + return ret; + } + + return 0; +} + int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s); int bch2_snapshot_get_subvol(struct btree_trans *, u32, @@ -229,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); +int bch2_reconstruct_snapshots(struct bch_fs *); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); -- cgit v1.2.3 From 4c02e63dadc7f5a92732ce3e267d0511749f60fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Mar 2024 18:43:00 -0400 Subject: bcachefs: Check for extents that point to same space In backpointer repair, if we get a missing backpointer - but there's already a backpointer that points to an existing extent - we've got multiple extents that point to the same space and need to decide which to keep. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 173 ++++++++++++++++++++++++++++++++++++++++-- fs/bcachefs/sb-errors_types.h | 3 +- 2 files changed, 168 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 762c8ddfc5e7..114328acde72 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -8,6 +8,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "checksum.h" #include "error.h" #include @@ -418,6 +419,84 @@ struct extents_to_bp_state { struct bkey_buf last_flushed; }; +static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, + struct bkey_s_c extent, unsigned dev) +{ + struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bch2_bkey_drop_device(bkey_i_to_s(n), dev); + return bch2_btree_insert_trans(trans, btree, n, 0); +} + +static int check_extent_checksum(struct btree_trans *trans, + enum btree_id btree, struct bkey_s_c extent, + enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; + void *data_buf = NULL; + struct bio *bio = NULL; + size_t bytes; + int ret = 0; + + if (bkey_is_btree_ptr(extent.k)) + return false; + + bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) + if (p.ptr.dev == dev) + goto found; + BUG(); +found: + if (!p.crc.csum_type) + return false; + + bytes = p.crc.compressed_size << 9; + + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + if (!bch2_dev_get_ioref(ca, READ)) + return false; + + data_buf = kvmalloc(bytes, GFP_KERNEL); + if (!data_buf) { + ret = -ENOMEM; + goto err; + } + + bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL); + bio->bi_iter.bi_sector = p.ptr.offset; + bch2_bio_map(bio, data_buf, bytes); + ret = submit_bio_wait(bio); + if (ret) + goto err; + + prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); + prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); + bch2_bkey_val_to_text(&buf, c, extent); + prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); + bch2_bkey_val_to_text(&buf, c, extent2); + + struct nonce nonce = extent_nonce(extent.k->version, p.crc); + struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); + if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), + c, dup_backpointer_to_bad_csum_extent, + "%s", buf.buf)) + ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; +fsck_err: +err: + if (bio) + bio_put(bio); + kvfree(data_buf); + percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} + static int check_bp_exists(struct btree_trans *trans, struct extents_to_bp_state *s, struct bpos bucket, @@ -425,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = { NULL }; + struct btree_iter bp_iter = {}; + struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; struct bkey_buf tmp; @@ -433,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "extent for nonexistent device:bucket "); + bch2_bpos_to_text(&buf, bucket); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + bch_err(c, "%s", buf.buf); + return -BCH_ERR_fsck_repair_unimplemented; + } + if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) return 0; - if (!bch2_dev_bucket_exists(c, bucket)) - goto missing; - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp(c, bucket, bp.bucket_offset), 0); @@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans, ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } - goto missing; + + goto check_existing_bp; } out: err: fsck_err: + bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&tmp, c); printbuf_exit(&buf); return ret; +check_existing_bp: + /* Do we have a backpointer for a different extent? */ + if (bp_k.k->type != KEY_TYPE_backpointer) + goto missing; + + struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; + + struct bkey_s_c other_extent = + bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); + ret = bkey_err(other_extent); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + ret = 0; + if (ret) + goto err; + + if (!other_extent.k) + goto missing; + + if (bch2_extents_match(orig_k, other_extent)) { + printbuf_reset(&buf); + prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + + if (other_extent.k->size <= orig_k.k->size) { + ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); + if (ret) + goto err; + goto out; + } else { + ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); + if (ret) + goto err; + goto missing; + } + } + + ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto missing; + } + + ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto out; + } + + printbuf_reset(&buf); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; missing: + printbuf_reset(&buf); prt_printf(&buf, "missing backpointer for btree=%s l=%u ", bch2_btree_id_str(bp.btree_id), bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nbp pos "); - bch2_bpos_to_text(&buf, bp_iter.pos); + prt_printf(&buf, "\n got: "); + bch2_bkey_val_to_text(&buf, c, bp_k); + + struct bkey_i_backpointer n_bp_k; + bkey_backpointer_init(&n_bp_k.k_i); + n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + n_bp_k.v = bp; + prt_printf(&buf, "\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 2f8f4d2388b0..d7d609131030 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -269,7 +269,8 @@ x(btree_node_topology_empty_interior_node, 261) \ x(btree_ptr_v2_min_key_bad, 262) \ x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) + x(snapshot_node_missing, 264) \ + x(dup_backpointer_to_bad_csum_extent, 265) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, -- cgit v1.2.3 From cc0532900bcf1896a81dcdd30873ffa6c4f6926b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 31 Mar 2024 02:03:03 -0400 Subject: bcachefs: Subvolume reconstruction We can now recreate missing subvolumes from dirents and/or inodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 148 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b704fb0dda95..c1edc5647ba0 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, u32 *snapshot, u64 *inum) { struct bch_subvolume s; - int ret; - - ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); *snapshot = le32_to_cpu(s.snapshot); *inum = le64_to_cpu(s.inode); @@ -170,7 +168,8 @@ err: /* Get lost+found, create if it doesn't exist: */ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound) + struct bch_inode_unpacked *lostfound, + u64 reattaching_inum) { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); @@ -185,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, return ret; subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; - u32 subvol_snapshot; - ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), - &subvol_snapshot, &root_inum.inum); - bch_err_msg(c, ret, "looking up root subvol"); + struct bch_subvolume subvol; + ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), + false, 0, &subvol); + bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", + le32_to_cpu(st.master_subvol), snapshot); if (ret) return ret; + if (!subvol.inode) { + struct btree_iter iter; + struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) + return ret; + + subvol->v.inode = cpu_to_le64(reattaching_inum); + bch2_trans_iter_exit(trans, &iter); + } + + root_inum.inum = le64_to_cpu(subvol.inode); + struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; u32 root_inode_snapshot = snapshot; ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); - bch_err_msg(c, ret, "looking up root inode"); + bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", + root_inum.inum, le32_to_cpu(st.master_subvol)); if (ret) return ret; @@ -293,7 +309,7 @@ static int reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); } - ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); if (ret) return ret; @@ -364,6 +380,85 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume return ret; } +static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) +{ + struct bch_fs *c = trans->c; + + if (!bch2_snapshot_is_leaf(c, snapshotid)) { + bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); + return -BCH_ERR_fsck_repair_unimplemented; + } + + /* + * If inum isn't set, that means we're being called from check_dirents, + * not check_inodes - the root of this subvolume doesn't exist or we + * would have found it there: + */ + if (!inum) { + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked new_inode; + u64 cpu = raw_smp_processor_id(); + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + + new_inode.bi_subvol = subvolid; + + int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: + bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, &new_inode); + bch2_trans_iter_exit(trans, &inode_iter); + if (ret) + return ret; + + inum = new_inode.bi_inum; + } + + bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); + + struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + int ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + return ret; + + bkey_subvolume_init(&new_subvol->k_i); + new_subvol->k.p.offset = subvolid; + new_subvol->v.snapshot = cpu_to_le32(snapshotid); + new_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); + if (ret) + return ret; + + struct btree_iter iter; + struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, snapshotid), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + bch_err_msg(c, ret, "getting snapshot %u", snapshotid); + if (ret) + return ret; + + u32 snapshot_tree = le32_to_cpu(s->v.tree); + + s->v.subvol = cpu_to_le32(subvolid); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); + bch2_trans_iter_exit(trans, &iter); + + struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, snapshot_tree), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(st); + bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); + if (ret) + return ret; + + if (!st->v.master_subvol) + st->v.master_subvol = cpu_to_le32(subvolid); + + bch2_trans_iter_exit(trans, &iter); + return 0; +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -1065,6 +1160,11 @@ static int check_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; + if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); + goto do_update; + } + if (fsck_err_on(ret, c, inode_bi_subvol_missing, "inode %llu:%u bi_subvol points to missing subvolume %u", @@ -1082,7 +1182,7 @@ static int check_inode(struct btree_trans *trans, do_update = true; } } - +do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); @@ -1785,6 +1885,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); u32 parent_snapshot; + u32 new_parent_subvol = 0; u64 parent_inum; struct printbuf buf = PRINTBUF; int ret = 0; @@ -1793,6 +1894,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret && !bch2_err_matches(ret, ENOENT)) return ret; + if (ret || + (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { + int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); + if (ret2 && !bch2_err_matches(ret, ENOENT)) + return ret2; + } + + if (ret && + !new_parent_subvol && + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + /* + * Couldn't find a subvol for dirent's snapshot - but we lost + * subvols, so we need to reconstruct: + */ + ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); + if (ret) + return ret; + + parent_snapshot = d.k->p.snapshot; + } + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, "dirent parent_subvol points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || @@ -1801,10 +1923,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", parent_snapshot, (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - u32 new_parent_subvol; - ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); - if (ret) - goto err; + if (!new_parent_subvol) { + bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); + return -BCH_ERR_fsck_repair_unimplemented; + } struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); ret = PTR_ERR_OR_ZERO(new_dirent); @@ -1850,9 +1972,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; - if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, + if (ret) { + bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + ret = 0; + goto err; + } + + if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, c, inode_bi_parent_wrong, "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", target_inum, @@ -1860,13 +1989,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * subvol_root.bi_parent_subvol = parent_subvol; ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; } ret = check_dirent_target(trans, iter, d, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; out: err: fsck_err: -- cgit v1.2.3 From 09d4c2acbf4c864fef0f520bbcba256c9a19102e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Apr 2024 00:00:56 -0400 Subject: bcachefs: reconstruct_inode() If an inode is missing, but corresponding extents and dirent still exist, it's well worth recreating it - this does so. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c1edc5647ba0..8e2010212cc3 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -459,6 +459,33 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub return 0; } +static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked new_inode; + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL); + new_inode.bi_size = size; + new_inode.bi_inum = inum; + + return __bch2_fsck_write_inode(trans, &new_inode, snapshot); +} + +static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum) +{ + struct btree_iter iter = {}; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); + bch2_trans_iter_exit(trans, &iter); + int ret = bkey_err(k); + if (ret) + return ret; + + return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -1535,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; if (k.k->type != KEY_TYPE_whiteout) { + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + if (fsck_err_on(!i, c, extent_in_missing_inode, "extent in missing inode:\n %s", (printbuf_reset(&buf), @@ -2012,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct snapshots_seen *s) { struct bch_fs *c = trans->c; - struct bkey_s_c_dirent d; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; struct bpos equiv; @@ -2051,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); dir->first_this_inode = false; + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + dir->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, "dirent in nonexisting directory:\n%s", (printbuf_reset(&buf), @@ -2085,7 +2133,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type != KEY_TYPE_dirent) goto out; - d = bkey_s_c_to_dirent(k); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { ret = check_dirent_to_subvol(trans, iter, d); -- cgit v1.2.3 From de164a7f19248fb03229a4af9b0db333d9591e55 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 21 Mar 2024 23:54:19 -0700 Subject: nios2: Only use built-in devicetree blob if configured to do so Starting with commit 7b937cc243e5 ("of: Create of_root if no dtb provided by firmware"), attempts to boot nios2 images with an external devicetree blob result in a crash. Kernel panic - not syncing: early_init_dt_alloc_memory_arch: Failed to allocate 72 bytes align=0x40 For nios2, a built-in devicetree blob always overrides devicetree blobs provided by ROMMON/BIOS. This includes the new dummy devicetree blob. Result is that the dummy devicetree blob is used even if an external devicetree blob is provided. Since the dummy devicetree blob does not include any memory information, memory allocations fail, resulting in the crash. To fix the problem, only use the built-in devicetree blob if CONFIG_NIOS2_DTB_SOURCE_BOOL is enabled. Fixes: 7b937cc243e5 ("of: Create of_root if no dtb provided by firmware") Cc: Frank Rowand Cc: Stephen Boyd Cc: Rob Herring Signed-off-by: Guenter Roeck Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240322065419.162416-1-linux@roeck-us.net Signed-off-by: Rob Herring --- arch/nios2/kernel/prom.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/nios2/kernel/prom.c b/arch/nios2/kernel/prom.c index 8d98af5c7201..9a8393e6b4a8 100644 --- a/arch/nios2/kernel/prom.c +++ b/arch/nios2/kernel/prom.c @@ -21,7 +21,8 @@ void __init early_init_devtree(void *params) { - __be32 *dtb = (u32 *)__dtb_start; + __be32 __maybe_unused *dtb = (u32 *)__dtb_start; + #if defined(CONFIG_NIOS2_DTB_AT_PHYS_ADDR) if (be32_to_cpup((__be32 *)CONFIG_NIOS2_DTB_PHYS_ADDR) == OF_DT_HEADER) { @@ -30,8 +31,11 @@ void __init early_init_devtree(void *params) return; } #endif + +#ifdef CONFIG_NIOS2_DTB_SOURCE_BOOL if (be32_to_cpu((__be32) *dtb) == OF_DT_HEADER) params = (void *)__dtb_start; +#endif early_init_dt_scan(params); } -- cgit v1.2.3 From 173217bd73365867378b5e75a86f0049e1069ee8 Mon Sep 17 00:00:00 2001 From: Ritvik Budhiraja Date: Tue, 2 Apr 2024 14:01:28 -0500 Subject: smb3: retrying on failed server close In the current implementation, CIFS close sends a close to the server and does not check for the success of the server close. This patch adds functionality to check for server close return status and retries in case of an EBUSY or EAGAIN error. This can help avoid handle leaks Cc: stable@vger.kernel.org Signed-off-by: Ritvik Budhiraja Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 6 +++-- fs/smb/client/cifsfs.c | 11 ++++++++ fs/smb/client/cifsglob.h | 7 ++++-- fs/smb/client/file.c | 63 +++++++++++++++++++++++++++++++++++++++++----- fs/smb/client/smb1ops.c | 4 +-- fs/smb/client/smb2ops.c | 9 ++++--- fs/smb/client/smb2pdu.c | 2 +- 7 files changed, 85 insertions(+), 17 deletions(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index a0017724d523..13a9d7acf8f8 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -417,6 +417,7 @@ smb2_close_cached_fid(struct kref *ref) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); + int rc; spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->on_list) { @@ -430,9 +431,10 @@ smb2_close_cached_fid(struct kref *ref) cfid->dentry = NULL; if (cfid->is_open) { - SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); - atomic_dec(&cfid->tcon->num_remote_opens); + if (rc != -EBUSY && rc != -EAGAIN) + atomic_dec(&cfid->tcon->num_remote_opens); } free_cached_dir(cfid); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index aa6f1ecb7c0e..d41eedbff674 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -156,6 +156,7 @@ struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; struct workqueue_struct *deferredclose_wq; +struct workqueue_struct *serverclose_wq; __u32 cifs_lock_secret; /* @@ -1888,6 +1889,13 @@ init_cifs(void) goto out_destroy_cifsoplockd_wq; } + serverclose_wq = alloc_workqueue("serverclose", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!serverclose_wq) { + rc = -ENOMEM; + goto out_destroy_serverclose_wq; + } + rc = cifs_init_inodecache(); if (rc) goto out_destroy_deferredclose_wq; @@ -1962,6 +1970,8 @@ out_destroy_decrypt_wq: destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); +out_destroy_serverclose_wq: + destroy_workqueue(serverclose_wq); out_clean_proc: cifs_proc_clean(); return rc; @@ -1991,6 +2001,7 @@ exit_cifs(void) destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); destroy_workqueue(fileinfo_put_wq); + destroy_workqueue(serverclose_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 286afbe346be..77ca7861a2cc 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -442,10 +442,10 @@ struct smb_version_operations { /* set fid protocol-specific info */ void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); /* close a file */ - void (*close)(const unsigned int, struct cifs_tcon *, + int (*close)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); /* close a file, returning file attributes and timestamps */ - void (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon, + int (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *pfile_info); /* send a flush request to the server */ int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); @@ -1439,6 +1439,7 @@ struct cifsFileInfo { bool swapfile:1; bool oplock_break_cancelled:1; bool status_file_deleted:1; /* file has been deleted */ + bool offload:1; /* offload final part of _put to a wq */ unsigned int oplock_epoch; /* epoch from the lease break */ __u32 oplock_level; /* oplock/lease level from the lease break */ int count; @@ -1447,6 +1448,7 @@ struct cifsFileInfo { struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ struct work_struct put; /* work for the final part of _put */ + struct work_struct serverclose; /* work for serverclose */ struct delayed_work deferred; bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ char *symlink_target; @@ -2103,6 +2105,7 @@ extern struct workqueue_struct *decrypt_wq; extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; extern struct workqueue_struct *deferredclose_wq; +extern struct workqueue_struct *serverclose_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_sm_req_poolp; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index ab536ce8a04a..9be37d0fe724 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -459,6 +459,7 @@ cifs_down_write(struct rw_semaphore *sem) } static void cifsFileInfo_put_work(struct work_struct *work); +void serverclose_work(struct work_struct *work); struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock, @@ -505,6 +506,7 @@ struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); INIT_WORK(&cfile->put, cifsFileInfo_put_work); + INIT_WORK(&cfile->serverclose, serverclose_work); INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close); mutex_init(&cfile->fh_mutex); spin_lock_init(&cfile->file_info_lock); @@ -596,6 +598,40 @@ static void cifsFileInfo_put_work(struct work_struct *work) cifsFileInfo_put_final(cifs_file); } +void serverclose_work(struct work_struct *work) +{ + struct cifsFileInfo *cifs_file = container_of(work, + struct cifsFileInfo, serverclose); + + struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); + + struct TCP_Server_Info *server = tcon->ses->server; + int rc = 0; + int retries = 0; + int MAX_RETRIES = 4; + + do { + if (server->ops->close_getattr) + rc = server->ops->close_getattr(0, tcon, cifs_file); + else if (server->ops->close) + rc = server->ops->close(0, tcon, &cifs_file->fid); + + if (rc == -EBUSY || rc == -EAGAIN) { + retries++; + msleep(250); + } + } while ((rc == -EBUSY || rc == -EAGAIN) && (retries < MAX_RETRIES) + ); + + if (retries == MAX_RETRIES) + pr_warn("Serverclose failed %d times, giving up\n", MAX_RETRIES); + + if (cifs_file->offload) + queue_work(fileinfo_put_wq, &cifs_file->put); + else + cifsFileInfo_put_final(cifs_file); +} + /** * cifsFileInfo_put - release a reference of file priv data * @@ -636,10 +672,13 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, struct cifs_fid fid = {}; struct cifs_pending_open open; bool oplock_break_cancelled; + bool serverclose_offloaded = false; spin_lock(&tcon->open_file_lock); spin_lock(&cifsi->open_file_lock); spin_lock(&cifs_file->file_info_lock); + + cifs_file->offload = offload; if (--cifs_file->count > 0) { spin_unlock(&cifs_file->file_info_lock); spin_unlock(&cifsi->open_file_lock); @@ -681,13 +720,20 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; unsigned int xid; + int rc = 0; xid = get_xid(); if (server->ops->close_getattr) - server->ops->close_getattr(xid, tcon, cifs_file); + rc = server->ops->close_getattr(xid, tcon, cifs_file); else if (server->ops->close) - server->ops->close(xid, tcon, &cifs_file->fid); + rc = server->ops->close(xid, tcon, &cifs_file->fid); _free_xid(xid); + + if (rc == -EBUSY || rc == -EAGAIN) { + // Server close failed, hence offloading it as an async op + queue_work(serverclose_wq, &cifs_file->serverclose); + serverclose_offloaded = true; + } } if (oplock_break_cancelled) @@ -695,10 +741,15 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, cifs_del_pending_open(&open); - if (offload) - queue_work(fileinfo_put_wq, &cifs_file->put); - else - cifsFileInfo_put_final(cifs_file); + // if serverclose has been offloaded to wq (on failure), it will + // handle offloading put as well. If serverclose not offloaded, + // we need to handle offloading put here. + if (!serverclose_offloaded) { + if (offload) + queue_work(fileinfo_put_wq, &cifs_file->put); + else + cifsFileInfo_put_final(cifs_file); + } } int cifs_open(struct inode *inode, struct file *file) diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index a9eaba8083b0..212ec6f66ec6 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -753,11 +753,11 @@ cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); } -static void +static int cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid) { - CIFSSMBClose(xid, tcon, fid->netfid); + return CIFSSMBClose(xid, tcon, fid->netfid); } static int diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 35bf7eb315cd..87b63f6ad2e2 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1412,14 +1412,14 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) memcpy(cfile->fid.create_guid, fid->create_guid, 16); } -static void +static int smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid) { - SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); + return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } -static void +static int smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) { @@ -1430,7 +1430,7 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, rc = __SMB2_close(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, &file_inf); if (rc) - return; + return rc; inode = d_inode(cfile->dentry); @@ -1459,6 +1459,7 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, /* End of file and Attributes should not have to be updated on close */ spin_unlock(&inode->i_lock); + return rc; } static int diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 3ea688558e6c..c0c4933af5fc 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3628,9 +3628,9 @@ replay_again: memcpy(&pbuf->network_open_info, &rsp->network_open_info, sizeof(pbuf->network_open_info)); + atomic_dec(&tcon->num_remote_opens); } - atomic_dec(&tcon->num_remote_opens); close_exit: SMB2_close_free(&rqst); free_rsp_buf(resp_buftype, rsp); -- cgit v1.2.3 From ca545b7f0823f19db0f1148d59bc5e1a56634502 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:53 -0300 Subject: smb: client: fix potential UAF in cifs_debug_files_proc_show() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 ++ fs/smb/client/cifsglob.h | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 226d4835c92d..c9aec9a38ad3 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -250,6 +250,8 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 77ca7861a2cc..f6a302205f89 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -2325,4 +2325,14 @@ struct smb2_compound_vars { struct kvec ea_iov; }; +static inline bool cifs_ses_exiting(struct cifs_ses *ses) +{ + bool ret; + + spin_lock(&ses->ses_lock); + ret = ses->ses_status == SES_EXITING; + spin_unlock(&ses->ses_lock); + return ret; +} + #endif /* _CIFS_GLOB_H */ -- cgit v1.2.3 From 58acd1f497162e7d282077f816faa519487be045 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:54 -0300 Subject: smb: client: fix potential UAF in cifs_dump_full_key() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index c012dfdba80d..855ac5a62edf 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -247,7 +247,9 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) { list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) { - if (ses_it->Suid == out.session_id) { + spin_lock(&ses_it->ses_lock); + if (ses_it->ses_status != SES_EXITING && + ses_it->Suid == out.session_id) { ses = ses_it; /* * since we are using the session outside the crit @@ -255,9 +257,11 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug * so increment its refcount */ cifs_smb_ses_inc_refcount(ses); + spin_unlock(&ses_it->ses_lock); found = true; goto search_end; } + spin_unlock(&ses_it->ses_lock); } } search_end: -- cgit v1.2.3 From d3da25c5ac84430f89875ca7485a3828150a7e0a Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:55 -0300 Subject: smb: client: fix potential UAF in cifs_stats_proc_write() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index c9aec9a38ad3..8535c9907462 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -678,6 +678,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, } #endif /* CONFIG_CIFS_STATS2 */ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { atomic_set(&tcon->num_smbs_sent, 0); spin_lock(&tcon->stat_lock); -- cgit v1.2.3 From 0865ffefea197b437ba78b5dd8d8e256253efd65 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:56 -0300 Subject: smb: client: fix potential UAF in cifs_stats_proc_show() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 8535c9907462..c71ae5c04306 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -759,6 +759,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) } #endif /* STATS2 */ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { i++; seq_printf(m, "\n%d) %s", i, tcon->tree_name); -- cgit v1.2.3 From 705c76fbf726c7a2f6ff9143d4013b18daaaebf1 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:58 -0300 Subject: smb: client: fix potential UAF in smb2_is_valid_lease_break() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2misc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 82b84a4941dd..ed6d033dce96 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -622,6 +622,8 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); cifs_stats_inc( -- cgit v1.2.3 From 22863485a4626ec6ecf297f4cc0aef709bc862e4 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:33:59 -0300 Subject: smb: client: fix potential UAF in smb2_is_valid_oplock_break() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2misc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index ed6d033dce96..cc72be5a93a9 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -699,6 +699,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); -- cgit v1.2.3 From 69ccf040acddf33a3a85ec0f6b45ef84b0f7ec29 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:34:00 -0300 Subject: smb: client: fix potential UAF in is_valid_oplock_break() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/misc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 1ea22b3955a2..33ac4f8f5050 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -481,6 +481,8 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid != buf->Tid) continue; -- cgit v1.2.3 From 63981561ffd2d4987807df4126f96a11e18b0c1d Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:34:02 -0300 Subject: smb: client: fix potential UAF in smb2_is_network_name_deleted() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 87b63f6ad2e2..b156eefa75d7 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2481,6 +2481,8 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { spin_lock(&tcon->tc_lock); -- cgit v1.2.3 From e0e50401cc3921c9eaf1b0e667db174519ea939f Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 2 Apr 2024 16:34:04 -0300 Subject: smb: client: fix potential UAF in cifs_signal_cifsd_for_reconnect() Skip sessions that are being teared down (status == SES_EXITING) to avoid UAF. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 95e4bda4fd51..85679ae106fd 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -175,6 +175,8 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { if (!ses->chans[i].server) -- cgit v1.2.3 From cffaefd15a8f423cdee5d8eac15d267bc92de314 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Mar 2024 19:02:15 +0100 Subject: vdso: Use CONFIG_PAGE_SHIFT in vdso/datapage.h Both the vdso rework and the CONFIG_PAGE_SHIFT changes were merged during the v6.9 merge window, so it is now possible to use CONFIG_PAGE_SHIFT instead of including asm/page.h in the vdso. This avoids the workaround for arm64 - commit 8b3843ae3634 ("vdso/datapage: Quick fix - use asm/page-def.h for ARM64") and addresses a build warning for powerpc64: In file included from :4: In file included from /home/arnd/arm-soc/arm-soc/lib/vdso/gettimeofday.c:5: In file included from ../include/vdso/datapage.h:25: arch/powerpc/include/asm/page.h:230:9: error: result of comparison of constant 13835058055282163712 with expression of type 'unsigned long' is always true [-Werror,-Wtautological-constant-out-of-range-compare] 230 | return __pa(kaddr) >> PAGE_SHIFT; | ^~~~~~~~~~~ arch/powerpc/include/asm/page.h:217:37: note: expanded from macro '__pa' 217 | VIRTUAL_WARN_ON((unsigned long)(x) < PAGE_OFFSET); \ | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~ arch/powerpc/include/asm/page.h:202:73: note: expanded from macro 'VIRTUAL_WARN_ON' 202 | #define VIRTUAL_WARN_ON(x) WARN_ON(IS_ENABLED(CONFIG_DEBUG_VIRTUAL) && (x)) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~ arch/powerpc/include/asm/bug.h:88:25: note: expanded from macro 'WARN_ON' 88 | int __ret_warn_on = !!(x); \ | ^ Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/r/20240320180228.136371-1-arnd@kernel.org --- arch/powerpc/include/asm/vdso/gettimeofday.h | 3 +-- include/vdso/datapage.h | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index f0a4cf01e85c..78302f6c2580 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -4,7 +4,6 @@ #ifndef __ASSEMBLY__ -#include #include #include #include @@ -95,7 +94,7 @@ const struct vdso_data *__arch_get_vdso_data(void); static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) { - return (void *)vd + PAGE_SIZE; + return (void *)vd + (1U << CONFIG_PAGE_SHIFT); } #endif diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 5d5c0b8efff2..c71ddb6d4691 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -19,12 +19,6 @@ #include #include -#ifdef CONFIG_ARM64 -#include -#else -#include -#endif - #ifdef CONFIG_ARCH_HAS_VDSO_DATA #include #else @@ -132,7 +126,7 @@ extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden */ union vdso_data_store { struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; + u8 page[1U << CONFIG_PAGE_SHIFT]; }; /* -- cgit v1.2.3 From dd19e827d63ac60debf117676d1126bff884bdb8 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Wed, 20 Mar 2024 17:09:25 -0700 Subject: idpf: fix kernel panic on unknown packet types In the very rare case where a packet type is unknown to the driver, idpf_rx_process_skb_fields would return early without calling eth_type_trans to set the skb protocol / the network layer handler. This is especially problematic if tcpdump is running when such a packet is received, i.e. it would cause a kernel panic. Instead, call eth_type_trans for every single packet, even when the packet type is unknown. Fixes: 3a8845af66ed ("idpf: add RX splitq napi poll support") Reported-by: Balazs Nemeth Signed-off-by: Joshua Hay Reviewed-by: Jesse Brandeburg Reviewed-by: Przemek Kitszel Tested-by: Salvatore Daniele Signed-off-by: Pavan Kumar Linga Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 6dd7a66bb897..f5bc4a278074 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2941,6 +2941,8 @@ static int idpf_rx_process_skb_fields(struct idpf_queue *rxq, rx_ptype = le16_get_bits(rx_desc->ptype_err_fflags0, VIRTCHNL2_RX_FLEX_DESC_ADV_PTYPE_M); + skb->protocol = eth_type_trans(skb, rxq->vport->netdev); + decoded = rxq->vport->rx_ptype_lkup[rx_ptype]; /* If we don't know the ptype we can't do anything else with it. Just * pass it up the stack as-is. @@ -2951,8 +2953,6 @@ static int idpf_rx_process_skb_fields(struct idpf_queue *rxq, /* process RSS/hash */ idpf_rx_hash(rxq, skb, rx_desc, &decoded); - skb->protocol = eth_type_trans(skb, rxq->vport->netdev); - if (le16_get_bits(rx_desc->hdrlen_flags, VIRTCHNL2_RX_FLEX_DESC_ADV_RSC_M)) return idpf_rx_rsc(rxq, skb, rx_desc, &decoded); -- cgit v1.2.3 From f7c52345ccc96343c0a05bdea3121c8ac7b67d5f Mon Sep 17 00:00:00 2001 From: Kwangjin Ko Date: Tue, 2 Apr 2024 17:14:03 +0900 Subject: cxl/core: Fix initialization of mbox_cmd.size_out in get event Since mbox_cmd.size_out is overwritten with the actual output size in the function below, it needs to be initialized every time. cxl_internal_send_cmd -> __cxl_pci_mbox_send_cmd Problem scenario: 1) The size_out variable is initially set to the size of the mailbox. 2) Read an event. - size_out is set to 160 bytes(header 32B + one event 128B). - Two event are created while reading. 3) Read the new *two* events. - size_out is still set to 160 bytes. - Although the value of out_len is 288 bytes, only 160 bytes are copied from the mailbox register to the local variable. - record_count is set to 2. - Accessing records[1] will result in reading incorrect data. Fixes: 6ebe28f9ec72 ("cxl/mem: Read, trace, and clear events on driver load") Tested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Signed-off-by: Kwangjin Ko Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 50146161887d..f0f54aeccc87 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -958,13 +958,14 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, .payload_in = &log_type, .size_in = sizeof(log_type), .payload_out = payload, - .size_out = mds->payload_size, .min_out = struct_size(payload, records, 0), }; do { int rc, i; + mbox_cmd.size_out = mds->payload_size; + rc = cxl_internal_send_cmd(mds, &mbox_cmd); if (rc) { dev_err_ratelimited(dev, -- cgit v1.2.3 From e7d24c0aa8e678f41457d1304e2091cac6fd1a2e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 28 Mar 2024 07:42:57 +0100 Subject: gcc-plugins/stackleak: Avoid .head.text section The .head.text section carries the startup code that runs with the MMU off or with a translation of memory that deviates from the ordinary one. So avoid instrumentation with the stackleak plugin, which already avoids .init.text and .noinstr.text entirely. Fixes: 48204aba801f1b51 ("x86/sme: Move early SME kernel encryption handling into .head.text") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403221630.2692c998-oliver.sang@intel.com Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20240328064256.2358634-2-ardb+git@google.com Signed-off-by: Kees Cook --- scripts/gcc-plugins/stackleak_plugin.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index c5c2ce113c92..d20c47d21ad8 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -467,6 +467,8 @@ static bool stackleak_gate(void) return false; if (STRING_EQUAL(section, ".entry.text")) return false; + if (STRING_EQUAL(section, ".head.text")) + return false; } return track_frame_size >= 0; -- cgit v1.2.3 From bbda3ba626b9f57ff6063058877eca856f5b734d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:20 +0200 Subject: ubsan: fix unused variable warning in test module This is one of the drivers with an unused variable that is marked 'const'. Adding a __used annotation here avoids the warning and lets us enable the option by default: lib/test_ubsan.c:137:28: error: unused variable 'skip_ubsan_array' [-Werror,-Wunused-const-variable] Fixes: 4a26f49b7b3d ("ubsan: expand tests and reporting") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240403080702.3509288-3-arnd@kernel.org Signed-off-by: Kees Cook --- lib/test_ubsan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 276c12140ee2..c288df9372ed 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -134,7 +134,7 @@ static const test_ubsan_fp test_ubsan_array[] = { }; /* Excluded because they Oops the module. */ -static const test_ubsan_fp skip_ubsan_array[] = { +static __used const test_ubsan_fp skip_ubsan_array[] = { test_ubsan_divrem_overflow, }; -- cgit v1.2.3 From 9c573cd313433f6c1f7236fe64b9b743500c1628 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 9 Mar 2024 12:24:48 -0800 Subject: randomize_kstack: Improve entropy diffusion The kstack_offset variable was really only ever using the low bits for kernel stack offset entropy. Add a ror32() to increase bit diffusion. Suggested-by: Arnd Bergmann Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Link: https://lore.kernel.org/r/20240309202445.work.165-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/randomize_kstack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 5d868505a94e..6d92b68efbf6 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -80,7 +80,7 @@ DECLARE_PER_CPU(u32, kstack_offset); if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ u32 offset = raw_cpu_read(kstack_offset); \ - offset ^= (rand); \ + offset = ror32(offset, 5) ^ (rand); \ raw_cpu_write(kstack_offset, offset); \ } \ } while (0) -- cgit v1.2.3 From 95197779091166b9ed4b1c630c13600abf94ada7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:22 +0200 Subject: i2c: pxa: hide unused icr_bits[] variable The function using this is hidden in an #ifdef, so the variable needs the same one for a clean W=1 build: drivers/i2c/busses/i2c-pxa.c:327:26: error: 'icr_bits' defined but not used [-Werror=unused-const-variable=] Fixes: d6a7b5f84b5c ("[ARM] 4827/1: fix two warnings in drivers/i2c/busses/i2c-pxa.c") Signed-off-by: Arnd Bergmann Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-pxa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index 76f79b68cef8..888ca636f3f3 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -324,6 +324,7 @@ static void decode_ISR(unsigned int val) decode_bits(KERN_DEBUG "ISR", isr_bits, ARRAY_SIZE(isr_bits), val); } +#ifdef CONFIG_I2C_PXA_SLAVE static const struct bits icr_bits[] = { PXA_BIT(ICR_START, "START", NULL), PXA_BIT(ICR_STOP, "STOP", NULL), @@ -342,7 +343,6 @@ static const struct bits icr_bits[] = { PXA_BIT(ICR_UR, "UR", "ur"), }; -#ifdef CONFIG_I2C_PXA_SLAVE static void decode_ICR(unsigned int val) { decode_bits(KERN_DEBUG "ICR", icr_bits, ARRAY_SIZE(icr_bits), val); -- cgit v1.2.3 From c27fa53b858b4ee6552a719aa599c250cf98a586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 3 Apr 2024 09:26:38 +0200 Subject: riscv: Fix vector state restore in rt_sigreturn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RISC-V Vector specification states in "Appendix D: Calling Convention for Vector State" [1] that "Executing a system call causes all caller-saved vector registers (v0-v31, vl, vtype) and vstart to become unspecified.". In the RISC-V kernel this is called "discarding the vstate". Returning from a signal handler via the rt_sigreturn() syscall, vector discard is also performed. However, this is not an issue since the vector state should be restored from the sigcontext, and therefore not care about the vector discard. The "live state" is the actual vector register in the running context, and the "vstate" is the vector state of the task. A dirty live state, means that the vstate and live state are not in synch. When vectorized user_from_copy() was introduced, an bug sneaked in at the restoration code, related to the discard of the live state. An example when this go wrong: 1. A userland application is executing vector code 2. The application receives a signal, and the signal handler is entered. 3. The application returns from the signal handler, using the rt_sigreturn() syscall. 4. The live vector state is discarded upon entering the rt_sigreturn(), and the live state is marked as "dirty", indicating that the live state need to be synchronized with the current vstate. 5. rt_sigreturn() restores the vstate, except the Vector registers, from the sigcontext 6. rt_sigreturn() restores the Vector registers, from the sigcontext, and now the vectorized user_from_copy() is used. The dirty live state from the discard is saved to the vstate, making the vstate corrupt. 7. rt_sigreturn() returns to the application, which crashes due to corrupted vstate. Note that the vectorized user_from_copy() is invoked depending on the value of CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD. Default is 768, which means that vlen has to be larger than 128b for this bug to trigger. The fix is simply to mark the live state as non-dirty/clean prior performing the vstate restore. Link: https://github.com/riscv/riscv-isa-manual/releases/download/riscv-isa-release-8abdb41-2024-03-26/unpriv-isa-asciidoc.pdf # [1] Reported-by: Charlie Jenkins Reported-by: Vineet Gupta Fixes: c2a658d41924 ("riscv: lib: vectorize copy_to_user/copy_from_user") Signed-off-by: Björn Töpel Reviewed-by: Andy Chiu Tested-by: Vineet Gupta Link: https://lore.kernel.org/r/20240403072638.567446-1-bjorn@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/signal.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 501e66debf69..5a2edd7f027e 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -119,6 +119,13 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) struct __sc_riscv_v_state __user *state = sc_vec; void __user *datap; + /* + * Mark the vstate as clean prior performing the actual copy, + * to avoid getting the vstate incorrectly clobbered by the + * discarded vector state. + */ + riscv_v_vstate_set_restore(current, regs); + /* Copy everything of __sc_riscv_v_state except datap. */ err = __copy_from_user(¤t->thread.vstate, &state->v_state, offsetof(struct __riscv_v_ext_state, datap)); @@ -133,13 +140,7 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) * Copy the whole vector content from user space datap. Use * copy_from_user to prevent information leak. */ - err = copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); - if (unlikely(err)) - return err; - - riscv_v_vstate_set_restore(current, regs); - - return err; + return copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); } #else #define save_v_state(task, regs) (0) -- cgit v1.2.3 From e60aa472400b1ff8d0e6c563a2eb05916927f10a Mon Sep 17 00:00:00 2001 From: Thomas Bertschinger Date: Thu, 14 Mar 2024 10:02:18 -0600 Subject: bcachefs: create debugfs dir for each btree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This creates a subdirectory for each individual btree under the btrees/ debugfs directory. Directory structure, before: /sys/kernel/debug/bcachefs/$FS_ID/btrees/ ├── alloc ├── alloc-bfloat-failed ├── alloc-formats ├── backpointers ├── backpointers-bfloat-failed ├── backpointers-formats ... Directory structure, after: /sys/kernel/debug/bcachefs/$FS_ID/btrees/ ├── alloc │   ├── bfloat-failed │   ├── formats │   └── keys ├── backpointers │   ├── bfloat-failed │   ├── formats │   └── keys ... Signed-off-by: Thomas Bertschinger Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 208ce6f0fc43..7b681ae91510 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -866,6 +866,20 @@ void bch2_fs_debug_exit(struct bch_fs *c) debugfs_remove_recursive(c->fs_debug_dir); } +static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) +{ + struct dentry *d; + + d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); + + debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); + + debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); + + debugfs_create_file("bfloat-failed", 0400, d, bd, + &bfloat_failed_debug_ops); +} + void bch2_fs_debug_init(struct bch_fs *c) { struct btree_debug *bd; @@ -902,21 +916,7 @@ void bch2_fs_debug_init(struct bch_fs *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - debugfs_create_file(bch2_btree_id_str(bd->id), - 0400, c->btree_debug_dir, bd, - &btree_debug_ops); - - snprintf(name, sizeof(name), "%s-formats", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &btree_format_debug_ops); - - snprintf(name, sizeof(name), "%s-bfloat-failed", - bch2_btree_id_str(bd->id)); - - debugfs_create_file(name, 0400, c->btree_debug_dir, bd, - &bfloat_failed_debug_ops); + bch2_fs_debug_btree_init(c, bd); } } -- cgit v1.2.3 From cbc17e7802f5de37c7c262204baadfad3f7f99e5 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 28 Mar 2024 15:59:29 +0000 Subject: net: fec: Set mac_managed_pm during probe Setting mac_managed_pm during interface up is too late. In situations where the link is not brought up yet and the system suspends the regular PHY power management will run. Since the FEC ETHEREN control bit is cleared (automatically) on suspend the controller is off in resume. When the regular PHY power management resume path runs in this context it will write to the MII_DATA register but nothing will be transmitted on the MDIO bus. This can be observed by the following log: fec 5b040000.ethernet eth0: MDIO read timeout Microchip LAN87xx T1 5b040000.ethernet-1:04: PM: dpm_run_callback(): mdio_bus_phy_resume+0x0/0xc8 returns -110 Microchip LAN87xx T1 5b040000.ethernet-1:04: PM: failed to resume: error -110 The data written will however remain in the MII_DATA register. When the link later is set to administrative up it will trigger a call to fec_restart() which will restore the MII_SPEED register. This triggers the quirk explained in f166f890c8f0 ("net: ethernet: fec: Replace interrupt driven MDIO with polled IO") causing an extra MII_EVENT. This extra event desynchronizes all the MDIO register reads, causing them to complete too early. Leading all reads to read as 0 because fec_enet_mdio_wait() returns too early. When a Microchip LAN8700R PHY is connected to the FEC, the 0 reads causes the PHY to be initialized incorrectly and the PHY will not transmit any ethernet signal in this state. It cannot be brought out of this state without a power cycle of the PHY. Fixes: 557d5dc83f68 ("net: fec: use mac-managed PHY PM") Closes: https://lore.kernel.org/netdev/1f45bdbe-eab1-4e59-8f24-add177590d27@actia.se/ Signed-off-by: Wei Fang [jernberg: commit message] Signed-off-by: John Ernberg Link: https://lore.kernel.org/r/20240328155909.59613-2-john.ernberg@actia.se Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index d7693fdf640d..8bd213da8fb6 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2454,8 +2454,6 @@ static int fec_enet_mii_probe(struct net_device *ndev) fep->link = 0; fep->full_duplex = 0; - phy_dev->mac_managed_pm = true; - phy_attached_info(phy_dev); return 0; @@ -2467,10 +2465,12 @@ static int fec_enet_mii_init(struct platform_device *pdev) struct net_device *ndev = platform_get_drvdata(pdev); struct fec_enet_private *fep = netdev_priv(ndev); bool suppress_preamble = false; + struct phy_device *phydev; struct device_node *node; int err = -ENXIO; u32 mii_speed, holdtime; u32 bus_freq; + int addr; /* * The i.MX28 dual fec interfaces are not equal. @@ -2584,6 +2584,13 @@ static int fec_enet_mii_init(struct platform_device *pdev) goto err_out_free_mdiobus; of_node_put(node); + /* find all the PHY devices on the bus and set mac_managed_pm to true */ + for (addr = 0; addr < PHY_MAX_ADDR; addr++) { + phydev = mdiobus_get_phy(fep->mii_bus, addr); + if (phydev) + phydev->mac_managed_pm = true; + } + mii_cnt++; /* save fec0 mii_bus */ -- cgit v1.2.3 From c644920ce9220d83e070f575a4df711741c07f07 Mon Sep 17 00:00:00 2001 From: Duanqiang Wen Date: Tue, 2 Apr 2024 10:18:43 +0800 Subject: net: txgbe: fix i2c dev name cannot match clkdev txgbe clkdev shortened clk_name, so i2c_dev info_name also need to shorten. Otherwise, i2c_dev cannot initialize clock. Fixes: e30cef001da2 ("net: txgbe: fix clk_name exceed MAX_DEV_ID limits") Signed-off-by: Duanqiang Wen Link: https://lore.kernel.org/r/20240402021843.126192-1-duanqiangwen@net-swift.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c index 5b5d5e4310d1..2fa511227eac 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c @@ -20,6 +20,8 @@ #include "txgbe_phy.h" #include "txgbe_hw.h" +#define TXGBE_I2C_CLK_DEV_NAME "i2c_dw" + static int txgbe_swnodes_register(struct txgbe *txgbe) { struct txgbe_nodes *nodes = &txgbe->nodes; @@ -571,8 +573,8 @@ static int txgbe_clock_register(struct txgbe *txgbe) char clk_name[32]; struct clk *clk; - snprintf(clk_name, sizeof(clk_name), "i2c_dw.%d", - pci_dev_id(pdev)); + snprintf(clk_name, sizeof(clk_name), "%s.%d", + TXGBE_I2C_CLK_DEV_NAME, pci_dev_id(pdev)); clk = clk_register_fixed_rate(NULL, clk_name, NULL, 0, 156250000); if (IS_ERR(clk)) @@ -634,7 +636,7 @@ static int txgbe_i2c_register(struct txgbe *txgbe) info.parent = &pdev->dev; info.fwnode = software_node_fwnode(txgbe->nodes.group[SWNODE_I2C]); - info.name = "i2c_designware"; + info.name = TXGBE_I2C_CLK_DEV_NAME; info.id = pci_dev_id(pdev); info.res = &DEFINE_RES_IRQ(pdev->irq); -- cgit v1.2.3 From b3da86d432b7cd65b025a11f68613e333d2483db Mon Sep 17 00:00:00 2001 From: Piotr Wejman Date: Mon, 1 Apr 2024 21:22:39 +0200 Subject: net: stmmac: fix rx queue priority assignment The driver should ensure that same priority is not mapped to multiple rx queues. From DesignWare Cores Ethernet Quality-of-Service Databook, section 17.1.29 MAC_RxQ_Ctrl2: "[...]The software must ensure that the content of this field is mutually exclusive to the PSRQ fields for other queues, that is, the same priority is not mapped to multiple Rx queues[...]" Previously rx_queue_priority() function was: - clearing all priorities from a queue - adding new priorities to that queue After this patch it will: - first assign new priorities to a queue - then remove those priorities from all other queues - keep other priorities previously assigned to that queue Fixes: a8f5102af2a7 ("net: stmmac: TX and RX queue priority configuration") Fixes: 2142754f8b9c ("net: stmmac: Add MAC related callbacks for XGMAC2") Signed-off-by: Piotr Wejman Link: https://lore.kernel.org/r/20240401192239.33942-1-piotrwejman90@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 40 +++++++++++++++++----- .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 38 ++++++++++++++++---- 2 files changed, 62 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 6b6d0de09619..cef25efbdff9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -92,19 +92,41 @@ static void dwmac4_rx_queue_priority(struct mac_device_info *hw, u32 prio, u32 queue) { void __iomem *ioaddr = hw->pcsr; - u32 base_register; - u32 value; + u32 clear_mask = 0; + u32 ctrl2, ctrl3; + int i; - base_register = (queue < 4) ? GMAC_RXQ_CTRL2 : GMAC_RXQ_CTRL3; - if (queue >= 4) - queue -= 4; + ctrl2 = readl(ioaddr + GMAC_RXQ_CTRL2); + ctrl3 = readl(ioaddr + GMAC_RXQ_CTRL3); - value = readl(ioaddr + base_register); + /* The software must ensure that the same priority + * is not mapped to multiple Rx queues + */ + for (i = 0; i < 4; i++) + clear_mask |= ((prio << GMAC_RXQCTRL_PSRQX_SHIFT(i)) & + GMAC_RXQCTRL_PSRQX_MASK(i)); + + ctrl2 &= ~clear_mask; + ctrl3 &= ~clear_mask; + + /* First assign new priorities to a queue, then + * clear them from others queues + */ + if (queue < 4) { + ctrl2 |= (prio << GMAC_RXQCTRL_PSRQX_SHIFT(queue)) & + GMAC_RXQCTRL_PSRQX_MASK(queue); - value &= ~GMAC_RXQCTRL_PSRQX_MASK(queue); - value |= (prio << GMAC_RXQCTRL_PSRQX_SHIFT(queue)) & + writel(ctrl2, ioaddr + GMAC_RXQ_CTRL2); + writel(ctrl3, ioaddr + GMAC_RXQ_CTRL3); + } else { + queue -= 4; + + ctrl3 |= (prio << GMAC_RXQCTRL_PSRQX_SHIFT(queue)) & GMAC_RXQCTRL_PSRQX_MASK(queue); - writel(value, ioaddr + base_register); + + writel(ctrl3, ioaddr + GMAC_RXQ_CTRL3); + writel(ctrl2, ioaddr + GMAC_RXQ_CTRL2); + } } static void dwmac4_tx_queue_priority(struct mac_device_info *hw, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 1af2f89a0504..e841e312077e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -105,17 +105,41 @@ static void dwxgmac2_rx_queue_prio(struct mac_device_info *hw, u32 prio, u32 queue) { void __iomem *ioaddr = hw->pcsr; - u32 value, reg; + u32 clear_mask = 0; + u32 ctrl2, ctrl3; + int i; - reg = (queue < 4) ? XGMAC_RXQ_CTRL2 : XGMAC_RXQ_CTRL3; - if (queue >= 4) + ctrl2 = readl(ioaddr + XGMAC_RXQ_CTRL2); + ctrl3 = readl(ioaddr + XGMAC_RXQ_CTRL3); + + /* The software must ensure that the same priority + * is not mapped to multiple Rx queues + */ + for (i = 0; i < 4; i++) + clear_mask |= ((prio << XGMAC_PSRQ_SHIFT(i)) & + XGMAC_PSRQ(i)); + + ctrl2 &= ~clear_mask; + ctrl3 &= ~clear_mask; + + /* First assign new priorities to a queue, then + * clear them from others queues + */ + if (queue < 4) { + ctrl2 |= (prio << XGMAC_PSRQ_SHIFT(queue)) & + XGMAC_PSRQ(queue); + + writel(ctrl2, ioaddr + XGMAC_RXQ_CTRL2); + writel(ctrl3, ioaddr + XGMAC_RXQ_CTRL3); + } else { queue -= 4; - value = readl(ioaddr + reg); - value &= ~XGMAC_PSRQ(queue); - value |= (prio << XGMAC_PSRQ_SHIFT(queue)) & XGMAC_PSRQ(queue); + ctrl3 |= (prio << XGMAC_PSRQ_SHIFT(queue)) & + XGMAC_PSRQ(queue); - writel(value, ioaddr + reg); + writel(ctrl3, ioaddr + XGMAC_RXQ_CTRL3); + writel(ctrl2, ioaddr + XGMAC_RXQ_CTRL2); + } } static void dwxgmac2_tx_queue_prio(struct mac_device_info *hw, u32 prio, -- cgit v1.2.3 From de99e1ea3a35f23ff83a31d6b08f43d27b2c6345 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 2 Apr 2024 09:16:34 +0200 Subject: net: phy: micrel: lan8814: Fix when enabling/disabling 1-step timestamping There are 2 issues with the blamed commit. 1. When the phy is initialized, it would enable the disabled of UDPv4 checksums. The UDPv6 checksum is already enabled by default. So when 1-step is configured then it would clear these flags. 2. After the 1-step is configured, then if 2-step is configured then the 1-step would be still configured because it is not clearing the flag. So the sync frames will still have origin timestamps set. Fix this by reading first the value of the register and then just change bit 12 as this one determines if the timestamp needs to be inserted in the frame, without changing any other bits. Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Horatiu Vultur Reviewed-by: Divya Koppera Link: https://lore.kernel.org/r/20240402071634.2483524-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 0f8a8ad7ea0b..ddb50a0e2bc8 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2431,6 +2431,7 @@ static int lan8814_hwtstamp(struct mii_timestamper *mii_ts, struct lan8814_ptp_rx_ts *rx_ts, *tmp; int txcfg = 0, rxcfg = 0; int pkt_ts_enable; + int tx_mod; ptp_priv->hwts_tx_type = config->tx_type; ptp_priv->rx_filter = config->rx_filter; @@ -2477,9 +2478,14 @@ static int lan8814_hwtstamp(struct mii_timestamper *mii_ts, lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_RX_TIMESTAMP_EN, pkt_ts_enable); lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_TX_TIMESTAMP_EN, pkt_ts_enable); - if (ptp_priv->hwts_tx_type == HWTSTAMP_TX_ONESTEP_SYNC) + tx_mod = lanphy_read_page_reg(ptp_priv->phydev, 5, PTP_TX_MOD); + if (ptp_priv->hwts_tx_type == HWTSTAMP_TX_ONESTEP_SYNC) { lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_TX_MOD, - PTP_TX_MOD_TX_PTP_SYNC_TS_INSERT_); + tx_mod | PTP_TX_MOD_TX_PTP_SYNC_TS_INSERT_); + } else if (ptp_priv->hwts_tx_type == HWTSTAMP_TX_ON) { + lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_TX_MOD, + tx_mod & ~PTP_TX_MOD_TX_PTP_SYNC_TS_INSERT_); + } if (config->rx_filter != HWTSTAMP_FILTER_NONE) lan8814_config_ts_intr(ptp_priv->phydev, true); -- cgit v1.2.3 From 7eb322360b0266481e560d1807ee79e0cef5742b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Apr 2024 13:41:33 +0000 Subject: net/sched: fix lockdep splat in qdisc_tree_reduce_backlog() qdisc_tree_reduce_backlog() is called with the qdisc lock held, not RTNL. We must use qdisc_lookup_rcu() instead of qdisc_lookup() syzbot reported: WARNING: suspicious RCU usage 6.1.74-syzkaller #0 Not tainted ----------------------------- net/sched/sch_api.c:305 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by udevd/1142: #0: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:306 [inline] #0: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:747 [inline] #0: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: net_tx_action+0x64a/0x970 net/core/dev.c:5282 #1: ffff888171861108 (&sch->q.lock){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:350 [inline] #1: ffff888171861108 (&sch->q.lock){+.-.}-{2:2}, at: net_tx_action+0x754/0x970 net/core/dev.c:5297 #2: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:306 [inline] #2: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:747 [inline] #2: ffffffff87c729a0 (rcu_read_lock){....}-{1:2}, at: qdisc_tree_reduce_backlog+0x84/0x580 net/sched/sch_api.c:792 stack backtrace: CPU: 1 PID: 1142 Comm: udevd Not tainted 6.1.74-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Call Trace: [] __dump_stack lib/dump_stack.c:88 [inline] [] dump_stack_lvl+0x1b1/0x28f lib/dump_stack.c:106 [] dump_stack+0x15/0x1e lib/dump_stack.c:113 [] lockdep_rcu_suspicious+0x1b9/0x260 kernel/locking/lockdep.c:6592 [] qdisc_lookup+0xac/0x6f0 net/sched/sch_api.c:305 [] qdisc_tree_reduce_backlog+0x243/0x580 net/sched/sch_api.c:811 [] pfifo_tail_enqueue+0x32c/0x4b0 net/sched/sch_fifo.c:51 [] qdisc_enqueue include/net/sch_generic.h:833 [inline] [] netem_dequeue+0xeb3/0x15d0 net/sched/sch_netem.c:723 [] dequeue_skb net/sched/sch_generic.c:292 [inline] [] qdisc_restart net/sched/sch_generic.c:397 [inline] [] __qdisc_run+0x249/0x1e60 net/sched/sch_generic.c:415 [] qdisc_run+0xd6/0x260 include/net/pkt_sched.h:125 [] net_tx_action+0x7c9/0x970 net/core/dev.c:5313 [] __do_softirq+0x2bd/0x9bd kernel/softirq.c:616 [] invoke_softirq kernel/softirq.c:447 [inline] [] __irq_exit_rcu+0xca/0x230 kernel/softirq.c:700 [] irq_exit_rcu+0x9/0x20 kernel/softirq.c:712 [] sysvec_apic_timer_interrupt+0x42/0x90 arch/x86/kernel/apic/apic.c:1107 [] asm_sysvec_apic_timer_interrupt+0x1b/0x20 arch/x86/include/asm/idtentry.h:656 Fixes: d636fc5dd692 ("net: sched: add rcu annotations around qdisc->qdisc_sleeping") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240402134133.2352776-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 65e05b0c98e4..60239378d43f 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -809,7 +809,7 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) notify = !sch->q.qlen && !WARN_ON_ONCE(!n && !qdisc_is_offloaded); /* TODO: perform the search on a per txq basis */ - sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); + sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON_ONCE(parentid != TC_H_ROOT); break; -- cgit v1.2.3 From c0de6ab920aafb56feab56058e46b688e694a246 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 2 Apr 2024 12:48:36 -0700 Subject: net: mana: Fix Rx DMA datasize and skb_over_panic mana_get_rxbuf_cfg() aligns the RX buffer's DMA datasize to be multiple of 64. So a packet slightly bigger than mtu+14, say 1536, can be received and cause skb_over_panic. Sample dmesg: [ 5325.237162] skbuff: skb_over_panic: text:ffffffffc043277a len:1536 put:1536 head:ff1100018b517000 data:ff1100018b517100 tail:0x700 end:0x6ea dev: [ 5325.243689] ------------[ cut here ]------------ [ 5325.245748] kernel BUG at net/core/skbuff.c:192! [ 5325.247838] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 5325.258374] RIP: 0010:skb_panic+0x4f/0x60 [ 5325.302941] Call Trace: [ 5325.304389] [ 5325.315794] ? skb_panic+0x4f/0x60 [ 5325.317457] ? asm_exc_invalid_op+0x1f/0x30 [ 5325.319490] ? skb_panic+0x4f/0x60 [ 5325.321161] skb_put+0x4e/0x50 [ 5325.322670] mana_poll+0x6fa/0xb50 [mana] [ 5325.324578] __napi_poll+0x33/0x1e0 [ 5325.326328] net_rx_action+0x12e/0x280 As discussed internally, this alignment is not necessary. To fix this bug, remove it from the code. So oversized packets will be marked as CQE_RX_TRUNCATED by NIC, and dropped. Cc: stable@vger.kernel.org Fixes: 2fbbd712baf1 ("net: mana: Enable RX path to handle various MTU sizes") Signed-off-by: Haiyang Zhang Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1712087316-20886-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 2 +- include/net/mana/mana.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 59287c6e6cee..d8af5e7e15b4 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -601,7 +601,7 @@ static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, *alloc_size = mtu + MANA_RXBUF_PAD + *headroom; - *datasize = ALIGN(mtu + ETH_HLEN, MANA_RX_DATA_ALIGN); + *datasize = mtu + ETH_HLEN; } static int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu) diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 76147feb0d10..4eeedf14711b 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -39,7 +39,6 @@ enum TRI_STATE { #define COMP_ENTRY_SIZE 64 #define RX_BUFFERS_PER_QUEUE 512 -#define MANA_RX_DATA_ALIGN 64 #define MAX_SEND_BUFFERS_PER_QUEUE 256 -- cgit v1.2.3 From 3137b83a90646917c90951d66489db466b4ae106 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:48 +0200 Subject: ata: sata_mv: Fix PCI device ID table declaration compilation warning Building with W=1 shows a warning for an unused variable when CONFIG_PCI is diabled: drivers/ata/sata_mv.c:790:35: error: unused variable 'mv_pci_tbl' [-Werror,-Wunused-const-variable] static const struct pci_device_id mv_pci_tbl[] = { Move the table into the same block that containsn the pci_driver definition. Fixes: 7bb3c5290ca0 ("sata_mv: Remove PCI dependency") Signed-off-by: Arnd Bergmann Signed-off-by: Damien Le Moal --- drivers/ata/sata_mv.c | 63 +++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index e82786c63fbd..9bec0aee92e0 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -787,37 +787,6 @@ static const struct ata_port_info mv_port_info[] = { }, }; -static const struct pci_device_id mv_pci_tbl[] = { - { PCI_VDEVICE(MARVELL, 0x5040), chip_504x }, - { PCI_VDEVICE(MARVELL, 0x5041), chip_504x }, - { PCI_VDEVICE(MARVELL, 0x5080), chip_5080 }, - { PCI_VDEVICE(MARVELL, 0x5081), chip_508x }, - /* RocketRAID 1720/174x have different identifiers */ - { PCI_VDEVICE(TTI, 0x1720), chip_6042 }, - { PCI_VDEVICE(TTI, 0x1740), chip_6042 }, - { PCI_VDEVICE(TTI, 0x1742), chip_6042 }, - - { PCI_VDEVICE(MARVELL, 0x6040), chip_604x }, - { PCI_VDEVICE(MARVELL, 0x6041), chip_604x }, - { PCI_VDEVICE(MARVELL, 0x6042), chip_6042 }, - { PCI_VDEVICE(MARVELL, 0x6080), chip_608x }, - { PCI_VDEVICE(MARVELL, 0x6081), chip_608x }, - - { PCI_VDEVICE(ADAPTEC2, 0x0241), chip_604x }, - - /* Adaptec 1430SA */ - { PCI_VDEVICE(ADAPTEC2, 0x0243), chip_7042 }, - - /* Marvell 7042 support */ - { PCI_VDEVICE(MARVELL, 0x7042), chip_7042 }, - - /* Highpoint RocketRAID PCIe series */ - { PCI_VDEVICE(TTI, 0x2300), chip_7042 }, - { PCI_VDEVICE(TTI, 0x2310), chip_7042 }, - - { } /* terminate list */ -}; - static const struct mv_hw_ops mv5xxx_ops = { .phy_errata = mv5_phy_errata, .enable_leds = mv5_enable_leds, @@ -4303,6 +4272,36 @@ static int mv_pci_init_one(struct pci_dev *pdev, static int mv_pci_device_resume(struct pci_dev *pdev); #endif +static const struct pci_device_id mv_pci_tbl[] = { + { PCI_VDEVICE(MARVELL, 0x5040), chip_504x }, + { PCI_VDEVICE(MARVELL, 0x5041), chip_504x }, + { PCI_VDEVICE(MARVELL, 0x5080), chip_5080 }, + { PCI_VDEVICE(MARVELL, 0x5081), chip_508x }, + /* RocketRAID 1720/174x have different identifiers */ + { PCI_VDEVICE(TTI, 0x1720), chip_6042 }, + { PCI_VDEVICE(TTI, 0x1740), chip_6042 }, + { PCI_VDEVICE(TTI, 0x1742), chip_6042 }, + + { PCI_VDEVICE(MARVELL, 0x6040), chip_604x }, + { PCI_VDEVICE(MARVELL, 0x6041), chip_604x }, + { PCI_VDEVICE(MARVELL, 0x6042), chip_6042 }, + { PCI_VDEVICE(MARVELL, 0x6080), chip_608x }, + { PCI_VDEVICE(MARVELL, 0x6081), chip_608x }, + + { PCI_VDEVICE(ADAPTEC2, 0x0241), chip_604x }, + + /* Adaptec 1430SA */ + { PCI_VDEVICE(ADAPTEC2, 0x0243), chip_7042 }, + + /* Marvell 7042 support */ + { PCI_VDEVICE(MARVELL, 0x7042), chip_7042 }, + + /* Highpoint RocketRAID PCIe series */ + { PCI_VDEVICE(TTI, 0x2300), chip_7042 }, + { PCI_VDEVICE(TTI, 0x2310), chip_7042 }, + + { } /* terminate list */ +}; static struct pci_driver mv_pci_driver = { .name = DRV_NAME, @@ -4315,6 +4314,7 @@ static struct pci_driver mv_pci_driver = { #endif }; +MODULE_DEVICE_TABLE(pci, mv_pci_tbl); /** * mv_print_info - Dump key info to kernel log for perusal. @@ -4487,7 +4487,6 @@ static void __exit mv_exit(void) MODULE_AUTHOR("Brett Russ"); MODULE_DESCRIPTION("SCSI low-level driver for Marvell SATA controllers"); MODULE_LICENSE("GPL v2"); -MODULE_DEVICE_TABLE(pci, mv_pci_tbl); MODULE_VERSION(DRV_VERSION); MODULE_ALIAS("platform:" DRV_NAME); -- cgit v1.2.3 From e85006ae7430aef780cc4f0849692e266a102ec0 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 3 Apr 2024 04:33:49 +0000 Subject: ata: sata_gemini: Check clk_enable() result The call to clk_enable() in gemini_sata_start_bridge() can fail. Add a check to detect such failure. Signed-off-by: Chen Ni Signed-off-by: Damien Le Moal --- drivers/ata/sata_gemini.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/ata/sata_gemini.c b/drivers/ata/sata_gemini.c index 400b22ee99c3..4c270999ba3c 100644 --- a/drivers/ata/sata_gemini.c +++ b/drivers/ata/sata_gemini.c @@ -200,7 +200,10 @@ int gemini_sata_start_bridge(struct sata_gemini *sg, unsigned int bridge) pclk = sg->sata0_pclk; else pclk = sg->sata1_pclk; - clk_enable(pclk); + ret = clk_enable(pclk); + if (ret) + return ret; + msleep(10); /* Do not keep clocking a bridge that is not online */ -- cgit v1.2.3 From 9852b1dc6a140365977d7bfb5fa03d413b3417ad Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 22:23:37 +0200 Subject: x86/numa/32: Include missing The __vmalloc_start_set declaration is in a header that is not included in numa_32.c in current linux-next: arch/x86/mm/numa_32.c: In function 'initmem_init': arch/x86/mm/numa_32.c:57:9: error: '__vmalloc_start_set' undeclared (first use in this function) 57 | __vmalloc_start_set = true; | ^~~~~~~~~~~~~~~~~~~ arch/x86/mm/numa_32.c:57:9: note: each undeclared identifier is reported only once for each function it appears in Add an explicit #include. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240403202344.3463169-1-arnd@kernel.org --- arch/x86/mm/numa_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 104544359d69..025fd7ea5d69 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -24,6 +24,7 @@ #include #include +#include #include "numa_internal.h" -- cgit v1.2.3 From 7d8ed162e6a92268d4b2b84d364a931216102c8e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:26:59 +0000 Subject: memblock tests: fix undefined reference to `early_pfn_to_nid' commit 6a9531c3a880 ("memblock: fix crash when reserved memory is not added to memory") introduce the usage of early_pfn_to_nid, which is not defined in memblock tests. The original definition of early_pfn_to_nid is defined in mm.h, so let add this in the corresponding mm.h. Signed-off-by: Wei Yang CC: Yajun Deng CC: Mike Rapoport Link: https://lore.kernel.org/r/20240402132701.29744-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index f3c82ab5b14c..7d73da098047 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -37,4 +37,9 @@ static inline void totalram_pages_add(long count) { } +static inline int early_pfn_to_nid(unsigned long pfn) +{ + return 0; +} + #endif -- cgit v1.2.3 From e0f5a8e74be88f2476e58b25d3b49a9521bdc4ec Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:27:00 +0000 Subject: memblock tests: fix undefined reference to `panic' commit e96c6b8f212a ("memblock: report failures when memblock_can_resize is not set") introduced the usage of panic, which is not defined in memblock test. Let's define it directly in panic.h to fix it. Signed-off-by: Wei Yang CC: Song Shuai CC: Mike Rapoport Link: https://lore.kernel.org/r/20240402132701.29744-3-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/include/linux/kernel.h | 1 + tools/include/linux/panic.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 tools/include/linux/panic.h diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 4b0673bf52c2..07cfad817d53 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/tools/include/linux/panic.h b/tools/include/linux/panic.h new file mode 100644 index 000000000000..9c8f17a41ce8 --- /dev/null +++ b/tools/include/linux/panic.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_PANIC_H +#define _TOOLS_LINUX_PANIC_H + +#include +#include +#include + +static inline void panic(const char *fmt, ...) +{ + va_list argp; + + va_start(argp, fmt); + vfprintf(stderr, fmt, argp); + va_end(argp); + exit(-1); +} + +#endif -- cgit v1.2.3 From 592447f6cb3c20d606d6c5d8e6af68e99707b786 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:27:01 +0000 Subject: memblock tests: fix undefined reference to `BIT' commit 772dd0342727 ("mm: enumerate all gfp flags") define gfp flags with the help of BIT, while gfp_types.h doesn't include header file for the definition. This through an error on building memblock tests. Let's include linux/bits.h to fix it. Signed-off-by: Wei Yang CC: Suren Baghdasaryan CC: Michal Hocko Link: https://lore.kernel.org/r/20240402132701.29744-4-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- include/linux/gfp_types.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 868c8fb1bbc1..13becafe41df 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -2,6 +2,8 @@ #ifndef __LINUX_GFP_TYPES_H #define __LINUX_GFP_TYPES_H +#include + /* The typedef is in types.h but we want the documentation here */ #if 0 /** -- cgit v1.2.3 From 9ab4ad295622a3481818856762471c1f8c830e18 Mon Sep 17 00:00:00 2001 From: Nikita Kiryushin Date: Mon, 1 Apr 2024 22:14:18 +0300 Subject: tg3: Remove residual error handling in tg3_suspend As of now, tg3_power_down_prepare always ends with success, but the error handling code from former tg3_set_power_state call is still here. This code became unreachable in commit c866b7eac073 ("tg3: Do not use legacy PCI power management"). Remove (now unreachable) error handling code for simplification and change tg3_power_down_prepare to a void function as its result is no more checked. Signed-off-by: Nikita Kiryushin Reviewed-by: Michael Chan Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240401191418.361747-1-kiryushin@ancud.ru Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/tg3.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 62ff4381ac83..e6ff3c9bd7e5 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -4019,7 +4019,7 @@ static int tg3_power_up(struct tg3 *tp) static int tg3_setup_phy(struct tg3 *, bool); -static int tg3_power_down_prepare(struct tg3 *tp) +static void tg3_power_down_prepare(struct tg3 *tp) { u32 misc_host_ctrl; bool device_should_wake, do_low_power; @@ -4263,7 +4263,7 @@ static int tg3_power_down_prepare(struct tg3 *tp) tg3_ape_driver_state_change(tp, RESET_KIND_SHUTDOWN); - return 0; + return; } static void tg3_power_down(struct tg3 *tp) @@ -18084,7 +18084,6 @@ static int tg3_suspend(struct device *device) { struct net_device *dev = dev_get_drvdata(device); struct tg3 *tp = netdev_priv(dev); - int err = 0; rtnl_lock(); @@ -18108,32 +18107,11 @@ static int tg3_suspend(struct device *device) tg3_flag_clear(tp, INIT_COMPLETE); tg3_full_unlock(tp); - err = tg3_power_down_prepare(tp); - if (err) { - int err2; - - tg3_full_lock(tp, 0); - - tg3_flag_set(tp, INIT_COMPLETE); - err2 = tg3_restart_hw(tp, true); - if (err2) - goto out; - - tg3_timer_start(tp); - - netif_device_attach(dev); - tg3_netif_start(tp); - -out: - tg3_full_unlock(tp); - - if (!err2) - tg3_phy_start(tp); - } + tg3_power_down_prepare(tp); unlock: rtnl_unlock(); - return err; + return 0; } static int tg3_resume(struct device *device) -- cgit v1.2.3 From 99485c4c026f024e7cb82da84c7951dbe3deb584 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 26 Mar 2024 17:07:35 +0100 Subject: x86/coco: Require seeding RNG with RDRAND on CoCo systems There are few uses of CoCo that don't rely on working cryptography and hence a working RNG. Unfortunately, the CoCo threat model means that the VM host cannot be trusted and may actively work against guests to extract secrets or manipulate computation. Since a malicious host can modify or observe nearly all inputs to guests, the only remaining source of entropy for CoCo guests is RDRAND. If RDRAND is broken -- due to CPU hardware fault -- the RNG as a whole is meant to gracefully continue on gathering entropy from other sources, but since there aren't other sources on CoCo, this is catastrophic. This is mostly a concern at boot time when initially seeding the RNG, as after that the consequences of a broken RDRAND are much more theoretical. So, try at boot to seed the RNG using 256 bits of RDRAND output. If this fails, panic(). This will also trigger if the system is booted without RDRAND, as RDRAND is essential for a safe CoCo boot. Add this deliberately to be "just a CoCo x86 driver feature" and not part of the RNG itself. Many device drivers and platforms have some desire to contribute something to the RNG, and add_device_randomness() is specifically meant for this purpose. Any driver can call it with seed data of any quality, or even garbage quality, and it can only possibly make the quality of the RNG better or have no effect, but can never make it worse. Rather than trying to build something into the core of the RNG, consider the particular CoCo issue just a CoCo issue, and therefore separate it all out into driver (well, arch/platform) code. [ bp: Massage commit message. ] Signed-off-by: Jason A. Donenfeld Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Elena Reshetova Reviewed-by: Kirill A. Shutemov Reviewed-by: Theodore Ts'o Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240326160735.73531-1-Jason@zx2c4.com --- arch/x86/coco/core.c | 41 +++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/coco.h | 2 ++ arch/x86/kernel/setup.c | 2 ++ 3 files changed, 45 insertions(+) diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d07be9d05cd0..ddd4efdc79d6 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -3,13 +3,17 @@ * Confidential Computing Platform Capability checks * * Copyright (C) 2021 Advanced Micro Devices, Inc. + * Copyright (C) 2024 Jason A. Donenfeld . All Rights Reserved. * * Author: Tom Lendacky */ #include #include +#include +#include +#include #include #include @@ -148,3 +152,40 @@ u64 cc_mkdec(u64 val) } } EXPORT_SYMBOL_GPL(cc_mkdec); + +__init void cc_random_init(void) +{ + /* + * The seed is 32 bytes (in units of longs), which is 256 bits, which + * is the security level that the RNG is targeting. + */ + unsigned long rng_seed[32 / sizeof(long)]; + size_t i, longs; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * Since the CoCo threat model includes the host, the only reliable + * source of entropy that can be neither observed nor manipulated is + * RDRAND. Usually, RDRAND failure is considered tolerable, but since + * CoCo guests have no other unobservable source of entropy, it's + * important to at least ensure the RNG gets some initial random seeds. + */ + for (i = 0; i < ARRAY_SIZE(rng_seed); i += longs) { + longs = arch_get_random_longs(&rng_seed[i], ARRAY_SIZE(rng_seed) - i); + + /* + * A zero return value means that the guest doesn't have RDRAND + * or the CPU is physically broken, and in both cases that + * means most crypto inside of the CoCo instance will be + * broken, defeating the purpose of CoCo in the first place. So + * just panic here because it's absolutely unsafe to continue + * executing. + */ + if (longs == 0) + panic("RDRAND is defective."); + } + add_device_randomness(rng_seed, sizeof(rng_seed)); + memzero_explicit(rng_seed, sizeof(rng_seed)); +} diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index fb7388bbc212..c086699b0d0c 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -22,6 +22,7 @@ static inline void cc_set_mask(u64 mask) u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); +void cc_random_init(void); #else #define cc_vendor (CC_VENDOR_NONE) @@ -34,6 +35,7 @@ static inline u64 cc_mkdec(u64 val) { return val; } +static inline void cc_random_init(void) { } #endif #endif /* _ASM_X86_COCO_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0109e6c510e0..e125e059e2c4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -991,6 +992,7 @@ void __init setup_arch(char **cmdline_p) * memory size. */ mem_encrypt_setup_arch(); + cc_random_init(); efi_fake_memmap(); efi_find_mirror(); -- cgit v1.2.3 From 54f5f47b6055c6b57cbc41a440f8ca8b2ec4275a Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 27 Mar 2024 16:43:15 +0100 Subject: x86/kvm/Kconfig: Have KVM_AMD_SEV select ARCH_HAS_CC_PLATFORM The functionality to load SEV-SNP guests by the host will soon rely on cc_platform* helpers because the cpu_feature* API with the early patching is insufficient when SNP support needs to be disabled late. Therefore, pull that functionality in. Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Link: https://lore.kernel.org/r/20240327154317.29909-4-bp@alien8.de --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3aaf7e86a859..0ebdd088f28b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -122,6 +122,7 @@ config KVM_AMD_SEV default y depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) + select ARCH_HAS_CC_PLATFORM help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. -- cgit v1.2.3 From bc6f707fc0feec72acc2f49c312eb31d257363a3 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 27 Mar 2024 16:43:16 +0100 Subject: x86/cc: Add cc_platform_set/_clear() helpers Add functionality to set and/or clear different attributes of the machine as a confidential computing platform. Add the first one too: whether the machine is running as a host for SEV-SNP guests. Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Link: https://lore.kernel.org/r/20240327154317.29909-5-bp@alien8.de --- arch/x86/coco/core.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/cc_platform.h | 12 +++++++++++ 2 files changed, 64 insertions(+) diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index ddd4efdc79d6..b31ef2424d19 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -20,6 +20,11 @@ enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE; u64 cc_mask __ro_after_init; +static struct cc_attr_flags { + __u64 host_sev_snp : 1, + __resv : 63; +} cc_flags; + static bool noinstr intel_cc_platform_has(enum cc_attr attr) { switch (attr) { @@ -93,6 +98,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_SEV_SNP: return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + case CC_ATTR_HOST_SEV_SNP: + return cc_flags.host_sev_snp; + default: return false; } @@ -153,6 +161,50 @@ u64 cc_mkdec(u64 val) } EXPORT_SYMBOL_GPL(cc_mkdec); +static void amd_cc_platform_clear(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_HOST_SEV_SNP: + cc_flags.host_sev_snp = 0; + break; + default: + break; + } +} + +void cc_platform_clear(enum cc_attr attr) +{ + switch (cc_vendor) { + case CC_VENDOR_AMD: + amd_cc_platform_clear(attr); + break; + default: + break; + } +} + +static void amd_cc_platform_set(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_HOST_SEV_SNP: + cc_flags.host_sev_snp = 1; + break; + default: + break; + } +} + +void cc_platform_set(enum cc_attr attr) +{ + switch (cc_vendor) { + case CC_VENDOR_AMD: + amd_cc_platform_set(attr); + break; + default: + break; + } +} + __init void cc_random_init(void) { /* diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index cb0d6cd1c12f..60693a145894 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -90,6 +90,14 @@ enum cc_attr { * Examples include TDX Guest. */ CC_ATTR_HOTPLUG_DISABLED, + + /** + * @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host. + * + * The host kernel is running with the necessary features + * enabled to run SEV-SNP guests. + */ + CC_ATTR_HOST_SEV_SNP, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM @@ -107,10 +115,14 @@ enum cc_attr { * * FALSE - Specified Confidential Computing attribute is not active */ bool cc_platform_has(enum cc_attr attr); +void cc_platform_set(enum cc_attr attr); +void cc_platform_clear(enum cc_attr attr); #else /* !CONFIG_ARCH_HAS_CC_PLATFORM */ static inline bool cc_platform_has(enum cc_attr attr) { return false; } +static inline void cc_platform_set(enum cc_attr attr) { } +static inline void cc_platform_clear(enum cc_attr attr) { } #endif /* CONFIG_ARCH_HAS_CC_PLATFORM */ -- cgit v1.2.3 From 0ecaefb303de69929dc0036d5021d01cec7ea052 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 27 Mar 2024 16:43:17 +0100 Subject: x86/CPU/AMD: Track SNP host status with cc_platform_*() The host SNP worthiness can determined later, after alternatives have been patched, in snp_rmptable_init() depending on cmdline options like iommu=pt which is incompatible with SNP, for example. Which means that one cannot use X86_FEATURE_SEV_SNP and will need to have a special flag for that control. Use that newly added CC_ATTR_HOST_SEV_SNP in the appropriate places. Move kdump_sev_callback() to its rightful place, while at it. Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Link: https://lore.kernel.org/r/20240327154317.29909-6-bp@alien8.de --- arch/x86/include/asm/sev.h | 4 ++-- arch/x86/kernel/cpu/amd.c | 38 +++++++++++++++++++++++--------------- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/kernel/sev.c | 10 ---------- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/virt/svm/sev.c | 26 ++++++++++++++++++-------- drivers/crypto/ccp/sev-dev.c | 2 +- drivers/iommu/amd/init.c | 4 +++- 8 files changed, 49 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 07e125f32528..7f57382afee4 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -228,7 +228,6 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct sn void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); -void kdump_sev_callback(void); void sev_show_status(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } @@ -258,7 +257,6 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } -static inline void kdump_sev_callback(void) { } static inline void sev_show_status(void) { } #endif @@ -270,6 +268,7 @@ int psmash(u64 pfn); int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); int rmp_make_shared(u64 pfn, enum pg_level level); void snp_leak_pages(u64 pfn, unsigned int npages); +void kdump_sev_callback(void); #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } @@ -282,6 +281,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as } static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} +static inline void kdump_sev_callback(void) { } #endif #endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6d8677e80ddb..9bf17c9c29da 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -345,6 +345,28 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } +static void bsp_determine_snp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM + cc_vendor = CC_VENDOR_AMD; + + if (cpu_has(c, X86_FEATURE_SEV_SNP)) { + /* + * RMP table entry format is not architectural and is defined by the + * per-processor PPR. Restrict SNP support on the known CPU models + * for which the RMP table entry format is currently defined for. + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + c->x86 >= 0x19 && snp_probe_rmptable_info()) { + cc_platform_set(CC_ATTR_HOST_SEV_SNP); + } else { + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + } + } +#endif +} + static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -452,21 +474,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) break; } - if (cpu_has(c, X86_FEATURE_SEV_SNP)) { - /* - * RMP table entry format is not architectural and it can vary by processor - * and is defined by the per-processor PPR. Restrict SNP support on the - * known CPU model and family for which the RMP table entry format is - * currently defined for. - */ - if (!boot_cpu_has(X86_FEATURE_ZEN3) && - !boot_cpu_has(X86_FEATURE_ZEN4) && - !boot_cpu_has(X86_FEATURE_ZEN5)) - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); - else if (!snp_probe_rmptable_info()) - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); - } - + bsp_determine_snp(c); return; warn: diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 422a4ddc2ab7..7b29ebda024f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -108,7 +108,7 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return; rdmsr(MSR_AMD64_SYSCFG, lo, hi); diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 7e1e63cc48e6..38ad066179d8 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -2284,16 +2284,6 @@ static int __init snp_init_platform_device(void) } device_initcall(snp_init_platform_device); -void kdump_sev_callback(void) -{ - /* - * Do wbinvd() on remote CPUs when SNP is enabled in order to - * safely do SNP_SHUTDOWN on the local CPU. - */ - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) - wbinvd(); -} - void sev_show_status(void) { int i; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ae0ac12382b9..3d310b473e05 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3174,7 +3174,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) unsigned long pfn; struct page *p; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); /* diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index cffe1157a90a..ab0e8448bb6e 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -77,7 +77,7 @@ static int __mfd_enable(unsigned int cpu) { u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; rdmsrl(MSR_AMD64_SYSCFG, val); @@ -98,7 +98,7 @@ static int __snp_enable(unsigned int cpu) { u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; rdmsrl(MSR_AMD64_SYSCFG, val); @@ -174,11 +174,11 @@ static int __init snp_rmptable_init(void) u64 rmptable_size; u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; if (!amd_iommu_snp_en) - return 0; + goto nosnp; if (!probed_rmp_size) goto nosnp; @@ -225,7 +225,7 @@ skip_enable: return 0; nosnp: - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); return -ENOSYS; } @@ -246,7 +246,7 @@ static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) { struct rmpentry *large_entry, *entry; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return ERR_PTR(-ENODEV); entry = get_rmpentry(pfn); @@ -363,7 +363,7 @@ int psmash(u64 pfn) unsigned long paddr = pfn << PAGE_SHIFT; int ret; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return -ENODEV; if (!pfn_valid(pfn)) @@ -472,7 +472,7 @@ static int rmpupdate(u64 pfn, struct rmp_state *state) unsigned long paddr = pfn << PAGE_SHIFT; int ret, level; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return -ENODEV; level = RMP_TO_PG_LEVEL(state->pagesize); @@ -558,3 +558,13 @@ void snp_leak_pages(u64 pfn, unsigned int npages) spin_unlock(&snp_leaked_pages_list_lock); } EXPORT_SYMBOL_GPL(snp_leak_pages); + +void kdump_sev_callback(void) +{ + /* + * Do wbinvd() on remote CPUs when SNP is enabled in order to + * safely do SNP_SHUTDOWN on the local CPU. + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + wbinvd(); +} diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index f44efbb89c34..2102377f727b 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1090,7 +1090,7 @@ static int __sev_snp_init_locked(int *error) void *arg = &data; int cmd, rc = 0; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return -ENODEV; sev = psp->sev_data; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index e7a44929f0da..33228c1c8980 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3228,7 +3228,7 @@ out: static void iommu_snp_enable(void) { #ifdef CONFIG_KVM_AMD_SEV - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return; /* * The SNP support requires that IOMMU must be enabled, and is @@ -3236,12 +3236,14 @@ static void iommu_snp_enable(void) */ if (no_iommu || iommu_default_passthrough()) { pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); return; } amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) { pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); return; } -- cgit v1.2.3 From 72076fc9fe60b9143cd971fd8737718719bc512e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Apr 2024 10:51:01 +0200 Subject: Revert "tg3: Remove residual error handling in tg3_suspend" This reverts commit 9ab4ad295622a3481818856762471c1f8c830e18. I went out of coffee and applied it to the wrong tree. Blame on me. Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/tg3.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index e6ff3c9bd7e5..62ff4381ac83 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -4019,7 +4019,7 @@ static int tg3_power_up(struct tg3 *tp) static int tg3_setup_phy(struct tg3 *, bool); -static void tg3_power_down_prepare(struct tg3 *tp) +static int tg3_power_down_prepare(struct tg3 *tp) { u32 misc_host_ctrl; bool device_should_wake, do_low_power; @@ -4263,7 +4263,7 @@ static void tg3_power_down_prepare(struct tg3 *tp) tg3_ape_driver_state_change(tp, RESET_KIND_SHUTDOWN); - return; + return 0; } static void tg3_power_down(struct tg3 *tp) @@ -18084,6 +18084,7 @@ static int tg3_suspend(struct device *device) { struct net_device *dev = dev_get_drvdata(device); struct tg3 *tp = netdev_priv(dev); + int err = 0; rtnl_lock(); @@ -18107,11 +18108,32 @@ static int tg3_suspend(struct device *device) tg3_flag_clear(tp, INIT_COMPLETE); tg3_full_unlock(tp); - tg3_power_down_prepare(tp); + err = tg3_power_down_prepare(tp); + if (err) { + int err2; + + tg3_full_lock(tp, 0); + + tg3_flag_set(tp, INIT_COMPLETE); + err2 = tg3_restart_hw(tp, true); + if (err2) + goto out; + + tg3_timer_start(tp); + + netif_device_attach(dev); + tg3_netif_start(tp); + +out: + tg3_full_unlock(tp); + + if (!err2) + tg3_phy_start(tp); + } unlock: rtnl_unlock(); - return 0; + return err; } static int tg3_resume(struct device *device) -- cgit v1.2.3 From a45e6889575c2067d3c0212b6bc1022891e65b91 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 28 Mar 2024 13:27:36 +0100 Subject: netfilter: nf_tables: release batch on table validation from abort path Unlike early commit path stage which triggers a call to abort, an explicit release of the batch is required on abort, otherwise mutex is released and commit_list remains in place. Add WARN_ON_ONCE to ensure commit_list is empty from the abort path before releasing the mutex. After this patch, commit_list is always assumed to be empty before grabbing the mutex, therefore 03c1f1ef1584 ("netfilter: Cleanup nft_net->module_list from nf_tables_exit_net()") only needs to release the pending modules for registration. Cc: stable@vger.kernel.org Fixes: c0391b6ab810 ("netfilter: nf_tables: missing validation from the abort path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd86f2720c9e..ffcd3213c335 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10455,10 +10455,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) struct nft_trans *trans, *next; LIST_HEAD(set_update_list); struct nft_trans_elem *te; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { @@ -10655,7 +10656,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) else nf_tables_module_autoload_cleanup(net); - return 0; + return err; } static int nf_tables_abort(struct net *net, struct sk_buff *skb, @@ -10668,6 +10669,9 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, gc_seq = nft_gc_seq_begin(nft_net); ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + mutex_unlock(&nft_net->commit_mutex); return ret; @@ -11473,9 +11477,10 @@ static void __net_exit nf_tables_exit_net(struct net *net) gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nft_net->commit_list) || - !list_empty(&nft_net->module_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); __nft_release_tables(net); -- cgit v1.2.3 From 0d459e2ffb541841714839e8228b845458ed3b27 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 28 Mar 2024 14:23:55 +0100 Subject: netfilter: nf_tables: release mutex after nft_gc_seq_end from abort path The commit mutex should not be released during the critical section between nft_gc_seq_begin() and nft_gc_seq_end(), otherwise, async GC worker could collect expired objects and get the released commit lock within the same GC sequence. nf_tables_module_autoload() temporarily releases the mutex to load module dependencies, then it goes back to replay the transaction again. Move it at the end of the abort phase after nft_gc_seq_end() is called. Cc: stable@vger.kernel.org Fixes: 720344340fb9 ("netfilter: nf_tables: GC transaction race with abort path") Reported-by: Kuan-Ting Chen Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ffcd3213c335..0d432d0674e1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10651,11 +10651,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - return err; } @@ -10672,6 +10667,14 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + mutex_unlock(&nft_net->commit_mutex); return ret; -- cgit v1.2.3 From 24cea9677025e0de419989ecb692acd4bb34cac2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 2 Apr 2024 18:04:36 +0200 Subject: netfilter: nf_tables: flush pending destroy work before exit_net release Similar to 2c9f0293280e ("netfilter: nf_tables: flush pending destroy work before netlink notifier") to address a race between exit_net and the destroy workqueue. The trace below shows an element to be released via destroy workqueue while exit_net path (triggered via module removal) has already released the set that is used in such transaction. [ 1360.547789] BUG: KASAN: slab-use-after-free in nf_tables_trans_destroy_work+0x3f5/0x590 [nf_tables] [ 1360.547861] Read of size 8 at addr ffff888140500cc0 by task kworker/4:1/152465 [ 1360.547870] CPU: 4 PID: 152465 Comm: kworker/4:1 Not tainted 6.8.0+ #359 [ 1360.547882] Workqueue: events nf_tables_trans_destroy_work [nf_tables] [ 1360.547984] Call Trace: [ 1360.547991] [ 1360.547998] dump_stack_lvl+0x53/0x70 [ 1360.548014] print_report+0xc4/0x610 [ 1360.548026] ? __virt_addr_valid+0xba/0x160 [ 1360.548040] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 1360.548054] ? nf_tables_trans_destroy_work+0x3f5/0x590 [nf_tables] [ 1360.548176] kasan_report+0xae/0xe0 [ 1360.548189] ? nf_tables_trans_destroy_work+0x3f5/0x590 [nf_tables] [ 1360.548312] nf_tables_trans_destroy_work+0x3f5/0x590 [nf_tables] [ 1360.548447] ? __pfx_nf_tables_trans_destroy_work+0x10/0x10 [nf_tables] [ 1360.548577] ? _raw_spin_unlock_irq+0x18/0x30 [ 1360.548591] process_one_work+0x2f1/0x670 [ 1360.548610] worker_thread+0x4d3/0x760 [ 1360.548627] ? __pfx_worker_thread+0x10/0x10 [ 1360.548640] kthread+0x16b/0x1b0 [ 1360.548653] ? __pfx_kthread+0x10/0x10 [ 1360.548665] ret_from_fork+0x2f/0x50 [ 1360.548679] ? __pfx_kthread+0x10/0x10 [ 1360.548690] ret_from_fork_asm+0x1a/0x30 [ 1360.548707] [ 1360.548719] Allocated by task 192061: [ 1360.548726] kasan_save_stack+0x20/0x40 [ 1360.548739] kasan_save_track+0x14/0x30 [ 1360.548750] __kasan_kmalloc+0x8f/0xa0 [ 1360.548760] __kmalloc_node+0x1f1/0x450 [ 1360.548771] nf_tables_newset+0x10c7/0x1b50 [nf_tables] [ 1360.548883] nfnetlink_rcv_batch+0xbc4/0xdc0 [nfnetlink] [ 1360.548909] nfnetlink_rcv+0x1a8/0x1e0 [nfnetlink] [ 1360.548927] netlink_unicast+0x367/0x4f0 [ 1360.548935] netlink_sendmsg+0x34b/0x610 [ 1360.548944] ____sys_sendmsg+0x4d4/0x510 [ 1360.548953] ___sys_sendmsg+0xc9/0x120 [ 1360.548961] __sys_sendmsg+0xbe/0x140 [ 1360.548971] do_syscall_64+0x55/0x120 [ 1360.548982] entry_SYSCALL_64_after_hwframe+0x55/0x5d [ 1360.548994] Freed by task 192222: [ 1360.548999] kasan_save_stack+0x20/0x40 [ 1360.549009] kasan_save_track+0x14/0x30 [ 1360.549019] kasan_save_free_info+0x3b/0x60 [ 1360.549028] poison_slab_object+0x100/0x180 [ 1360.549036] __kasan_slab_free+0x14/0x30 [ 1360.549042] kfree+0xb6/0x260 [ 1360.549049] __nft_release_table+0x473/0x6a0 [nf_tables] [ 1360.549131] nf_tables_exit_net+0x170/0x240 [nf_tables] [ 1360.549221] ops_exit_list+0x50/0xa0 [ 1360.549229] free_exit_list+0x101/0x140 [ 1360.549236] unregister_pernet_operations+0x107/0x160 [ 1360.549245] unregister_pernet_subsys+0x1c/0x30 [ 1360.549254] nf_tables_module_exit+0x43/0x80 [nf_tables] [ 1360.549345] __do_sys_delete_module+0x253/0x370 [ 1360.549352] do_syscall_64+0x55/0x120 [ 1360.549360] entry_SYSCALL_64_after_hwframe+0x55/0x5d (gdb) list *__nft_release_table+0x473 0x1e033 is in __nft_release_table (net/netfilter/nf_tables_api.c:11354). 11349 list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { 11350 list_del(&flowtable->list); 11351 nft_use_dec(&table->use); 11352 nf_tables_flowtable_destroy(flowtable); 11353 } 11354 list_for_each_entry_safe(set, ns, &table->sets, list) { 11355 list_del(&set->list); 11356 nft_use_dec(&table->use); 11357 if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) 11358 nft_map_deactivate(&ctx, set); (gdb) [ 1360.549372] Last potentially related work creation: [ 1360.549376] kasan_save_stack+0x20/0x40 [ 1360.549384] __kasan_record_aux_stack+0x9b/0xb0 [ 1360.549392] __queue_work+0x3fb/0x780 [ 1360.549399] queue_work_on+0x4f/0x60 [ 1360.549407] nft_rhash_remove+0x33b/0x340 [nf_tables] [ 1360.549516] nf_tables_commit+0x1c6a/0x2620 [nf_tables] [ 1360.549625] nfnetlink_rcv_batch+0x728/0xdc0 [nfnetlink] [ 1360.549647] nfnetlink_rcv+0x1a8/0x1e0 [nfnetlink] [ 1360.549671] netlink_unicast+0x367/0x4f0 [ 1360.549680] netlink_sendmsg+0x34b/0x610 [ 1360.549690] ____sys_sendmsg+0x4d4/0x510 [ 1360.549697] ___sys_sendmsg+0xc9/0x120 [ 1360.549706] __sys_sendmsg+0xbe/0x140 [ 1360.549715] do_syscall_64+0x55/0x120 [ 1360.549725] entry_SYSCALL_64_after_hwframe+0x55/0x5d Fixes: 0935d5588400 ("netfilter: nf_tables: asynchronous release") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0d432d0674e1..17bf53cd0e84 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -11575,6 +11575,7 @@ static void __exit nf_tables_module_exit(void) unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); + nf_tables_trans_destroy_flush_work(); unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); -- cgit v1.2.3 From 994209ddf4f430946f6247616b2e33d179243769 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 1 Apr 2024 00:33:02 +0200 Subject: netfilter: nf_tables: reject new basechain after table flag update When dormant flag is toggled, hooks are disabled in the commit phase by iterating over current chains in table (existing and new). The following configuration allows for an inconsistent state: add table x add chain x y { type filter hook input priority 0; } add table x { flags dormant; } add chain x w { type filter hook input priority 1; } which triggers the following warning when trying to unregister chain w which is already unregistered. [ 127.322252] WARNING: CPU: 7 PID: 1211 at net/netfilter/core.c:50 1 __nf_unregister_net_hook+0x21a/0x260 [...] [ 127.322519] Call Trace: [ 127.322521] [ 127.322524] ? __warn+0x9f/0x1a0 [ 127.322531] ? __nf_unregister_net_hook+0x21a/0x260 [ 127.322537] ? report_bug+0x1b1/0x1e0 [ 127.322545] ? handle_bug+0x3c/0x70 [ 127.322552] ? exc_invalid_op+0x17/0x40 [ 127.322556] ? asm_exc_invalid_op+0x1a/0x20 [ 127.322563] ? kasan_save_free_info+0x3b/0x60 [ 127.322570] ? __nf_unregister_net_hook+0x6a/0x260 [ 127.322577] ? __nf_unregister_net_hook+0x21a/0x260 [ 127.322583] ? __nf_unregister_net_hook+0x6a/0x260 [ 127.322590] ? __nf_tables_unregister_hook+0x8a/0xe0 [nf_tables] [ 127.322655] nft_table_disable+0x75/0xf0 [nf_tables] [ 127.322717] nf_tables_commit+0x2571/0x2620 [nf_tables] Fixes: 179d9ba5559a ("netfilter: nf_tables: fix table flag updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 17bf53cd0e84..1eb51bf24fc2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2449,6 +2449,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook = {}; + if (table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; + if (flags & NFT_CHAIN_BINDING) return -EOPNOTSUPP; -- cgit v1.2.3 From 24225011d81b471acc0e1e315b7d9905459a6304 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Wed, 3 Apr 2024 15:22:04 +0800 Subject: netfilter: nf_tables: Fix potential data-race in __nft_flowtable_type_get() nft_unregister_flowtable_type() within nf_flow_inet_module_exit() can concurrent with __nft_flowtable_type_get() within nf_tables_newflowtable(). And thhere is not any protection when iterate over nf_tables_flowtables list in __nft_flowtable_type_get(). Therefore, there is pertential data-race of nf_tables_flowtables list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_flowtables list in __nft_flowtable_type_get(), and use rcu_read_lock() in the caller nft_flowtable_type_get() to protect the entire type query process. Fixes: 3b49e2e94e6e ("netfilter: nf_tables: add flow table netlink frontend") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1eb51bf24fc2..e02d0ae4f436 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8296,11 +8296,12 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, return err; } +/* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; - list_for_each_entry(type, &nf_tables_flowtables, list) { + list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } @@ -8312,9 +8313,13 @@ nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; + rcu_read_lock(); type = __nft_flowtable_type_get(family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES -- cgit v1.2.3 From 1bc83a019bbe268be3526406245ec28c2458a518 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 3 Apr 2024 19:35:30 +0200 Subject: netfilter: nf_tables: discard table flag update with pending basechain deletion Hook unregistration is deferred to the commit phase, same occurs with hook updates triggered by the table dormant flag. When both commands are combined, this results in deleting a basechain while leaving its hook still registered in the core. Fixes: 179d9ba5559a ("netfilter: nf_tables: fix table flag updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e02d0ae4f436..d89d77946719 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1209,10 +1209,11 @@ static bool nft_table_pending_update(const struct nft_ctx *ctx) return true; list_for_each_entry(trans, &nft_net->commit_list, list) { - if ((trans->msg_type == NFT_MSG_NEWCHAIN || - trans->msg_type == NFT_MSG_DELCHAIN) && - trans->ctx.table == ctx->table && - nft_trans_chain_update(trans)) + if (trans->ctx.table == ctx->table && + ((trans->msg_type == NFT_MSG_NEWCHAIN && + nft_trans_chain_update(trans)) || + (trans->msg_type == NFT_MSG_DELCHAIN && + nft_is_base_chain(trans->ctx.chain)))) return true; } -- cgit v1.2.3 From 596a4254915f94c927217fe09c33a6828f33fb25 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 2 Apr 2024 15:53:04 +0100 Subject: net: ravb: Always process TX descriptor ring The TX queue should be serviced each time the poll function is called, even if the full RX work budget has been consumed. This prevents starvation of the TX queue when RX bandwidth usage is high. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20240402145305.82148-1-paul.barker.ct@bp.renesas.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index d1be030c8848..48803050abdb 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1324,12 +1324,12 @@ static int ravb_poll(struct napi_struct *napi, int budget) int q = napi - priv->napi; int mask = BIT(q); int quota = budget; + bool unmask; /* Processing RX Descriptor Ring */ /* Clear RX interrupt */ ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0); - if (ravb_rx(ndev, "a, q)) - goto out; + unmask = !ravb_rx(ndev, "a, q); /* Processing TX Descriptor Ring */ spin_lock_irqsave(&priv->lock, flags); @@ -1339,6 +1339,9 @@ static int ravb_poll(struct napi_struct *napi, int budget) netif_wake_subqueue(ndev, q); spin_unlock_irqrestore(&priv->lock, flags); + if (!unmask) + goto out; + napi_complete(napi); /* Re-enable RX/TX interrupts */ -- cgit v1.2.3 From 101b76418d7163240bc74a7e06867dca0e51183e Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 2 Apr 2024 15:53:05 +0100 Subject: net: ravb: Always update error counters The error statistics should be updated each time the poll function is called, even if the full RX work budget has been consumed. This prevents the counts from becoming stuck when RX bandwidth usage is high. This also ensures that error counters are not updated after we've re-enabled interrupts as that could result in a race condition. Also drop an unnecessary space. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20240402145305.82148-2-paul.barker.ct@bp.renesas.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 48803050abdb..ba01c8cc3c90 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1339,6 +1339,15 @@ static int ravb_poll(struct napi_struct *napi, int budget) netif_wake_subqueue(ndev, q); spin_unlock_irqrestore(&priv->lock, flags); + /* Receive error message handling */ + priv->rx_over_errors = priv->stats[RAVB_BE].rx_over_errors; + if (info->nc_queues) + priv->rx_over_errors += priv->stats[RAVB_NC].rx_over_errors; + if (priv->rx_over_errors != ndev->stats.rx_over_errors) + ndev->stats.rx_over_errors = priv->rx_over_errors; + if (priv->rx_fifo_errors != ndev->stats.rx_fifo_errors) + ndev->stats.rx_fifo_errors = priv->rx_fifo_errors; + if (!unmask) goto out; @@ -1355,14 +1364,6 @@ static int ravb_poll(struct napi_struct *napi, int budget) } spin_unlock_irqrestore(&priv->lock, flags); - /* Receive error message handling */ - priv->rx_over_errors = priv->stats[RAVB_BE].rx_over_errors; - if (info->nc_queues) - priv->rx_over_errors += priv->stats[RAVB_NC].rx_over_errors; - if (priv->rx_over_errors != ndev->stats.rx_over_errors) - ndev->stats.rx_over_errors = priv->rx_over_errors; - if (priv->rx_fifo_errors != ndev->stats.rx_fifo_errors) - ndev->stats.rx_fifo_errors = priv->rx_fifo_errors; out: return budget - quota; } -- cgit v1.2.3 From c120209bce34c49dcaba32f15679574327d09f63 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 2 Apr 2024 20:33:56 +0200 Subject: net: dsa: sja1105: Fix parameters order in sja1110_pcs_mdio_write_c45() The definition and declaration of sja1110_pcs_mdio_write_c45() don't have parameters in the same order. Knowing that sja1110_pcs_mdio_write_c45() is used as a function pointer in 'sja1105_info' structure with .pcs_mdio_write_c45, and that we have: int (*pcs_mdio_write_c45)(struct mii_bus *bus, int phy, int mmd, int reg, u16 val); it is likely that the definition is the one to change. Found with cppcheck, funcArgOrderDifferent. Fixes: ae271547bba6 ("net: dsa: sja1105: C45 only transactions for PCS") Signed-off-by: Christophe JAILLET Reviewed-by: Michael Walle Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/ff2a5af67361988b3581831f7bd1eddebfb4c48f.1712082763.git.christophe.jaillet@wanadoo.fr Signed-off-by: Paolo Abeni --- drivers/net/dsa/sja1105/sja1105_mdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_mdio.c b/drivers/net/dsa/sja1105/sja1105_mdio.c index 833e55e4b961..52ddb4ef259e 100644 --- a/drivers/net/dsa/sja1105/sja1105_mdio.c +++ b/drivers/net/dsa/sja1105/sja1105_mdio.c @@ -94,7 +94,7 @@ int sja1110_pcs_mdio_read_c45(struct mii_bus *bus, int phy, int mmd, int reg) return tmp & 0xffff; } -int sja1110_pcs_mdio_write_c45(struct mii_bus *bus, int phy, int reg, int mmd, +int sja1110_pcs_mdio_write_c45(struct mii_bus *bus, int phy, int mmd, int reg, u16 val) { struct sja1105_mdio_private *mdio_priv = bus->priv; -- cgit v1.2.3 From b9846a386734e73a1414950ebfd50f04919f5e24 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Thu, 4 Apr 2024 09:47:13 +0530 Subject: ASoC: SOF: amd: fix for false dsp interrupts Before ACP firmware loading, DSP interrupts are not expected. Sometimes after reboot, it's observed that before ACP firmware is loaded false DSP interrupt is reported. Registering the interrupt handler before acp initialization causing false interrupts sometimes on reboot as ACP reset is not applied. Correct the sequence by invoking acp initialization sequence prior to registering interrupt handler. Fixes: 738a2b5e2cc9 ("ASoC: SOF: amd: Add IPC support for ACP IP block") Signed-off-by: Vijendar Mukunda Link: https://msgid.link/r/20240404041717.430545-1-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index be7dc1e02284..c12c7f820529 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -704,6 +704,10 @@ int amd_sof_acp_probe(struct snd_sof_dev *sdev) goto unregister_dev; } + ret = acp_init(sdev); + if (ret < 0) + goto free_smn_dev; + sdev->ipc_irq = pci->irq; ret = request_threaded_irq(sdev->ipc_irq, acp_irq_handler, acp_irq_thread, IRQF_SHARED, "AudioDSP", sdev); @@ -713,10 +717,6 @@ int amd_sof_acp_probe(struct snd_sof_dev *sdev) goto free_smn_dev; } - ret = acp_init(sdev); - if (ret < 0) - goto free_ipc_irq; - /* scan SoundWire capabilities exposed by DSDT */ ret = acp_sof_scan_sdw_devices(sdev, chip->sdw_acpi_dev_addr); if (ret < 0) { -- cgit v1.2.3 From 5bfc311dd6c376d350b39028b9000ad766ddc934 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 4 Apr 2024 15:11:05 +0300 Subject: usb: xhci: correct return value in case of STS_HCE If we get STS_HCE we give up on the interrupt, but for the purpose of IRQ handling that still counts as ours. We may return IRQ_NONE only if we are positive that it wasn't ours. Hence correct the default. Fixes: 2a25e66d676d ("xhci: print warning when HCE was set") Cc: stable@vger.kernel.org # v6.2+ Signed-off-by: Oliver Neukum Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240404121106.2842417-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 52278afea94b..575f0fd9c9f1 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -3133,7 +3133,7 @@ static int xhci_handle_events(struct xhci_hcd *xhci, struct xhci_interrupter *ir irqreturn_t xhci_irq(struct usb_hcd *hcd) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); - irqreturn_t ret = IRQ_NONE; + irqreturn_t ret = IRQ_HANDLED; u32 status; spin_lock(&xhci->lock); @@ -3141,12 +3141,13 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) status = readl(&xhci->op_regs->status); if (status == ~(u32)0) { xhci_hc_died(xhci); - ret = IRQ_HANDLED; goto out; } - if (!(status & STS_EINT)) + if (!(status & STS_EINT)) { + ret = IRQ_NONE; goto out; + } if (status & STS_HCE) { xhci_warn(xhci, "WARNING: Host Controller Error\n"); @@ -3156,7 +3157,6 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) if (status & STS_FATAL) { xhci_warn(xhci, "WARNING: Host System Error\n"); xhci_halt(xhci); - ret = IRQ_HANDLED; goto out; } @@ -3167,7 +3167,6 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) */ status |= STS_EINT; writel(status, &xhci->op_regs->status); - ret = IRQ_HANDLED; /* This is the handler of the primary interrupter */ xhci_handle_events(xhci, xhci->interrupters[0]); -- cgit v1.2.3 From dda7e89e53d6ebf27c49df7d87a54e3e1614d332 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 4 Apr 2024 15:11:06 +0300 Subject: xhci: Fix root hub port null pointer dereference in xhci tracepoints The pointer from a xhci usb virt device to its root hub port (vdev->rhub_port) is set later when device is addressed, not while vdev is allocated. Tracepoints dereferenced this rhub_port pointer when freeing the virt device, which causes null pointer dereference if tracing is enabled and device is freed before addressed. This can happen if tracing is enabled and xhci driver is unloaded before a device is fully enumerated, or initial enumeration fails and device is reset and freed before retry. Don't dereference the rhub_port or show port numbers when tracing xhci_free_virt_device(). This info is not very useful anyway. Print the more useful slot id instead Fixes: 06790c19086f ("xhci: replace real & fake port with pointer to root hub port") Reported-by: Thinh Nguyen Closes: https://lore.kernel.org/linux-usb/20240402005007.klv2ij727fkz7rpd@synopsys.com/ Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240404121106.2842417-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-trace.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/usb/host/xhci-trace.h b/drivers/usb/host/xhci-trace.h index 1740000d54c2..5762564b9d73 100644 --- a/drivers/usb/host/xhci-trace.h +++ b/drivers/usb/host/xhci-trace.h @@ -172,8 +172,7 @@ DECLARE_EVENT_CLASS(xhci_log_free_virt_dev, __field(void *, vdev) __field(unsigned long long, out_ctx) __field(unsigned long long, in_ctx) - __field(int, hcd_portnum) - __field(int, hw_portnum) + __field(int, slot_id) __field(u16, current_mel) ), @@ -181,13 +180,12 @@ DECLARE_EVENT_CLASS(xhci_log_free_virt_dev, __entry->vdev = vdev; __entry->in_ctx = (unsigned long long) vdev->in_ctx->dma; __entry->out_ctx = (unsigned long long) vdev->out_ctx->dma; - __entry->hcd_portnum = (int) vdev->rhub_port->hcd_portnum; - __entry->hw_portnum = (int) vdev->rhub_port->hw_portnum; + __entry->slot_id = (int) vdev->slot_id; __entry->current_mel = (u16) vdev->current_mel; ), - TP_printk("vdev %p ctx %llx | %llx hcd_portnum %d hw_portnum %d current_mel %d", - __entry->vdev, __entry->in_ctx, __entry->out_ctx, - __entry->hcd_portnum, __entry->hw_portnum, __entry->current_mel + TP_printk("vdev %p slot %d ctx %llx | %llx current_mel %d", + __entry->vdev, __entry->slot_id, __entry->in_ctx, + __entry->out_ctx, __entry->current_mel ) ); -- cgit v1.2.3 From 69630926011c1f7170a465b7b5c228deb66e9372 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Mar 2024 00:02:00 +1100 Subject: powerpc/crypto/chacha-p10: Fix failure on non Power10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chacha-p10-crypto module provides optimised chacha routines for Power10. It also selects CRYPTO_ARCH_HAVE_LIB_CHACHA which says it provides chacha_crypt_arch() to generic code. Notably the module needs to provide chacha_crypt_arch() regardless of whether it is loaded on Power10 or an older CPU. The implementation of chacha_crypt_arch() already has a fallback to chacha_crypt_generic(), however the module as a whole fails to load on pre-Power10, because of the use of module_cpu_feature_match(). This breaks for example loading wireguard: jostaberry-1:~ # modprobe -v wireguard insmod /lib/modules/6.8.0-lp155.8.g7e0e887-default/kernel/arch/powerpc/crypto/chacha-p10-crypto.ko.zst modprobe: ERROR: could not insert 'wireguard': No such device Fix it by removing module_cpu_feature_match(), and instead check the CPU feature manually. If the CPU feature is not found, the module still loads successfully, but doesn't register the Power10 specific algorithms. That allows chacha_crypt_generic() to remain available for use, fixing the problem. [root@fedora ~]# modprobe -v wireguard insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/net/ipv4/udp_tunnel.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/net/ipv6/ip6_udp_tunnel.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/lib/crypto/libchacha.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/arch/powerpc/crypto/chacha-p10-crypto.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/lib/crypto/libchacha20poly1305.ko insmod /lib/modules/6.8.0-00001-g786a790c4d79/kernel/drivers/net/wireguard/wireguard.ko [ 18.910452][ T721] wireguard: allowedips self-tests: pass [ 18.914999][ T721] wireguard: nonce counter self-tests: pass [ 19.029066][ T721] wireguard: ratelimiter self-tests: pass [ 19.029257][ T721] wireguard: WireGuard 1.0.0 loaded. See www.wireguard.com for information. [ 19.029361][ T721] wireguard: Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. Reported-by: Michal Suchánek Closes: https://lore.kernel.org/all/20240315122005.GG20665@kitsune.suse.cz/ Acked-by: Herbert Xu Signed-off-by: Michael Ellerman Link: https://msgid.link/20240328130200.3041687-1-mpe@ellerman.id.au --- arch/powerpc/crypto/chacha-p10-glue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/crypto/chacha-p10-glue.c b/arch/powerpc/crypto/chacha-p10-glue.c index 74fb86b0d209..7c728755852e 100644 --- a/arch/powerpc/crypto/chacha-p10-glue.c +++ b/arch/powerpc/crypto/chacha-p10-glue.c @@ -197,6 +197,9 @@ static struct skcipher_alg algs[] = { static int __init chacha_p10_init(void) { + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return 0; + static_branch_enable(&have_p10); return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); @@ -204,10 +207,13 @@ static int __init chacha_p10_init(void) static void __exit chacha_p10_exit(void) { + if (!static_branch_likely(&have_p10)) + return; + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } -module_cpu_feature_match(PPC_MODULE_FEATURE_P10, chacha_p10_init); +module_init(chacha_p10_init); module_exit(chacha_p10_exit); MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (P10 accelerated)"); -- cgit v1.2.3 From 3f03e7717c2999473a3ed38475ed0a962ba9f393 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 18:53:08 +0100 Subject: usb: phy: MAINTAINERS: mark Freescale USB PHY as orphaned Emails to the only maintainer bounce: : host nxp-com.mail.protection.outlook.com[52.101.68.39] said: 550 5.4.1 Recipient address rejected: Access denied. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240327175308.520317-1-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..3c78979fb819 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8752,10 +8752,9 @@ S: Maintained F: drivers/usb/gadget/udc/fsl* FREESCALE USB PHY DRIVER -M: Ran Wang L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/usb/phy/phy-fsl-usb* FREEVXFS FILESYSTEM -- cgit v1.2.3 From ce4c8d21054ae9396cd759fe6e8157b525616dc4 Mon Sep 17 00:00:00 2001 From: "Christian A. Ehrhardt" Date: Mon, 1 Apr 2024 23:05:15 +0200 Subject: usb: typec: ucsi: Fix connector check on init Fix issues when initially checking for a connector change: - Use the correct connector number not the entire CCI. - Call ->read under the PPM lock. - Remove a bogus READ_ONCE. Fixes: 808a8b9e0b87 ("usb: typec: ucsi: Check for notifications after init") Cc: stable@kernel.org Signed-off-by: Christian A. Ehrhardt Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240401210515.1902048-1-lk@c--e.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 31d8a46ae5e7..bd6ae92aa39e 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -1736,11 +1736,13 @@ static int ucsi_init(struct ucsi *ucsi) ucsi->connector = connector; ucsi->ntfy = ntfy; + mutex_lock(&ucsi->ppm_lock); ret = ucsi->ops->read(ucsi, UCSI_CCI, &cci, sizeof(cci)); + mutex_unlock(&ucsi->ppm_lock); if (ret) return ret; - if (UCSI_CCI_CONNECTOR(READ_ONCE(cci))) - ucsi_connector_change(ucsi, cci); + if (UCSI_CCI_CONNECTOR(cci)) + ucsi_connector_change(ucsi, UCSI_CCI_CONNECTOR(cci)); return 0; -- cgit v1.2.3 From 1500a7b2794d46beadd408132ddb5ef669c5c057 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 2 Apr 2024 13:09:50 +0200 Subject: usb: gadget: functionfs: Fix inverted DMA fence direction A "read" fence was installed when the DMABUF was to be written to, and a "write" fence was installed when the DMABUF was to be read from. Besides, dma_resv_usage_rw() should only be used when waiting for fences. Fixes: 7b07a2a7ca02 ("usb: gadget: functionfs: Add DMABUF import interface") Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20240402110951.16376-2-paul@crapouillou.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index bffbc1dc651f..70e8532127ad 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1578,6 +1578,7 @@ static int ffs_dmabuf_transfer(struct file *file, struct ffs_dmabuf_priv *priv; struct ffs_dma_fence *fence; struct usb_request *usb_req; + enum dma_resv_usage resv_dir; struct dma_buf *dmabuf; struct ffs_ep *ep; bool cookie; @@ -1665,8 +1666,9 @@ static int ffs_dmabuf_transfer(struct file *file, dma_fence_init(&fence->base, &ffs_dmabuf_fence_ops, &priv->lock, priv->context, seqno); - dma_resv_add_fence(dmabuf->resv, &fence->base, - dma_resv_usage_rw(epfile->in)); + resv_dir = epfile->in ? DMA_RESV_USAGE_WRITE : DMA_RESV_USAGE_READ; + + dma_resv_add_fence(dmabuf->resv, &fence->base, resv_dir); dma_resv_unlock(dmabuf->resv); /* Now that the dma_fence is in place, queue the transfer. */ -- cgit v1.2.3 From 862b416560c348f66d2091c56e71fd5462510cab Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 2 Apr 2024 13:09:51 +0200 Subject: usb: gadget: functionfs: Wait for fences before enqueueing DMABUF Instead of bailing when fences have already been installed on the DMABUF, wait for them (with a timeout) when doing a blocking operation. This fixes the issue where userspace would submit a DMABUF with fences already installed, with the (correct) expectation that it would just work. Fixes: 7b07a2a7ca02 ("usb: gadget: functionfs: Add DMABUF import interface") Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20240402110951.16376-3-paul@crapouillou.net Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 70e8532127ad..f855f1fc8e5e 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -46,6 +46,8 @@ #define FUNCTIONFS_MAGIC 0xa647361 /* Chosen by a honest dice roll ;) */ +#define DMABUF_ENQUEUE_TIMEOUT_MS 5000 + MODULE_IMPORT_NS(DMA_BUF); /* Reference counter handling */ @@ -1580,9 +1582,11 @@ static int ffs_dmabuf_transfer(struct file *file, struct usb_request *usb_req; enum dma_resv_usage resv_dir; struct dma_buf *dmabuf; + unsigned long timeout; struct ffs_ep *ep; bool cookie; u32 seqno; + long retl; int ret; if (req->flags & ~USB_FFS_DMABUF_TRANSFER_MASK) @@ -1616,17 +1620,14 @@ static int ffs_dmabuf_transfer(struct file *file, goto err_attachment_put; /* Make sure we don't have writers */ - if (!dma_resv_test_signaled(dmabuf->resv, DMA_RESV_USAGE_WRITE)) { - pr_vdebug("FFS WRITE fence is not signaled\n"); - ret = -EBUSY; - goto err_resv_unlock; - } - - /* If we're writing to the DMABUF, make sure we don't have readers */ - if (epfile->in && - !dma_resv_test_signaled(dmabuf->resv, DMA_RESV_USAGE_READ)) { - pr_vdebug("FFS READ fence is not signaled\n"); - ret = -EBUSY; + timeout = nonblock ? 0 : msecs_to_jiffies(DMABUF_ENQUEUE_TIMEOUT_MS); + retl = dma_resv_wait_timeout(dmabuf->resv, + dma_resv_usage_rw(epfile->in), + true, timeout); + if (retl == 0) + retl = -EBUSY; + if (retl < 0) { + ret = (int)retl; goto err_resv_unlock; } -- cgit v1.2.3 From 3c88b8f471ee9512bc4ef02bebafdc53fb7c5d9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Mar 2024 10:11:33 +0100 Subject: drm/xe: Use ring ops TLB invalidation for rebinds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For each rebind we insert a GuC TLB invalidation and add a corresponding unordered TLB invalidation fence. This might add a huge number of TLB invalidation fences to wait for so rather than doing that, defer the TLB invalidation to the next ring ops for each affected exec queue. Since the TLB is invalidated on exec_queue switch, we need to invalidate once for each affected exec_queue. v2: - Simplify if-statements around the tlb_flush_seqno. (Matthew Brost) - Add some comments and asserts. Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost Cc: # v6.8+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-2-thomas.hellstrom@linux.intel.com (cherry picked from commit 4fc4899e86f7afbd09f4bcb899f0fc57e0296e62) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec_queue_types.h | 5 +++++ drivers/gpu/drm/xe/xe_pt.c | 6 ++++-- drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++------- drivers/gpu/drm/xe/xe_sched_job.c | 10 ++++++++++ drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++ 6 files changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 62b3d9d1d7cd..462b33195032 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -148,6 +148,11 @@ struct xe_exec_queue { const struct xe_ring_ops *ring_ops; /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ struct drm_sched_entity *entity; + /** + * @tlb_flush_seqno: The seqno of the last rebind tlb flush performed + * Protected by @vm's resv. Unused if @vm == NULL. + */ + u64 tlb_flush_seqno; /** @lrc: logical ring context for this exec queue */ struct xe_lrc lrc[]; }; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 7f54bc3e389d..9fd65f5d3d80 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1254,11 +1254,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue * non-faulting LR, in particular on user-space batch buffer chaining, * it needs to be done here. */ - if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) || - (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { + if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM); + } else if (rebind && !xe_vm_in_lr_mode(vm)) { + /* We bump also if batch_invalidate_tlb is true */ + vm->tlb_flush_seqno++; } rfence = kzalloc(sizeof(*rfence), GFP_KERNEL); diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index c4edffcd4a32..5b2b37b59813 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc { u32 dw[MAX_JOB_SIZE_DW], i = 0; u32 ppgtt_flag = get_ppgtt_flag(job); - struct xe_vm *vm = job->q->vm; struct xe_gt *gt = job->q->gt; - if (vm && vm->batch_invalidate_tlb) { + if (job->ring_ops_flush_tlb) { dw[i++] = preparser_disable(true); i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); @@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE; - struct xe_vm *vm = job->q->vm; dw[i++] = preparser_disable(true); @@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); } - if (vm && vm->batch_invalidate_tlb) + if (job->ring_ops_flush_tlb) i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); dw[i++] = preparser_disable(false); - if (!vm || !vm->batch_invalidate_tlb) + if (!job->ring_ops_flush_tlb) i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, dw, i); @@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); - struct xe_vm *vm = job->q->vm; u32 mask_flags = 0; dw[i++] = preparser_disable(true); @@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS; /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */ - i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i); + i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i); /* hsdes: 1809175790 */ if (has_aux_ccs(xe)) diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 8151ddafb940..b0c7fa4693cf 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -250,6 +250,16 @@ bool xe_sched_job_completed(struct xe_sched_job *job) void xe_sched_job_arm(struct xe_sched_job *job) { + struct xe_exec_queue *q = job->q; + struct xe_vm *vm = q->vm; + + if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) && + (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) { + xe_vm_assert_held(vm); + q->tlb_flush_seqno = vm->tlb_flush_seqno; + job->ring_ops_flush_tlb = true; + } + drm_sched_job_arm(&job->drm); } diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index b1d83da50a53..5e12724219fd 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -39,6 +39,8 @@ struct xe_sched_job { } user_fence; /** @migrate_flush_flags: Additional flush flags for migration jobs */ u32 migrate_flush_flags; + /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */ + bool ring_ops_flush_tlb; /** @batch_addr: batch buffer address of job */ u64 batch_addr[]; }; diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index ae5fb565f6bf..5747f136d24d 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -264,6 +264,11 @@ struct xe_vm { bool capture_once; } error_capture; + /** + * @tlb_flush_seqno: Required TLB flush seqno for the next exec. + * protected by the vm resv. + */ + u64 tlb_flush_seqno; /** @batch_invalidate_tlb: Always invalidate TLB before batch start */ bool batch_invalidate_tlb; /** @xef: XE file handle for tracking this VM's drm client */ -- cgit v1.2.3 From a00e7e3fb4b9b30a9f2286a6f892b6e781e560a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Mar 2024 10:11:34 +0100 Subject: drm/xe: Rework rebinding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of handling the vm's rebind fence separately, which is error prone if they are not strictly ordered, attach rebind fences as kernel fences to the vm's resv. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Rodrigo Vivi Cc: Matthew Brost Cc: # v6.8+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-3-thomas.hellstrom@linux.intel.com (cherry picked from commit 5a091aff50b780ae29c7faf70a7a6c21c98a54c4) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec.c | 31 +++---------------------------- drivers/gpu/drm/xe/xe_pt.c | 2 +- drivers/gpu/drm/xe/xe_vm.c | 27 +++++++++------------------ drivers/gpu/drm/xe/xe_vm.h | 2 +- drivers/gpu/drm/xe/xe_vm_types.h | 3 --- 5 files changed, 14 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 826c8b389672..79469b76a8be 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -152,7 +152,6 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) struct drm_exec *exec = &vm_exec.exec; u32 i, num_syncs = 0, num_ufence = 0; struct xe_sched_job *job; - struct dma_fence *rebind_fence; struct xe_vm *vm; bool write_locked, skip_retry = false; ktime_t end = 0; @@ -294,35 +293,11 @@ retry: * Rebind any invalidated userptr or evicted BOs in the VM, non-compute * VM mode only. */ - rebind_fence = xe_vm_rebind(vm, false); - if (IS_ERR(rebind_fence)) { - err = PTR_ERR(rebind_fence); + err = xe_vm_rebind(vm, false); + if (err) goto err_put_job; - } - - /* - * We store the rebind_fence in the VM so subsequent execs don't get - * scheduled before the rebinds of userptrs / evicted BOs is complete. - */ - if (rebind_fence) { - dma_fence_put(vm->rebind_fence); - vm->rebind_fence = rebind_fence; - } - if (vm->rebind_fence) { - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, - &vm->rebind_fence->flags)) { - dma_fence_put(vm->rebind_fence); - vm->rebind_fence = NULL; - } else { - dma_fence_get(vm->rebind_fence); - err = drm_sched_job_add_dependency(&job->drm, - vm->rebind_fence); - if (err) - goto err_put_job; - } - } - /* Wait behind munmap style rebinds */ + /* Wait behind rebinds */ if (!xe_vm_in_lr_mode(vm)) { err = drm_sched_job_add_resv_dependencies(&job->drm, xe_vm_resv(vm), diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 9fd65f5d3d80..5fc4ad1e4298 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1299,7 +1299,7 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue } /* add shared fence now for pagetable delayed destroy */ - dma_resv_add_fence(xe_vm_resv(vm), fence, !rebind && + dma_resv_add_fence(xe_vm_resv(vm), fence, rebind || last_munmap_rebind ? DMA_RESV_USAGE_KERNEL : DMA_RESV_USAGE_BOOKKEEP); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index f88faef4142b..3b97e9c92609 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -522,7 +522,6 @@ static void preempt_rebind_work_func(struct work_struct *w) { struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); struct drm_exec exec; - struct dma_fence *rebind_fence; unsigned int fence_count = 0; LIST_HEAD(preempt_fences); ktime_t end = 0; @@ -568,18 +567,11 @@ retry: if (err) goto out_unlock; - rebind_fence = xe_vm_rebind(vm, true); - if (IS_ERR(rebind_fence)) { - err = PTR_ERR(rebind_fence); + err = xe_vm_rebind(vm, true); + if (err) goto out_unlock; - } - - if (rebind_fence) { - dma_fence_wait(rebind_fence, false); - dma_fence_put(rebind_fence); - } - /* Wait on munmap style VM unbinds */ + /* Wait on rebinds and munmap style VM unbinds */ wait = dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_KERNEL, false, MAX_SCHEDULE_TIMEOUT); @@ -773,14 +765,14 @@ xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q, struct xe_sync_entry *syncs, u32 num_syncs, bool first_op, bool last_op); -struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker) +int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker) { - struct dma_fence *fence = NULL; + struct dma_fence *fence; struct xe_vma *vma, *next; lockdep_assert_held(&vm->lock); if (xe_vm_in_lr_mode(vm) && !rebind_worker) - return NULL; + return 0; xe_vm_assert_held(vm); list_for_each_entry_safe(vma, next, &vm->rebind_list, @@ -788,17 +780,17 @@ struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker) xe_assert(vm->xe, vma->tile_present); list_del_init(&vma->combined_links.rebind); - dma_fence_put(fence); if (rebind_worker) trace_xe_vma_rebind_worker(vma); else trace_xe_vma_rebind_exec(vma); fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false); if (IS_ERR(fence)) - return fence; + return PTR_ERR(fence); + dma_fence_put(fence); } - return fence; + return 0; } static void xe_vma_free(struct xe_vma *vma) @@ -1589,7 +1581,6 @@ static void vm_destroy_work_func(struct work_struct *w) XE_WARN_ON(vm->pt_root[id]); trace_xe_vm_free(vm); - dma_fence_put(vm->rebind_fence); kfree(vm); } diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 6df1f1c7f85d..4853354336f2 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -207,7 +207,7 @@ int __xe_vm_userptr_needs_repin(struct xe_vm *vm); int xe_vm_userptr_check_repin(struct xe_vm *vm); -struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker); +int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker); int xe_vm_invalidate_vma(struct xe_vma *vma); diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 5747f136d24d..badf3945083d 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -177,9 +177,6 @@ struct xe_vm { */ struct list_head rebind_list; - /** @rebind_fence: rebind fence from execbuf */ - struct dma_fence *rebind_fence; - /** * @destroy_work: worker to destroy VM, needed as a dma_fence signaling * from an irq context can be last put and the destroy needs to be able -- cgit v1.2.3 From fd1c8085113fb7c803fd81280f7e0bb25a5797ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Mar 2024 10:11:35 +0100 Subject: drm/xe: Make TLB invalidation fences unordered MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They can actually complete out-of-order, so allocate a unique fence context for each fence. Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost Cc: # v6.8+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-4-thomas.hellstrom@linux.intel.com (cherry picked from commit 0453f1757501df2e82b66b3183a24bba5a6f8fa3) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 1 - drivers/gpu/drm/xe/xe_gt_types.h | 7 ------- drivers/gpu/drm/xe/xe_pt.c | 3 +-- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index f03e077f81a0..e598a4363d01 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -61,7 +61,6 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt) INIT_LIST_HEAD(>->tlb_invalidation.pending_fences); spin_lock_init(>->tlb_invalidation.pending_lock); spin_lock_init(>->tlb_invalidation.lock); - gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1); INIT_DELAYED_WORK(>->tlb_invalidation.fence_tdr, xe_gt_tlb_fence_timeout); diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 70c615dd1498..07b2f724ec45 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -177,13 +177,6 @@ struct xe_gt { * xe_gt_tlb_fence_timeout after the timeut interval is over. */ struct delayed_work fence_tdr; - /** @tlb_invalidation.fence_context: context for TLB invalidation fences */ - u64 fence_context; - /** - * @tlb_invalidation.fence_seqno: seqno to TLB invalidation fences, protected by - * tlb_invalidation.lock - */ - u32 fence_seqno; /** @tlb_invalidation.lock: protects TLB invalidation fences */ spinlock_t lock; } tlb_invalidation; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 5fc4ad1e4298..29d1a31f9804 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1135,8 +1135,7 @@ static int invalidation_fence_init(struct xe_gt *gt, spin_lock_irq(>->tlb_invalidation.lock); dma_fence_init(&ifence->base.base, &invalidation_fence_ops, >->tlb_invalidation.lock, - gt->tlb_invalidation.fence_context, - ++gt->tlb_invalidation.fence_seqno); + dma_fence_context_alloc(1), 1); spin_unlock_irq(>->tlb_invalidation.lock); INIT_LIST_HEAD(&ifence->base.link); -- cgit v1.2.3 From 3edd52bead30879644bb69fe4aafde67d2cd8512 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Mar 2024 10:11:36 +0100 Subject: drm/xe: Move vma rebinding to the drm_exec locking loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rebinding might allocate page-table bos, causing evictions. To support blocking locking during these evictions, perform the rebinding in the drm_exec locking loop. Also Reserve fence slots where actually needed rather than trying to predict how many fence slots will be needed over a complete wound-wait transaction. v2: - Remove a leftover call to xe_vm_rebind() (Matt Brost) - Add a helper function xe_vm_validate_rebind() (Matt Brost) v3: - Add comments and squash with previous patch (Matt Brost) Fixes: 24f947d58fe5 ("drm/xe: Use DRM GPUVM helpers for external- and evicted objects") Fixes: 29f424eb8702 ("drm/xe/exec: move fence reservation") Cc: Matthew Auld Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-5-thomas.hellstrom@linux.intel.com (cherry picked from commit 7ee7dd6f301341d5b1204fc19fa620d7f7f7e90d) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec.c | 52 +++------------------- drivers/gpu/drm/xe/xe_gt_pagefault.c | 3 +- drivers/gpu/drm/xe/xe_pt.c | 14 ++++++ drivers/gpu/drm/xe/xe_vm.c | 83 +++++++++++++++++++++++++----------- drivers/gpu/drm/xe/xe_vm.h | 6 ++- 5 files changed, 83 insertions(+), 75 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 79469b76a8be..cc5e0f75de3c 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -94,48 +94,16 @@ * Unlock all */ +/* + * Add validation and rebinding to the drm_exec locking loop, since both can + * trigger eviction which may require sleeping dma_resv locks. + */ static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec) { struct xe_vm *vm = container_of(vm_exec->vm, struct xe_vm, gpuvm); - struct drm_gem_object *obj; - unsigned long index; - int num_fences; - int ret; - - ret = drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec); - if (ret) - return ret; - - /* - * 1 fence slot for the final submit, and 1 more for every per-tile for - * GPU bind and 1 extra for CPU bind. Note that there are potentially - * many vma per object/dma-resv, however the fence slot will just be - * re-used, since they are largely the same timeline and the seqno - * should be in order. In the case of CPU bind there is dummy fence used - * for all CPU binds, so no need to have a per-tile slot for that. - */ - num_fences = 1 + 1 + vm->xe->info.tile_count; - /* - * We don't know upfront exactly how many fence slots we will need at - * the start of the exec, since the TTM bo_validate above can consume - * numerous fence slots. Also due to how the dma_resv_reserve_fences() - * works it only ensures that at least that many fence slots are - * available i.e if there are already 10 slots available and we reserve - * two more, it can just noop without reserving anything. With this it - * is quite possible that TTM steals some of the fence slots and then - * when it comes time to do the vma binding and final exec stage we are - * lacking enough fence slots, leading to some nasty BUG_ON() when - * adding the fences. Hence just add our own fences here, after the - * validate stage. - */ - drm_exec_for_each_locked_object(&vm_exec->exec, index, obj) { - ret = dma_resv_reserve_fences(obj->resv, num_fences); - if (ret) - return ret; - } - - return 0; + /* The fence slot added here is intended for the exec sched job. */ + return xe_vm_validate_rebind(vm, &vm_exec->exec, 1); } int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) @@ -289,14 +257,6 @@ retry: goto err_exec; } - /* - * Rebind any invalidated userptr or evicted BOs in the VM, non-compute - * VM mode only. - */ - err = xe_vm_rebind(vm, false); - if (err) - goto err_put_job; - /* Wait behind rebinds */ if (!xe_vm_in_lr_mode(vm)) { err = drm_sched_job_add_resv_dependencies(&job->drm, diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 241c294270d9..fa9e9853c53b 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -100,10 +100,9 @@ static int xe_pf_begin(struct drm_exec *exec, struct xe_vma *vma, { struct xe_bo *bo = xe_vma_bo(vma); struct xe_vm *vm = xe_vma_vm(vma); - unsigned int num_shared = 2; /* slots for bind + move */ int err; - err = xe_vm_prepare_vma(exec, vma, num_shared); + err = xe_vm_lock_vma(exec, vma); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 29d1a31f9804..4efc8c1a3d7a 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1235,6 +1235,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue err = xe_pt_prepare_bind(tile, vma, entries, &num_entries); if (err) goto err; + + err = dma_resv_reserve_fences(xe_vm_resv(vm), 1); + if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm) + err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1); + if (err) + goto err; + xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries)); xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries); @@ -1577,6 +1584,7 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queu struct dma_fence *fence = NULL; struct invalidation_fence *ifence; struct xe_range_fence *rfence; + int err; LLIST_HEAD(deferred); @@ -1594,6 +1602,12 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queu xe_pt_calc_rfence_interval(vma, &unbind_pt_update, entries, num_entries); + err = dma_resv_reserve_fences(xe_vm_resv(vm), 1); + if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm) + err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1); + if (err) + return ERR_PTR(err); + ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 3b97e9c92609..62d1ef8867a8 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -482,17 +482,53 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) return 0; } +/** + * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas + * @vm: The vm for which we are rebinding. + * @exec: The struct drm_exec with the locked GEM objects. + * @num_fences: The number of fences to reserve for the operation, not + * including rebinds and validations. + * + * Validates all evicted gem objects and rebinds their vmas. Note that + * rebindings may cause evictions and hence the validation-rebind + * sequence is rerun until there are no more objects to validate. + * + * Return: 0 on success, negative error code on error. In particular, + * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if + * the drm_exec transaction needs to be restarted. + */ +int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec, + unsigned int num_fences) +{ + struct drm_gem_object *obj; + unsigned long index; + int ret; + + do { + ret = drm_gpuvm_validate(&vm->gpuvm, exec); + if (ret) + return ret; + + ret = xe_vm_rebind(vm, false); + if (ret) + return ret; + } while (!list_empty(&vm->gpuvm.evict.list)); + + drm_exec_for_each_locked_object(exec, index, obj) { + ret = dma_resv_reserve_fences(obj->resv, num_fences); + if (ret) + return ret; + } + + return 0; +} + static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, bool *done) { int err; - /* - * 1 fence for each preempt fence plus a fence for each tile from a - * possible rebind - */ - err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, vm->preempt.num_exec_queues + - vm->xe->info.tile_count); + err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0); if (err) return err; @@ -507,7 +543,7 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, return 0; } - err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, vm->preempt.num_exec_queues); + err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0); if (err) return err; @@ -515,7 +551,13 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, if (err) return err; - return drm_gpuvm_validate(&vm->gpuvm, exec); + /* + * Add validation and rebinding to the locking loop since both can + * cause evictions which may require blocing dma_resv locks. + * The fence reservation here is intended for the new preempt fences + * we attach at the end of the rebind work. + */ + return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues); } static void preempt_rebind_work_func(struct work_struct *w) @@ -996,35 +1038,26 @@ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence) } /** - * xe_vm_prepare_vma() - drm_exec utility to lock a vma + * xe_vm_lock_vma() - drm_exec utility to lock a vma * @exec: The drm_exec object we're currently locking for. * @vma: The vma for witch we want to lock the vm resv and any attached * object's resv. - * @num_shared: The number of dma-fence slots to pre-allocate in the - * objects' reservation objects. * * Return: 0 on success, negative error code on error. In particular * may return -EDEADLK on WW transaction contention and -EINTR if * an interruptible wait is terminated by a signal. */ -int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma, - unsigned int num_shared) +int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma) { struct xe_vm *vm = xe_vma_vm(vma); struct xe_bo *bo = xe_vma_bo(vma); int err; XE_WARN_ON(!vm); - if (num_shared) - err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared); - else - err = drm_exec_lock_obj(exec, xe_vm_obj(vm)); - if (!err && bo && !bo->vm) { - if (num_shared) - err = drm_exec_prepare_obj(exec, &bo->ttm.base, num_shared); - else - err = drm_exec_lock_obj(exec, &bo->ttm.base); - } + + err = drm_exec_lock_obj(exec, xe_vm_obj(vm)); + if (!err && bo && !bo->vm) + err = drm_exec_lock_obj(exec, &bo->ttm.base); return err; } @@ -1036,7 +1069,7 @@ static void xe_vma_destroy_unlocked(struct xe_vma *vma) drm_exec_init(&exec, 0, 0); drm_exec_until_all_locked(&exec) { - err = xe_vm_prepare_vma(&exec, vma, 0); + err = xe_vm_lock_vma(&exec, vma); drm_exec_retry_on_contention(&exec); if (XE_WARN_ON(err)) break; @@ -2503,7 +2536,7 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm, lockdep_assert_held_write(&vm->lock); - err = xe_vm_prepare_vma(exec, vma, 1); + err = xe_vm_lock_vma(exec, vma); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 4853354336f2..306cd0934a19 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -242,8 +242,10 @@ bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end); int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id); -int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma, - unsigned int num_shared); +int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma); + +int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec, + unsigned int num_fences); /** * xe_vm_resv() - Return's the vm's reservation object -- cgit v1.2.3 From 77a011012d7d8b98368a763bf74317c6d5ce00f1 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 1 Apr 2024 15:19:11 -0700 Subject: drm/xe: Use ordered wq for preempt fence waiting Preempt fences can sleep waiting for an exec queue suspend operation to complete. If the system_unbound_wq is used for waiting and the number of waiters exceeds max_active this will result in other users of the system_unbound_wq getting starved. Use a device private work queue for preempt fences to avoid starvation of the system_unbound_wq. Even though suspend operations can complete out-of-order, all suspend operations within a VM need to complete before the preempt rebind worker can start. With that, use a device private ordered wq for preempt fence waiting. v2: - Add comment about cleanup on failure (Matt R) - Update commit message (Lucas) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240401221913.139672-2-matthew.brost@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 37c15c4aae1fe3f67efd2641db8d8c25c2d524ab) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_device.c | 11 ++++++++++- drivers/gpu/drm/xe/xe_device_types.h | 3 +++ drivers/gpu/drm/xe/xe_preempt_fence.c | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index ca85e81fdb44..d32ff3857e65 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -193,6 +193,9 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy) { struct xe_device *xe = to_xe_device(dev); + if (xe->preempt_fence_wq) + destroy_workqueue(xe->preempt_fence_wq); + if (xe->ordered_wq) destroy_workqueue(xe->ordered_wq); @@ -258,9 +261,15 @@ struct xe_device *xe_device_create(struct pci_dev *pdev, INIT_LIST_HEAD(&xe->pinned.external_vram); INIT_LIST_HEAD(&xe->pinned.evicted); + xe->preempt_fence_wq = alloc_ordered_workqueue("xe-preempt-fence-wq", 0); xe->ordered_wq = alloc_ordered_workqueue("xe-ordered-wq", 0); xe->unordered_wq = alloc_workqueue("xe-unordered-wq", 0, 0); - if (!xe->ordered_wq || !xe->unordered_wq) { + if (!xe->ordered_wq || !xe->unordered_wq || + !xe->preempt_fence_wq) { + /* + * Cleanup done in xe_device_destroy via + * drmm_add_action_or_reset register above + */ drm_err(&xe->drm, "Failed to allocate xe workqueues\n"); err = -ENOMEM; goto err; diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 9785eef2e5a4..8e3a222b41cf 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -363,6 +363,9 @@ struct xe_device { /** @ufence_wq: user fence wait queue */ wait_queue_head_t ufence_wq; + /** @preempt_fence_wq: used to serialize preempt fences */ + struct workqueue_struct *preempt_fence_wq; + /** @ordered_wq: used to serialize compute mode resume */ struct workqueue_struct *ordered_wq; diff --git a/drivers/gpu/drm/xe/xe_preempt_fence.c b/drivers/gpu/drm/xe/xe_preempt_fence.c index 7bce2a332603..7d50c6e89d8e 100644 --- a/drivers/gpu/drm/xe/xe_preempt_fence.c +++ b/drivers/gpu/drm/xe/xe_preempt_fence.c @@ -49,7 +49,7 @@ static bool preempt_fence_enable_signaling(struct dma_fence *fence) struct xe_exec_queue *q = pfence->q; pfence->error = q->ops->suspend(q); - queue_work(system_unbound_wq, &pfence->preempt_work); + queue_work(q->vm->xe->preempt_fence_wq, &pfence->preempt_work); return true; } -- cgit v1.2.3 From 05258a0a69b3c5d2c003f818702c0a52b6fea861 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 3 Apr 2024 10:36:25 -0400 Subject: SUNRPC: Fix a slow server-side memory leak with RPC-over-TCP Jan Schunk reports that his small NFS servers suffer from memory exhaustion after just a few days. A bisect shows that commit e18e157bb5c8 ("SUNRPC: Send RPC message on TCP with a single sock_sendmsg() call") is the first bad commit. That commit assumed that sock_sendmsg() releases all the pages in the underlying bio_vec array, but the reality is that it doesn't. svc_xprt_release() releases the rqst's response pages, but the record marker page fragment isn't one of those, so it is never released. This is a narrow fix that can be applied to stable kernels. A more extensive fix is in the works. Reported-by: Jan Schunk Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218671 Fixes: e18e157bb5c8 ("SUNRPC: Send RPC message on TCP with a single sock_sendmsg() call") Cc: Alexander Duyck Cc: Jakub Kacinski Cc: David Howells Reviewed-by: David Howells Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 545017a3daa4..6b3f01beb294 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1206,15 +1206,6 @@ err_noclose: * MSG_SPLICE_PAGES is used exclusively to reduce the number of * copy operations in this path. Therefore the caller must ensure * that the pages backing @xdr are unchanging. - * - * Note that the send is non-blocking. The caller has incremented - * the reference count on each page backing the RPC message, and - * the network layer will "put" these pages when transmission is - * complete. - * - * This is safe for our RPC services because the memory backing - * the head and tail components is never kmalloc'd. These always - * come from pages in the svc_rqst::rq_pages array. */ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, rpc_fraghdr marker, unsigned int *sentp) @@ -1244,6 +1235,7 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); + page_frag_free(buf); if (ret < 0) return ret; *sentp += ret; -- cgit v1.2.3 From c4128304c2169b4664ed6fb6200f228cead2ab70 Mon Sep 17 00:00:00 2001 From: Kyle Tso Date: Thu, 4 Apr 2024 21:35:17 +0800 Subject: usb: typec: tcpm: Correct the PDO counting in pd_set Off-by-one errors happen because nr_snk_pdo and nr_src_pdo are incorrectly added one. The index of the loop is equal to the number of PDOs to be updated when leaving the loop and it doesn't need to be added one. When doing the power negotiation, TCPM relies on the "nr_snk_pdo" as the size of the local sink PDO array to match the Source capabilities of the partner port. If the off-by-one overflow occurs, a wrong RDO might be sent and unexpected power transfer might happen such as over voltage or over current (than expected). "nr_src_pdo" is used to set the Rp level when the port is in Source role. It is also the array size of the local Source capabilities when filling up the buffer which will be sent as the Source PDOs (such as in Power Negotiation). If the off-by-one overflow occurs, a wrong Rp level might be set and wrong Source PDOs will be sent to the partner port. This could potentially cause over current or port resets. Fixes: cd099cde4ed2 ("usb: typec: tcpm: Support multiple capabilities") Cc: stable@vger.kernel.org Signed-off-by: Kyle Tso Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240404133517.2707955-1-kyletso@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index c26fb70c3ec6..ab6ed6111ed0 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -6855,14 +6855,14 @@ static int tcpm_pd_set(struct typec_port *p, struct usb_power_delivery *pd) if (data->sink_desc.pdo[0]) { for (i = 0; i < PDO_MAX_OBJECTS && data->sink_desc.pdo[i]; i++) port->snk_pdo[i] = data->sink_desc.pdo[i]; - port->nr_snk_pdo = i + 1; + port->nr_snk_pdo = i; port->operating_snk_mw = data->operating_snk_mw; } if (data->source_desc.pdo[0]) { for (i = 0; i < PDO_MAX_OBJECTS && data->source_desc.pdo[i]; i++) port->src_pdo[i] = data->source_desc.pdo[i]; - port->nr_src_pdo = i + 1; + port->nr_src_pdo = i; } switch (port->state) { -- cgit v1.2.3 From 6334b8e4553cc69f51e383c9de545082213d785e Mon Sep 17 00:00:00 2001 From: Norihiko Hama Date: Wed, 27 Mar 2024 11:35:50 +0900 Subject: usb: gadget: f_ncm: Fix UAF ncm object at re-bind after usb ep transport error When ncm function is working and then stop usb0 interface for link down, eth_stop() is called. At this piont, accidentally if usb transport error should happen in usb_ep_enable(), 'in_ep' and/or 'out_ep' may not be enabled. After that, ncm_disable() is called to disable for ncm unbind but gether_disconnect() is never called since 'in_ep' is not enabled. As the result, ncm object is released in ncm unbind but 'dev->port_usb' associated to 'ncm->port' is not NULL. And when ncm bind again to recover netdev, ncm object is reallocated but usb0 interface is already associated to previous released ncm object. Therefore, once usb0 interface is up and eth_start_xmit() is called, released ncm object is dereferrenced and it might cause use-after-free memory. [function unlink via configfs] usb0: eth_stop dev->port_usb=ffffff9b179c3200 --> error happens in usb_ep_enable(). NCM: ncm_disable: ncm=ffffff9b179c3200 --> no gether_disconnect() since ncm->port.in_ep->enabled is false. NCM: ncm_unbind: ncm unbind ncm=ffffff9b179c3200 NCM: ncm_free: ncm free ncm=ffffff9b179c3200 <-- released ncm [function link via configfs] NCM: ncm_alloc: ncm alloc ncm=ffffff9ac4f8a000 NCM: ncm_bind: ncm bind ncm=ffffff9ac4f8a000 NCM: ncm_set_alt: ncm=ffffff9ac4f8a000 alt=0 usb0: eth_open dev->port_usb=ffffff9b179c3200 <-- previous released ncm usb0: eth_start dev->port_usb=ffffff9b179c3200 <-- eth_start_xmit() --> dev->wrap() Unable to handle kernel paging request at virtual address dead00000000014f This patch addresses the issue by checking if 'ncm->netdev' is not NULL at ncm_disable() to call gether_disconnect() to deassociate 'dev->port_usb'. It's more reasonable to check 'ncm->netdev' to call gether_connect/disconnect rather than check 'ncm->port.in_ep->enabled' since it might not be enabled but the gether connection might be established. Signed-off-by: Norihiko Hama Cc: stable Link: https://lore.kernel.org/r/20240327023550.51214-1-Norihiko.Hama@alpsalpine.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_ncm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index 28f4e6552e84..0acc32ed9960 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -878,7 +878,7 @@ static int ncm_set_alt(struct usb_function *f, unsigned intf, unsigned alt) if (alt > 1) goto fail; - if (ncm->port.in_ep->enabled) { + if (ncm->netdev) { DBG(cdev, "reset ncm\n"); ncm->netdev = NULL; gether_disconnect(&ncm->port); @@ -1367,7 +1367,7 @@ static void ncm_disable(struct usb_function *f) DBG(cdev, "ncm deactivated\n"); - if (ncm->port.in_ep->enabled) { + if (ncm->netdev) { ncm->netdev = NULL; gether_disconnect(&ncm->port); } -- cgit v1.2.3 From b3b95964590a3d756d69ea8604c856de805479ad Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 4 Apr 2024 11:33:27 +0200 Subject: gpio: cdev: check for NULL labels when sanitizing them for irqs We need to take into account that a line's consumer label may be NULL and not try to kstrdup() it in that case but rather pass the NULL pointer up the stack to the interrupt request function. To that end: let make_irq_label() return NULL as a valid return value and use ERR_PTR() instead to signal an allocation failure to callers. Cc: stable@vger.kernel.org Fixes: b34490879baa ("gpio: cdev: sanitize the label before requesting the interrupt") Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/lkml/20240402093534.212283-1-naresh.kamboju@linaro.org/ Signed-off-by: Bartosz Golaszewski Tested-by: Anders Roxell --- drivers/gpio/gpiolib-cdev.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index fa9635610251..1426cc1c4a28 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -1085,7 +1085,16 @@ static u32 gpio_v2_line_config_debounce_period(struct gpio_v2_line_config *lc, static inline char *make_irq_label(const char *orig) { - return kstrdup_and_replace(orig, '/', ':', GFP_KERNEL); + char *new; + + if (!orig) + return NULL; + + new = kstrdup_and_replace(orig, '/', ':', GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + return new; } static inline void free_irq_label(const char *label) @@ -1158,8 +1167,8 @@ static int edge_detector_setup(struct line *line, irqflags |= IRQF_ONESHOT; label = make_irq_label(line->req->label); - if (!label) - return -ENOMEM; + if (IS_ERR(label)) + return PTR_ERR(label); /* Request a thread to read the events */ ret = request_threaded_irq(irq, edge_irq_handler, edge_irq_thread, @@ -2217,8 +2226,8 @@ static int lineevent_create(struct gpio_device *gdev, void __user *ip) goto out_free_le; label = make_irq_label(le->label); - if (!label) { - ret = -ENOMEM; + if (IS_ERR(label)) { + ret = PTR_ERR(label); goto out_free_le; } -- cgit v1.2.3 From d920a2ed8620be04a3301e1a9c2b7cc1de65f19d Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 5 Mar 2024 14:51:38 +0800 Subject: usb: Disable USB3 LPM at shutdown SanDisks USB3 storage may disapper after system reboot: usb usb2-port3: link state change xhci_hcd 0000:00:14.0: clear port3 link state change, portsc: 0x2c0 usb usb2-port3: do warm reset, port only xhci_hcd 0000:00:14.0: xhci_hub_status_data: stopping usb2 port polling xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x2b0, return 0x2b0 usb usb2-port3: not warm reset yet, waiting 50ms xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x2f0, return 0x2f0 usb usb2-port3: not warm reset yet, waiting 200ms ... xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x6802c0, return 0x7002c0 usb usb2-port3: not warm reset yet, waiting 200ms xhci_hcd 0000:00:14.0: clear port3 reset change, portsc: 0x4802c0 xhci_hcd 0000:00:14.0: clear port3 warm(BH) reset change, portsc: 0x4002c0 xhci_hcd 0000:00:14.0: clear port3 link state change, portsc: 0x2c0 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x2c0, return 0x2c0 usb usb2-port3: not enabled, trying warm reset again... This is due to the USB device still cause port change event after xHCI is shuted down: xhci_hcd 0000:38:00.0: // Setting command ring address to 0xffffe001 xhci_hcd 0000:38:00.0: xhci_resume: starting usb3 port polling. xhci_hcd 0000:38:00.0: xhci_hub_status_data: stopping usb4 port polling xhci_hcd 0000:38:00.0: xhci_hub_status_data: stopping usb3 port polling xhci_hcd 0000:38:00.0: hcd_pci_runtime_resume: 0 xhci_hcd 0000:38:00.0: xhci_shutdown: stopping usb3 port polling. xhci_hcd 0000:38:00.0: // Halt the HC xhci_hcd 0000:38:00.0: xhci_shutdown completed - status = 1 xhci_hcd 0000:00:14.0: xhci_shutdown: stopping usb1 port polling. xhci_hcd 0000:00:14.0: // Halt the HC xhci_hcd 0000:00:14.0: xhci_shutdown completed - status = 1 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x1203, return 0x203 xhci_hcd 0000:00:14.0: set port reset, actual port 2-3 status = 0x1311 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x201203, return 0x100203 xhci_hcd 0000:00:14.0: clear port3 reset change, portsc: 0x1203 xhci_hcd 0000:00:14.0: clear port3 warm(BH) reset change, portsc: 0x1203 xhci_hcd 0000:00:14.0: clear port3 link state change, portsc: 0x1203 xhci_hcd 0000:00:14.0: clear port3 connect change, portsc: 0x1203 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x1203, return 0x203 usb 2-3: device not accepting address 2, error -108 xhci_hcd 0000:00:14.0: xHCI dying or halted, can't queue_command xhci_hcd 0000:00:14.0: Set port 2-3 link state, portsc: 0x1203, write 0x11261 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x1263, return 0x263 xhci_hcd 0000:00:14.0: set port reset, actual port 2-3 status = 0x1271 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x12b1, return 0x2b1 usb usb2-port3: not reset yet, waiting 60ms ACPI: PM: Preparing to enter system sleep state S5 xhci_hcd 0000:00:14.0: Get port status 2-3 read: 0x12f1, return 0x2f1 usb usb2-port3: not reset yet, waiting 200ms reboot: Restarting system The port change event is caused by LPM transition, so disabling LPM at shutdown to make sure the device is in U0 for warmboot. Signed-off-by: Kai-Heng Feng Cc: stable Link: https://lore.kernel.org/r/20240305065140.66801-1-kai.heng.feng@canonical.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/port.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c index 686c01af03e6..0e1262a077ae 100644 --- a/drivers/usb/core/port.c +++ b/drivers/usb/core/port.c @@ -449,8 +449,10 @@ static void usb_port_shutdown(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); - if (port_dev->child) + if (port_dev->child) { usb_disable_usb2_hardware_lpm(port_dev->child); + usb_unlocked_disable_lpm(port_dev->child); + } } static const struct dev_pm_ops usb_port_pm_ops = { -- cgit v1.2.3 From 1fc9af813b25e146d3607669247d0f970f5a87c3 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 5 Jan 2024 21:46:11 +0300 Subject: drm/panfrost: Fix the error path in panfrost_mmu_map_fault_addr() Subject: [PATCH] drm/panfrost: Fix the error path in panfrost_mmu_map_fault_addr() If some the pages or sgt allocation failed, we shouldn't release the pages ref we got earlier, otherwise we will end up with unbalanced get/put_pages() calls. We should instead leave everything in place and let the BO release function deal with extra cleanup when the object is destroyed, or let the fault handler try again next time it's called. Fixes: 187d2929206e ("drm/panfrost: Add support for GPU heap allocations") Cc: Reviewed-by: Steven Price Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Boris Brezillon Co-developed-by: Dmitry Osipenko Signed-off-by: Dmitry Osipenko Link: https://patchwork.freedesktop.org/patch/msgid/20240105184624.508603-18-dmitry.osipenko@collabora.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index f38385fe76bb..b91019cd5acb 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -502,11 +502,18 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, mapping_set_unevictable(mapping); for (i = page_offset; i < page_offset + NUM_FAULT_PAGES; i++) { + /* Can happen if the last fault only partially filled this + * section of the pages array before failing. In that case + * we skip already filled pages. + */ + if (pages[i]) + continue; + pages[i] = shmem_read_mapping_page(mapping, i); if (IS_ERR(pages[i])) { ret = PTR_ERR(pages[i]); pages[i] = NULL; - goto err_pages; + goto err_unlock; } } @@ -514,7 +521,7 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, ret = sg_alloc_table_from_pages(sgt, pages + page_offset, NUM_FAULT_PAGES, 0, SZ_2M, GFP_KERNEL); if (ret) - goto err_pages; + goto err_unlock; ret = dma_map_sgtable(pfdev->dev, sgt, DMA_BIDIRECTIONAL, 0); if (ret) @@ -537,8 +544,6 @@ out: err_map: sg_free_table(sgt); -err_pages: - drm_gem_shmem_put_pages(&bo->base); err_unlock: dma_resv_unlock(obj->resv); err_bo: -- cgit v1.2.3 From 3ddf944b32f88741c303f0b21459dbb3872b8bc5 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 13 Mar 2024 14:48:27 +0100 Subject: x86/mce: Make sure to grab mce_sysfs_mutex in set_bank() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modifying a MCA bank's MCA_CTL bits which control which error types to be reported is done over /sys/devices/system/machinecheck/ ├── machinecheck0 │   ├── bank0 │   ├── bank1 │   ├── bank10 │   ├── bank11 ... sysfs nodes by writing the new bit mask of events to enable. When the write is accepted, the kernel deletes all current timers and reinits all banks. Doing that in parallel can lead to initializing a timer which is already armed and in the timer wheel, i.e., in use already: ODEBUG: init active (active state 0) object: ffff888063a28000 object type: timer_list hint: mce_timer_fn+0x0/0x240 arch/x86/kernel/cpu/mce/core.c:2642 WARNING: CPU: 0 PID: 8120 at lib/debugobjects.c:514 debug_print_object+0x1a0/0x2a0 lib/debugobjects.c:514 Fix that by grabbing the sysfs mutex as the rest of the MCA sysfs code does. Reported by: Yue Sun Reported by: xingwei lee Signed-off-by: Borislav Petkov (AMD) Cc: Link: https://lore.kernel.org/r/CAEkJfYNiENwQY8yV1LYJ9LjJs%2Bx_-PqMv98gKig55=2vbzffRw@mail.gmail.com --- arch/x86/kernel/cpu/mce/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b5cc557cfc37..84d41be6d06b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2500,12 +2500,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return -EINVAL; b = &per_cpu(mce_banks_array, s->id)[bank]; - if (!b->init) return -ENODEV; b->ctl = new; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); return size; } -- cgit v1.2.3 From 0551ec93a00d935efd86b45bf70cefdc4e515b65 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Apr 2024 14:47:17 +0200 Subject: nvme: don't create a multipath node for zero capacity devices Apparently there are nvme controllers around that report namespaces in the namespace list which have zero capacity. Return -ENXIO instead of -ENODEV from nvme_update_ns_info_block so we don't create a hidden multipath node for these namespaces but entirely ignore them. Fixes: 46e7422cda84 ("nvme: move common logic into nvme_update_ns_info") Reported-by: Nilay Shroff Signed-off-by: Christoph Hellwig Tested-by: Nilay Shroff Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 504dc352c458..27281a9a8951 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2089,7 +2089,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (id->ncap == 0) { /* namespace not allocated or attached */ info->is_removed = true; - ret = -ENODEV; + ret = -ENXIO; goto out; } lbaf = nvme_lbaf_index(id->flbas); -- cgit v1.2.3 From 95409e277d8343810adf8700d29d4329828d452b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 3 Apr 2024 13:31:14 +0200 Subject: nvmet: implement unique discovery NQN Unique discovery NQNs allow to differentiate between discovery services from (typically physically separate) NVMe-oF subsystems. This is required for establishing secured connections as otherwise the credentials won't be unique and the integrity of the connection cannot be guaranteed. This patch adds a configfs attribute 'discovery_nqn' in the 'nvmet' configfs directory to specify the unique discovery NQN. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 47 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 7 +++++++ 2 files changed, 54 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 77a6e817b315..a2325330bf22 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1613,6 +1613,11 @@ static struct config_group *nvmet_subsys_make(struct config_group *group, return ERR_PTR(-EINVAL); } + if (sysfs_streq(name, nvmet_disc_subsys->subsysnqn)) { + pr_err("can't create subsystem using unique discovery NQN\n"); + return ERR_PTR(-EINVAL); + } + subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME); if (IS_ERR(subsys)) return ERR_CAST(subsys); @@ -2159,7 +2164,49 @@ static const struct config_item_type nvmet_hosts_type = { static struct config_group nvmet_hosts_group; +static ssize_t nvmet_root_discovery_nqn_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%s\n", nvmet_disc_subsys->subsysnqn); +} + +static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item, + const char *page, size_t count) +{ + struct list_head *entry; + size_t len; + + len = strcspn(page, "\n"); + if (!len || len > NVMF_NQN_FIELD_LEN - 1) + return -EINVAL; + + down_write(&nvmet_config_sem); + list_for_each(entry, &nvmet_subsystems_group.cg_children) { + struct config_item *item = + container_of(entry, struct config_item, ci_entry); + + if (!strncmp(config_item_name(item), page, len)) { + pr_err("duplicate NQN %s\n", config_item_name(item)); + up_write(&nvmet_config_sem); + return -EINVAL; + } + } + memset(nvmet_disc_subsys->subsysnqn, 0, NVMF_NQN_FIELD_LEN); + memcpy(nvmet_disc_subsys->subsysnqn, page, len); + up_write(&nvmet_config_sem); + + return len; +} + +CONFIGFS_ATTR(nvmet_root_, discovery_nqn); + +static struct configfs_attribute *nvmet_root_attrs[] = { + &nvmet_root_attr_discovery_nqn, + NULL, +}; + static const struct config_item_type nvmet_root_type = { + .ct_attrs = nvmet_root_attrs, .ct_owner = THIS_MODULE, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6bbe4df0166c..8860a3eb71ec 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1541,6 +1541,13 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, } down_read(&nvmet_config_sem); + if (!strncmp(nvmet_disc_subsys->subsysnqn, subsysnqn, + NVMF_NQN_SIZE)) { + if (kref_get_unless_zero(&nvmet_disc_subsys->ref)) { + up_read(&nvmet_config_sem); + return nvmet_disc_subsys; + } + } list_for_each_entry(p, &port->subsystems, entry) { if (!strncmp(p->subsys->subsysnqn, subsysnqn, NVMF_NQN_SIZE)) { -- cgit v1.2.3 From db67bb39eff0fc5db21f49f8ae93e67ed2c5fe01 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 4 Apr 2024 16:41:30 +0200 Subject: nvmet-fc: move RCU read lock to nvmet_fc_assoc_exists The RCU lock is only needed for the lookup loop and not for list_ad_tail_rcu call. Thus move it down the call chain into nvmet_fc_assoc_exists. While at it also fix the name typo of the function. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/fc.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index fd229f310c93..337ee1cb09ae 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1115,16 +1115,21 @@ nvmet_fc_schedule_delete_assoc(struct nvmet_fc_tgt_assoc *assoc) } static bool -nvmet_fc_assoc_exits(struct nvmet_fc_tgtport *tgtport, u64 association_id) +nvmet_fc_assoc_exists(struct nvmet_fc_tgtport *tgtport, u64 association_id) { struct nvmet_fc_tgt_assoc *a; + bool found = false; + rcu_read_lock(); list_for_each_entry_rcu(a, &tgtport->assoc_list, a_list) { - if (association_id == a->association_id) - return true; + if (association_id == a->association_id) { + found = true; + break; + } } + rcu_read_unlock(); - return false; + return found; } static struct nvmet_fc_tgt_assoc * @@ -1164,13 +1169,11 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) ran = ran << BYTES_FOR_QID_SHIFT; spin_lock_irqsave(&tgtport->lock, flags); - rcu_read_lock(); - if (!nvmet_fc_assoc_exits(tgtport, ran)) { + if (!nvmet_fc_assoc_exists(tgtport, ran)) { assoc->association_id = ran; list_add_tail_rcu(&assoc->a_list, &tgtport->assoc_list); done = true; } - rcu_read_unlock(); spin_unlock_irqrestore(&tgtport->lock, flags); } while (!done); -- cgit v1.2.3 From 205fb5fa6fde1b5b426015eb1ff69f2ff25ef5bb Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 4 Apr 2024 16:41:31 +0200 Subject: nvme-fc: rename free_ctrl callback to match name pattern Rename nvme_fc_nvme_ctrl_freed to nvme_fc_free_ctrl to match the name pattern for the callback. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 68a5d971657b..a5b29e9ad342 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2428,7 +2428,7 @@ nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl) * controller. Called after last nvme_put_ctrl() call */ static void -nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl) +nvme_fc_free_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); @@ -3384,7 +3384,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .free_ctrl = nvme_fc_nvme_ctrl_freed, + .free_ctrl = nvme_fc_free_ctrl, .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_delete_ctrl, .get_address = nvmf_get_address, -- cgit v1.2.3 From 1a4ea83a6e67f1415a1f17c1af5e9c814c882bb5 Mon Sep 17 00:00:00 2001 From: Yuanhe Shu Date: Mon, 26 Feb 2024 11:18:16 +0800 Subject: selftests/ftrace: Limit length in subsystem-enable tests While sched* events being traced and sched* events continuously happen, "[xx] event tracing - enable/disable with subsystem level files" would not stop as on some slower systems it seems to take forever. Select the first 100 lines of output would be enough to judge whether there are more than 3 types of sched events. Fixes: 815b18ea66d6 ("ftracetest: Add basic event tracing test cases") Cc: stable@vger.kernel.org Signed-off-by: Yuanhe Shu Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc index b1ede6249866..b7c8f29c09a9 100644 --- a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc +++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc @@ -18,7 +18,7 @@ echo 'sched:*' > set_event yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -29,7 +29,7 @@ echo 1 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -40,7 +40,7 @@ echo 0 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -ne 0 ]; then fail "any of scheduler events should not be recorded" fi -- cgit v1.2.3 From 2e91bb99b9d4f756e92e83c4453f894dda220f09 Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Wed, 3 Apr 2024 15:21:58 +0200 Subject: net: usb: ax88179_178a: avoid the interface always configured as random address After the commit d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets"), reset is not executed from bind operation and mac address is not read from the device registers or the devicetree at that moment. Since the check to configure if the assigned mac address is random or not for the interface, happens after the bind operation from usbnet_probe, the interface keeps configured as random address, although the address is correctly read and set during open operation (the only reset now). In order to keep only one reset for the device and to avoid the interface always configured as random address, after reset, configure correctly the suitable field from the driver, if the mac address is read successfully from the device registers or the devicetree. Take into account if a locally administered address (random) was previously stored. cc: stable@vger.kernel.org # 6.6+ Fixes: d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets") Reported-by: Dave Stevenson Signed-off-by: Jose Ignacio Tornos Martinez Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240403132158.344838-1-jtornosm@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/ax88179_178a.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 88e084534853..a9c418890a1c 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1273,6 +1273,8 @@ static void ax88179_get_mac_addr(struct usbnet *dev) if (is_valid_ether_addr(mac)) { eth_hw_addr_set(dev->net, mac); + if (!is_local_ether_addr(mac)) + dev->net->addr_assign_type = NET_ADDR_PERM; } else { netdev_info(dev->net, "invalid MAC address, using random\n"); eth_hw_addr_random(dev->net); -- cgit v1.2.3 From d313eb8b77557a6d5855f42d2234bd592c7b50dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Apr 2024 13:09:08 +0000 Subject: net/sched: act_skbmod: prevent kernel-infoleak syzbot found that tcf_skbmod_dump() was copying four bytes from kernel stack to user space [1]. The issue here is that 'struct tc_skbmod' has a four bytes hole. We need to clear the structure before filling fields. [1] BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in copy_to_user_iter lib/iov_iter.c:24 [inline] BUG: KMSAN: kernel-infoleak in iterate_ubuf include/linux/iov_iter.h:29 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance2 include/linux/iov_iter.h:245 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance include/linux/iov_iter.h:271 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x366/0x2520 lib/iov_iter.c:185 instrument_copy_to_user include/linux/instrumented.h:114 [inline] copy_to_user_iter lib/iov_iter.c:24 [inline] iterate_ubuf include/linux/iov_iter.h:29 [inline] iterate_and_advance2 include/linux/iov_iter.h:245 [inline] iterate_and_advance include/linux/iov_iter.h:271 [inline] _copy_to_iter+0x366/0x2520 lib/iov_iter.c:185 copy_to_iter include/linux/uio.h:196 [inline] simple_copy_to_iter net/core/datagram.c:532 [inline] __skb_datagram_iter+0x185/0x1000 net/core/datagram.c:420 skb_copy_datagram_iter+0x5c/0x200 net/core/datagram.c:546 skb_copy_datagram_msg include/linux/skbuff.h:4050 [inline] netlink_recvmsg+0x432/0x1610 net/netlink/af_netlink.c:1962 sock_recvmsg_nosec net/socket.c:1046 [inline] sock_recvmsg+0x2c4/0x340 net/socket.c:1068 __sys_recvfrom+0x35a/0x5f0 net/socket.c:2242 __do_sys_recvfrom net/socket.c:2260 [inline] __se_sys_recvfrom net/socket.c:2256 [inline] __x64_sys_recvfrom+0x126/0x1d0 net/socket.c:2256 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was stored to memory at: pskb_expand_head+0x30f/0x19d0 net/core/skbuff.c:2253 netlink_trim+0x2c2/0x330 net/netlink/af_netlink.c:1317 netlink_unicast+0x9f/0x1260 net/netlink/af_netlink.c:1351 nlmsg_unicast include/net/netlink.h:1144 [inline] nlmsg_notify+0x21d/0x2f0 net/netlink/af_netlink.c:2610 rtnetlink_send+0x73/0x90 net/core/rtnetlink.c:741 rtnetlink_maybe_send include/linux/rtnetlink.h:17 [inline] tcf_add_notify net/sched/act_api.c:2048 [inline] tcf_action_add net/sched/act_api.c:2071 [inline] tc_ctl_action+0x146e/0x19d0 net/sched/act_api.c:2119 rtnetlink_rcv_msg+0x1737/0x1900 net/core/rtnetlink.c:6595 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2559 rtnetlink_rcv+0x34/0x40 net/core/rtnetlink.c:6613 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0xf4c/0x1260 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x10df/0x11f0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 ____sys_sendmsg+0x877/0xb60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x4a0 net/socket.c:2674 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was stored to memory at: __nla_put lib/nlattr.c:1041 [inline] nla_put+0x1c6/0x230 lib/nlattr.c:1099 tcf_skbmod_dump+0x23f/0xc20 net/sched/act_skbmod.c:256 tcf_action_dump_old net/sched/act_api.c:1191 [inline] tcf_action_dump_1+0x85e/0x970 net/sched/act_api.c:1227 tcf_action_dump+0x1fd/0x460 net/sched/act_api.c:1251 tca_get_fill+0x519/0x7a0 net/sched/act_api.c:1628 tcf_add_notify_msg net/sched/act_api.c:2023 [inline] tcf_add_notify net/sched/act_api.c:2042 [inline] tcf_action_add net/sched/act_api.c:2071 [inline] tc_ctl_action+0x1365/0x19d0 net/sched/act_api.c:2119 rtnetlink_rcv_msg+0x1737/0x1900 net/core/rtnetlink.c:6595 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2559 rtnetlink_rcv+0x34/0x40 net/core/rtnetlink.c:6613 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0xf4c/0x1260 net/netlink/af_netlink.c:1361 netlink_sendmsg+0x10df/0x11f0 net/netlink/af_netlink.c:1905 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 ____sys_sendmsg+0x877/0xb60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x4a0 net/socket.c:2674 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Local variable opt created at: tcf_skbmod_dump+0x9d/0xc20 net/sched/act_skbmod.c:244 tcf_action_dump_old net/sched/act_api.c:1191 [inline] tcf_action_dump_1+0x85e/0x970 net/sched/act_api.c:1227 Bytes 188-191 of 248 are uninitialized Memory access of size 248 starts at ffff888117697680 Data copied to user address 00007ffe56d855f0 Fixes: 86da71b57383 ("net_sched: Introduce skbmod action") Signed-off-by: Eric Dumazet Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240403130908.93421-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/act_skbmod.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 39945b139c48..cd0accaf844a 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -241,13 +241,13 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod *d = to_skbmod(a); unsigned char *b = skb_tail_pointer(skb); struct tcf_skbmod_params *p; - struct tc_skbmod opt = { - .index = d->tcf_index, - .refcnt = refcount_read(&d->tcf_refcnt) - ref, - .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - }; + struct tc_skbmod opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + opt.index = d->tcf_index; + opt.refcnt = refcount_read(&d->tcf_refcnt) - ref, + opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; p = rcu_dereference_protected(d->skbmod_p, -- cgit v1.2.3 From 0c83842df40f86e529db6842231154772c20edcc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Apr 2024 12:20:51 +0000 Subject: netfilter: validate user input for expected length I got multiple syzbot reports showing old bugs exposed by BPF after commit 20f2505fb436 ("bpf: Try to avoid kzalloc in cgroup/{s,g}etsockopt") setsockopt() @optlen argument should be taken into account before copying data. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in do_replace net/ipv4/netfilter/ip_tables.c:1111 [inline] BUG: KASAN: slab-out-of-bounds in do_ipt_set_ctl+0x902/0x3dd0 net/ipv4/netfilter/ip_tables.c:1627 Read of size 96 at addr ffff88802cd73da0 by task syz-executor.4/7238 CPU: 1 PID: 7238 Comm: syz-executor.4 Not tainted 6.9.0-rc2-next-20240403-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 kasan_check_range+0x282/0x290 mm/kasan/generic.c:189 __asan_memcpy+0x29/0x70 mm/kasan/shadow.c:105 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] do_replace net/ipv4/netfilter/ip_tables.c:1111 [inline] do_ipt_set_ctl+0x902/0x3dd0 net/ipv4/netfilter/ip_tables.c:1627 nf_setsockopt+0x295/0x2c0 net/netfilter/nf_sockopt.c:101 do_sock_setsockopt+0x3af/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x72/0x7a RIP: 0033:0x7fd22067dde9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fd21f9ff0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007fd2207abf80 RCX: 00007fd22067dde9 RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00007fd2206ca47a R08: 0000000000000001 R09: 0000000000000000 R10: 0000000020000880 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007fd2207abf80 R15: 00007ffd2d0170d8 Allocated by task 7238: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:370 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:387 kasan_kmalloc include/linux/kasan.h:211 [inline] __do_kmalloc_node mm/slub.c:4069 [inline] __kmalloc_noprof+0x200/0x410 mm/slub.c:4082 kmalloc_noprof include/linux/slab.h:664 [inline] __cgroup_bpf_run_filter_setsockopt+0xd47/0x1050 kernel/bpf/cgroup.c:1869 do_sock_setsockopt+0x6b4/0x720 net/socket.c:2293 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x72/0x7a The buggy address belongs to the object at ffff88802cd73da0 which belongs to the cache kmalloc-8 of size 8 The buggy address is located 0 bytes inside of allocated 1-byte region [ffff88802cd73da0, ffff88802cd73da1) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88802cd73020 pfn:0x2cd73 flags: 0xfff80000000000(node=0|zone=1|lastcpupid=0xfff) page_type: 0xffffefff(slab) raw: 00fff80000000000 ffff888015041280 dead000000000100 dead000000000122 raw: ffff88802cd73020 000000008080007f 00000001ffffefff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x12cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY), pid 5103, tgid 2119833701 (syz-executor.4), ts 5103, free_ts 70804600828 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1490 prep_new_page mm/page_alloc.c:1498 [inline] get_page_from_freelist+0x2e7e/0x2f40 mm/page_alloc.c:3454 __alloc_pages_noprof+0x256/0x6c0 mm/page_alloc.c:4712 __alloc_pages_node_noprof include/linux/gfp.h:244 [inline] alloc_pages_node_noprof include/linux/gfp.h:271 [inline] alloc_slab_page+0x5f/0x120 mm/slub.c:2249 allocate_slab+0x5a/0x2e0 mm/slub.c:2412 new_slab mm/slub.c:2465 [inline] ___slab_alloc+0xcd1/0x14b0 mm/slub.c:3615 __slab_alloc+0x58/0xa0 mm/slub.c:3705 __slab_alloc_node mm/slub.c:3758 [inline] slab_alloc_node mm/slub.c:3936 [inline] __do_kmalloc_node mm/slub.c:4068 [inline] kmalloc_node_track_caller_noprof+0x286/0x450 mm/slub.c:4089 kstrdup+0x3a/0x80 mm/util.c:62 device_rename+0xb5/0x1b0 drivers/base/core.c:4558 dev_change_name+0x275/0x860 net/core/dev.c:1232 do_setlink+0xa4b/0x41f0 net/core/rtnetlink.c:2864 __rtnl_newlink net/core/rtnetlink.c:3680 [inline] rtnl_newlink+0x180b/0x20a0 net/core/rtnetlink.c:3727 rtnetlink_rcv_msg+0x89b/0x10d0 net/core/rtnetlink.c:6594 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2559 netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline] netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1361 page last free pid 5146 tgid 5146 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1110 [inline] free_unref_page+0xd3c/0xec0 mm/page_alloc.c:2617 discard_slab mm/slub.c:2511 [inline] __put_partials+0xeb/0x130 mm/slub.c:2980 put_cpu_partial+0x17c/0x250 mm/slub.c:3055 __slab_free+0x2ea/0x3d0 mm/slub.c:4254 qlink_free mm/kasan/quarantine.c:163 [inline] qlist_free_all+0x9e/0x140 mm/kasan/quarantine.c:179 kasan_quarantine_reduce+0x14f/0x170 mm/kasan/quarantine.c:286 __kasan_slab_alloc+0x23/0x80 mm/kasan/common.c:322 kasan_slab_alloc include/linux/kasan.h:201 [inline] slab_post_alloc_hook mm/slub.c:3888 [inline] slab_alloc_node mm/slub.c:3948 [inline] __do_kmalloc_node mm/slub.c:4068 [inline] __kmalloc_node_noprof+0x1d7/0x450 mm/slub.c:4076 kmalloc_node_noprof include/linux/slab.h:681 [inline] kvmalloc_node_noprof+0x72/0x190 mm/util.c:634 bucket_table_alloc lib/rhashtable.c:186 [inline] rhashtable_rehash_alloc+0x9e/0x290 lib/rhashtable.c:367 rht_deferred_worker+0x4e1/0x2440 lib/rhashtable.c:427 process_one_work kernel/workqueue.c:3218 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299 worker_thread+0x86d/0xd70 kernel/workqueue.c:3380 kthread+0x2f0/0x390 kernel/kthread.c:388 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243 Memory state around the buggy address: ffff88802cd73c80: 07 fc fc fc 05 fc fc fc 05 fc fc fc fa fc fc fc ffff88802cd73d00: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc >ffff88802cd73d80: fa fc fc fc 01 fc fc fc fa fc fc fc fa fc fc fc ^ ffff88802cd73e00: fa fc fc fc fa fc fc fc 05 fc fc fc 07 fc fc fc ffff88802cd73e80: 07 fc fc fc 07 fc fc fc 07 fc fc fc 07 fc fc fc Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Pablo Neira Ayuso Link: https://lore.kernel.org/r/20240404122051.2303764-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/bridge/netfilter/ebtables.c | 6 ++++++ net/ipv4/netfilter/arp_tables.c | 4 ++++ net/ipv4/netfilter/ip_tables.c | 4 ++++ net/ipv6/netfilter/ip6_tables.c | 4 ++++ 4 files changed, 18 insertions(+) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 99d82676f780..cbd0e3586c3f 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1111,6 +1111,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) struct ebt_table_info *newinfo; struct ebt_replace tmp; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1423,6 +1425,8 @@ static int update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct ebt_replace hlp; + if (len < sizeof(hlp)) + return -EINVAL; if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; @@ -2352,6 +2356,8 @@ static int compat_update_counters(struct net *net, sockptr_t arg, { struct compat_ebt_replace hlp; + if (len < sizeof(hlp)) + return -EINVAL; if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 2407066b0fec..b150c9929b12 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -956,6 +956,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct arpt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1254,6 +1256,8 @@ static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct arpt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 7da1df4997d0..487670759578 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1108,6 +1108,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1492,6 +1494,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index fd9f049d6d41..636b360311c5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1125,6 +1125,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1501,6 +1503,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; -- cgit v1.2.3 From 72d7cb5c190befbb095bae7737e71560ec0fcaa6 Mon Sep 17 00:00:00 2001 From: Shengyu Li Date: Wed, 27 Mar 2024 05:13:15 +0800 Subject: selftests/harness: Prevent infinite loop due to Assert in FIXTURE_TEARDOWN This patch addresses an issue in the selftests/harness where an assertion within FIXTURE_TEARDOWN could trigger an infinite loop. The problem arises because the teardown procedure is meant to execute once, but the presence of failing assertions (ASSERT_EQ(0, 1)) leads to repeated attempts to execute teardown due to the long jump mechanism used by the harness for handling assertions. To resolve this, the patch ensures that the teardown process runs only once, regardless of assertion outcomes, preventing the infinite loop and allowing tests to fail. A simple test demo(test.c): #include "kselftest_harness.h" FIXTURE(f) { int fd; }; FIXTURE_SETUP(f) { self->fd = 0; } FIXTURE_TEARDOWN(f) { TH_LOG("TEARDOWN"); ASSERT_EQ(0, 1); self->fd = -1; } TEST_F(f, open_close) { ASSERT_NE(self->fd, 1); } TEST_HARNESS_MAIN will always output the following output due to a dead loop until timeout: # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) ... But here's what we should and expect to get: TAP version 13 1..1 # Starting 1 tests from 2 test cases. # RUN f.open_close ... # test.c:15:open_close:TEARDOWN # test.c:16:open_close:Expected 0 (0) == 1 (1) # open_close: Test terminated by assertion # FAIL f.open_close not ok 1 f.open_close # FAILED: 0 / 1 tests passed. # Totals: pass:0 fail:1 xfail:0 xpass:0 skip:0 error:0 also this is related to the issue mentioned in this patch https://patchwork.kernel.org/project/linux-kselftest/patch/e2ba3f8c-80e6-477d-9cea-1c9af820e0ed@alu.unizg.hr/ Signed-off-by: Shengyu Li Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7..230d62884885 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -383,6 +383,7 @@ FIXTURE_DATA(fixture_name) self; \ pid_t child = 1; \ int status = 0; \ + bool jmp = false; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ if (setjmp(_metadata->env) == 0) { \ /* Use the same _metadata. */ \ @@ -399,8 +400,10 @@ _metadata->exit_code = KSFT_FAIL; \ } \ } \ + else \ + jmp = true; \ if (child == 0) { \ - if (_metadata->setup_completed && !_metadata->teardown_parent) \ + if (_metadata->setup_completed && !_metadata->teardown_parent && !jmp) \ fixture_name##_teardown(_metadata, &self, variant->data); \ _exit(0); \ } \ -- cgit v1.2.3 From 83092341e15d0dfee1caa8dc502f66c815ccd78a Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Thu, 4 Apr 2024 11:33:28 +0200 Subject: gpio: cdev: fix missed label sanitizing in debounce_setup() When adding sanitization of the label, the path through edge_detector_setup() that leads to debounce_setup() was overlooked. A request taking this path does not allocate a new label and the request label is freed twice when the request is released, resulting in memory corruption. Add label sanitization to debounce_setup(). Cc: stable@vger.kernel.org Fixes: b34490879baa ("gpio: cdev: sanitize the label before requesting the interrupt") Signed-off-by: Kent Gibson [Bartosz: rebased on top of the fix for empty GPIO labels] Co-developed-by: Bartosz Golaszewski Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-cdev.c | 49 ++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 1426cc1c4a28..d09c7d728365 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -728,6 +728,25 @@ static u32 line_event_id(int level) GPIO_V2_LINE_EVENT_FALLING_EDGE; } +static inline char *make_irq_label(const char *orig) +{ + char *new; + + if (!orig) + return NULL; + + new = kstrdup_and_replace(orig, '/', ':', GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + return new; +} + +static inline void free_irq_label(const char *label) +{ + kfree(label); +} + #ifdef CONFIG_HTE static enum hte_return process_hw_ts_thread(void *p) @@ -1015,6 +1034,7 @@ static int debounce_setup(struct line *line, unsigned int debounce_period_us) { unsigned long irqflags; int ret, level, irq; + char *label; /* try hardware */ ret = gpiod_set_debounce(line->desc, debounce_period_us); @@ -1037,11 +1057,17 @@ static int debounce_setup(struct line *line, unsigned int debounce_period_us) if (irq < 0) return -ENXIO; + label = make_irq_label(line->req->label); + if (IS_ERR(label)) + return -ENOMEM; + irqflags = IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING; ret = request_irq(irq, debounce_irq_handler, irqflags, - line->req->label, line); - if (ret) + label, line); + if (ret) { + free_irq_label(label); return ret; + } line->irq = irq; } else { ret = hte_edge_setup(line, GPIO_V2_LINE_FLAG_EDGE_BOTH); @@ -1083,25 +1109,6 @@ static u32 gpio_v2_line_config_debounce_period(struct gpio_v2_line_config *lc, return 0; } -static inline char *make_irq_label(const char *orig) -{ - char *new; - - if (!orig) - return NULL; - - new = kstrdup_and_replace(orig, '/', ':', GFP_KERNEL); - if (!new) - return ERR_PTR(-ENOMEM); - - return new; -} - -static inline void free_irq_label(const char *label) -{ - kfree(label); -} - static void edge_detector_stop(struct line *line) { if (line->irq) { -- cgit v1.2.3 From 8130b05c559d1aa83d0c8971b422ba0da18ef24a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 4 Apr 2024 12:42:00 +0200 Subject: PM: EM: fix wrong utilization estimation in em_cpu_energy() Commit 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division") has added back map_util_perf() in em_cpu_energy() computation which has been removed with the rework of scheduler/cpufreq interface. This is wrong because sugov_effective_cpu_perf() already takes care of mapping the utilization to a performance level. Fixes: 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division") Signed-off-by: Vincent Guittot Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 770755df852f..70cd7258cd29 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -245,7 +245,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * max utilization to the allowed CPU capacity before calculating * effective performance. */ - max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); /* -- cgit v1.2.3 From 90f8917e7a15f6dd508779048bdf00ce119b6ca0 Mon Sep 17 00:00:00 2001 From: Chaitanya Kumar Borah Date: Thu, 4 Apr 2024 13:48:13 -0500 Subject: ASoC: SOF: Core: Add remove_late() to sof_init_environment failure path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In cases where the sof driver is unable to find the firmware and/or topology file [1], it exits without releasing the i915 runtime pm wakeref [2]. This results in dmesg warnings[3] during suspend/resume or driver unbind. Add remove_late() to the failure path of sof_init_environment so that i915 wakeref is released appropriately [1] [ 8.990366] sof-audio-pci-intel-mtl 0000:00:1f.3: SOF firmware and/or topology file not found. [ 8.990396] sof-audio-pci-intel-mtl 0000:00:1f.3: Supported default profiles [ 8.990398] sof-audio-pci-intel-mtl 0000:00:1f.3: - ipc type 1 (Requested): [ 8.990399] sof-audio-pci-intel-mtl 0000:00:1f.3: Firmware file: intel/sof-ipc4/mtl/sof-mtl.ri [ 8.990401] sof-audio-pci-intel-mtl 0000:00:1f.3: Topology file: intel/sof-ace-tplg/sof-mtl-rt711-2ch.tplg [ 8.990402] sof-audio-pci-intel-mtl 0000:00:1f.3: Check if you have 'sof-firmware' package installed. [ 8.990403] sof-audio-pci-intel-mtl 0000:00:1f.3: Optionally it can be manually downloaded from: [ 8.990404] sof-audio-pci-intel-mtl 0000:00:1f.3: https://github.com/thesofproject/sof-bin/ [ 8.999088] sof-audio-pci-intel-mtl 0000:00:1f.3: error: sof_probe_work failed err: -2 [2] ref_tracker: 0000:00:02.0@ffff9b8511b6a378 has 1/5 users at track_intel_runtime_pm_wakeref.part.0+0x36/0x70 [i915] __intel_runtime_pm_get+0x51/0xb0 [i915] intel_runtime_pm_get+0x17/0x20 [i915] intel_display_power_get+0x2f/0x70 [i915] i915_audio_component_get_power+0x23/0x120 [i915] snd_hdac_display_power+0x89/0x130 [snd_hda_core] hda_codec_i915_init+0x3f/0x50 [snd_sof_intel_hda] hda_dsp_probe_early+0x170/0x250 [snd_sof_intel_hda_common] snd_sof_device_probe+0x224/0x320 [snd_sof] sof_pci_probe+0x15b/0x220 [snd_sof_pci] hda_pci_intel_probe+0x30/0x70 [snd_sof_intel_hda_common] local_pci_probe+0x4c/0xb0 pci_device_probe+0xcc/0x250 really_probe+0x18e/0x420 __driver_probe_device+0x7e/0x170 driver_probe_device+0x23/0xa0 [3] [ 484.105070] ------------[ cut here ]------------ [ 484.108238] thunderbolt 0000:00:0d.2: PM: pci_pm_suspend_late+0x0/0x50 returned 0 after 0 usecs [ 484.117106] i915 0000:00:02.0: i915 raw-wakerefs=1 wakelocks=1 on cleanup [ 484.792005] WARNING: CPU: 2 PID: 2405 at drivers/gpu/drm/i915/intel_runtime_pm.c:444 intel_runtime_pm_driver_release+0x6c/0x80 Tested-by: Rodrigo Vivi Reviewed-by: Rodrigo Vivi Reviewed-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Kai Vehmanen Signed-off-by: Chaitanya Kumar Borah Signed-off-by: Pierre-Louis Bossart Acked-by: Lucas De Marchi Link: https://github.com/thesofproject/linux/pull/4878 Signed-off-by: Rodrigo Vivi Link: https://msgid.link/r/20240404184813.134566-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sound/soc/sof/core.c b/sound/soc/sof/core.c index 9b00ede2a486..cc84d4c81be9 100644 --- a/sound/soc/sof/core.c +++ b/sound/soc/sof/core.c @@ -339,8 +339,7 @@ static int sof_init_environment(struct snd_sof_dev *sdev) ret = snd_sof_probe(sdev); if (ret < 0) { dev_err(sdev->dev, "failed to probe DSP %d\n", ret); - sof_ops_free(sdev); - return ret; + goto err_sof_probe; } /* check machine info */ @@ -358,15 +357,18 @@ static int sof_init_environment(struct snd_sof_dev *sdev) ret = validate_sof_ops(sdev); if (ret < 0) { snd_sof_remove(sdev); + snd_sof_remove_late(sdev); return ret; } } + return 0; + err_machine_check: - if (ret) { - snd_sof_remove(sdev); - sof_ops_free(sdev); - } + snd_sof_remove(sdev); +err_sof_probe: + snd_sof_remove_late(sdev); + sof_ops_free(sdev); return ret; } -- cgit v1.2.3 From dd33e5dc7247041b565014f66286c9566b0e32b6 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 19 Mar 2024 16:40:05 +0100 Subject: riscv: use KERN_INFO in do_trap Print the instruction dump with info instead of emergency level. The unhandled signal message is only for informational purpose. Fixes: b8a03a634129 ("riscv: add userland instruction dump to RISC-V splats") Signed-off-by: Andreas Schwab Reviewed-by: Conor Dooley Reviewed-by: Atish Patra Reviewed-by: Yunhui Cui Link: https://lore.kernel.org/r/mvmy1aegrhm.fsf@suse.de Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 868d6280cf66..05a16b1f0aee 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -122,7 +122,7 @@ void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) print_vma_addr(KERN_CONT " in ", instruction_pointer(regs)); pr_cont("\n"); __show_regs(regs); - dump_instr(KERN_EMERG, regs); + dump_instr(KERN_INFO, regs); } force_sig_fault(signo, code, (void __user *)addr); -- cgit v1.2.3 From 8a48ea87ce89fb701624f4b9e82556c81f30c7dc Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 26 Mar 2024 21:30:16 +0100 Subject: riscv: Fix warning by declaring arch_cpu_idle() as noinstr The following warning appears when using ftrace: [89855.443413] RCU not on for: arch_cpu_idle+0x0/0x1c [89855.445640] WARNING: CPU: 5 PID: 0 at include/linux/trace_recursion.h:162 arch_ftrace_ops_list_func+0x208/0x228 [89855.445824] Modules linked in: xt_conntrack(E) nft_chain_nat(E) xt_MASQUERADE(E) nf_conntrack_netlink(E) xt_addrtype(E) nft_compat(E) nf_tables(E) nfnetlink(E) br_netfilter(E) cfg80211(E) nls_iso8859_1(E) ofpart(E) redboot(E) cmdlinepart(E) cfi_cmdset_0001(E) virtio_net(E) cfi_probe(E) cfi_util(E) 9pnet_virtio(E) gen_probe(E) net_failover(E) virtio_rng(E) failover(E) 9pnet(E) physmap(E) map_funcs(E) chipreg(E) mtd(E) uio_pdrv_genirq(E) uio(E) dm_multipath(E) scsi_dh_rdac(E) scsi_dh_emc(E) scsi_dh_alua(E) drm(E) efi_pstore(E) backlight(E) ip_tables(E) x_tables(E) raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E) async_xor(E) xor(E) async_tx(E) raid6_pq(E) raid1(E) raid0(E) virtio_blk(E) [89855.451563] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G E 6.8.0-rc6ubuntu-defconfig #2 [89855.451726] Hardware name: riscv-virtio,qemu (DT) [89855.451899] epc : arch_ftrace_ops_list_func+0x208/0x228 [89855.452016] ra : arch_ftrace_ops_list_func+0x208/0x228 [89855.452119] epc : ffffffff8016b216 ra : ffffffff8016b216 sp : ffffaf808090fdb0 [89855.452171] gp : ffffffff827c7680 tp : ffffaf808089ad40 t0 : ffffffff800c0dd8 [89855.452216] t1 : 0000000000000001 t2 : 0000000000000000 s0 : ffffaf808090fe30 [89855.452306] s1 : 0000000000000000 a0 : 0000000000000026 a1 : ffffffff82cd6ac8 [89855.452423] a2 : ffffffff800458c8 a3 : ffffaf80b1870640 a4 : 0000000000000000 [89855.452646] a5 : 0000000000000000 a6 : 00000000ffffffff a7 : ffffffffffffffff [89855.452698] s2 : ffffffff82766872 s3 : ffffffff80004caa s4 : ffffffff80ebea90 [89855.452743] s5 : ffffaf808089bd40 s6 : 8000000a00006e00 s7 : 0000000000000008 [89855.452787] s8 : 0000000000002000 s9 : 0000000080043700 s10: 0000000000000000 [89855.452831] s11: 0000000000000000 t3 : 0000000000100000 t4 : 0000000000000064 [89855.452874] t5 : 000000000000000c t6 : ffffaf80b182dbfc [89855.452929] status: 0000000200000100 badaddr: 0000000000000000 cause: 0000000000000003 [89855.453053] [] arch_ftrace_ops_list_func+0x208/0x228 [89855.453191] [] ftrace_call+0x8/0x22 [89855.453265] [] do_idle+0x24c/0x2ca [89855.453357] [] return_to_handler+0x0/0x26 [89855.453429] [] smp_callin+0x92/0xb6 [89855.453785] ---[ end trace 0000000000000000 ]--- To fix this, mark arch_cpu_idle() as noinstr, like it is done in commit a9cbc1b471d2 ("s390/idle: mark arch_cpu_idle() noinstr"). Reported-by: Evgenii Shatokhin Closes: https://lore.kernel.org/linux-riscv/51f21b87-ebed-4411-afbc-c00d3dea2bab@yadro.com/ Fixes: cfbc4f81c9d0 ("riscv: Select ARCH_WANTS_NO_INSTR") Signed-off-by: Alexandre Ghiti Reviewed-by: Andy Chiu Tested-by: Andy Chiu Acked-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240326203017.310422-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 92922dbd5b5c..6abeecbfc51d 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -37,7 +37,7 @@ EXPORT_SYMBOL(__stack_chk_guard); extern asmlinkage void ret_from_fork(void); -void arch_cpu_idle(void) +void noinstr arch_cpu_idle(void) { cpu_do_idle(); } -- cgit v1.2.3 From a370c2419e4680a27382d9231edcf739d5d74efc Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 26 Mar 2024 21:30:17 +0100 Subject: riscv: Disable preemption when using patch_map() patch_map() uses fixmap mappings to circumvent the non-writability of the kernel text mapping. The __set_fixmap() function only flushes the current cpu tlb, it does not emit an IPI so we must make sure that while we use a fixmap mapping, the current task is not migrated on another cpu which could miss the newly introduced fixmap mapping. So in order to avoid any task migration, disable the preemption. Reported-by: Andrea Parri Closes: https://lore.kernel.org/all/ZcS+GAaM25LXsBOl@andrea/ Reported-by: Andy Chiu Closes: https://lore.kernel.org/linux-riscv/CABgGipUMz3Sffu-CkmeUB1dKVwVQ73+7=sgC45-m0AE9RCjOZg@mail.gmail.com/ Fixes: cad539baa48f ("riscv: implement a memset like function for text") Fixes: 0ff7c3b33127 ("riscv: Use text_mutex instead of patch_lock") Co-developed-by: Andy Chiu Signed-off-by: Andy Chiu Signed-off-by: Alexandre Ghiti Acked-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240326203017.310422-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/patch.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 37e87fdcf6a0..30e12b310cab 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -80,6 +80,8 @@ static int __patch_insn_set(void *addr, u8 c, size_t len) */ lockdep_assert_held(&text_mutex); + preempt_disable(); + if (across_pages) patch_map(addr + PAGE_SIZE, FIX_TEXT_POKE1); @@ -92,6 +94,8 @@ static int __patch_insn_set(void *addr, u8 c, size_t len) if (across_pages) patch_unmap(FIX_TEXT_POKE1); + preempt_enable(); + return 0; } NOKPROBE_SYMBOL(__patch_insn_set); @@ -122,6 +126,8 @@ static int __patch_insn_write(void *addr, const void *insn, size_t len) if (!riscv_patch_in_stop_machine) lockdep_assert_held(&text_mutex); + preempt_disable(); + if (across_pages) patch_map(addr + PAGE_SIZE, FIX_TEXT_POKE1); @@ -134,6 +140,8 @@ static int __patch_insn_write(void *addr, const void *insn, size_t len) if (across_pages) patch_unmap(FIX_TEXT_POKE1); + preempt_enable(); + return ret; } NOKPROBE_SYMBOL(__patch_insn_write); -- cgit v1.2.3 From d14fa1fcf69db9d070e75f1c4425211fa619dfc8 Mon Sep 17 00:00:00 2001 From: Stefan O'Rear Date: Wed, 27 Mar 2024 02:12:58 -0400 Subject: riscv: process: Fix kernel gp leakage childregs represents the registers which are active for the new thread in user context. For a kernel thread, childregs->gp is never used since the kernel gp is not touched by switch_to. For a user mode helper, the gp value can be observed in user space after execve or possibly by other means. [From the email thread] The /* Kernel thread */ comment is somewhat inaccurate in that it is also used for user_mode_helper threads, which exec a user process, e.g. /sbin/init or when /proc/sys/kernel/core_pattern is a pipe. Such threads do not have PF_KTHREAD set and are valid targets for ptrace etc. even before they exec. childregs is the *user* context during syscall execution and it is observable from userspace in at least five ways: 1. kernel_execve does not currently clear integer registers, so the starting register state for PID 1 and other user processes started by the kernel has sp = user stack, gp = kernel __global_pointer$, all other integer registers zeroed by the memset in the patch comment. This is a bug in its own right, but I'm unwilling to bet that it is the only way to exploit the issue addressed by this patch. 2. ptrace(PTRACE_GETREGSET): you can PTRACE_ATTACH to a user_mode_helper thread before it execs, but ptrace requires SIGSTOP to be delivered which can only happen at user/kernel boundaries. 3. /proc/*/task/*/syscall: this is perfectly happy to read pt_regs for user_mode_helpers before the exec completes, but gp is not one of the registers it returns. 4. PERF_SAMPLE_REGS_USER: LOCKDOWN_PERF normally prevents access to kernel addresses via PERF_SAMPLE_REGS_INTR, but due to this bug kernel addresses are also exposed via PERF_SAMPLE_REGS_USER which is permitted under LOCKDOWN_PERF. I have not attempted to write exploit code. 5. Much of the tracing infrastructure allows access to user registers. I have not attempted to determine which forms of tracing allow access to user registers without already allowing access to kernel registers. Fixes: 7db91e57a0ac ("RISC-V: Task implementation") Cc: stable@vger.kernel.org Signed-off-by: Stefan O'Rear Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240327061258.2370291-1-sorear@fastmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/process.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 6abeecbfc51d..e4bc61c4e58a 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -27,8 +27,6 @@ #include #include -register unsigned long gp_in_global __asm__("gp"); - #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include unsigned long __stack_chk_guard __read_mostly; @@ -207,7 +205,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (unlikely(args->fn)) { /* Kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->gp = gp_in_global; /* Supervisor/Machine, irqs on: */ childregs->status = SR_PP | SR_PIE; -- cgit v1.2.3 From 01e5f4fc0fead3ef19000e1d2fc748e87aac3f02 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 4 Apr 2024 15:50:26 -0400 Subject: bcachefs: Make snapshot_is_ancestor() safe Snapshot table accesses generally need to be checking for invalid snapshot ID now, fix one that was missed. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 0e806f04f3d7..544322d5c251 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -125,6 +125,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances return s->parent; } +static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return false; + + return test_bit(ancestor - id - 1, s->is_ancestor); +} + bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { bool ret; @@ -140,13 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - if (id && id < ancestor) { - ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); + ret = id && id < ancestor + ? test_ancestor_bitmap(t, id, ancestor) + : id == ancestor; - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); - } else { - ret = id == ancestor; - } + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); out: rcu_read_unlock(); -- cgit v1.2.3 From be42e4a621fee05e3299169fbb1068b473e779c2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 4 Apr 2024 16:51:40 -0400 Subject: bcachefs: Bump limit in btree_trans_too_many_iters() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 2 +- fs/bcachefs/btree_types.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 24772538e4cc..1d58d447b386 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -642,7 +642,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *); static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8) return __bch2_btree_trans_too_many_iters(trans); return 0; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 9404d96c38f3..e0c982a4195c 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -364,7 +364,21 @@ struct btree_insert_entry { unsigned long ip_allocated; }; +/* Number of btree paths we preallocate, usually enough */ #define BTREE_ITER_INITIAL 64 +/* + * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code + * paths should run inside this limit, and if they don't it usually indicates a + * bug (leaking/duplicated btree paths). + * + * exception: some fsck paths + * + * bugs with excessive path usage seem to have possibly been eliminated now, so + * we might consider eliminating this (and btree_trans_too_many_iter()) at some + * point. + */ +#define BTREE_ITER_NORMAL_LIMIT 256 +/* never exceed limit */ #define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; -- cgit v1.2.3 From 9fb3036fe3d9414ae32a97d01d7ccf7550e168a7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 3 Apr 2024 19:15:53 -0400 Subject: bcachefs: Move btree_updates to debugfs sysfs is limited to PAGE_SIZE, and when we're debugging strange deadlocks/priority inversions we need to see the full list. Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/bcachefs/sysfs.c | 6 ------ 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 7b681ae91510..cd99b7399414 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -13,6 +13,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "debug.h" #include "error.h" @@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->size = size; i->ret = 0; - do { + while (1) { err = flush_buf(i); if (err) return err; @@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, if (!i->size) break; + if (done) + break; + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); i->iter++; - } while (!done); + } if (i->buf.allocation_failure) return -ENOMEM; @@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = { .read = bch2_journal_pins_read, }; +static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + bch2_btree_updates_to_text(&i->buf, c); + i->iter++; + } + + err = flush_buf(i); + if (err) + return err; + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations btree_updates_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_updates_read, +}; + static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); - if (!i) return -ENOMEM; @@ -902,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); + debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, + c->btree_debug, &btree_updates_ops); + debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, c, &btree_transaction_stats_op); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index c86a93a8d8fc..b18b0cc81b59 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -17,7 +17,6 @@ #include "btree_iter.h" #include "btree_key_cache.h" #include "btree_update.h" -#include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" #include "clock.h" @@ -166,7 +165,6 @@ read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); read_attribute(journal_debug); -read_attribute(btree_updates); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(stripes_heap); @@ -415,9 +413,6 @@ SHOW(bch2_fs) if (attr == &sysfs_journal_debug) bch2_journal_debug_to_text(out, &c->journal); - if (attr == &sysfs_btree_updates) - bch2_btree_updates_to_text(out, c); - if (attr == &sysfs_btree_cache) bch2_btree_cache_to_text(out, c); @@ -639,7 +634,6 @@ SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { &sysfs_flags, &sysfs_journal_debug, - &sysfs_btree_updates, &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_new_stripes, -- cgit v1.2.3 From d880a43836d5e2ba951b10471104cdacc2eefbed Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 3 Apr 2024 19:52:10 -0400 Subject: bcachefs: Further improve btree_update_to_text() Print start and end level of the btree update; also a bit of cleanup. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 100 +++++++++++++++++------------------- fs/bcachefs/btree_update_interior.h | 3 +- 2 files changed, 48 insertions(+), 55 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 32397b99752f..24fbd5be70a2 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -26,9 +26,9 @@ #include -const char * const bch2_btree_update_modes[] = { +static const char * const bch2_btree_update_modes[] = { #define x(t) #t, - BCH_WATERMARKS() + BTREE_UPDATE_MODES() #undef x NULL }; @@ -850,15 +850,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_UPDATE_none); + BUG_ON(as->update_level_end < b->c.level); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + as->mode = BTREE_UPDATE_node; as->b = b; + as->update_level_end = b->c.level; set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); @@ -1100,7 +1102,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level, bool split, unsigned flags) + unsigned level_start, bool split, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1108,7 +1110,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; - unsigned update_level = level; + unsigned level_end = level_start; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; int ret = 0; u32 restart_count = trans->restart_count; @@ -1140,17 +1142,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, } while (1) { - nr_nodes[!!update_level] += 1 + split; - update_level++; + nr_nodes[!!level_end] += 1 + split; + level_end++; - ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + ret = bch2_btree_path_upgrade(trans, path, level_end + 1); if (ret) return ERR_PTR(ret); - if (!btree_path_node(path, update_level)) { + if (!btree_path_node(path, level_end)) { /* Allocating new root? */ nr_nodes[1] += split; - update_level = BTREE_MAX_DEPTH; + level_end = BTREE_MAX_DEPTH; break; } @@ -1158,11 +1160,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[level_end].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; - split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); + split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } if (!down_read_trylock(&c->gc_lock)) { @@ -1176,14 +1178,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->ip_started = _RET_IP_; - as->mode = BTREE_UPDATE_none; - as->watermark = watermark; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level = update_level; + as->c = c; + as->start_time = start_time; + as->ip_started = _RET_IP_; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; + as->took_gc_lock = true; + as->btree_id = path->btree_id; + as->update_level_start = level_start; + as->update_level_end = level_end; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -1373,12 +1376,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, } static void -__bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) { struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; @@ -1534,7 +1537,7 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); BUG_ON(bch2_btree_node_check_topology(trans, b)); } @@ -1714,27 +1717,6 @@ err: goto out; } -static void -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *linked; - unsigned i; - - __bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - - btree_update_updated_node(as, b); - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); -} - /** * bch2_btree_insert_node - insert bkeys into a given btree node * @@ -1755,7 +1737,8 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t struct keylist *keys) { struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; + struct btree_path *path = trans->paths + path_idx, *linked; + unsigned i; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1784,7 +1767,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t return ret; } - bch2_btree_insert_keys_interior(as, trans, path, b, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); + + trans_for_each_path_with_node(trans, b, linked, i) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + + bch2_trans_verify_paths(trans); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -1798,6 +1787,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_maybe_compact_whiteouts(c, b)) bch2_trans_node_reinit_iter(trans, b); + btree_update_updated_node(as, b); bch2_btree_node_unlock_write(trans, path, b); BUG_ON(bch2_btree_node_check_topology(trans, b)); @@ -1807,7 +1797,7 @@ split: * We could attempt to avoid the transaction restart, by calling * bch2_btree_path_upgrade() and allocating more nodes: */ - if (b->c.level >= as->update_level) { + if (b->c.level >= as->update_level_end) { trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } @@ -2519,9 +2509,11 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) { - prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", (void *) as->ip_started, bch2_btree_id_str(as->btree_id), + as->update_level_start, + as->update_level_end, bch2_watermarks[as->watermark], bch2_btree_update_modes[as->mode], as->nodes_written, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 88dcf5a22a3b..c1a479ebaad1 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -57,7 +57,8 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; - unsigned update_level; + unsigned update_level_start; + unsigned update_level_end; struct disk_reservation disk_res; -- cgit v1.2.3 From 9802ff48f3fd8ae5d6699c5a32afc76769920c98 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 20 Feb 2024 21:08:24 -0500 Subject: bcachefs: Print shutdown journal sequence number Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ed63018f21be..8daf80a38d60 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -288,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); + bch2_fs_journal_stop(&c->journal); + bch_info(c, "%sshutdown complete, journal seq %llu", + test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", + c->journal.seq_ondisk); + /* * After stopping journal: */ -- cgit v1.2.3 From d4e655c49f474deffaf5ed7e65034b8167ee39c8 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Mon, 1 Apr 2024 21:10:38 +0200 Subject: scsi: sg: Avoid race in error handling & drop bogus warn Commit 27f58c04a8f4 ("scsi: sg: Avoid sg device teardown race") introduced an incorrect WARN_ON_ONCE() and missed a sequence where sg_device_destroy() was used after scsi_device_put(). sg_device_destroy() is accessing the parent scsi_device request_queue which will already be set to NULL when the preceding call to scsi_device_put() removed the last reference to the parent scsi_device. Drop the incorrect WARN_ON_ONCE() - allowing more than one concurrent access to the sg device - and make sure sg_device_destroy() is not used after scsi_device_put() in the error handling. Link: https://lore.kernel.org/all/5375B275-D137-4D5F-BE25-6AF8ACAE41EF@linux.ibm.com Fixes: 27f58c04a8f4 ("scsi: sg: Avoid sg device teardown race") Cc: stable@vger.kernel.org Signed-off-by: Alexander Wetzel Link: https://lore.kernel.org/r/20240401191038.18359-1-Alexander@wetzel-home.de Tested-by: Sachin Sant Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 386981c6976a..baf870a03ecf 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -285,6 +285,7 @@ sg_open(struct inode *inode, struct file *filp) int dev = iminor(inode); int flags = filp->f_flags; struct request_queue *q; + struct scsi_device *device; Sg_device *sdp; Sg_fd *sfp; int retval; @@ -301,11 +302,12 @@ sg_open(struct inode *inode, struct file *filp) /* This driver's module count bumped by fops_get in */ /* Prevent the device driver from vanishing while we sleep */ - retval = scsi_device_get(sdp->device); + device = sdp->device; + retval = scsi_device_get(device); if (retval) goto sg_put; - retval = scsi_autopm_get_device(sdp->device); + retval = scsi_autopm_get_device(device); if (retval) goto sdp_put; @@ -313,7 +315,7 @@ sg_open(struct inode *inode, struct file *filp) * check if O_NONBLOCK. Permits SCSI commands to be issued * during error recovery. Tread carefully. */ if (!((flags & O_NONBLOCK) || - scsi_block_when_processing_errors(sdp->device))) { + scsi_block_when_processing_errors(device))) { retval = -ENXIO; /* we are in error recovery for this device */ goto error_out; @@ -344,7 +346,7 @@ sg_open(struct inode *inode, struct file *filp) if (sdp->open_cnt < 1) { /* no existing opens */ sdp->sgdebug = 0; - q = sdp->device->request_queue; + q = device->request_queue; sdp->sg_tablesize = queue_max_segments(q); } sfp = sg_add_sfp(sdp); @@ -370,10 +372,11 @@ out_undo: error_mutex_locked: mutex_unlock(&sdp->open_rel_lock); error_out: - scsi_autopm_put_device(sdp->device); + scsi_autopm_put_device(device); sdp_put: - scsi_device_put(sdp->device); - goto sg_put; + kref_put(&sdp->d_ref, sg_device_destroy); + scsi_device_put(device); + return retval; } /* Release resources associated with a successful sg_open() @@ -2233,7 +2236,6 @@ sg_remove_sfp_usercontext(struct work_struct *work) "sg_remove_sfp: sfp=0x%p\n", sfp)); kfree(sfp); - WARN_ON_ONCE(kref_read(&sdp->d_ref) != 1); kref_put(&sdp->d_ref, sg_device_destroy); scsi_device_put(device); module_put(THIS_MODULE); -- cgit v1.2.3 From 8cb4a9a82b21623dbb4b3051dd30d98356cf95bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Apr 2024 17:16:14 -0700 Subject: x86/cpufeatures: Add CPUID_LNX_5 to track recently added Linux-defined word Add CPUID_LNX_5 to track cpufeatures' word 21, and add the appropriate compile-time assert in KVM to prevent direct lookups on the features in CPUID_LNX_5. KVM uses X86_FEATURE_* flags to manage guest CPUID, and so must translate features that are scattered by Linux from the Linux-defined bit to the hardware-defined bit, i.e. should never try to directly access scattered features in guest CPUID. Opportunistically add NR_CPUID_WORDS to enum cpuid_leafs, along with a compile-time assert in KVM's CPUID infrastructure to ensure that future additions update cpuid_leafs along with NCAPINTS. No functional change intended. Fixes: 7f274e609f3d ("x86/cpufeatures: Add new word for scattered features") Cc: Sandipan Das Signed-off-by: Sean Christopherson Acked-by: Dave Hansen Signed-off-by: Linus Torvalds --- arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/kvm/reverse_cpuid.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 42157ddcc09d..686e92d2663e 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -33,6 +33,8 @@ enum cpuid_leafs CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, + CPUID_LNX_5, + NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index aadefcaa9561..58ac8d69c94b 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -102,10 +102,12 @@ static const struct cpuid_reg reverse_cpuid[] = { */ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) { + BUILD_BUG_ON(NR_CPUID_WORDS != NCAPINTS); BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_5); BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); } -- cgit v1.2.3 From d464dac47260a33add5a206fd3289ec1216e8435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 5 Apr 2024 07:58:10 +0200 Subject: usb: gadget: fsl: Initialize udc before using it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fsl_ep_queue() is only called by usb_ep_queue() (as ep->ops->queue()). So _ep isn't NULL. As ep->ops->queue = fsl_ep_queue, the ep was initialized by struct_ep_setup() and so ep->udc isn't NULL either. Drop the check for _ep being NULL and assign udc earlier to prevent following an uninitialized pointer in the two dev_vdbg()s in lines 878 and 882. This fixes a compiler warning when using clang and CONFIG_USB_GADGET_VERBOSE=y. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404050227.TTvcCPBu-lkp@intel.com/ Fixes: 6025f20f16c2 ("usb: gadget: fsl-udc: Replace custom log wrappers by dev_{err,warn,dbg,vdbg}") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240405055812.694123-2-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/fsl_udc_core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c index e82d03224f94..3432ebfae978 100644 --- a/drivers/usb/gadget/udc/fsl_udc_core.c +++ b/drivers/usb/gadget/udc/fsl_udc_core.c @@ -868,7 +868,7 @@ fsl_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) { struct fsl_ep *ep = container_of(_ep, struct fsl_ep, ep); struct fsl_req *req = container_of(_req, struct fsl_req, req); - struct fsl_udc *udc; + struct fsl_udc *udc = ep->udc; unsigned long flags; int ret; @@ -878,7 +878,7 @@ fsl_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) dev_vdbg(&udc->gadget.dev, "%s, bad params\n", __func__); return -EINVAL; } - if (unlikely(!_ep || !ep->ep.desc)) { + if (unlikely(!ep->ep.desc)) { dev_vdbg(&udc->gadget.dev, "%s, bad ep\n", __func__); return -EINVAL; } @@ -887,7 +887,6 @@ fsl_ep_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t gfp_flags) return -EMSGSIZE; } - udc = ep->udc; if (!udc->driver || udc->gadget.speed == USB_SPEED_UNKNOWN) return -ESHUTDOWN; -- cgit v1.2.3 From 5957e0a28b5177849f7666d041b32f5dc7d27427 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 02:43:08 -0400 Subject: bcachefs: Fix rebalance from durability=0 device Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 34731ee0217f..0022b51ce3c0 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -598,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans, i++; } + unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + /* * If current extent durability is less than io_opts.data_replicas, * we're not trying to rereplicate the extent up to data_replicas here - @@ -607,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans, * rereplicate, currently, so that users don't get an unexpected -ENOSPC */ if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && - durability_have >= io_opts.data_replicas) { + !durability_required) { m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ @@ -616,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans, goto done; } - m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) + + m->op.nr_replicas = min(durability_removing, durability_required) + m->data_opts.extra_replicas; - m->op.nr_replicas_required = m->op.nr_replicas; - BUG_ON(!m->op.nr_replicas); + /* + * If device(s) were set to durability=0 after data was written to them + * we can end up with a duribilty=0 extent, and the normal algorithm + * that tries not to increase durability doesn't work: + */ + if (!(durability_have + durability_removing)) + m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + + m->op.nr_replicas_required = m->op.nr_replicas; if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, -- cgit v1.2.3 From bc004f5038220b1891ef4107134ccae44be55109 Mon Sep 17 00:00:00 2001 From: Jammy Huang Date: Wed, 3 Apr 2024 17:02:46 +0800 Subject: drm/ast: Fix soft lockup There is a while-loop in ast_dp_set_on_off() that could lead to infinite-loop. This is because the register, VGACRI-Dx, checked in this API is a scratch register actually controlled by a MCU, named DPMCU, in BMC. These scratch registers are protected by scu-lock. If suc-lock is not off, DPMCU can not update these registers and then host will have soft lockup due to never updated status. DPMCU is used to control DP and relative registers to handshake with host's VGA driver. Even the most time-consuming task, DP's link training, is less than 100ms. 200ms should be enough. Signed-off-by: Jammy Huang Fixes: 594e9c04b586 ("drm/ast: Create the driver for ASPEED proprietory Display-Port") Reviewed-by: Jocelyn Falempe Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Cc: KuoHsiang Chou Cc: Thomas Zimmermann Cc: Dave Airlie Cc: Jocelyn Falempe Cc: dri-devel@lists.freedesktop.org Cc: # v5.19+ Link: https://patchwork.freedesktop.org/patch/msgid/20240403090246.1495487-1-jammy_huang@aspeedtech.com --- drivers/gpu/drm/ast/ast_dp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c index ebb6d8ebd44e..1e9259416980 100644 --- a/drivers/gpu/drm/ast/ast_dp.c +++ b/drivers/gpu/drm/ast/ast_dp.c @@ -180,6 +180,7 @@ void ast_dp_set_on_off(struct drm_device *dev, bool on) { struct ast_device *ast = to_ast_device(dev); u8 video_on_off = on; + u32 i = 0; // Video On/Off ast_set_index_reg_mask(ast, AST_IO_VGACRI, 0xE3, (u8) ~AST_DP_VIDEO_ENABLE, on); @@ -192,6 +193,8 @@ void ast_dp_set_on_off(struct drm_device *dev, bool on) ASTDP_MIRROR_VIDEO_ENABLE) != video_on_off) { // wait 1 ms mdelay(1); + if (++i > 200) + break; } } } -- cgit v1.2.3 From 61f7fdf8fd00ce33d30ca3fae8d643c0850ce945 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Apr 2024 23:48:59 +0200 Subject: timers/migration: Fix ignored event due to missing CPU update When a group event is updated with its expiry unchanged but a different CPU, that target change may go unnoticed and the event may be propagated up with a stale CPU value. The following depicts a scenario that has been actually observed: [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = TGRP1:0 (T0) / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T0 / \ 0 (T0) 1 (T1) idle idle 0) The hierarchy has 3 levels. The left part (GRP1:0) is all idle, including CPU 0 and CPU 1 which have a timer each: T0 and T1. They have the same expiry value. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T0 / \ 0 (T0) 1 (T1) idle idle 1) The migrator in GRP1:1 handles remotely T0. The event is dequeued from the top and T0 executed. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 2) The migrator in GRP1:1 fetches the next timer for CPU 0 and finds none. But it updates the events from its groups, starting with GRP0:0 which now has T1 as its next event. So far so good. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 3) The migrator in GRP1:1 proceeds upward and updates the events in GRP1:0. The child event TGRP0:0 is found queued with the same expiry as before. And therefore it is left unchanged. However the target CPU is not the same but that fact is ignored so TGRP0:0 still points to CPU 0 when it should point to CPU 1. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = TGRP1:0 (T0) / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 4) The propagation has reached the top level and TGRP1:0, having TGRP0:0 as its first event, also wrongly points to CPU 0. TGRP1:0 is added to the top level group. [GRP2:0] migrator = GRP1:1 active = GRP1:1 nextevt = KTIME_MAX / \ [GRP1:0] [GRP1:1] migrator = NONE [...] active = NONE nextevt = TGRP0:0 (T0) / \ [GRP0:0] [...] migrator = NONE active = NONE nextevt = T1 / \ 0 1 (T1) idle idle 5) The migrator in GRP1:1 dequeues the next event in top level pointing to CPU 0. But since it actually doesn't see any real event in CPU 0, it early returns. 6) T1 is left unhandled until either CPU 0 or CPU 1 wake up. Some other bad scenario may involve trees with just two levels. Fix this with unconditionally updating the CPU of the child event before considering to early return while updating a queued event with an unchanged expiry value. Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Anna-Maria Behnsen Link: https://lore.kernel.org/r/Zg2Ct6M2RJAYHgCB@localhost.localdomain --- kernel/time/timer_migration.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index c63a0afdcebe..e3075e40cb43 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -762,8 +762,11 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { - if ((evt->nextevt.expires == nextexp) && !evt->ignore) + if ((evt->nextevt.expires == nextexp) && !evt->ignore) { + /* Make sure not to miss a new CPU event with the same expiry */ + evt->cpu = first_childevt->cpu; goto check_toplvl; + } if (!timerqueue_del(&group->events, &evt->nextevt)) WRITE_ONCE(group->next_expiry, KTIME_MAX); -- cgit v1.2.3 From 7a96a84bfbee96871bb16c70ee3e93d564e190f4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 5 Apr 2024 10:53:21 +0200 Subject: timers/migration: Return early on deactivation Commit 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") removed the logic to return early in tmigr_update_events() on deactivation. With this the problem with a not properly updated first global event in a hierarchy containing only a single group was fixed. But when having a look at this code path with a hierarchy with more than a single level, now unnecessary work is done (example is partially copied from the message of the commit mentioned above): [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = T0:0i, T0:1 / \ [GRP0:0] [GRP0:1] migrator = 0 migrator = NONE active = 0 active = NONE nextevt = T0i, T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 active idle idle idle 0) CPU 0 is active thus its event is ignored (the letter 'i') and so are upper levels' events. CPU 1 is idle and has the timer T1 enqueued. CPU 2 also has a timer. The expiry order is T0 (ignored) < T1 < T2 [GRP1:0] migrator = GRP0:0 active = GRP0:0 nextevt = T0:0i, T0:1 / \ [GRP0:0] [GRP0:1] migrator = NONE migrator = NONE active = NONE active = NONE nextevt = T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 idle idle idle idle 1) CPU 0 goes idle without global event queued. Therefore KTIME_MAX is pushed as its next expiry and its own event kept as "ignore". Without this early return the following steps happen in tmigr_update_events() when child = null and group = GRP0:0 : lock(GRP0:0->lock); timerqueue_del(GRP0:0, T0i); unlock(GRP0:0->lock); [GRP1:0] migrator = NONE active = NONE nextevt = T0:0, T0:1 / \ [GRP0:0] [GRP0:1] migrator = NONE migrator = NONE active = NONE active = NONE nextevt = T1 nextevt = T2 / \ / \ 0 (T0i) 1 (T1) 2 (T2) 3 idle idle idle idle 2) The change now propagates up to the top. Then tmigr_update_events() updates the group event of GRP0:0 and executes the following steps (child = GRP0:0 and group = GRP0:0): lock(GRP0:0->lock); lock(GRP1:0->lock); evt = tmigr_next_groupevt(GRP0:0); -> this removes the ignored events in GRP0:0 ... update GRP1:0 group event and timerqueue ... unlock(GRP1:0->lock); unlock(GRP0:0->lock); So the dance in 1) with locking the GRP0:0->lock and removing the T0i from the timerqueue is redundand as this is done nevertheless in 2) when tmigr_next_groupevt(GRP0:0) is executed. Revert commit 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") and add a condition into return path to skip the return only, when hierarchy contains a single group. Adapt comments accordingly. Fixes: 4b6f4c5a67c0 ("timer/migration: Remove buggy early return on deactivation") Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/87cyr49on2.fsf@somnus --- kernel/time/timer_migration.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index e3075e40cb43..ccba875d2234 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -751,6 +751,33 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, first_childevt = evt = data->evt; + /* + * Walking the hierarchy is required in any case when a + * remote expiry was done before. This ensures to not lose + * already queued events in non active groups (see section + * "Required event and timerqueue update after a remote + * expiry" in the documentation at the top). + * + * The two call sites which are executed without a remote expiry + * before, are not prevented from propagating changes through + * the hierarchy by the return: + * - When entering this path by tmigr_new_timer(), @evt->ignore + * is never set. + * - tmigr_inactive_up() takes care of the propagation by + * itself and ignores the return value. But an immediate + * return is possible if there is a parent, sparing group + * locking at this level, because the upper walking call to + * the parent will take care about removing this event from + * within the group and update next_expiry accordingly. + * + * However if there is no parent, ie: the hierarchy has only a + * single level so @group is the top level group, make sure the + * first event information of the group is updated properly and + * also handled properly, so skip this fast return path. + */ + if (evt->ignore && !remote && group->parent) + return true; + raw_spin_lock(&group->lock); childstate.state = 0; -- cgit v1.2.3 From caeb4b0a11b3393e43f7fa8e0a5a18462acc66bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 31 Mar 2024 17:52:12 -0400 Subject: aio: Fix null ptr deref in aio_complete() wakeup list_del_init_careful() needs to be the last access to the wait queue entry - it effectively unlocks access. Previously, finish_wait() would see the empty list head and skip taking the lock, and then we'd return - but the completion path would still attempt to do the wakeup after the task_struct pointer had been overwritten. Fixes: 71eb6b6b0ba9 ("fs/aio: obey min_nr when doing wakeups") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-fsdevel/CAHTA-ubfwwB51A5Wg5M6H_rPEQK9pNf8FkAGH=vr=FEkyRrtqw@mail.gmail.com/ Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/stable/20240331215212.522544-1-kent.overstreet%40linux.dev Link: https://lore.kernel.org/r/20240331215212.522544-1-kent.overstreet@linux.dev Signed-off-by: Christian Brauner --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index 9cdaa2faa536..0f4f531c9780 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1202,8 +1202,8 @@ static void aio_complete(struct aio_kiocb *iocb) spin_lock_irqsave(&ctx->wait.lock, flags); list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) if (avail >= curr->min_nr) { - list_del_init_careful(&curr->w.entry); wake_up_process(curr->w.private); + list_del_init_careful(&curr->w.entry); } spin_unlock_irqrestore(&ctx->wait.lock, flags); } -- cgit v1.2.3 From 07ed11afb68d94eadd4ffc082b97c2331307c5ea Mon Sep 17 00:00:00 2001 From: Alex Constantino Date: Thu, 4 Apr 2024 19:14:48 +0100 Subject: Revert "drm/qxl: simplify qxl_fence_wait" This reverts commit 5a838e5d5825c85556011478abde708251cc0776. Changes from commit 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") would result in a '[TTM] Buffer eviction failed' exception whenever it reached a timeout. Due to a dependency to DMA_FENCE_WARN this also restores some code deleted by commit d72277b6c37d ("dma-buf: nuke DMA_FENCE_TRACE macros v2"). Fixes: 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") Link: https://lore.kernel.org/regressions/ZTgydqRlK6WX_b29@eldamar.lan/ Reported-by: Timo Lindfors Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1054514 Signed-off-by: Alex Constantino Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240404181448.1643-2-dreaming.about.electric.sheep@gmail.com --- drivers/gpu/drm/qxl/qxl_release.c | 50 +++++++++++++++++++++++++++++++++++---- include/linux/dma-fence.h | 7 ++++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 368d26da0d6a..9febc8b73f09 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -58,16 +58,56 @@ static long qxl_fence_wait(struct dma_fence *fence, bool intr, signed long timeout) { struct qxl_device *qdev; + struct qxl_release *release; + int count = 0, sc = 0; + bool have_drawable_releases; unsigned long cur, end = jiffies + timeout; qdev = container_of(fence->lock, struct qxl_device, release_lock); + release = container_of(fence, struct qxl_release, base); + have_drawable_releases = release->type == QXL_RELEASE_DRAWABLE; - if (!wait_event_timeout(qdev->release_event, - (dma_fence_is_signaled(fence) || - (qxl_io_notify_oom(qdev), 0)), - timeout)) - return 0; +retry: + sc++; + + if (dma_fence_is_signaled(fence)) + goto signaled; + + qxl_io_notify_oom(qdev); + + for (count = 0; count < 11; count++) { + if (!qxl_queue_garbage_collect(qdev, true)) + break; + + if (dma_fence_is_signaled(fence)) + goto signaled; + } + + if (dma_fence_is_signaled(fence)) + goto signaled; + + if (have_drawable_releases || sc < 4) { + if (sc > 2) + /* back off */ + usleep_range(500, 1000); + + if (time_after(jiffies, end)) + return 0; + + if (have_drawable_releases && sc > 300) { + DMA_FENCE_WARN(fence, + "failed to wait on release %llu after spincount %d\n", + fence->context & ~0xf0000000, sc); + goto signaled; + } + goto retry; + } + /* + * yeah, original sync_obj_wait gave up after 3 spins when + * have_drawable_releases is not set. + */ +signaled: cur = jiffies; if (time_after(cur, end)) return 0; diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index e06bad467f55..c3f9bb6602ba 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -682,4 +682,11 @@ static inline bool dma_fence_is_container(struct dma_fence *fence) return dma_fence_is_array(fence) || dma_fence_is_chain(fence); } +#define DMA_FENCE_WARN(f, fmt, args...) \ + do { \ + struct dma_fence *__ff = (f); \ + pr_warn("f %llu#%llu: " fmt, __ff->context, __ff->seqno,\ + ##args); \ + } while (0) + #endif /* __LINUX_DMA_FENCE_H */ -- cgit v1.2.3 From 24cfd86433c920188ac3f02df8aba6bc4c792f4b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Apr 2024 18:30:14 +0900 Subject: ata: ahci: Add mask_port_map module parameter Commits 0077a504e1a4 ("ahci: asm1166: correct count of reported ports") and 9815e3961754 ("ahci: asm1064: correct count of reported ports") attempted to limit the ports of the ASM1166 and ASM1064 AHCI controllers to avoid long boot times caused by the fact that these adapters report a port map larger than the number of physical ports. The excess ports are "virtual" to hide port multiplier devices and probing these ports takes time. However, these commits caused a regression for users that do use PMP devices, as the ATA devices connected to the PMP cannot be scanned. These commits have thus been reverted by commit 6cd8adc3e18 ("ahci: asm1064: asm1166: don't limit reported ports") to allow the discovery of devices connected through a port multiplier. But this revert re-introduced the long boot times for users that do not use a port multiplier setup. This patch adds the mask_port_map ahci module parameter to allow users to manually specify port map masks for controllers. In the case of the ASMedia 1166 and 1064 controllers, users that do not have port multiplier devices can mask the excess virtual ports exposed by the controller to speedup port scanning, thus reducing boot time. The mask_port_map parameter accepts 2 different formats: - mask_port_map= This applies the same mask to all AHCI controllers present in the system. This format is convenient for small systems that have only a single AHCI controller. - mask_port_map==,=mask,... This applies the specified masks only to the PCI device listed. The field is a regular PCI device ID (domain:bus:dev.func). This ID can be seen following "ahci" in the kernel messages. E.g. for "ahci 0000:01:00.0: 2/2 ports implemented (port mask 0x3)", the field is "0000:01:00.0". When used, the function ahci_save_initial_config() indicates that a port map mask was applied with the message "masking port_map ...". E.g.: without a mask: modprobe ahci dmesg | grep ahci ... ahci 0000:00:17.0: AHCI vers 0001.0301, 32 command slots, 6 Gbps, SATA mode ahci 0000:00:17.0: (0000:00:17.0) 8/8 ports implemented (port mask 0xff) With a mask: modprobe ahci mask_port_map=0000:00:17.0=0x1 dmesg | grep ahci ... ahci 0000:00:17.0: masking port_map 0xff -> 0x1 ahci 0000:00:17.0: AHCI vers 0001.0301, 32 command slots, 6 Gbps, SATA mode ahci 0000:00:17.0: (0000:00:17.0) 1/8 ports implemented (port mask 0x1) Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel --- drivers/ata/ahci.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 562302e2e57c..6548f10e61d9 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -666,6 +666,87 @@ static int mobile_lpm_policy = -1; module_param(mobile_lpm_policy, int, 0644); MODULE_PARM_DESC(mobile_lpm_policy, "Default LPM policy for mobile chipsets"); +static char *ahci_mask_port_map; +module_param_named(mask_port_map, ahci_mask_port_map, charp, 0444); +MODULE_PARM_DESC(mask_port_map, + "32-bits port map masks to ignore controllers ports. " + "Valid values are: " + "\"\" to apply the same mask to all AHCI controller " + "devices, and \"=,=,...\" to " + "specify different masks for the controllers specified, " + "where is the PCI ID of an AHCI controller in the " + "form \"domain:bus:dev.func\""); + +static void ahci_apply_port_map_mask(struct device *dev, + struct ahci_host_priv *hpriv, char *mask_s) +{ + unsigned int mask; + + if (kstrtouint(mask_s, 0, &mask)) { + dev_err(dev, "Invalid port map mask\n"); + return; + } + + hpriv->mask_port_map = mask; +} + +static void ahci_get_port_map_mask(struct device *dev, + struct ahci_host_priv *hpriv) +{ + char *param, *end, *str, *mask_s; + char *name; + + if (!strlen(ahci_mask_port_map)) + return; + + str = kstrdup(ahci_mask_port_map, GFP_KERNEL); + if (!str) + return; + + /* Handle single mask case */ + if (!strchr(str, '=')) { + ahci_apply_port_map_mask(dev, hpriv, str); + goto free; + } + + /* + * Mask list case: parse the parameter to apply the mask only if + * the device name matches. + */ + param = str; + end = param + strlen(param); + while (param && param < end && *param) { + name = param; + param = strchr(name, '='); + if (!param) + break; + + *param = '\0'; + param++; + if (param >= end) + break; + + if (strcmp(dev_name(dev), name) != 0) { + param = strchr(param, ','); + if (param) + param++; + continue; + } + + mask_s = param; + param = strchr(mask_s, ','); + if (param) { + *param = '\0'; + param++; + } + + ahci_apply_port_map_mask(dev, hpriv, mask_s); + } + +free: + kfree(str); +} + static void ahci_pci_save_initial_config(struct pci_dev *pdev, struct ahci_host_priv *hpriv) { @@ -688,6 +769,10 @@ static void ahci_pci_save_initial_config(struct pci_dev *pdev, "Disabling your PATA port. Use the boot option 'ahci.marvell_enable=0' to avoid this.\n"); } + /* Handle port map masks passed as module parameter. */ + if (ahci_mask_port_map) + ahci_get_port_map_mask(&pdev->dev, hpriv); + ahci_save_initial_config(&pdev->dev, hpriv); } -- cgit v1.2.3 From 648dae58a830ecceea3b1bebf68432435980f137 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:12 -0700 Subject: cxl: Remove checking of iter in cxl_endpoint_get_perf_coordinates() The while() loop in cxl_endpoint_get_perf_coordinates() checks to see if 'iter' is valid as part of the condition breaking out of the loop. is_cxl_root() will stop the loop before the next iteration could go NULL. Remove the iter check. The presence of the iter or removing the iter does not impact the behavior of the code. This is a code clean up and not a bug fix. Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 2b0cab556072..6cbde50a742b 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2197,7 +2197,7 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * port each iteration. If the parent is cxl root then there is * nothing to gather. */ - while (iter && !is_cxl_root(to_cxl_port(iter->dev.parent))) { + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) { cxl_coordinates_combine(&c, &c, &dport->sw_coord); c.write_latency += dport->link_latency; c.read_latency += dport->link_latency; -- cgit v1.2.3 From 838ae9f45c4e43b4633d8b0ad1fbedff9ecf177d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 30 Mar 2024 07:12:03 -0700 Subject: nouveau/gsp: Avoid addressing beyond end of rpc->entries Using the end of rpc->entries[] for addressing runs into both compile-time and run-time detection of accessing beyond the end of the array. Use the base pointer instead, since was allocated with the additional bytes for storing the strings. Avoids the following warning in future GCC releases with support for __counted_by: In function 'fortify_memcpy_chk', inlined from 'r535_gsp_rpc_set_registry' at ../drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c:1123:3: ../include/linux/fortify-string.h:553:25: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 553 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ for this code: strings = (char *)&rpc->entries[NV_GSP_REG_NUM_ENTRIES]; ... memcpy(strings, r535_registry_entries[i].name, name_len); Signed-off-by: Kees Cook Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240330141159.work.063-kees@kernel.org --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index 9994cbd6f1c4..9858c1438aa7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -1112,7 +1112,7 @@ r535_gsp_rpc_set_registry(struct nvkm_gsp *gsp) rpc->numEntries = NV_GSP_REG_NUM_ENTRIES; str_offset = offsetof(typeof(*rpc), entries[NV_GSP_REG_NUM_ENTRIES]); - strings = (char *)&rpc->entries[NV_GSP_REG_NUM_ENTRIES]; + strings = (char *)rpc + str_offset; for (i = 0; i < NV_GSP_REG_NUM_ENTRIES; i++) { int name_len = strlen(r535_registry_entries[i].name) + 1; -- cgit v1.2.3 From 185fdb4697cc9684a02f2fab0530ecdd0c2f15d4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Apr 2024 18:02:25 +0200 Subject: nouveau: fix function cast warning Calling a function through an incompatible pointer type causes breaks kcfi, so clang warns about the assignment: drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c:73:10: error: cast from 'void (*)(const void *)' to 'void (*)(void *)' converts to incompatible function type [-Werror,-Wcast-function-type-strict] 73 | .fini = (void(*)(void *))kfree, Avoid this with a trivial wrapper. Fixes: c39f472e9f14 ("drm/nouveau: remove symlinks, move core/ to nvkm/ (no code changes)") Signed-off-by: Arnd Bergmann Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240404160234.2923554-1-arnd@kernel.org --- drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c index 4bf486b57101..cb05f7f48a98 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowof.c @@ -66,11 +66,16 @@ of_init(struct nvkm_bios *bios, const char *name) return ERR_PTR(-EINVAL); } +static void of_fini(void *p) +{ + kfree(p); +} + const struct nvbios_source nvbios_of = { .name = "OpenFirmware", .init = of_init, - .fini = (void(*)(void *))kfree, + .fini = of_fini, .read = of_read, .size = of_size, .rw = false, -- cgit v1.2.3 From 0c3b532ad3fbf82884a2e7e83e37c7dcdd4d1d99 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Apr 2024 19:25:21 +0300 Subject: gpio: wcove: Use -ENOTSUPP consistently The GPIO library expects the drivers to return -ENOTSUPP in some cases and not using analogue POSIX code. Make the driver to follow this. Reviewed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Andy Shevchenko --- drivers/gpio/gpio-wcove.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-wcove.c b/drivers/gpio/gpio-wcove.c index c18b6b47384f..94ca9d03c094 100644 --- a/drivers/gpio/gpio-wcove.c +++ b/drivers/gpio/gpio-wcove.c @@ -104,7 +104,7 @@ static inline int to_reg(int gpio, enum ctrl_register type) unsigned int reg = type == CTRL_IN ? GPIO_IN_CTRL_BASE : GPIO_OUT_CTRL_BASE; if (gpio >= WCOVE_GPIO_NUM) - return -EOPNOTSUPP; + return -ENOTSUPP; return reg + gpio; } -- cgit v1.2.3 From ace0ebe5c98d66889f19e0f30e2518d0c58d0e04 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Apr 2024 19:26:22 +0300 Subject: gpio: crystalcove: Use -ENOTSUPP consistently The GPIO library expects the drivers to return -ENOTSUPP in some cases and not using analogue POSIX code. Make the driver to follow this. Signed-off-by: Andy Shevchenko --- drivers/gpio/gpio-crystalcove.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-crystalcove.c b/drivers/gpio/gpio-crystalcove.c index 1ee62cd58582..25db014494a4 100644 --- a/drivers/gpio/gpio-crystalcove.c +++ b/drivers/gpio/gpio-crystalcove.c @@ -92,7 +92,7 @@ static inline int to_reg(int gpio, enum ctrl_register reg_type) case 0x5e: return GPIOPANELCTL; default: - return -EOPNOTSUPP; + return -ENOTSUPP; } } -- cgit v1.2.3 From 10396f4df8b75ff6ab0aa2cd74296565466f2c8d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Apr 2024 13:56:18 -0400 Subject: nfsd: hold a lighter-weight client reference over CB_RECALL_ANY Currently the CB_RECALL_ANY job takes a cl_rpc_users reference to the client. While a callback job is technically an RPC that counter is really more for client-driven RPCs, and this has the effect of preventing the client from being unhashed until the callback completes. If nfsd decides to send a CB_RECALL_ANY just as the client reboots, we can end up in a situation where the callback can't complete on the (now dead) callback channel, but the new client can't connect because the old client can't be unhashed. This usually manifests as a NFS4ERR_DELAY return on the CREATE_SESSION operation. The job is only holding a reference to the client so it can clear a flag after the RPC completes. Fix this by having CB_RECALL_ANY instead hold a reference to the cl_nfsdfs.cl_ref. Typically we only take that sort of reference when dealing with the nfsdfs info files, but it should work appropriately here to ensure that the nfs4_client doesn't disappear. Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") Reported-by: Vladimir Benes Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5fcd93f7cb8c..3cef81e196c6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3042,12 +3042,9 @@ static void nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - spin_lock(&nn->client_lock); clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); - put_client_renew_locked(clp); - spin_unlock(&nn->client_lock); + drop_client(clp); } static int @@ -6616,7 +6613,7 @@ deleg_reaper(struct nfsd_net *nn) list_add(&clp->cl_ra_cblist, &cblist); /* release in nfsd4_cb_recall_any_release */ - atomic_inc(&clp->cl_rpc_users); + kref_get(&clp->cl_nfsdfs.cl_ref); set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); clp->cl_ra_time = ktime_get_boottime_seconds(); } -- cgit v1.2.3 From d3bbc4dfcc8d436b1e27a0bb6a5893b090fc1cab Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 26 Mar 2024 22:23:24 +0100 Subject: drm/msm: fix the `CRASHDUMP_READ` target of `a6xx_get_shader_block()` Clang 14 in an (essentially) defconfig arm64 build for next-20240326 reports [1]: drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c:843:6: error: variable 'out' set but not used [-Werror,-Wunused-but-set-variable] The variable `out` in these functions is meant to compute the `target` of `CRASHDUMP_READ()`, but in this case only the initial value (`dumper->iova + A6XX_CD_DATA_OFFSET`) was being passed. Thus use `out` as it was intended by Connor [2]. There was an alternative patch at [3] that removed the variable altogether, but that would only use the initial value. Fixes: 64d6255650d4 ("drm/msm: More fully implement devcoredump for a7xx") Closes: https://lore.kernel.org/lkml/CANiq72mjc5t4n25SQvYSrOEhxxpXYPZ4pPzneSJHEnc3qApu2Q@mail.gmail.com/ [1] Link: https://lore.kernel.org/lkml/CACu1E7HhCKMJd6fixZSPiNAz6ekoZnkMTHTcLFVmbZ-9VoLxKg@mail.gmail.com/ [2] Link: https://lore.kernel.org/lkml/20240307093727.1978126-1-colin.i.king@gmail.com/ [3] Signed-off-by: Miguel Ojeda Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/584955/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index 1f5245fc2cdc..a847a0f7a73c 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -852,7 +852,7 @@ static void a6xx_get_shader_block(struct msm_gpu *gpu, (block->type << 8) | i); in += CRASHDUMP_READ(in, REG_A6XX_HLSQ_DBG_AHB_READ_APERTURE, - block->size, dumper->iova + A6XX_CD_DATA_OFFSET); + block->size, out); out += block->size * sizeof(u32); } -- cgit v1.2.3 From 65291dcfcf8936e1b23cfd7718fdfde7cfaf7706 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 26 Mar 2024 15:32:08 +0100 Subject: mm/secretmem: fix GUP-fast succeeding on secretmem folios folio_is_secretmem() currently relies on secretmem folios being LRU folios, to save some cycles. However, folios might reside in a folio batch without the LRU flag set, or temporarily have their LRU flag cleared. Consequently, the LRU flag is unreliable for this purpose. In particular, this is the case when secretmem_fault() allocates a fresh page and calls filemap_add_folio()->folio_add_lru(). The folio might be added to the per-cpu folio batch and won't get the LRU flag set until the batch was drained using e.g., lru_add_drain(). Consequently, folio_is_secretmem() might not detect secretmem folios and GUP-fast can succeed in grabbing a secretmem folio, crashing the kernel when we would later try reading/writing to the folio, because the folio has been unmapped from the directmap. Fix it by removing that unreliable check. Link: https://lkml.kernel.org/r/20240326143210.291116-2-david@redhat.com Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas") Signed-off-by: David Hildenbrand Reported-by: xingwei lee Reported-by: yue sun Closes: https://lore.kernel.org/lkml/CABOYnLyevJeravW=QrH0JUPYEcDN160aZFb7kwndm-J2rmz0HQ@mail.gmail.com/ Debugged-by: Miklos Szeredi Tested-by: Miklos Szeredi Reviewed-by: Mike Rapoport (IBM) Cc: Lorenzo Stoakes Cc: Signed-off-by: Andrew Morton --- include/linux/secretmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index 35f3a4a8ceb1..acf7e1a3f3de 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -13,10 +13,10 @@ static inline bool folio_is_secretmem(struct folio *folio) /* * Using folio_mapping() is quite slow because of the actual call * instruction. - * We know that secretmem pages are not compound and LRU so we can + * We know that secretmem pages are not compound, so we can * save a couple of cycles here. */ - if (folio_test_large(folio) || !folio_test_lru(folio)) + if (folio_test_large(folio)) return false; mapping = (struct address_space *) -- cgit v1.2.3 From 8434f9aa6b7e77bc1459d75d1293c3f55bf4687b Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Sat, 23 Mar 2024 08:29:34 -0700 Subject: init: open output files from cpio unpacking with O_LARGEFILE If a member of a cpio archive for an initrd or initrams is larger than 2Gb, we'll eventually fail to write to that file when we get to that limit, unless O_LARGEFILE is set. The problem can be seen with this recipe, assuming that BLK_DEV_RAM is not configured: cd /tmp dd if=/dev/zero of=BIGFILE bs=1048576 count=2200 echo BIGFILE | cpio -o -H newc -R root:root > initrd.img kexec -l /boot/vmlinuz-$(uname -r) --initrd=initrd.img --reuse-cmdline kexec -e The console will show 'Initramfs unpacking failed: write error'. With the patch, the error is gone. Link: https://lkml.kernel.org/r/20240323152934.3307391-1-jsperbeck@google.com Signed-off-by: John Sperbeck Cc: Jens Axboe Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- init/initramfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index 3127e0bf7bbd..a298a3854a80 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -367,7 +367,7 @@ static int __init do_name(void) if (S_ISREG(mode)) { int ml = maybe_link(); if (ml >= 0) { - int openflags = O_WRONLY|O_CREAT; + int openflags = O_WRONLY|O_CREAT|O_LARGEFILE; if (ml != 1) openflags |= O_TRUNC; wfile = filp_open(collected, openflags, mode); -- cgit v1.2.3 From 4ed91fa9177b236b73a271f11a333a98f076eb63 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sat, 23 Mar 2024 15:15:44 +0100 Subject: mm: vmalloc: bail out early in find_vmap_area() if vmap is not init During the boot the s390 system triggers "spinlock bad magic" messages if the spinlock debugging is enabled: [ 0.465445] BUG: spinlock bad magic on CPU#0, swapper/0 [ 0.465490] lock: single+0x1860/0x1958, .magic: 00000000, .owner: /-1, .owner_cpu: 0 [ 0.466067] CPU: 0 PID: 0 Comm: swapper Not tainted 6.8.0-12955-g8e938e398669 #1 [ 0.466188] Hardware name: QEMU 8561 QEMU (KVM/Linux) [ 0.466270] Call Trace: [ 0.466470] [<00000000011f26c8>] dump_stack_lvl+0x98/0xd8 [ 0.466516] [<00000000001dcc6a>] do_raw_spin_lock+0x8a/0x108 [ 0.466545] [<000000000042146c>] find_vmap_area+0x6c/0x108 [ 0.466572] [<000000000042175a>] find_vm_area+0x22/0x40 [ 0.466597] [<000000000012f152>] __set_memory+0x132/0x150 [ 0.466624] [<0000000001cc0398>] vmem_map_init+0x40/0x118 [ 0.466651] [<0000000001cc0092>] paging_init+0x22/0x68 [ 0.466677] [<0000000001cbbed2>] setup_arch+0x52a/0x708 [ 0.466702] [<0000000001cb6140>] start_kernel+0x80/0x5c8 [ 0.466727] [<0000000000100036>] startup_continue+0x36/0x40 it happens because such system tries to access some vmap areas whereas the vmalloc initialization is not even yet done: [ 0.465490] lock: single+0x1860/0x1958, .magic: 00000000, .owner: /-1, .owner_cpu: 0 [ 0.466067] CPU: 0 PID: 0 Comm: swapper Not tainted 6.8.0-12955-g8e938e398669 #1 [ 0.466188] Hardware name: QEMU 8561 QEMU (KVM/Linux) [ 0.466270] Call Trace: [ 0.466470] dump_stack_lvl (lib/dump_stack.c:117) [ 0.466516] do_raw_spin_lock (kernel/locking/spinlock_debug.c:87 kernel/locking/spinlock_debug.c:115) [ 0.466545] find_vmap_area (mm/vmalloc.c:1059 mm/vmalloc.c:2364) [ 0.466572] find_vm_area (mm/vmalloc.c:3150) [ 0.466597] __set_memory (arch/s390/mm/pageattr.c:360 arch/s390/mm/pageattr.c:393) [ 0.466624] vmem_map_init (./arch/s390/include/asm/set_memory.h:55 arch/s390/mm/vmem.c:660) [ 0.466651] paging_init (arch/s390/mm/init.c:97) [ 0.466677] setup_arch (arch/s390/kernel/setup.c:972) [ 0.466702] start_kernel (init/main.c:899) [ 0.466727] startup_continue (arch/s390/kernel/head64.S:35) [ 0.466811] INFO: lockdep is turned off. ... [ 0.718250] vmalloc init - busy lock init 0000000002871860 [ 0.718328] vmalloc init - busy lock init 00000000028731b8 Some background. It worked before because the lock that is in question was statically defined and initialized. As of now, the locks and data structures are initialized in the vmalloc_init() function. To address that issue add the check whether the "vmap_initialized" variable is set, if not find_vmap_area() bails out on entry returning NULL. Link: https://lkml.kernel.org/r/20240323141544.4150-1-urezki@gmail.com Fixes: 72210662c5a2 ("mm: vmalloc: offload free_vmap_area_lock lock") Signed-off-by: Uladzislau Rezki (Sony) Tested-by: Guenter Roeck Reviewed-by: Baoquan He Acked-by: Heiko Carstens Cc: Christoph Hellwig Cc: Dave Chinner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 22aa63f4ef63..0d77d171b5d9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2343,6 +2343,9 @@ struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; int i, j; + if (unlikely(!vmap_initialized)) + return NULL; + /* * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed -- cgit v1.2.3 From fc2c22693c608125bbce174c1952eb4db2f8d07f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 28 Mar 2024 15:03:30 +0100 Subject: mm: vmalloc: fix lockdep warning A lockdep reports a possible deadlock in the find_vmap_area_exceed_addr_lock() function: ============================================ WARNING: possible recursive locking detected 6.9.0-rc1-00060-ged3ccc57b108-dirty #6140 Not tainted -------------------------------------------- drgn/455 is trying to acquire lock: ffff0000c00131d0 (&vn->busy.lock/1){+.+.}-{2:2}, at: find_vmap_area_exceed_addr_lock+0x64/0x124 but task is already holding lock: ffff0000c0011878 (&vn->busy.lock/1){+.+.}-{2:2}, at: find_vmap_area_exceed_addr_lock+0x64/0x124 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&vn->busy.lock/1); lock(&vn->busy.lock/1); *** DEADLOCK *** indeed it can happen if the find_vmap_area_exceed_addr_lock() gets called concurrently because it tries to acquire two nodes locks. It was done to prevent removing a lowest VA found on a previous step. To address this a lowest VA is found first without holding a node lock where it resides. As a last step we check if a VA still there because it can go away, if removed, proceed with next lowest. [akpm@linux-foundation.org: fix comment typos, per Baoquan] Link: https://lkml.kernel.org/r/20240328140330.4747-1-urezki@gmail.com Fixes: 53becf32aec1 ("mm: vmalloc: support multiple nodes in vread_iter") Signed-off-by: Uladzislau Rezki (Sony) Tested-by: Jens Axboe Tested-by: Omar Sandoval Reported-by: Jens Axboe Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 73 +++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0d77d171b5d9..68fa001648cc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -989,6 +989,27 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + + addr = (unsigned long)kasan_reset_tag((void *)addr); + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr >= va->va_end) + n = n->rb_right; + else + return va; + } + + return NULL; +} + /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area * __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) @@ -1025,47 +1046,39 @@ __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) static struct vmap_node * find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { - struct vmap_node *vn, *va_node = NULL; - struct vmap_area *va_lowest; + unsigned long va_start_lowest; + struct vmap_node *vn; int i; - for (i = 0; i < nr_vmap_nodes; i++) { +repeat: + for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); - va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root); - if (va_lowest) { - if (!va_node || va_lowest->va_start < (*va)->va_start) { - if (va_node) - spin_unlock(&va_node->busy.lock); - - *va = va_lowest; - va_node = vn; - continue; - } - } + *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); + + if (*va) + if (!va_start_lowest || (*va)->va_start < va_start_lowest) + va_start_lowest = (*va)->va_start; spin_unlock(&vn->busy.lock); } - return va_node; -} - -static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) -{ - struct rb_node *n = root->rb_node; + /* + * Check if found VA exists, it might have gone away. In this case we + * repeat the search because a VA has been removed concurrently and we + * need to proceed to the next one, which is a rare case. + */ + if (va_start_lowest) { + vn = addr_to_node(va_start_lowest); - addr = (unsigned long)kasan_reset_tag((void *)addr); + spin_lock(&vn->busy.lock); + *va = __find_vmap_area(va_start_lowest, &vn->busy.root); - while (n) { - struct vmap_area *va; + if (*va) + return vn; - va = rb_entry(n, struct vmap_area, rb_node); - if (addr < va->va_start) - n = n->rb_left; - else if (addr >= va->va_end) - n = n->rb_right; - else - return va; + spin_unlock(&vn->busy.lock); + goto repeat; } return NULL; -- cgit v1.2.3 From 176517c9310281d00dd3210ab4cc4d3cdc26b17e Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Fri, 29 Mar 2024 18:58:10 +0000 Subject: selftests/mm: include strings.h for ffsl Got a compilation error on Android for ffsl after 91b80cc5b39f ("selftests: mm: fix map_hugetlb failure on 64K page size systems") included vm_util.h. Link: https://lkml.kernel.org/r/20240329185814.16304-1-edliaw@google.com Fixes: af605d26a8f2 ("selftests/mm: merge util.h into vm_util.h") Signed-off-by: Edward Liaw Reviewed-by: Muhammad Usama Anjum Cc: Axel Rasmussen Cc: David Hildenbrand Cc: "Mike Rapoport (IBM)" Cc: Peter Xu Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/vm_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index c02990bbd56f..9007c420d52c 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -3,7 +3,7 @@ #include #include #include -#include /* ffsl() */ +#include /* ffsl() */ #include /* _SC_PAGESIZE */ #define BIT_ULL(nr) (1ULL << (nr)) -- cgit v1.2.3 From 87f0e65cdf762f7c4d7ad1466f0cc7c1381f1996 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Tue, 2 Apr 2024 16:23:34 -0700 Subject: MAINTAINERS: change vmware.com addresses to broadcom.com Update all remaining vmware.com email addresses to actual broadcom.com. Add corresponding .mailmap entries for maintainers who contributed in the past as the vmware.com address will start bouncing soon. Maintainership update. Jeff Sipek has left VMware, Nick Shi will be maintaining VMware PTP. Link: https://lkml.kernel.org/r/20240402232334.33167-1-alexey.makhalov@broadcom.com Signed-off-by: Alexey Makhalov Acked-by: Florian Fainelli Acked-by: Ajay Kaher Acked-by: Ronak Doshi Acked-by: Nick Shi Acked-by: Bryan Tan Acked-by: Vishnu Dasa Acked-by: Vishal Bhakta Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- .mailmap | 5 +++++ MAINTAINERS | 46 +++++++++++++++++++++++----------------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/.mailmap b/.mailmap index 59c9a841bf71..8284692f9610 100644 --- a/.mailmap +++ b/.mailmap @@ -20,6 +20,7 @@ Adam Oldham Adam Radford Adriana Reus Adrian Bunk +Ajay Kaher Akhil P Oommen Alan Cox Alan Cox @@ -36,6 +37,7 @@ Alexei Avshalom Lazar Alexei Starovoitov Alexei Starovoitov Alexei Starovoitov +Alexey Makhalov Alex Hung Alex Shi Alex Shi @@ -110,6 +112,7 @@ Brendan Higgins Brian Avery Brian King Brian Silverman +Bryan Tan Cai Huoqing Can Guo Carl Huang @@ -529,6 +532,7 @@ Rocky Liao Roman Gushchin Roman Gushchin Roman Gushchin +Ronak Doshi Muchun Song Muchun Song Ross Zwisler @@ -651,6 +655,7 @@ Viresh Kumar Viresh Kumar Viresh Kumar Viresh Kumar +Vishnu Dasa Vivek Aknurwar Vivien Didelot Vlad Dogaru diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..bd9fbb315e69 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16731,9 +16731,9 @@ F: include/uapi/linux/ppdev.h PARAVIRT_OPS INTERFACE M: Juergen Gross -R: Ajay Kaher -R: Alexey Makhalov -R: VMware PV-Drivers Reviewers +R: Ajay Kaher +R: Alexey Makhalov +R: Broadcom internal kernel review list L: virtualization@lists.linux.dev L: x86@kernel.org S: Supported @@ -23652,9 +23652,9 @@ S: Supported F: drivers/misc/vmw_balloon.c VMWARE HYPERVISOR INTERFACE -M: Ajay Kaher -M: Alexey Makhalov -R: VMware PV-Drivers Reviewers +M: Ajay Kaher +M: Alexey Makhalov +R: Broadcom internal kernel review list L: virtualization@lists.linux.dev L: x86@kernel.org S: Supported @@ -23663,34 +23663,34 @@ F: arch/x86/include/asm/vmware.h F: arch/x86/kernel/cpu/vmware.c VMWARE PVRDMA DRIVER -M: Bryan Tan -M: Vishnu Dasa -R: VMware PV-Drivers Reviewers +M: Bryan Tan +M: Vishnu Dasa +R: Broadcom internal kernel review list L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/vmw_pvrdma/ VMWARE PVSCSI DRIVER -M: Vishal Bhakta -R: VMware PV-Drivers Reviewers +M: Vishal Bhakta +R: Broadcom internal kernel review list L: linux-scsi@vger.kernel.org S: Supported F: drivers/scsi/vmw_pvscsi.c F: drivers/scsi/vmw_pvscsi.h VMWARE VIRTUAL PTP CLOCK DRIVER -M: Jeff Sipek -R: Ajay Kaher -R: Alexey Makhalov -R: VMware PV-Drivers Reviewers +M: Nick Shi +R: Ajay Kaher +R: Alexey Makhalov +R: Broadcom internal kernel review list L: netdev@vger.kernel.org S: Supported F: drivers/ptp/ptp_vmw.c VMWARE VMCI DRIVER -M: Bryan Tan -M: Vishnu Dasa -R: VMware PV-Drivers Reviewers +M: Bryan Tan +M: Vishnu Dasa +R: Broadcom internal kernel review list L: linux-kernel@vger.kernel.org S: Supported F: drivers/misc/vmw_vmci/ @@ -23705,16 +23705,16 @@ F: drivers/input/mouse/vmmouse.c F: drivers/input/mouse/vmmouse.h VMWARE VMXNET3 ETHERNET DRIVER -M: Ronak Doshi -R: VMware PV-Drivers Reviewers +M: Ronak Doshi +R: Broadcom internal kernel review list L: netdev@vger.kernel.org S: Supported F: drivers/net/vmxnet3/ VMWARE VSOCK VMCI TRANSPORT DRIVER -M: Bryan Tan -M: Vishnu Dasa -R: VMware PV-Drivers Reviewers +M: Bryan Tan +M: Vishnu Dasa +R: Broadcom internal kernel review list L: linux-kernel@vger.kernel.org S: Supported F: net/vmw_vsock/vmci_transport* -- cgit v1.2.3 From 04c35ab3bdae7fefbd7c7a7355f29fa03a035221 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 3 Apr 2024 23:21:30 +0200 Subject: x86/mm/pat: fix VM_PAT handling in COW mappings PAT handling won't do the right thing in COW mappings: the first PTE (or, in fact, all PTEs) can be replaced during write faults to point at anon folios. Reliably recovering the correct PFN and cachemode using follow_phys() from PTEs will not work in COW mappings. Using follow_phys(), we might just get the address+protection of the anon folio (which is very wrong), or fail on swap/nonswap entries, failing follow_phys() and triggering a WARN_ON_ONCE() in untrack_pfn() and track_pfn_copy(), not properly calling free_pfn_range(). In free_pfn_range(), we either wouldn't call memtype_free() or would call it with the wrong range, possibly leaking memory. To fix that, let's update follow_phys() to refuse returning anon folios, and fallback to using the stored PFN inside vma->vm_pgoff for COW mappings if we run into that. We will now properly handle untrack_pfn() with COW mappings, where we don't need the cachemode. We'll have to fail fork()->track_pfn_copy() if the first page was replaced by an anon folio, though: we'd have to store the cachemode in the VMA to make this work, likely growing the VMA size. For now, lets keep it simple and let track_pfn_copy() just fail in that case: it would have failed in the past with swap/nonswap entries already, and it would have done the wrong thing with anon folios. Simple reproducer to trigger the WARN_ON_ONCE() in untrack_pfn(): <--- C reproducer ---> #include #include #include #include int main(void) { struct io_uring_params p = {}; int ring_fd; size_t size; char *map; ring_fd = io_uring_setup(1, &p); if (ring_fd < 0) { perror("io_uring_setup"); return 1; } size = p.sq_off.array + p.sq_entries * sizeof(unsigned); /* Map the submission queue ring MAP_PRIVATE */ map = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, ring_fd, IORING_OFF_SQ_RING); if (map == MAP_FAILED) { perror("mmap"); return 1; } /* We have at least one page. Let's COW it. */ *map = 0; pause(); return 0; } <--- C reproducer ---> On a system with 16 GiB RAM and swap configured: # ./iouring & # memhog 16G # killall iouring [ 301.552930] ------------[ cut here ]------------ [ 301.553285] WARNING: CPU: 7 PID: 1402 at arch/x86/mm/pat/memtype.c:1060 untrack_pfn+0xf4/0x100 [ 301.553989] Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_g [ 301.558232] CPU: 7 PID: 1402 Comm: iouring Not tainted 6.7.5-100.fc38.x86_64 #1 [ 301.558772] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebu4 [ 301.559569] RIP: 0010:untrack_pfn+0xf4/0x100 [ 301.559893] Code: 75 c4 eb cf 48 8b 43 10 8b a8 e8 00 00 00 3b 6b 28 74 b8 48 8b 7b 30 e8 ea 1a f7 000 [ 301.561189] RSP: 0018:ffffba2c0377fab8 EFLAGS: 00010282 [ 301.561590] RAX: 00000000ffffffea RBX: ffff9208c8ce9cc0 RCX: 000000010455e047 [ 301.562105] RDX: 07fffffff0eb1e0a RSI: 0000000000000000 RDI: ffff9208c391d200 [ 301.562628] RBP: 0000000000000000 R08: ffffba2c0377fab8 R09: 0000000000000000 [ 301.563145] R10: ffff9208d2292d50 R11: 0000000000000002 R12: 00007fea890e0000 [ 301.563669] R13: 0000000000000000 R14: ffffba2c0377fc08 R15: 0000000000000000 [ 301.564186] FS: 0000000000000000(0000) GS:ffff920c2fbc0000(0000) knlGS:0000000000000000 [ 301.564773] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 301.565197] CR2: 00007fea88ee8a20 CR3: 00000001033a8000 CR4: 0000000000750ef0 [ 301.565725] PKRU: 55555554 [ 301.565944] Call Trace: [ 301.566148] [ 301.566325] ? untrack_pfn+0xf4/0x100 [ 301.566618] ? __warn+0x81/0x130 [ 301.566876] ? untrack_pfn+0xf4/0x100 [ 301.567163] ? report_bug+0x171/0x1a0 [ 301.567466] ? handle_bug+0x3c/0x80 [ 301.567743] ? exc_invalid_op+0x17/0x70 [ 301.568038] ? asm_exc_invalid_op+0x1a/0x20 [ 301.568363] ? untrack_pfn+0xf4/0x100 [ 301.568660] ? untrack_pfn+0x65/0x100 [ 301.568947] unmap_single_vma+0xa6/0xe0 [ 301.569247] unmap_vmas+0xb5/0x190 [ 301.569532] exit_mmap+0xec/0x340 [ 301.569801] __mmput+0x3e/0x130 [ 301.570051] do_exit+0x305/0xaf0 ... Link: https://lkml.kernel.org/r/20240403212131.929421-3-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Wupeng Ma Closes: https://lkml.kernel.org/r/20240227122814.3781907-1-mawupeng1@huawei.com Fixes: b1a86e15dc03 ("x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines") Fixes: 5899329b1910 ("x86: PAT: implement track/untrack of pfnmap regions for x86 - v3") Acked-by: Ingo Molnar Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton --- arch/x86/mm/pat/memtype.c | 49 +++++++++++++++++++++++++++++++++-------------- mm/memory.c | 4 ++++ 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 0d72183b5dd0..36b603d0cdde 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -947,6 +947,38 @@ static void free_pfn_range(u64 paddr, unsigned long size) memtype_free(paddr, paddr + size); } +static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, + pgprot_t *pgprot) +{ + unsigned long prot; + + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT)); + + /* + * We need the starting PFN and cachemode used for track_pfn_remap() + * that covered the whole VMA. For most mappings, we can obtain that + * information from the page tables. For COW mappings, we might now + * suddenly have anon folios mapped and follow_phys() will fail. + * + * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to + * detect the PFN. If we need the cachemode as well, we're out of luck + * for now and have to fail fork(). + */ + if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) { + if (pgprot) + *pgprot = __pgprot(prot); + return 0; + } + if (is_cow_mapping(vma->vm_flags)) { + if (pgprot) + return -EINVAL; + *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + return 0; + } + WARN_ON_ONCE(1); + return -EINVAL; +} + /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). @@ -957,20 +989,13 @@ static void free_pfn_range(u64 paddr, unsigned long size) int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; - unsigned long prot; unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; if (vma->vm_flags & VM_PAT) { - /* - * reserve the whole chunk covered by vma. We need the - * starting address and protection from pte. - */ - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { - WARN_ON_ONCE(1); + if (get_pat_info(vma, &paddr, &pgprot)) return -EINVAL; - } - pgprot = __pgprot(prot); + /* reserve the whole chunk covered by vma. */ return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } @@ -1045,7 +1070,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked) { resource_size_t paddr; - unsigned long prot; if (vma && !(vma->vm_flags & VM_PAT)) return; @@ -1053,11 +1077,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, /* free the chunk starting from pfn or the whole chunk */ paddr = (resource_size_t)pfn << PAGE_SHIFT; if (!paddr && !size) { - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { - WARN_ON_ONCE(1); + if (get_pat_info(vma, &paddr, NULL)) return; - } - size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); diff --git a/mm/memory.c b/mm/memory.c index 904f70b99498..d2155ced45f8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5973,6 +5973,10 @@ int follow_phys(struct vm_area_struct *vma, goto out; pte = ptep_get(ptep); + /* Never return PFNs of anon folios in COW mappings. */ + if (vm_normal_folio(vma, address, pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; -- cgit v1.2.3 From a6c1d9cb9a68bfa4512248419c4f4d880d19fe90 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 1 Apr 2024 17:14:58 -0700 Subject: stackdepot: rename pool_index to pool_index_plus_1 Commit 3ee34eabac2a ("lib/stackdepot: fix first entry having a 0-handle") changed the meaning of the pool_index field to mean "the pool index plus 1". This made the code accessing this field less self-documenting, as well as causing debuggers such as drgn to not be able to easily remain compatible with both old and new kernels, because they typically do that by testing for presence of the new field. Because stackdepot is a debugging tool, we should make sure that it is debugger friendly. Therefore, give the field a different name to improve readability as well as enabling debugger backwards compatibility. This is needed in 6.9, which would otherwise become an odd release with the new semantics and old name so debuggers wouldn't recognize the new semantics there. Fixes: 3ee34eabac2a ("lib/stackdepot: fix first entry having a 0-handle") Link: https://lkml.kernel.org/r/20240402001500.53533-1-pcc@google.com Link: https://linux-review.googlesource.com/id/Ib3e70c36c1d230dd0a118dc22649b33e768b9f88 Signed-off-by: Peter Collingbourne Acked-by: Vlastimil Babka Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Acked-by: Oscar Salvador Cc: Andrey Konovalov Cc: Michal Hocko Cc: Omar Sandoval Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 7 +++---- lib/stackdepot.c | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 3c6caa5abc7c..e9ec32fb97d4 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -44,10 +44,9 @@ typedef u32 depot_stack_handle_t; union handle_parts { depot_stack_handle_t handle; struct { - /* pool_index is offset by 1 */ - u32 pool_index : DEPOT_POOL_INDEX_BITS; - u32 offset : DEPOT_OFFSET_BITS; - u32 extra : STACK_DEPOT_EXTRA_BITS; + u32 pool_index_plus_1 : DEPOT_POOL_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; diff --git a/lib/stackdepot.c b/lib/stackdepot.c index af6cc19a2003..68c97387aa54 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -330,7 +330,7 @@ static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size) stack = current_pool + pool_offset; /* Pre-initialize handle once. */ - stack->handle.pool_index = pool_index + 1; + stack->handle.pool_index_plus_1 = pool_index + 1; stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; stack->handle.extra = 0; INIT_LIST_HEAD(&stack->hash_list); @@ -441,7 +441,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) const int pools_num_cached = READ_ONCE(pools_num); union handle_parts parts = { .handle = handle }; void *pool; - u32 pool_index = parts.pool_index - 1; + u32 pool_index = parts.pool_index_plus_1 - 1; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; -- cgit v1.2.3 From 9dc23cba0927d09cb481da064c8413eb9df42e2b Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Thu, 28 Mar 2024 09:02:45 +0100 Subject: drm/msm/adreno: Set highest_bank_bit for A619 The default highest_bank_bit of 15 didn't seem to cause issues so far but downstream defines it to be 14. But similar to [0] leaving it on 14 (or 15 for that matter) causes some corruption issues with some resolutions with DisplayPort, like 1920x1200. So set it to 13 for now so that there's no screen corruption. [0] commit 6a0dbcd20ef2 ("drm/msm/a6xx: set highest_bank_bit to 13 for a610") Fixes: b7616b5c69e6 ("drm/msm/adreno: Add A619 support") Signed-off-by: Luca Weiss Patchwork: https://patchwork.freedesktop.org/patch/585215/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 0674aca0f8a3..cf0b1de1c071 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1377,6 +1377,10 @@ static void a6xx_calc_ubwc_config(struct adreno_gpu *gpu) if (adreno_is_a618(gpu)) gpu->ubwc_config.highest_bank_bit = 14; + if (adreno_is_a619(gpu)) + /* TODO: Should be 14 but causes corruption at e.g. 1920x1200 on DP */ + gpu->ubwc_config.highest_bank_bit = 13; + if (adreno_is_a619_holi(gpu)) gpu->ubwc_config.highest_bank_bit = 13; -- cgit v1.2.3 From a6c4162d844dae4dbfea1bf9ecffcb852d3ed615 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Apr 2024 18:01:02 +0300 Subject: bcachefs: fix ! vs ~ typo in __clear_bit_le64() The ! was obviously intended to be ~. As it is, this function does the equivalent to: "addr[bit / 64] = 0;". Fixes: 27fcec6c27ca ("bcachefs: Clear recovery_passes_required as they complete without errors") Signed-off-by: Dan Carpenter Signed-off-by: Kent Overstreet --- fs/bcachefs/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b7e7c29278fc..de639e8a3ab5 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -795,7 +795,7 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) static inline void __clear_bit_le64(size_t bit, __le64 *addr) { - addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); + addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64)); } static inline bool test_bit_le64(size_t bit, __le64 *addr) -- cgit v1.2.3 From cf979fca9a05d7d0b116257e5c4dc12b6bb7eb3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 16:21:18 -0400 Subject: bcachefs: fix rand_delete unit test Signed-off-by: Kent Overstreet --- fs/bcachefs/tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index b3fe9fc57747..bfec656f94c0 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; -- cgit v1.2.3 From 752e3c53de0fa3b7d817a83050b6699b8e9c6ec9 Mon Sep 17 00:00:00 2001 From: Adam Goldman Date: Mon, 25 Mar 2024 07:38:41 +0900 Subject: firewire: ohci: mask bus reset interrupts between ISR and bottom half In the FireWire OHCI interrupt handler, if a bus reset interrupt has occurred, mask bus reset interrupts until bus_reset_work has serviced and cleared the interrupt. Normally, we always leave bus reset interrupts masked. We infer the bus reset from the self-ID interrupt that happens shortly thereafter. A scenario where we unmask bus reset interrupts was introduced in 2008 in a007bb857e0b26f5d8b73c2ff90782d9c0972620: If OHCI_PARAM_DEBUG_BUSRESETS (8) is set in the debug parameter bitmask, we will unmask bus reset interrupts so we can log them. irq_handler logs the bus reset interrupt. However, we can't clear the bus reset event flag in irq_handler, because we won't service the event until later. irq_handler exits with the event flag still set. If the corresponding interrupt is still unmasked, the first bus reset will usually freeze the system due to irq_handler being called again each time it exits. This freeze can be reproduced by loading firewire_ohci with "modprobe firewire_ohci debug=-1" (to enable all debugging output). Apparently there are also some cases where bus_reset_work will get called soon enough to clear the event, and operation will continue normally. This freeze was first reported a few months after a007bb85 was committed, but until now it was never fixed. The debug level could safely be set to -1 through sysfs after the module was loaded, but this would be ineffectual in logging bus reset interrupts since they were only unmasked during initialization. irq_handler will now leave the event flag set but mask bus reset interrupts, so irq_handler won't be called again and there will be no freeze. If OHCI_PARAM_DEBUG_BUSRESETS is enabled, bus_reset_work will unmask the interrupt after servicing the event, so future interrupts will be caught as desired. As a side effect to this change, OHCI_PARAM_DEBUG_BUSRESETS can now be enabled through sysfs in addition to during initial module loading. However, when enabled through sysfs, logging of bus reset interrupts will be effective only starting with the second bus reset, after bus_reset_work has executed. Signed-off-by: Adam Goldman Signed-off-by: Takashi Sakamoto --- drivers/firewire/ohci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 7bc71f4be64a..38d19410a2be 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -2060,6 +2060,8 @@ static void bus_reset_work(struct work_struct *work) ohci->generation = generation; reg_write(ohci, OHCI1394_IntEventClear, OHCI1394_busReset); + if (param_debug & OHCI_PARAM_DEBUG_BUSRESETS) + reg_write(ohci, OHCI1394_IntMaskSet, OHCI1394_busReset); if (ohci->quirks & QUIRK_RESET_PACKET) ohci->request_generation = generation; @@ -2125,12 +2127,14 @@ static irqreturn_t irq_handler(int irq, void *data) return IRQ_NONE; /* - * busReset and postedWriteErr must not be cleared yet + * busReset and postedWriteErr events must not be cleared yet * (OHCI 1.1 clauses 7.2.3.2 and 13.2.8.1) */ reg_write(ohci, OHCI1394_IntEventClear, event & ~(OHCI1394_busReset | OHCI1394_postedWriteErr)); log_irqs(ohci, event); + if (event & OHCI1394_busReset) + reg_write(ohci, OHCI1394_IntMaskClear, OHCI1394_busReset); if (event & OHCI1394_selfIDComplete) queue_work(selfid_workqueue, &ohci->bus_reset_work); -- cgit v1.2.3 From 97a54ef596c3fd24ec2b227ba8aaf2cf5415e779 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 15 Feb 2024 15:39:43 +0100 Subject: scsi: target: Fix SELinux error when systemd-modules loads the target module If the systemd-modules service loads the target module, the credentials of that userspace process will be used to validate the access to the target db directory. SELinux will prevent it, reporting an error like the following: kernel: audit: type=1400 audit(1676301082.205:4): avc: denied { read } for pid=1020 comm="systemd-modules" name="target" dev="dm-3" ino=4657583 scontext=system_u:system_r:systemd_modules_load_t:s0 tcontext=system_u:object_r:targetd_etc_rw_t:s0 tclass=dir permissive=0 Fix the error by using the kernel credentials to access the db directory Signed-off-by: Maurizio Lombardi Link: https://lore.kernel.org/r/20240215143944.847184-2-mlombard@redhat.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_configfs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index c1fbcdd16182..c40217f44b1b 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -3672,6 +3672,8 @@ static int __init target_core_init_configfs(void) { struct configfs_subsystem *subsys = &target_core_fabrics; struct t10_alua_lu_gp *lu_gp; + struct cred *kern_cred; + const struct cred *old_cred; int ret; pr_debug("TARGET_CORE[0]: Loading Generic Kernel Storage" @@ -3748,11 +3750,21 @@ static int __init target_core_init_configfs(void) if (ret < 0) goto out; + /* We use the kernel credentials to access the target directory */ + kern_cred = prepare_kernel_cred(&init_task); + if (!kern_cred) { + ret = -ENOMEM; + goto out; + } + old_cred = override_creds(kern_cred); target_init_dbroot(); + revert_creds(old_cred); + put_cred(kern_cred); return 0; out: + target_xcopy_release_pt(); configfs_unregister_subsystem(subsys); core_dev_release_virtual_lun0(); rd_module_exit(); -- cgit v1.2.3 From 358e919a351f2ea4b412e7dac6b1c23ec10bd4f5 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Tue, 2 Apr 2024 11:55:12 +0800 Subject: scsi: hisi_sas: Handle the NCQ error returned by D2H frame We find that some disks use D2H frame instead of SDB frame to return NCQ error. Currently, only the I/O corresponding to the D2H frame is processed in this scenario, which does not meet the processing requirements of the NCQ error scenario. So we set dev_status to HISI_SAS_DEV_NCQ_ERR and abort all I/Os of the disk in this scenario. Co-developed-by: Xingui Yang Signed-off-by: Xingui Yang Signed-off-by: Xiang Chen Link: https://lore.kernel.org/r/20240402035513.2024241-2-chenxiang66@hisilicon.com Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 7d2a33514538..34f96cc35342 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -2244,7 +2244,15 @@ slot_err_v3_hw(struct hisi_hba *hisi_hba, struct sas_task *task, case SAS_PROTOCOL_SATA | SAS_PROTOCOL_STP: if ((dw0 & CMPLT_HDR_RSPNS_XFRD_MSK) && (sipc_rx_err_type & RX_FIS_STATUS_ERR_MSK)) { - ts->stat = SAS_PROTO_RESPONSE; + if (task->ata_task.use_ncq) { + struct domain_device *device = task->dev; + struct hisi_sas_device *sas_dev = device->lldd_dev; + + sas_dev->dev_status = HISI_SAS_DEV_NCQ_ERR; + slot->abort = 1; + } else { + ts->stat = SAS_PROTO_RESPONSE; + } } else if (dma_rx_err_type & RX_DATA_LEN_UNDERFLOW_MSK) { ts->residual = trans_tx_fail_type; ts->stat = SAS_DATA_UNDERRUN; -- cgit v1.2.3 From 0098c55e0881f0b32591f2110410d5c8b7f9bd5a Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Tue, 2 Apr 2024 11:55:13 +0800 Subject: scsi: hisi_sas: Modify the deadline for ata_wait_after_reset() We found that the second parameter of function ata_wait_after_reset() is incorrectly used. We call smp_ata_check_ready_type() to poll the device type until the 30s timeout, so the correct deadline should be (jiffies + 30000). Fixes: 3c2673a09cf1 ("scsi: hisi_sas: Fix SATA devices missing issue during I_T nexus reset") Co-developed-by: xiabing Signed-off-by: xiabing Co-developed-by: Yihang Li Signed-off-by: Yihang Li Signed-off-by: Xiang Chen Link: https://lore.kernel.org/r/20240402035513.2024241-3-chenxiang66@hisilicon.com Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 097dfe4b620d..35f8e00850d6 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1797,7 +1797,7 @@ static int hisi_sas_debug_I_T_nexus_reset(struct domain_device *device) if (dev_is_sata(device)) { struct ata_link *link = &device->sata_dev.ap->link; - rc = ata_wait_after_reset(link, HISI_SAS_WAIT_PHYUP_TIMEOUT, + rc = ata_wait_after_reset(link, jiffies + HISI_SAS_WAIT_PHYUP_TIMEOUT, smp_ata_check_ready_type); } else { msleep(2000); -- cgit v1.2.3 From 4406e4176f47177f5e51b4cc7e6a7a2ff3dbfbbd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 2 Apr 2024 12:56:54 +0300 Subject: scsi: qla2xxx: Fix off by one in qla_edif_app_getstats() The app_reply->elem[] array is allocated earlier in this function and it has app_req.num_ports elements. Thus this > comparison needs to be >= to prevent memory corruption. Fixes: 7878f22a2e03 ("scsi: qla2xxx: edif: Add getfcinfo and statistic bsgs") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/5c125b2f-92dd-412b-9b6f-fc3a3207bd60@moroto.mountain Reviewed-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_edif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_edif.c b/drivers/scsi/qla2xxx/qla_edif.c index 26e6b3e3af43..dcde55c8ee5d 100644 --- a/drivers/scsi/qla2xxx/qla_edif.c +++ b/drivers/scsi/qla2xxx/qla_edif.c @@ -1100,7 +1100,7 @@ qla_edif_app_getstats(scsi_qla_host_t *vha, struct bsg_job *bsg_job) list_for_each_entry_safe(fcport, tf, &vha->vp_fcports, list) { if (fcport->edif.enable) { - if (pcnt > app_req.num_ports) + if (pcnt >= app_req.num_ports) break; app_reply->elem[pcnt].rekey_count = -- cgit v1.2.3 From 978e5c19dfefc271e5550efba92fcef0d3f62864 Mon Sep 17 00:00:00 2001 From: Alexey Izbyshev Date: Fri, 5 Apr 2024 15:55:51 +0300 Subject: io_uring: Fix io_cqring_wait() not restoring sigmask on get_timespec64() failure This bug was introduced in commit 950e79dd7313 ("io_uring: minor io_cqring_wait() optimization"), which was made in preparation for adc8682ec690 ("io_uring: Add support for napi_busy_poll"). The latter got reverted in cb3182167325 ("Revert "io_uring: Add support for napi_busy_poll""), so simply undo the former as well. Cc: stable@vger.kernel.org Fixes: 950e79dd7313 ("io_uring: minor io_cqring_wait() optimization") Signed-off-by: Alexey Izbyshev Link: https://lore.kernel.org/r/20240405125551.237142-1-izbyshev@ispras.ru Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4521c2b66b98..c170a2b8d2cf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2602,19 +2602,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (__io_cqring_events_user(ctx) >= min_events) return 0; - if (sig) { -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); - else -#endif - ret = set_user_sigmask(sig, sigsz); - - if (ret) - return ret; - } - init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); @@ -2633,6 +2620,19 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, io_napi_adjust_timeout(ctx, &iowq, &ts); } + if (sig) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, + sigsz); + else +#endif + ret = set_user_sigmask(sig, sigsz); + + if (ret) + return ret; + } + io_napi_busy_loop(ctx, &iowq); trace_io_uring_cqring_wait(ctx, min_events); -- cgit v1.2.3 From beaa51b36012fad5a4d3c18b88a617aea7a9b96d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 4 Apr 2024 12:32:53 -0400 Subject: blk-iocost: avoid out of bounds shift UBSAN catches undefined behavior in blk-iocost, where sometimes iocg->delay is shifted right by a number that is too large, resulting in undefined behavior on some architectures. [ 186.556576] ------------[ cut here ]------------ UBSAN: shift-out-of-bounds in block/blk-iocost.c:1366:23 shift exponent 64 is too large for 64-bit type 'u64' (aka 'unsigned long long') CPU: 16 PID: 0 Comm: swapper/16 Tainted: G S E N 6.9.0-0_fbk700_debug_rc2_kbuilder_0_gc85af715cac0 #1 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A23 12/08/2020 Call Trace: dump_stack_lvl+0x8f/0xe0 __ubsan_handle_shift_out_of_bounds+0x22c/0x280 iocg_kick_delay+0x30b/0x310 ioc_timer_fn+0x2fb/0x1f80 __run_timer_base+0x1b6/0x250 ... Avoid that undefined behavior by simply taking the "delay = 0" branch if the shift is too large. I am not sure what the symptoms of an undefined value delay will be, but I suspect it could be more than a little annoying to debug. Signed-off-by: Rik van Riel Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240404123253.0f58010f@imladris.surriel.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9a85bfbbc45a..baa20c85799d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1347,7 +1347,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 tdelta, delay, new_delay; + u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; @@ -1362,8 +1362,9 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; - if (iocg->delay) - delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + shift = div64_u64(tdelta, USEC_PER_SEC); + if (iocg->delay && shift < BITS_PER_LONG) + delay = iocg->delay >> shift; else delay = 0; -- cgit v1.2.3 From 4539f91f2a801c0c028c252bffae56030cfb2cae Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 3 Apr 2024 22:38:01 +0200 Subject: net: openvswitch: fix unwanted error log on timeout policy probing On startup, ovs-vswitchd probes different datapath features including support for timeout policies. While probing, it tries to execute certain operations with OVS_PACKET_ATTR_PROBE or OVS_FLOW_ATTR_PROBE attributes set. These attributes tell the openvswitch module to not log any errors when they occur as it is expected that some of the probes will fail. For some reason, setting the timeout policy ignores the PROBE attribute and logs a failure anyway. This is causing the following kernel log on each re-start of ovs-vswitchd: kernel: Failed to associated timeout policy `ovs_test_tp' Fix that by using the same logging macro that all other messages are using. The message will still be printed at info level when needed and will be rate limited, but with a net rate limiter instead of generic printk one. The nf_ct_set_timeout() itself will still print some info messages, but at least this change makes logging in openvswitch module more consistent. Fixes: 06bd2bdf19d2 ("openvswitch: Add timeout support to ct action") Signed-off-by: Ilya Maximets Acked-by: Eelco Chaudron Link: https://lore.kernel.org/r/20240403203803.2137962-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/openvswitch/conntrack.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3019a4406ca4..74b63cdb5992 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1380,8 +1380,9 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (ct_info.timeout[0]) { if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, ct_info.timeout)) - pr_info_ratelimited("Failed to associated timeout " - "policy `%s'\n", ct_info.timeout); + OVS_NLERR(log, + "Failed to associated timeout policy '%s'", + ct_info.timeout); else ct_info.nf_ct_timeout = rcu_dereference( nf_ct_timeout_find(ct_info.ct)->timeout); -- cgit v1.2.3 From 38a15d0a50e0a43778561a5861403851f0b0194c Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Thu, 4 Apr 2024 09:57:40 +0200 Subject: u64_stats: fix u64_stats_init() for lockdep when used repeatedly in one file Fix bogus lockdep warnings if multiple u64_stats_sync variables are initialized in the same file. With CONFIG_LOCKDEP, seqcount_init() is a macro which declares: static struct lock_class_key __key; Since u64_stats_init() is a function (albeit an inline one), all calls within the same file end up using the same instance, effectively treating them all as a single lock-class. Fixes: 9464ca650008 ("net: make u64_stats_init() a function") Closes: https://lore.kernel.org/netdev/ea1567d9-ce66-45e6-8168-ac40a47d1821@roeck-us.net/ Signed-off-by: Petr Tesarik Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240404075740.30682-1-petr@tesarici.cz Signed-off-by: Jakub Kicinski --- include/linux/u64_stats_sync.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index ffe48e69b3f3..457879938fc1 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -135,10 +135,11 @@ static inline void u64_stats_inc(u64_stats_t *p) p->v++; } -static inline void u64_stats_init(struct u64_stats_sync *syncp) -{ - seqcount_init(&syncp->seq); -} +#define u64_stats_init(syncp) \ + do { \ + struct u64_stats_sync *__s = (syncp); \ + seqcount_init(&__s->seq); \ + } while (0) static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { -- cgit v1.2.3 From 237f3cf13b20db183d3706d997eedc3c49eacd44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Apr 2024 20:27:38 +0000 Subject: xsk: validate user input for XDP_{UMEM|COMPLETION}_FILL_RING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot reported an illegal copy in xsk_setsockopt() [1] Make sure to validate setsockopt() @optlen parameter. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in xsk_setsockopt+0x909/0xa40 net/xdp/xsk.c:1420 Read of size 4 at addr ffff888028c6cde3 by task syz-executor.0/7549 CPU: 0 PID: 7549 Comm: syz-executor.0 Not tainted 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] xsk_setsockopt+0x909/0xa40 net/xdp/xsk.c:1420 do_sock_setsockopt+0x3af/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7fb40587de69 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fb40665a0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007fb4059abf80 RCX: 00007fb40587de69 RDX: 0000000000000005 RSI: 000000000000011b RDI: 0000000000000006 RBP: 00007fb4058ca47a R08: 0000000000000002 R09: 0000000000000000 R10: 0000000020001980 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007fb4059abf80 R15: 00007fff57ee4d08 Allocated by task 7549: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:370 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:387 kasan_kmalloc include/linux/kasan.h:211 [inline] __do_kmalloc_node mm/slub.c:3966 [inline] __kmalloc+0x233/0x4a0 mm/slub.c:3979 kmalloc include/linux/slab.h:632 [inline] __cgroup_bpf_run_filter_setsockopt+0xd2f/0x1040 kernel/bpf/cgroup.c:1869 do_sock_setsockopt+0x6b4/0x720 net/socket.c:2293 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 The buggy address belongs to the object at ffff888028c6cde0 which belongs to the cache kmalloc-8 of size 8 The buggy address is located 1 bytes to the right of allocated 2-byte region [ffff888028c6cde0, ffff888028c6cde2) The buggy address belongs to the physical page: page:ffffea0000a31b00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888028c6c9c0 pfn:0x28c6c anon flags: 0xfff00000000800(slab|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000800 ffff888014c41280 0000000000000000 dead000000000001 raw: ffff888028c6c9c0 0000000080800057 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x112cc0(GFP_USER|__GFP_NOWARN|__GFP_NORETRY), pid 6648, tgid 6644 (syz-executor.0), ts 133906047828, free_ts 133859922223 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x1ea/0x210 mm/page_alloc.c:1533 prep_new_page mm/page_alloc.c:1540 [inline] get_page_from_freelist+0x33ea/0x3580 mm/page_alloc.c:3311 __alloc_pages+0x256/0x680 mm/page_alloc.c:4569 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] alloc_slab_page+0x5f/0x160 mm/slub.c:2175 allocate_slab mm/slub.c:2338 [inline] new_slab+0x84/0x2f0 mm/slub.c:2391 ___slab_alloc+0xc73/0x1260 mm/slub.c:3525 __slab_alloc mm/slub.c:3610 [inline] __slab_alloc_node mm/slub.c:3663 [inline] slab_alloc_node mm/slub.c:3835 [inline] __do_kmalloc_node mm/slub.c:3965 [inline] __kmalloc_node+0x2db/0x4e0 mm/slub.c:3973 kmalloc_node include/linux/slab.h:648 [inline] __vmalloc_area_node mm/vmalloc.c:3197 [inline] __vmalloc_node_range+0x5f9/0x14a0 mm/vmalloc.c:3392 __vmalloc_node mm/vmalloc.c:3457 [inline] vzalloc+0x79/0x90 mm/vmalloc.c:3530 bpf_check+0x260/0x19010 kernel/bpf/verifier.c:21162 bpf_prog_load+0x1667/0x20f0 kernel/bpf/syscall.c:2895 __sys_bpf+0x4ee/0x810 kernel/bpf/syscall.c:5631 __do_sys_bpf kernel/bpf/syscall.c:5738 [inline] __se_sys_bpf kernel/bpf/syscall.c:5736 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5736 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 page last free pid 6650 tgid 6647 stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1140 [inline] free_unref_page_prepare+0x95d/0xa80 mm/page_alloc.c:2346 free_unref_page_list+0x5a3/0x850 mm/page_alloc.c:2532 release_pages+0x2117/0x2400 mm/swap.c:1042 tlb_batch_pages_flush mm/mmu_gather.c:98 [inline] tlb_flush_mmu_free mm/mmu_gather.c:293 [inline] tlb_flush_mmu+0x34d/0x4e0 mm/mmu_gather.c:300 tlb_finish_mmu+0xd4/0x200 mm/mmu_gather.c:392 exit_mmap+0x4b6/0xd40 mm/mmap.c:3300 __mmput+0x115/0x3c0 kernel/fork.c:1345 exit_mm+0x220/0x310 kernel/exit.c:569 do_exit+0x99e/0x27e0 kernel/exit.c:865 do_group_exit+0x207/0x2c0 kernel/exit.c:1027 get_signal+0x176e/0x1850 kernel/signal.c:2907 arch_do_signal_or_restart+0x96/0x860 arch/x86/kernel/signal.c:310 exit_to_user_mode_loop kernel/entry/common.c:105 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:201 [inline] syscall_exit_to_user_mode+0xc9/0x360 kernel/entry/common.c:212 do_syscall_64+0x10a/0x240 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Memory state around the buggy address: ffff888028c6cc80: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc ffff888028c6cd00: fa fc fc fc fa fc fc fc 00 fc fc fc 06 fc fc fc >ffff888028c6cd80: fa fc fc fc fa fc fc fc fa fc fc fc 02 fc fc fc ^ ffff888028c6ce00: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc ffff888028c6ce80: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc Fixes: 423f38329d26 ("xsk: add umem fill queue support and mmap") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: "Björn Töpel" Cc: Magnus Karlsson Cc: Maciej Fijalkowski Cc: Jonathan Lemon Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240404202738.3634547-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3404d076a8a3..727aa20be4bd 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1417,6 +1417,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; + if (optlen < sizeof(entries)) + return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; -- cgit v1.2.3 From b377c66ae3509ccea596512d6afb4777711c4870 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 5 Apr 2024 16:46:37 +0200 Subject: x86/retpoline: Add NOENDBR annotation to the SRSO dummy return thunk srso_alias_untrain_ret() is special code, even if it is a dummy which is called in the !SRSO case, so annotate it like its real counterpart, to address the following objtool splat: vmlinux.o: warning: objtool: .export_symbol+0x2b290: data relocation to !ENDBR: srso_alias_untrain_ret+0x0 Fixes: 4535e1a4174c ("x86/bugs: Fix the SRSO mitigation on Zen3/4") Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240405144637.17908-1-bp@kernel.org --- arch/x86/lib/retpoline.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 0795b3464058..e674ccf720b9 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -229,6 +229,7 @@ SYM_CODE_END(srso_return_thunk) /* Dummy for the alternative in CALL_UNTRAIN_RET. */ SYM_CODE_START(srso_alias_untrain_ret) ANNOTATE_UNRET_SAFE + ANNOTATE_NOENDBR ret int3 SYM_FUNC_END(srso_alias_untrain_ret) -- cgit v1.2.3 From 374b3d38feff4a1cb4ecadace9bd915ffd91fe4b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 22:23:29 -0400 Subject: bcachefs: Fix BCH_IOCTL_FSCK_OFFLINE for encrypted filesystems To open an encrypted filesystem, we use request_key() to get the encryption key from the user's keyring - but request_key() needs to happen in the context of the process that invoked the ioctl. This easily fixed by using bch2_fs_open() in nostart mode. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 94 +++++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index cbfa6459bdbc..1e033df330b0 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -134,42 +134,38 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg struct fsck_thread { struct thread_with_stdio thr; struct bch_fs *c; - char **devs; - size_t nr_devs; struct bch_opts opts; }; static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) { struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - if (thr->devs) - for (size_t i = 0; i < thr->nr_devs; i++) - kfree(thr->devs[i]); - kfree(thr->devs); kfree(thr); } static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - - if (IS_ERR(c)) - return PTR_ERR(c); + struct bch_fs *c = thr->c; - int ret = 0; - if (test_bit(BCH_FS_errors_fixed, &c->flags)) - ret |= 1; - if (test_bit(BCH_FS_error, &c->flags)) - ret |= 4; + int ret = PTR_ERR_OR_ZERO(c); + if (ret) + return ret; - bch2_fs_stop(c); + ret = bch2_fs_start(thr->c); + if (ret) + goto err; - if (ret & 1) + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - if (ret & 4) + ret |= 1; + } + if (test_bit(BCH_FS_error, &c->flags)) { bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - + ret |= 4; + } +err: + bch2_fs_stop(c); return ret; } @@ -182,7 +178,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a { struct bch_ioctl_fsck_offline arg; struct fsck_thread *thr = NULL; - u64 *devs = NULL; + darray_str(devs) = {}; long ret = 0; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -194,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || - !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || - !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { - ret = -ENOMEM; - goto err; - } + for (size_t i = 0; i < arg.nr_devs; i++) { + u64 dev_u64; + ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); + if (ret) + goto err; - thr->opts = bch2_opts_empty(); - thr->nr_devs = arg.nr_devs; + char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); + ret = PTR_ERR_OR_ZERO(dev_str); + if (ret) + goto err; - if (copy_from_user(devs, &user_arg->devs[0], - array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) { - ret = -EINVAL; - goto err; + ret = darray_push(&devs, dev_str); + if (ret) { + kfree(dev_str); + goto err; + } } - for (size_t i = 0; i < arg.nr_devs; i++) { - thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); - ret = PTR_ERR_OR_ZERO(thr->devs[i]); - if (ret) - goto err; + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; } + thr->opts = bch2_opts_empty(); + if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); @@ -230,15 +229,22 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + /* We need request_key() to be called before we punt to kthread: */ + opt_set(thr->opts, nostart, true); + + thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); -err: - if (ret < 0) { - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - } - kfree(devs); +out: + darray_for_each(devs, i) + kfree(*i); + darray_exit(&devs); return ret; +err: + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + goto out; } static long bch2_global_ioctl(unsigned cmd, void __user *arg) -- cgit v1.2.3 From 05801b6526156aefe55c0440fab877109c9a89c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 22:30:30 -0400 Subject: bcachefs: Disable errors=panic for BCH_IOCTL_FSCK_OFFLINE BCH_IOCTL_FSCK_OFFLINE allows the userspace fsck tool to use the kernel implementation of fsck - primarily when the kernel version is a better version match. It should look and act exactly like the normal userspace fsck that the user expected to be invoking, so errors should never result in a kernel panic. We may want to consider further restricting errors=panic - it's only intended for debugging in controlled test environments, it should have no purpose it normal usage. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 1e033df330b0..72781aad6ba7 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -234,6 +234,10 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + if (!IS_ERR(thr->c) && + thr->c->opts.errors == BCH_ON_ERROR_panic) + thr->c->opts.errors = BCH_ON_ERROR_ro; + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); out: darray_for_each(devs, i) -- cgit v1.2.3 From 6088234ce83acec4aaf56ecc0e9525bac18b4295 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Apr 2024 23:27:27 -0400 Subject: bcachefs: JOURNAL_SPACE_LOW "bcachefs; Fix deadlock in bch2_btree_update_start()" was a significant performance regression (nearly 50%) on multithreaded random writes with fio. The reason is that the journal watermark checks multiple things, including the state of the btree write buffer, and on multithreaded update heavy workloads we're bottleneked on write buffer flushing - we don't want kicknig off btree updates to depend on the state of the write buffer. This isn't strictly correct; the interior btree update path does do write buffer updates, but it's a tiny fraction of total accounting updates and we're more concerned with space in the journal itself. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 2 +- fs/bcachefs/btree_update_interior.c | 18 +++++++----------- fs/bcachefs/journal_reclaim.c | 2 ++ fs/bcachefs/journal_types.h | 1 + fs/bcachefs/util.h | 8 ++++++++ 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 581edcb0911b..4ae7890f07c6 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -659,7 +659,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - j->watermark == BCH_WATERMARK_stripe) + !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; ret = bch2_btree_iter_traverse(&b_iter) ?: diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 24fbd5be70a2..a4a63e363047 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1125,18 +1125,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (watermark < c->journal.watermark) { - struct journal_res res = { 0 }; - unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; + if (watermark < BCH_WATERMARK_reclaim && + test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { + if (flags & BCH_TRANS_COMMIT_journal_reclaim) + return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < BCH_WATERMARK_reclaim) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - - ret = drop_locks_do(trans, - bch2_journal_res_get(&c->journal, &res, 1, journal_flags)); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_journal_reclaim_would_deadlock; + bch2_trans_unlock(trans); + wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); + ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ab811c0dad26..04a577848b01 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -67,6 +67,8 @@ void bch2_journal_set_watermark(struct journal *j) track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); + mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin); + swap(watermark, j->watermark); if (watermark > j->watermark) journal_wake(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 8c053cb64ca5..b5161b5d76a0 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -134,6 +134,7 @@ enum journal_flags { JOURNAL_STARTED, JOURNAL_MAY_SKIP_FLUSH, JOURNAL_NEED_FLUSH_WRITE, + JOURNAL_SPACE_LOW, }; /* Reasons we may fail to get a journal reservation: */ diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index de639e8a3ab5..5cf885b09986 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -788,6 +788,14 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi #endif +static inline void mod_bit(long nr, volatile unsigned long *addr, bool v) +{ + if (v) + set_bit(nr, addr); + else + clear_bit(nr, addr); +} + static inline void __set_bit_le64(size_t bit, __le64 *addr) { addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); -- cgit v1.2.3 From aa98e70fc6c97f39a7bd68cb1e641ca50d4f9423 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 5 Apr 2024 14:23:18 +0700 Subject: Documentation: filesystems: Add bcachefs toctree Commit eb386617be4bdf ("bcachefs: Errcode tracepoint, documentation") adds initial bcachefs documentation (private error codes) but without any table of contents tree for the filesystem docs, hence Sphinx warns: Documentation/filesystems/bcachefs/errorcodes.rst: WARNING: document isn't included in any toctree Add bcachefs toctree to fix above warning. Fixes: eb386617be4b ("bcachefs: Errcode tracepoint, documentation") Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/index.rst | 11 +++++++++++ Documentation/filesystems/index.rst | 1 + 2 files changed, 12 insertions(+) create mode 100644 Documentation/filesystems/bcachefs/index.rst diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst new file mode 100644 index 000000000000..e2bd61ccd96f --- /dev/null +++ b/Documentation/filesystems/bcachefs/index.rst @@ -0,0 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +bcachefs Documentation +====================== + +.. toctree:: + :maxdepth: 2 + :numbered: + + errorcodes diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 0ea1e44fa028..1f9b4c905a6a 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -69,6 +69,7 @@ Documentation for filesystem implementations. afs autofs autofs-mount-control + bcachefs/index befs bfs btrfs -- cgit v1.2.3 From 7d83cf53c77c8fdc0a8033e1472ac62745fac2ac Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 5 Apr 2024 14:23:19 +0700 Subject: MAINTAINERS: Add entry for bcachefs documentation Now that bcachefs docs exist in Documentation/filesystems/bcachefs/, cover it in MAINTAINERS entry for the filesystem. Signed-off-by: Bagas Sanjaya Signed-off-by: Kent Overstreet --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..c0091a206fd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3573,6 +3573,7 @@ S: Supported C: irc://irc.oftc.net/bcache T: git https://evilpiepirate.org/git/bcachefs.git F: fs/bcachefs/ +F: Documentation/filesystems/bcachefs/ BDISP ST MEDIA DRIVER M: Fabien Dessenne -- cgit v1.2.3 From 2d793e9315e335ef299024fe8bf7742c9c974b33 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 6 Apr 2024 16:19:20 +0200 Subject: bcachefs: Rename struct field swap to prevent macro naming collision The struct field swap can collide with the swap() macro defined in linux/minmax.h. Rename the struct field to prevent such collisions. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 4ce5e957a6e9..0f955c3c76a7 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n) struct wrapper { cmp_func_t cmp; - swap_func_t swap; + swap_func_t swap_func; }; /* @@ -125,7 +125,7 @@ struct wrapper { static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { - ((const struct wrapper *)priv)->swap(a, b, (int)size); + ((const struct wrapper *)priv)->swap_func(a, b, (int)size); return; } @@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, int i, c, r; /* called from 'sort' without swap function, let's pick the default */ - if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) swap_func = NULL; if (!swap_func) { @@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size, { struct wrapper w = { .cmp = cmp_func, - .swap = swap_func, + .swap_func = swap_func, }; return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); -- cgit v1.2.3 From 30e615a2ce6601d85729caefd8ac15634f848e59 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 6 Apr 2024 21:45:46 -0400 Subject: bcachefs: Fix gap buffer bug in bch2_journal_key_insert_take() Multiple bug fixes for journal iters: - When the journal keys gap buffer is resized, we have to adjust the iterators for moving the gap to the end - We don't want to rewind iterators to point to the key we just inserted if it's not for the correct btree/level Also, add some new assertions. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 55 ++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 5cbcbfe85235..a6413a8747d3 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -130,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); } +static void journal_iter_verify(struct journal_iter *iter) +{ + struct journal_keys *keys = iter->keys; + size_t gap_size = keys->size - keys->nr; + + BUG_ON(iter->idx >= keys->gap && + iter->idx < keys->gap + gap_size); + + if (iter->idx < keys->size) { + struct journal_key *k = keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + BUG_ON(cmp < 0); + } +} + static void journal_iters_fix(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; /* The key we just inserted is immediately before the gap: */ size_t gap_end = keys->gap + (keys->size - keys->nr); - struct btree_and_journal_iter *iter; + struct journal_key *new_key = &keys->data[keys->gap - 1]; + struct journal_iter *iter; /* * If an iterator points one after the key we just inserted, decrement @@ -143,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c) * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will * handle that: */ - list_for_each_entry(iter, &c->journal_iters, journal.list) - if (iter->journal.idx == gap_end) - iter->journal.idx = keys->gap - 1; + list_for_each_entry(iter, &c->journal_iters, list) { + journal_iter_verify(iter); + if (iter->idx == gap_end && + new_key->btree_id == iter->btree_id && + new_key->level == iter->level) + iter->idx = keys->gap - 1; + journal_iter_verify(iter); + } } static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) @@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (idx > keys->gap) idx -= keys->size - keys->nr; + size_t old_gap = keys->gap; + if (keys->nr == keys->size) { + journal_iters_move_gap(c, old_gap, keys->size); + old_gap = keys->size; + struct journal_keys new_keys = { .nr = keys->nr, .size = max_t(size_t, keys->size, 8) * 2, @@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, keys->gap = keys->nr; } - journal_iters_move_gap(c, keys->gap, idx); + journal_iters_move_gap(c, old_gap, idx); move_gap(keys, idx); @@ -301,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->data + iter->idx; + journal_iter_verify(iter); + + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + if (cmp > 0) + break; + BUG_ON(cmp); - while (k < iter->keys->data + iter->keys->size && - k->btree_id == iter->btree_id && - k->level == iter->level) { if (!k->overwritten) return bkey_i_to_s_c(k->k); bch2_journal_iter_advance(iter); - k = iter->keys->data + iter->idx; } return bkey_s_c_null; @@ -330,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->level = level; iter->keys = &c->journal_keys; iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); + + journal_iter_verify(iter); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -- cgit v1.2.3 From 84471d01c92c33b3f4cedfe319639ecf7f8fc4c5 Mon Sep 17 00:00:00 2001 From: Vitaly Rodionov Date: Fri, 5 Apr 2024 22:06:35 +0100 Subject: ALSA: hda/realtek: Add quirk for HP SnowWhite laptops Add support for HP SnowWhite laptops with CS35L51 amplifiers on I2C bus connected to Realtek codec. Signed-off-by: Vitaly Rodionov Message-ID: <20240405210635.22193-1-vitalyr@opensource.cirrus.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index cdcb28aa9d7b..d6940bc4ec39 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10084,6 +10084,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8cdd, "HP Spectre", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8cde, "HP Spectre", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8cdf, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8ce0, "HP SnowWhite", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), -- cgit v1.2.3 From 0b6f0ff01a4a8c1b66c600263465976d57dcc1a3 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Sat, 6 Apr 2024 21:20:09 +0800 Subject: ALSA: hda/tas2781: correct the register for pow calibrated data Calibrated data was written into an incorrect register, which cause speaker protection sometimes malfuctions Fixes: 5be27f1e3ec9 ("ALSA: hda/tas2781: Add tas2781 HDA driver") Signed-off-by: Shenghao Ding Cc: Message-ID: <20240406132010.341-1-shenghao-ding@ti.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 48dae3339305..75f7674c66ee 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -514,10 +514,10 @@ static int tas2563_save_calibration(struct tasdevice_priv *tas_priv) static void tas2781_apply_calib(struct tasdevice_priv *tas_priv) { static const unsigned char page_array[CALIB_MAX] = { - 0x17, 0x18, 0x18, 0x0d, 0x18 + 0x17, 0x18, 0x18, 0x13, 0x18, }; static const unsigned char rgno_array[CALIB_MAX] = { - 0x74, 0x0c, 0x14, 0x3c, 0x7c + 0x74, 0x0c, 0x14, 0x70, 0x7c, }; unsigned char *data; int i, j, rc; -- cgit v1.2.3 From 059a49aa2e25c58f90b50151f109dd3c4cdb3a47 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 3 Apr 2024 08:43:12 -0700 Subject: virtio_net: Do not send RSS key if it is not supported There is a bug when setting the RSS options in virtio_net that can break the whole machine, getting the kernel into an infinite loop. Running the following command in any QEMU virtual machine with virtionet will reproduce this problem: # ethtool -X eth0 hfunc toeplitz This is how the problem happens: 1) ethtool_set_rxfh() calls virtnet_set_rxfh() 2) virtnet_set_rxfh() calls virtnet_commit_rss_command() 3) virtnet_commit_rss_command() populates 4 entries for the rss scatter-gather 4) Since the command above does not have a key, then the last scatter-gatter entry will be zeroed, since rss_key_size == 0. sg_buf_size = vi->rss_key_size; 5) This buffer is passed to qemu, but qemu is not happy with a buffer with zero length, and do the following in virtqueue_map_desc() (QEMU function): if (!sz) { virtio_error(vdev, "virtio: zero sized buffers are not allowed"); 6) virtio_error() (also QEMU function) set the device as broken vdev->broken = true; 7) Qemu bails out, and do not repond this crazy kernel. 8) The kernel is waiting for the response to come back (function virtnet_send_command()) 9) The kernel is waiting doing the following : while (!virtqueue_get_buf(vi->cvq, &tmp) && !virtqueue_is_broken(vi->cvq)) cpu_relax(); 10) None of the following functions above is true, thus, the kernel loops here forever. Keeping in mind that virtqueue_is_broken() does not look at the qemu `vdev->broken`, so, it never realizes that the vitio is broken at QEMU side. Fix it by not sending RSS commands if the feature is not available in the device. Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Cc: stable@vger.kernel.org Cc: qemu-devel@nongnu.org Signed-off-by: Breno Leitao Reviewed-by: Heng Qi Reviewed-by: Xuan Zhuo Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c22d1118a133..115c3c5414f2 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3807,6 +3807,7 @@ static int virtnet_set_rxfh(struct net_device *dev, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); + bool update = false; int i; if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && @@ -3814,13 +3815,28 @@ static int virtnet_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; if (rxfh->indir) { + if (!vi->has_rss) + return -EOPNOTSUPP; + for (i = 0; i < vi->rss_indir_table_size; ++i) vi->ctrl->rss.indirection_table[i] = rxfh->indir[i]; + update = true; } - if (rxfh->key) + + if (rxfh->key) { + /* If either _F_HASH_REPORT or _F_RSS are negotiated, the + * device provides hash calculation capabilities, that is, + * hash_key is configured. + */ + if (!vi->has_rss && !vi->has_rss_hash_report) + return -EOPNOTSUPP; + memcpy(vi->ctrl->rss.key, rxfh->key, vi->rss_key_size); + update = true; + } - virtnet_commit_rss_command(vi); + if (update) + virtnet_commit_rss_command(vi); return 0; } @@ -4729,13 +4745,15 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) vi->has_rss_hash_report = true; - if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) { vi->has_rss = true; - if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_indir_table_size = virtio_cread16(vdev, offsetof(struct virtio_net_config, rss_max_indirection_table_length)); + } + + if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size)); -- cgit v1.2.3 From bccb798e07f8bb8b91212fe8ed1e421685449076 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 4 Apr 2024 16:54:27 +0530 Subject: octeontx2-pf: Fix transmit scheduler resource leak Inorder to support shaping and scheduling, Upon class creation Netdev driver allocates trasmit schedulers. The previous patch which added support for Round robin scheduling has a bug due to which driver is not freeing transmit schedulers post class deletion. This patch fixes the same. Fixes: 47a9656f168a ("octeontx2-pf: htb offload support for Round Robin scheduling") Signed-off-by: Hariprasad Kelam Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/qos.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c index 1e77bbf5d22a..1723e9912ae0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c @@ -382,6 +382,7 @@ static void otx2_qos_read_txschq_cfg_tl(struct otx2_qos_node *parent, otx2_qos_read_txschq_cfg_tl(node, cfg); cnt = cfg->static_node_pos[node->level]; cfg->schq_contig_list[node->level][cnt] = node->schq; + cfg->schq_index_used[node->level][cnt] = true; cfg->schq_contig[node->level]++; cfg->static_node_pos[node->level]++; otx2_qos_read_txschq_cfg_schq(node, cfg); -- cgit v1.2.3 From 09e913f5826936c0f6632d6d0d35a36295fac7cc Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 26 Mar 2024 12:04:56 +0800 Subject: bcachefs: fix the count of nr_freed_pcpu after changing bc->freed_nonpcpu list When allocating bkey_cached from bc->freed_pcpu list, it missed decreasing the count of nr_freed_pcpu which would cause the mismatch between the value of nr_freed_pcpu and the list items. This problem also exists in moving new bkey_cached to bc->freed_pcpu list. If these happened, the bug info may appear in bch2_fs_btree_key_cache_exit by the follow code: BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); Fixes: c65c13f0eac6 ("bcachefs: Run btree key cache shrinker less aggressively") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 4ae7890f07c6..88a3582a3275 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, } else { mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_pcpu); + bc->nr_freed_pcpu++; mutex_unlock(&bc->lock); } } @@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_pcpu)) { ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_pcpu--; } mutex_unlock(&bc->lock); } -- cgit v1.2.3 From fec50db7033ea478773b159e0e2efb135270e3b7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 7 Apr 2024 13:22:46 -0700 Subject: Linux 6.9-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4bef6323c47d..e1bf12891cb0 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- cgit v1.2.3 From b897b148ee30c7fca995e6d15cf791f52993920b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 7 Apr 2024 16:20:17 -0400 Subject: bcachefs: fix bch2_get_acl() transaction restart handling bch2_acl_from_disk() uses allocate_dropping_locks, and can thus return a transaction restart - this wasn't handled. Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 3640f417cce1..5c180fdc3efb 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; @@ -290,28 +289,27 @@ retry: ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, inode_inum(inode), &search, 0); - if (ret) { - if (!bch2_err_matches(ret, ENOENT)) - acl = ERR_PTR(ret); - goto out; - } + if (ret) + goto err; k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); - if (ret) { - acl = ERR_PTR(ret); - goto out; - } + if (ret) + goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; - if (!IS_ERR(acl)) + if (ret) + acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; + + if (!IS_ERR_OR_NULL(acl)) set_cached_acl(&inode->v, type, acl); -out: - if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) - goto retry; bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); -- cgit v1.2.3 From 8b8ace080319a866f5dfe9da8e665ae51d971c54 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 20:59:10 +0800 Subject: block: fix q->blkg_list corruption during disk rebind Multiple gendisk instances can allocated/added for single request queue in case of disk rebind. blkg may still stay in q->blkg_list when calling blkcg_init_disk() for rebind, then q->blkg_list becomes corrupted. Fix the list corruption issue by: - add blkg_init_queue() to initialize q->blkg_list & q->blkcg_mutex only - move calling blkg_init_queue() into blk_alloc_queue() The list corruption should be started since commit f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") which delays removing blkg from q->blkg_list into blkg_free_workfn(). Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Fixes: 1059699f87eb ("block: move blkcg initialization/destroy into disk allocation/release handler") Cc: Yu Kuai Cc: Tejun Heo Signed-off-by: Ming Lei Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240407125910.4053377-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++++++--- block/blk-cgroup.h | 2 ++ block/blk-core.c | 2 ++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bdbb557feb5a..059467086b13 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1409,6 +1409,12 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } +void blkg_init_queue(struct request_queue *q) +{ + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); +} + int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -1416,9 +1422,6 @@ int blkcg_init_disk(struct gendisk *disk) bool preloaded; int ret; - INIT_LIST_HEAD(&q->blkg_list); - mutex_init(&q->blkcg_mutex); - new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 78b74106bf10..90b3959d88cf 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -189,6 +189,7 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; +void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); @@ -482,6 +483,7 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index a16b5abdbbf5..3a6f5603fb44 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -442,6 +442,8 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); + blkg_init_queue(q); + /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. -- cgit v1.2.3 From b561ea56a26415bf44ce8ca6a8e625c7c390f1ea Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 21:19:31 +0800 Subject: block: allow device to have both virt_boundary_mask and max segment size When one stacking device is over one device with virt_boundary_mask and another one with max segment size, the stacking device have both limits set. This way is allowed before d690cb8ae14b ("block: add an API to atomically update queue limits"). Relax the limit so that we won't break such kind of stacking setting. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218687 Reported-by: janpieter.sollie@edpnet.be Fixes: d690cb8ae14b ("block: add an API to atomically update queue limits") Link: https://lore.kernel.org/linux-block/ZfGl8HzUpiOxCLm3@fedora/ Cc: Christoph Hellwig Cc: Mike Snitzer Cc: dm-devel@lists.linux.dev Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Ming Lei Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240407131931.4055231-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index cdbaef159c4b..d2731843f2fc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -182,17 +182,13 @@ static int blk_validate_limits(struct queue_limits *lim) return -EINVAL; /* - * Devices that require a virtual boundary do not support scatter/gather - * I/O natively, but instead require a descriptor list entry for each - * page (which might not be identical to the Linux PAGE_SIZE). Because - * of that they are not limited by our notion of "segment size". + * Stacking device may have both virtual boundary and max segment + * size limit, so allow this setting now, and long-term the two + * might need to move out of stacking limits since we have immutable + * bvec and lower layer bio splitting is supposed to handle the two + * correctly. */ - if (lim->virt_boundary_mask) { - if (WARN_ON_ONCE(lim->max_segment_size && - lim->max_segment_size != UINT_MAX)) - return -EINVAL; - lim->max_segment_size = UINT_MAX; - } else { + if (!lim->virt_boundary_mask) { /* * The maximum segment size has an odd historic 64k default that * drivers probably should override. Just like the I/O size we -- cgit v1.2.3 From 8358a76cfb47c9a5af627a0c4e7168aa14fa25f6 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:55 -0700 Subject: clk: Remove prepare_lock hold assertion in __clk_release() Removing this assertion lets us move the kref_put() call outside the prepare_lock section. We don't need to hold the prepare_lock here to free memory and destroy the clk_core structure. We've already unlinked the clk from the clk tree and by the time the release function runs nothing holds a reference to the clk_core anymore so anything with the pointer can't access the memory that's being freed anyway. Way back in commit 496eadf821c2 ("clk: Use lockdep asserts to find missing hold of prepare_lock") we didn't need to have this assertion either. Fixes: 496eadf821c2 ("clk: Use lockdep asserts to find missing hold of prepare_lock") Cc: Krzysztof Kozlowski Reviewed-by: Douglas Anderson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-2-sboyd@kernel.org --- drivers/clk/clk.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 25371c91a58f..fed20641822c 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -4375,8 +4375,6 @@ static void __clk_release(struct kref *ref) { struct clk_core *core = container_of(ref, struct clk_core, ref); - lockdep_assert_held(&prepare_lock); - clk_core_free_parent_map(core); kfree_const(core->name); kfree(core); -- cgit v1.2.3 From 6f63af7511e7058f3fa4ad5b8102210741c9f947 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:56 -0700 Subject: clk: Don't hold prepare_lock when calling kref_put() We don't need to hold the prepare_lock when dropping a ref on a struct clk_core. The release function is only freeing memory and any code with a pointer reference has already unlinked anything pointing to the clk_core. This reduces the holding area of the prepare_lock a bit. Note that we also don't call free_clk() with the prepare_lock held. There isn't any reason to do that. Reviewed-by: Douglas Anderson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-3-sboyd@kernel.org --- drivers/clk/clk.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index fed20641822c..e950d0cf6b2f 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -4470,7 +4470,8 @@ void clk_unregister(struct clk *clk) if (ops == &clk_nodrv_ops) { pr_err("%s: unregistered clock: %s\n", __func__, clk->core->name); - goto unlock; + clk_prepare_unlock(); + return; } /* * Assign empty clock ops for consumers that might still hold @@ -4504,11 +4505,10 @@ void clk_unregister(struct clk *clk) if (clk->core->protect_count) pr_warn("%s: unregistering protected clock: %s\n", __func__, clk->core->name); + clk_prepare_unlock(); kref_put(&clk->core->ref, __clk_release); free_clk(clk); -unlock: - clk_prepare_unlock(); } EXPORT_SYMBOL_GPL(clk_unregister); @@ -4667,13 +4667,11 @@ void __clk_put(struct clk *clk) if (clk->min_rate > 0 || clk->max_rate < ULONG_MAX) clk_set_rate_range_nolock(clk, 0, ULONG_MAX); - owner = clk->core->owner; - kref_put(&clk->core->ref, __clk_release); - clk_prepare_unlock(); + owner = clk->core->owner; + kref_put(&clk->core->ref, __clk_release); module_put(owner); - free_clk(clk); } -- cgit v1.2.3 From 9d05ae531c2cff20d5d527f04e28d28e04379929 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:57 -0700 Subject: clk: Initialize struct clk_core kref earlier Initialize this kref once we allocate memory for the struct clk_core so that we can reuse the release function to free any memory associated with the structure. This mostly consolidates code, but also clarifies that the kref lifetime exists once the container structure (struct clk_core) is allocated instead of leaving it in a half-baked state for most of __clk_core_init(). Reviewed-by: Douglas Anderson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-4-sboyd@kernel.org --- drivers/clk/clk.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index e950d0cf6b2f..8b35ca32f45e 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -3981,8 +3981,6 @@ static int __clk_core_init(struct clk_core *core) } clk_core_reparent_orphans_nolock(); - - kref_init(&core->ref); out: clk_pm_runtime_put(core); unlock: @@ -4211,6 +4209,16 @@ static void clk_core_free_parent_map(struct clk_core *core) kfree(core->parents); } +/* Free memory allocated for a struct clk_core */ +static void __clk_release(struct kref *ref) +{ + struct clk_core *core = container_of(ref, struct clk_core, ref); + + clk_core_free_parent_map(core); + kfree_const(core->name); + kfree(core); +} + static struct clk * __clk_register(struct device *dev, struct device_node *np, struct clk_hw *hw) { @@ -4231,6 +4239,8 @@ __clk_register(struct device *dev, struct device_node *np, struct clk_hw *hw) goto fail_out; } + kref_init(&core->ref); + core->name = kstrdup_const(init->name, GFP_KERNEL); if (!core->name) { ret = -ENOMEM; @@ -4285,12 +4295,10 @@ __clk_register(struct device *dev, struct device_node *np, struct clk_hw *hw) hw->clk = NULL; fail_create_clk: - clk_core_free_parent_map(core); fail_parents: fail_ops: - kfree_const(core->name); fail_name: - kfree(core); + kref_put(&core->ref, __clk_release); fail_out: return ERR_PTR(ret); } @@ -4370,16 +4378,6 @@ int of_clk_hw_register(struct device_node *node, struct clk_hw *hw) } EXPORT_SYMBOL_GPL(of_clk_hw_register); -/* Free memory allocated for a clock. */ -static void __clk_release(struct kref *ref) -{ - struct clk_core *core = container_of(ref, struct clk_core, ref); - - clk_core_free_parent_map(core); - kfree_const(core->name); - kfree(core); -} - /* * Empty clk_ops for unregistered clocks. These are used temporarily * after clk_unregister() was called on a clock and until last clock -- cgit v1.2.3 From e581cf5d216289ef292d1a4036d53ce90e122469 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:58 -0700 Subject: clk: Get runtime PM before walking tree during disable_unused Doug reported [1] the following hung task: INFO: task swapper/0:1 blocked for more than 122 seconds. Not tainted 5.15.149-21875-gf795ebc40eb8 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:swapper/0 state:D stack: 0 pid: 1 ppid: 0 flags:0x00000008 Call trace: __switch_to+0xf4/0x1f4 __schedule+0x418/0xb80 schedule+0x5c/0x10c rpm_resume+0xe0/0x52c rpm_resume+0x178/0x52c __pm_runtime_resume+0x58/0x98 clk_pm_runtime_get+0x30/0xb0 clk_disable_unused_subtree+0x58/0x208 clk_disable_unused_subtree+0x38/0x208 clk_disable_unused_subtree+0x38/0x208 clk_disable_unused_subtree+0x38/0x208 clk_disable_unused_subtree+0x38/0x208 clk_disable_unused+0x4c/0xe4 do_one_initcall+0xcc/0x2d8 do_initcall_level+0xa4/0x148 do_initcalls+0x5c/0x9c do_basic_setup+0x24/0x30 kernel_init_freeable+0xec/0x164 kernel_init+0x28/0x120 ret_from_fork+0x10/0x20 INFO: task kworker/u16:0:9 blocked for more than 122 seconds. Not tainted 5.15.149-21875-gf795ebc40eb8 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/u16:0 state:D stack: 0 pid: 9 ppid: 2 flags:0x00000008 Workqueue: events_unbound deferred_probe_work_func Call trace: __switch_to+0xf4/0x1f4 __schedule+0x418/0xb80 schedule+0x5c/0x10c schedule_preempt_disabled+0x2c/0x48 __mutex_lock+0x238/0x488 __mutex_lock_slowpath+0x1c/0x28 mutex_lock+0x50/0x74 clk_prepare_lock+0x7c/0x9c clk_core_prepare_lock+0x20/0x44 clk_prepare+0x24/0x30 clk_bulk_prepare+0x40/0xb0 mdss_runtime_resume+0x54/0x1c8 pm_generic_runtime_resume+0x30/0x44 __genpd_runtime_resume+0x68/0x7c genpd_runtime_resume+0x108/0x1f4 __rpm_callback+0x84/0x144 rpm_callback+0x30/0x88 rpm_resume+0x1f4/0x52c rpm_resume+0x178/0x52c __pm_runtime_resume+0x58/0x98 __device_attach+0xe0/0x170 device_initial_probe+0x1c/0x28 bus_probe_device+0x3c/0x9c device_add+0x644/0x814 mipi_dsi_device_register_full+0xe4/0x170 devm_mipi_dsi_device_register_full+0x28/0x70 ti_sn_bridge_probe+0x1dc/0x2c0 auxiliary_bus_probe+0x4c/0x94 really_probe+0xcc/0x2c8 __driver_probe_device+0xa8/0x130 driver_probe_device+0x48/0x110 __device_attach_driver+0xa4/0xcc bus_for_each_drv+0x8c/0xd8 __device_attach+0xf8/0x170 device_initial_probe+0x1c/0x28 bus_probe_device+0x3c/0x9c deferred_probe_work_func+0x9c/0xd8 process_one_work+0x148/0x518 worker_thread+0x138/0x350 kthread+0x138/0x1e0 ret_from_fork+0x10/0x20 The first thread is walking the clk tree and calling clk_pm_runtime_get() to power on devices required to read the clk hardware via struct clk_ops::is_enabled(). This thread holds the clk prepare_lock, and is trying to runtime PM resume a device, when it finds that the device is in the process of resuming so the thread schedule()s away waiting for the device to finish resuming before continuing. The second thread is runtime PM resuming the same device, but the runtime resume callback is calling clk_prepare(), trying to grab the prepare_lock waiting on the first thread. This is a classic ABBA deadlock. To properly fix the deadlock, we must never runtime PM resume or suspend a device with the clk prepare_lock held. Actually doing that is near impossible today because the global prepare_lock would have to be dropped in the middle of the tree, the device runtime PM resumed/suspended, and then the prepare_lock grabbed again to ensure consistency of the clk tree topology. If anything changes with the clk tree in the meantime, we've lost and will need to start the operation all over again. Luckily, most of the time we're simply incrementing or decrementing the runtime PM count on an active device, so we don't have the chance to schedule away with the prepare_lock held. Let's fix this immediate problem that can be triggered more easily by simply booting on Qualcomm sc7180. Introduce a list of clk_core structures that have been registered, or are in the process of being registered, that require runtime PM to operate. Iterate this list and call clk_pm_runtime_get() on each of them without holding the prepare_lock during clk_disable_unused(). This way we can be certain that the runtime PM state of the devices will be active and resumed so we can't schedule away while walking the clk tree with the prepare_lock held. Similarly, call clk_pm_runtime_put() without the prepare_lock held to properly drop the runtime PM reference. We remove the calls to clk_pm_runtime_{get,put}() in this path because they're superfluous now that we know the devices are runtime resumed. Reported-by: Douglas Anderson Closes: https://lore.kernel.org/all/20220922084322.RFC.2.I375b6b9e0a0a5348962f004beb3dafee6a12dfbb@changeid/ [1] Closes: https://issuetracker.google.com/328070191 Cc: Marek Szyprowski Cc: Ulf Hansson Cc: Krzysztof Kozlowski Fixes: 9a34b45397e5 ("clk: Add support for runtime PM") Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-5-sboyd@kernel.org Reviewed-by: Douglas Anderson --- drivers/clk/clk.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 12 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 8b35ca32f45e..9b907cb3ef61 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -37,6 +37,10 @@ static HLIST_HEAD(clk_root_list); static HLIST_HEAD(clk_orphan_list); static LIST_HEAD(clk_notifier_list); +/* List of registered clks that use runtime PM */ +static HLIST_HEAD(clk_rpm_list); +static DEFINE_MUTEX(clk_rpm_list_lock); + static const struct hlist_head *all_lists[] = { &clk_root_list, &clk_orphan_list, @@ -59,6 +63,7 @@ struct clk_core { struct clk_hw *hw; struct module *owner; struct device *dev; + struct hlist_node rpm_node; struct device_node *of_node; struct clk_core *parent; struct clk_parent_map *parents; @@ -122,6 +127,89 @@ static void clk_pm_runtime_put(struct clk_core *core) pm_runtime_put_sync(core->dev); } +/** + * clk_pm_runtime_get_all() - Runtime "get" all clk provider devices + * + * Call clk_pm_runtime_get() on all runtime PM enabled clks in the clk tree so + * that disabling unused clks avoids a deadlock where a device is runtime PM + * resuming/suspending and the runtime PM callback is trying to grab the + * prepare_lock for something like clk_prepare_enable() while + * clk_disable_unused_subtree() holds the prepare_lock and is trying to runtime + * PM resume/suspend the device as well. + * + * Context: Acquires the 'clk_rpm_list_lock' and returns with the lock held on + * success. Otherwise the lock is released on failure. + * + * Return: 0 on success, negative errno otherwise. + */ +static int clk_pm_runtime_get_all(void) +{ + int ret; + struct clk_core *core, *failed; + + /* + * Grab the list lock to prevent any new clks from being registered + * or unregistered until clk_pm_runtime_put_all(). + */ + mutex_lock(&clk_rpm_list_lock); + + /* + * Runtime PM "get" all the devices that are needed for the clks + * currently registered. Do this without holding the prepare_lock, to + * avoid the deadlock. + */ + hlist_for_each_entry(core, &clk_rpm_list, rpm_node) { + ret = clk_pm_runtime_get(core); + if (ret) { + failed = core; + pr_err("clk: Failed to runtime PM get '%s' for clk '%s'\n", + dev_name(failed->dev), failed->name); + goto err; + } + } + + return 0; + +err: + hlist_for_each_entry(core, &clk_rpm_list, rpm_node) { + if (core == failed) + break; + + clk_pm_runtime_put(core); + } + mutex_unlock(&clk_rpm_list_lock); + + return ret; +} + +/** + * clk_pm_runtime_put_all() - Runtime "put" all clk provider devices + * + * Put the runtime PM references taken in clk_pm_runtime_get_all() and release + * the 'clk_rpm_list_lock'. + */ +static void clk_pm_runtime_put_all(void) +{ + struct clk_core *core; + + hlist_for_each_entry(core, &clk_rpm_list, rpm_node) + clk_pm_runtime_put(core); + mutex_unlock(&clk_rpm_list_lock); +} + +static void clk_pm_runtime_init(struct clk_core *core) +{ + struct device *dev = core->dev; + + if (dev && pm_runtime_enabled(dev)) { + core->rpm_enabled = true; + + mutex_lock(&clk_rpm_list_lock); + hlist_add_head(&core->rpm_node, &clk_rpm_list); + mutex_unlock(&clk_rpm_list_lock); + } +} + /*** locking ***/ static void clk_prepare_lock(void) { @@ -1381,9 +1469,6 @@ static void __init clk_unprepare_unused_subtree(struct clk_core *core) if (core->flags & CLK_IGNORE_UNUSED) return; - if (clk_pm_runtime_get(core)) - return; - if (clk_core_is_prepared(core)) { trace_clk_unprepare(core); if (core->ops->unprepare_unused) @@ -1392,8 +1477,6 @@ static void __init clk_unprepare_unused_subtree(struct clk_core *core) core->ops->unprepare(core->hw); trace_clk_unprepare_complete(core); } - - clk_pm_runtime_put(core); } static void __init clk_disable_unused_subtree(struct clk_core *core) @@ -1409,9 +1492,6 @@ static void __init clk_disable_unused_subtree(struct clk_core *core) if (core->flags & CLK_OPS_PARENT_ENABLE) clk_core_prepare_enable(core->parent); - if (clk_pm_runtime_get(core)) - goto unprepare_out; - flags = clk_enable_lock(); if (core->enable_count) @@ -1436,8 +1516,6 @@ static void __init clk_disable_unused_subtree(struct clk_core *core) unlock_out: clk_enable_unlock(flags); - clk_pm_runtime_put(core); -unprepare_out: if (core->flags & CLK_OPS_PARENT_ENABLE) clk_core_disable_unprepare(core->parent); } @@ -1453,6 +1531,7 @@ __setup("clk_ignore_unused", clk_ignore_unused_setup); static int __init clk_disable_unused(void) { struct clk_core *core; + int ret; if (clk_ignore_unused) { pr_warn("clk: Not disabling unused clocks\n"); @@ -1461,6 +1540,13 @@ static int __init clk_disable_unused(void) pr_info("clk: Disabling unused clocks\n"); + ret = clk_pm_runtime_get_all(); + if (ret) + return ret; + /* + * Grab the prepare lock to keep the clk topology stable while iterating + * over clks. + */ clk_prepare_lock(); hlist_for_each_entry(core, &clk_root_list, child_node) @@ -1477,6 +1563,8 @@ static int __init clk_disable_unused(void) clk_prepare_unlock(); + clk_pm_runtime_put_all(); + return 0; } late_initcall_sync(clk_disable_unused); @@ -4214,6 +4302,12 @@ static void __clk_release(struct kref *ref) { struct clk_core *core = container_of(ref, struct clk_core, ref); + if (core->rpm_enabled) { + mutex_lock(&clk_rpm_list_lock); + hlist_del(&core->rpm_node); + mutex_unlock(&clk_rpm_list_lock); + } + clk_core_free_parent_map(core); kfree_const(core->name); kfree(core); @@ -4253,9 +4347,8 @@ __clk_register(struct device *dev, struct device_node *np, struct clk_hw *hw) } core->ops = init->ops; - if (dev && pm_runtime_enabled(dev)) - core->rpm_enabled = true; core->dev = dev; + clk_pm_runtime_init(core); core->of_node = np; if (dev && dev->driver) core->owner = dev->driver->owner; -- cgit v1.2.3 From 9d1e795f754db1ac3344528b7af0b17b8146f321 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2024 11:41:59 -0700 Subject: clk: Get runtime PM before walking tree for clk_summary Similar to the previous commit, we should make sure that all devices are runtime resumed before printing the clk_summary through debugfs. Failure to do so would result in a deadlock if the thread is resuming a device to print clk state and that device is also runtime resuming in another thread, e.g the screen is turning on and the display driver is starting up. We remove the calls to clk_pm_runtime_{get,put}() in this path because they're superfluous now that we know the devices are runtime resumed. This also squashes a bug where the return value of clk_pm_runtime_get() wasn't checked, leading to an RPM count underflow on error paths. Fixes: 1bb294a7981c ("clk: Enable/Disable runtime PM for clk_summary") Cc: Taniya Das Cc: Douglas Anderson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240325184204.745706-6-sboyd@kernel.org Reviewed-by: Douglas Anderson --- drivers/clk/clk.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 9b907cb3ef61..8cca52be993f 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -3340,9 +3340,7 @@ static void clk_summary_show_subtree(struct seq_file *s, struct clk_core *c, { struct clk_core *child; - clk_pm_runtime_get(c); clk_summary_show_one(s, c, level); - clk_pm_runtime_put(c); hlist_for_each_entry(child, &c->children, child_node) clk_summary_show_subtree(s, child, level + 1); @@ -3352,11 +3350,15 @@ static int clk_summary_show(struct seq_file *s, void *data) { struct clk_core *c; struct hlist_head **lists = s->private; + int ret; seq_puts(s, " enable prepare protect duty hardware connection\n"); seq_puts(s, " clock count count count rate accuracy phase cycle enable consumer id\n"); seq_puts(s, "---------------------------------------------------------------------------------------------------------------------------------------------\n"); + ret = clk_pm_runtime_get_all(); + if (ret) + return ret; clk_prepare_lock(); @@ -3365,6 +3367,7 @@ static int clk_summary_show(struct seq_file *s, void *data) clk_summary_show_subtree(s, c, 0); clk_prepare_unlock(); + clk_pm_runtime_put_all(); return 0; } @@ -3412,8 +3415,14 @@ static int clk_dump_show(struct seq_file *s, void *data) struct clk_core *c; bool first_node = true; struct hlist_head **lists = s->private; + int ret; + + ret = clk_pm_runtime_get_all(); + if (ret) + return ret; seq_putc(s, '{'); + clk_prepare_lock(); for (; *lists; lists++) { @@ -3426,6 +3435,7 @@ static int clk_dump_show(struct seq_file *s, void *data) } clk_prepare_unlock(); + clk_pm_runtime_put_all(); seq_puts(s, "}\n"); return 0; -- cgit v1.2.3 From 22e1992cf7b034db5325660e98c41ca5afa5f519 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 28 Mar 2024 10:21:47 +1000 Subject: vhost: Add smp_rmb() in vhost_vq_avail_empty() A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M \ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). When tx_can_batch() returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang Message-Id: <20240328002149.1141302-2-gshan@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vhost.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..29df65b2ebf2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,19 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, &avail_idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following + * call to vhost_get_vq_desc() will read available + * ring entries. Make sure that read happens after + * the avail_idx read. + */ + smp_rmb(); + return false; + } - return vq->avail_idx == vq->last_avail_idx; + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); -- cgit v1.2.3 From df9ace7647d4123209395bb9967e998d5758c645 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 28 Mar 2024 10:21:48 +1000 Subject: vhost: Add smp_rmb() in vhost_enable_notify() A smp_rmb() has been missed in vhost_enable_notify(), inspired by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M \ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_enable_notify(). When it returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()"). Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: # v5.18+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang Message-Id: <20240328002149.1141302-3-gshan@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vhost.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 29df65b2ebf2..32686c79c41d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2848,9 +2848,19 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) &vq->avail->idx, r); return false; } + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following + * call to vhost_get_vq_desc() will read available + * ring entries. Make sure that read happens after + * the avail_idx read. + */ + smp_rmb(); + return true; + } - return vq->avail_idx != vq->last_avail_idx; + return false; } EXPORT_SYMBOL_GPL(vhost_enable_notify); -- cgit v1.2.3 From ffe6176b7f53ca0c99355f13e14a33a40cf49406 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 31 Mar 2024 10:43:48 +0200 Subject: virtio: store owner from modules with register_virtio_driver() Modules registering driver with register_virtio_driver() might forget to set .owner field. i2c-virtio.c for example has it missing. The field is used by some other kernel parts for reference counting (try_module_get()), so it is expected that drivers will set it. Solve the problem by moving this task away from the drivers to the core virtio code, just like we did for platform_driver in commit 9447057eaff8 ("platform_device: use a macro instead of platform_driver_register"). Fixes: 3cfc88380413 ("i2c: virtio: add a virtio i2c frontend driver") Cc: "Jie Deng" Signed-off-by: Krzysztof Kozlowski Message-Id: <20240331-module-owner-virtio-v2-1-98f04bfaf46a@linaro.org> Signed-off-by: Michael S. Tsirkin --- Documentation/driver-api/virtio/writing_virtio_drivers.rst | 1 - drivers/virtio/virtio.c | 6 ++++-- include/linux/virtio.h | 7 +++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/driver-api/virtio/writing_virtio_drivers.rst b/Documentation/driver-api/virtio/writing_virtio_drivers.rst index e14c58796d25..e5de6f5d061a 100644 --- a/Documentation/driver-api/virtio/writing_virtio_drivers.rst +++ b/Documentation/driver-api/virtio/writing_virtio_drivers.rst @@ -97,7 +97,6 @@ like this:: static struct virtio_driver virtio_dummy_driver = { .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtio_dummy_probe, .remove = virtio_dummy_remove, diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index f173587893cb..9510c551dce8 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -362,14 +362,16 @@ static const struct bus_type virtio_bus = { .remove = virtio_dev_remove, }; -int register_virtio_driver(struct virtio_driver *driver) +int __register_virtio_driver(struct virtio_driver *driver, struct module *owner) { /* Catch this early. */ BUG_ON(driver->feature_table_size && !driver->feature_table); driver->driver.bus = &virtio_bus; + driver->driver.owner = owner; + return driver_register(&driver->driver); } -EXPORT_SYMBOL_GPL(register_virtio_driver); +EXPORT_SYMBOL_GPL(__register_virtio_driver); void unregister_virtio_driver(struct virtio_driver *driver) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b0201747a263..26c4325aa373 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -170,7 +170,7 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); /** * struct virtio_driver - operations for a virtio I/O driver - * @driver: underlying device driver (populate name and owner). + * @driver: underlying device driver (populate name). * @id_table: the ids serviced by this driver. * @feature_table: an array of feature numbers supported by this driver. * @feature_table_size: number of entries in the feature table array. @@ -208,7 +208,10 @@ static inline struct virtio_driver *drv_to_virtio(struct device_driver *drv) return container_of(drv, struct virtio_driver, driver); } -int register_virtio_driver(struct virtio_driver *drv); +/* use a macro to avoid include chaining to get THIS_MODULE */ +#define register_virtio_driver(drv) \ + __register_virtio_driver(drv, THIS_MODULE) +int __register_virtio_driver(struct virtio_driver *drv, struct module *owner); void unregister_virtio_driver(struct virtio_driver *drv); /* module_virtio_driver() - Helper macro for drivers that don't do -- cgit v1.2.3 From 2855c2a7820bc8198ae937a9a67dbdc3990e9d2c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 2 Apr 2024 17:21:43 -0400 Subject: vhost-vdpa: change ioctl # for VDPA_GET_VRING_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VDPA_GET_VRING_SIZE by mistake uses the already occupied ioctl # 0x80 and we never noticed - it happens to work because the direction and size are different, but confuses tools such as perf which like to look at just the number, and breaks the extra robustness of the ioctl numbering macros. To fix, sort the entries and renumber the ioctl - not too late since it wasn't in any released kernels yet. Cc: Arnaldo Carvalho de Melo Reported-by: Namhyung Kim Fixes: 1496c47065f9 ("vhost-vdpa: uapi to support reporting per vq size") Cc: "Zhu Lingshan" Signed-off-by: Michael S. Tsirkin Message-Id: <41c1c5489688abe5bfef9f7cf15584e3fb872ac5.1712092759.git.mst@redhat.com> Reviewed-by: Eugenio Pérez Reviewed-by: Zhu Lingshan Reviewed-by: Stefano Garzarella Acked-by: Jason Wang --- include/uapi/linux/vhost.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index bea697390613..b95dd84eef2d 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -179,12 +179,6 @@ /* Get the config size */ #define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - /* Get the number of address spaces. */ #define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) @@ -228,10 +222,17 @@ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + /* Get the queue size of a specific virtqueue. * userspace set the vring index in vhost_vring_state.index * kernel set the queue size in vhost_vring_state.num */ -#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x80, \ +#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) #endif -- cgit v1.2.3 From 76f408535aab39c33e0a1dcada9fba5631c65595 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Mon, 11 Mar 2024 16:21:09 +0800 Subject: vhost: correct misleading printing information Guest moved avail idx not used idx when we need to print log if '(vq->avail_idx - last_avail_idx) > vq->num', so fix it. Signed-off-by: Xianting Tian Message-Id: <20240311082109.46773-1-xianting.tian@linux.alibaba.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 32686c79c41d..8995730ce0bf 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2515,7 +2515,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, vq->avail_idx = vhost16_to_cpu(vq, avail_idx); if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved used index from %u to %u", + vq_err(vq, "Guest moved avail index from %u to %u", last_avail_idx, vq->avail_idx); return -EFAULT; } -- cgit v1.2.3 From f0cf7ffcd02953c72fed5995378805883d16203e Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:22 +0200 Subject: accel/ivpu: Check return code of ipc->lock init Return value of drmm_mutex_init(ipc->lock) was unchecked. Fixes: 5d7422cfb498 ("accel/ivpu: Add IPC driver and JSM messages") Cc: # v6.3+ Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-2-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_ipc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_ipc.c b/drivers/accel/ivpu/ivpu_ipc.c index 04ac4b9840fb..56ff067f63e2 100644 --- a/drivers/accel/ivpu/ivpu_ipc.c +++ b/drivers/accel/ivpu/ivpu_ipc.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -501,7 +501,11 @@ int ivpu_ipc_init(struct ivpu_device *vdev) spin_lock_init(&ipc->cons_lock); INIT_LIST_HEAD(&ipc->cons_list); INIT_LIST_HEAD(&ipc->cb_msg_list); - drmm_mutex_init(&vdev->drm, &ipc->lock); + ret = drmm_mutex_init(&vdev->drm, &ipc->lock); + if (ret) { + ivpu_err(vdev, "Failed to initialize ipc->lock, ret %d\n", ret); + goto err_free_rx; + } ivpu_ipc_reset(vdev); return 0; -- cgit v1.2.3 From e3caadf1f9dfc9d62b5ffc3bd73ebac0c8f26b3f Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:23 +0200 Subject: accel/ivpu: Remove d3hot_after_power_off WA Always enter D3hot after entering D0i3 an all platforms. This minimizes power usage. Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-3-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 20 ++++++++++---------- drivers/accel/ivpu/ivpu_drv.h | 3 +-- drivers/accel/ivpu/ivpu_hw_37xx.c | 4 +--- drivers/accel/ivpu/ivpu_pm.c | 7 ++----- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 39f6d1b98fd6..303d92753387 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -387,12 +387,15 @@ int ivpu_shutdown(struct ivpu_device *vdev) { int ret; - ivpu_prepare_for_reset(vdev); + /* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */ + pci_save_state(to_pci_dev(vdev->drm.dev)); ret = ivpu_hw_power_down(vdev); if (ret) ivpu_warn(vdev, "Failed to power down HW: %d\n", ret); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); + return ret; } @@ -560,11 +563,11 @@ static int ivpu_dev_init(struct ivpu_device *vdev) /* Power up early so the rest of init code can access VPU registers */ ret = ivpu_hw_power_up(vdev); if (ret) - goto err_power_down; + goto err_shutdown; ret = ivpu_mmu_global_context_init(vdev); if (ret) - goto err_power_down; + goto err_shutdown; ret = ivpu_mmu_init(vdev); if (ret) @@ -601,10 +604,8 @@ err_mmu_rctx_fini: ivpu_mmu_reserved_context_fini(vdev); err_mmu_gctx_fini: ivpu_mmu_global_context_fini(vdev); -err_power_down: - ivpu_hw_power_down(vdev); - if (IVPU_WA(d3hot_after_power_off)) - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); +err_shutdown: + ivpu_shutdown(vdev); err_xa_destroy: xa_destroy(&vdev->db_xa); xa_destroy(&vdev->submitted_jobs_xa); @@ -628,9 +629,8 @@ static void ivpu_bo_unbind_all_user_contexts(struct ivpu_device *vdev) static void ivpu_dev_fini(struct ivpu_device *vdev) { ivpu_pm_disable(vdev); + ivpu_prepare_for_reset(vdev); ivpu_shutdown(vdev); - if (IVPU_WA(d3hot_after_power_off)) - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); ivpu_jobs_abort_all(vdev); ivpu_job_done_consumer_fini(vdev); diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h index 7be0500d9bb8..bb4374d0eaec 100644 --- a/drivers/accel/ivpu/ivpu_drv.h +++ b/drivers/accel/ivpu/ivpu_drv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #ifndef __IVPU_DRV_H__ @@ -90,7 +90,6 @@ struct ivpu_wa_table { bool punit_disabled; bool clear_runtime_mem; - bool d3hot_after_power_off; bool interrupt_clear_with_0; bool disable_clock_relinquish; bool disable_d0i3_msg; diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index 9a0c9498baba..5e2865f9f7d6 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include "ivpu_drv.h" @@ -75,7 +75,6 @@ static void ivpu_hw_wa_init(struct ivpu_device *vdev) { vdev->wa.punit_disabled = false; vdev->wa.clear_runtime_mem = false; - vdev->wa.d3hot_after_power_off = true; REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, BUTTRESS_ALL_IRQ_MASK); if (REGB_RD32(VPU_37XX_BUTTRESS_INTERRUPT_STAT) == BUTTRESS_ALL_IRQ_MASK) { @@ -86,7 +85,6 @@ static void ivpu_hw_wa_init(struct ivpu_device *vdev) IVPU_PRINT_WA(punit_disabled); IVPU_PRINT_WA(clear_runtime_mem); - IVPU_PRINT_WA(d3hot_after_power_off); IVPU_PRINT_WA(interrupt_clear_with_0); } diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 7cce1c928a7f..9cbd7af6576b 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation */ #include @@ -58,15 +58,12 @@ static int ivpu_suspend(struct ivpu_device *vdev) { int ret; - /* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */ - pci_save_state(to_pci_dev(vdev->drm.dev)); + ivpu_prepare_for_reset(vdev); ret = ivpu_shutdown(vdev); if (ret) ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret); - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); - return ret; } -- cgit v1.2.3 From 3534eacbf101f6e66105f03d869a03893407c384 Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:24 +0200 Subject: accel/ivpu: Fix PCI D0 state entry in resume In case of failed power up we end up left in PCI D3hot state making it impossible to access NPU registers on retry. Enter D0 state on retry before proceeding with power up sequence. Fixes: 28083ff18d3f ("accel/ivpu: Fix DevTLB errors on suspend/resume and recovery") Cc: # v6.8+ Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-4-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 9cbd7af6576b..325b82f8d971 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -71,10 +71,10 @@ static int ivpu_resume(struct ivpu_device *vdev) { int ret; - pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); +retry: pci_restore_state(to_pci_dev(vdev->drm.dev)); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); -retry: ret = ivpu_hw_power_up(vdev); if (ret) { ivpu_err(vdev, "Failed to power up HW: %d\n", ret); -- cgit v1.2.3 From 875bc9cd1b33eb027a5663f5e6878a43d98e9a16 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:25 +0200 Subject: accel/ivpu: Put NPU back to D3hot after failed resume Put NPU in D3hot after ivpu_resume() fails to power up the device. This will assure that D3->D0 power cycle will be performed before the next resume and also will minimize power usage in this corner case. Fixes: 28083ff18d3f ("accel/ivpu: Fix DevTLB errors on suspend/resume and recovery") Cc: # v6.8+ Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-5-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 325b82f8d971..ba51781b5896 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -97,6 +97,7 @@ err_mmu_disable: ivpu_mmu_disable(vdev); err_power_down: ivpu_hw_power_down(vdev); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); if (!ivpu_fw_is_cold_boot(vdev)) { ivpu_pm_prepare_cold_boot(vdev); -- cgit v1.2.3 From 3556f922612caf4c9b97cf7337626f8342b3dea3 Mon Sep 17 00:00:00 2001 From: "Wachowski, Karol" Date: Tue, 2 Apr 2024 12:49:26 +0200 Subject: accel/ivpu: Improve clarity of MMU error messages This patch improves readability and clarity of MMU error messages. Previously, the error strings were somewhat confusing and could lead to ambiguous interpretations, making it difficult to diagnose issues. Signed-off-by: Wachowski, Karol Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-6-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c index 91bd640655ab..2e46b322c450 100644 --- a/drivers/accel/ivpu/ivpu_mmu.c +++ b/drivers/accel/ivpu/ivpu_mmu.c @@ -278,7 +278,7 @@ static const char *ivpu_mmu_event_to_str(u32 cmd) case IVPU_MMU_EVT_F_VMS_FETCH: return "Fetch of VMS caused external abort"; default: - return "Unknown CMDQ command"; + return "Unknown event"; } } @@ -286,15 +286,15 @@ static const char *ivpu_mmu_cmdq_err_to_str(u32 err) { switch (err) { case IVPU_MMU_CERROR_NONE: - return "No CMDQ Error"; + return "No error"; case IVPU_MMU_CERROR_ILL: return "Illegal command"; case IVPU_MMU_CERROR_ABT: - return "External abort on CMDQ read"; + return "External abort on command queue read"; case IVPU_MMU_CERROR_ATC_INV_SYNC: return "Sync failed to complete ATS invalidation"; default: - return "Unknown CMDQ Error"; + return "Unknown error"; } } -- cgit v1.2.3 From c52c35e5b404b95a5bcff39af9be1b9293be3434 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:27 +0200 Subject: accel/ivpu: Return max freq for DRM_IVPU_PARAM_CORE_CLOCK_RATE DRM_IVPU_PARAM_CORE_CLOCK_RATE returns current NPU frequency which could be 0 if device was sleeping. This value isn't really useful to the user space, so return max freq instead which can be used to estimate NPU performance. Fixes: c39dc15191c4 ("accel/ivpu: Read clock rate only if device is up") Cc: # v6.7 Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-7-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 18 +----------------- drivers/accel/ivpu/ivpu_hw.h | 6 ++++++ drivers/accel/ivpu/ivpu_hw_37xx.c | 7 ++++--- drivers/accel/ivpu/ivpu_hw_40xx.c | 6 ++++++ 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 303d92753387..77283daaedd1 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -131,22 +131,6 @@ static int ivpu_get_capabilities(struct ivpu_device *vdev, struct drm_ivpu_param return 0; } -static int ivpu_get_core_clock_rate(struct ivpu_device *vdev, u64 *clk_rate) -{ - int ret; - - ret = ivpu_rpm_get_if_active(vdev); - if (ret < 0) - return ret; - - *clk_rate = ret ? ivpu_hw_reg_pll_freq_get(vdev) : 0; - - if (ret) - ivpu_rpm_put(vdev); - - return 0; -} - static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct ivpu_file_priv *file_priv = file->driver_priv; @@ -170,7 +154,7 @@ static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_f args->value = vdev->platform; break; case DRM_IVPU_PARAM_CORE_CLOCK_RATE: - ret = ivpu_get_core_clock_rate(vdev, &args->value); + args->value = ivpu_hw_ratio_to_freq(vdev, vdev->hw->pll.max_ratio); break; case DRM_IVPU_PARAM_NUM_CONTEXTS: args->value = ivpu_get_context_count(vdev); diff --git a/drivers/accel/ivpu/ivpu_hw.h b/drivers/accel/ivpu/ivpu_hw.h index b2909168a0a6..094c659d2800 100644 --- a/drivers/accel/ivpu/ivpu_hw.h +++ b/drivers/accel/ivpu/ivpu_hw.h @@ -21,6 +21,7 @@ struct ivpu_hw_ops { u32 (*profiling_freq_get)(struct ivpu_device *vdev); void (*profiling_freq_drive)(struct ivpu_device *vdev, bool enable); u32 (*reg_pll_freq_get)(struct ivpu_device *vdev); + u32 (*ratio_to_freq)(struct ivpu_device *vdev, u32 ratio); u32 (*reg_telemetry_offset_get)(struct ivpu_device *vdev); u32 (*reg_telemetry_size_get)(struct ivpu_device *vdev); u32 (*reg_telemetry_enable_get)(struct ivpu_device *vdev); @@ -130,6 +131,11 @@ static inline u32 ivpu_hw_reg_pll_freq_get(struct ivpu_device *vdev) return vdev->hw->ops->reg_pll_freq_get(vdev); }; +static inline u32 ivpu_hw_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) +{ + return vdev->hw->ops->ratio_to_freq(vdev, ratio); +} + static inline u32 ivpu_hw_reg_telemetry_offset_get(struct ivpu_device *vdev) { return vdev->hw->ops->reg_telemetry_offset_get(vdev); diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index 5e2865f9f7d6..bd25e2d9fb0f 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -803,12 +803,12 @@ static void ivpu_hw_37xx_profiling_freq_drive(struct ivpu_device *vdev, bool ena /* Profiling freq - is a debug feature. Unavailable on VPU 37XX. */ } -static u32 ivpu_hw_37xx_pll_to_freq(u32 ratio, u32 config) +static u32 ivpu_hw_37xx_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) { u32 pll_clock = PLL_REF_CLK_FREQ * ratio; u32 cpu_clock; - if ((config & 0xff) == PLL_RATIO_4_3) + if ((vdev->hw->config & 0xff) == PLL_RATIO_4_3) cpu_clock = pll_clock * 2 / 4; else cpu_clock = pll_clock * 2 / 5; @@ -827,7 +827,7 @@ static u32 ivpu_hw_37xx_reg_pll_freq_get(struct ivpu_device *vdev) if (!ivpu_is_silicon(vdev)) return PLL_SIMULATION_FREQ; - return ivpu_hw_37xx_pll_to_freq(pll_curr_ratio, vdev->hw->config); + return ivpu_hw_37xx_ratio_to_freq(vdev, pll_curr_ratio); } static u32 ivpu_hw_37xx_reg_telemetry_offset_get(struct ivpu_device *vdev) @@ -1050,6 +1050,7 @@ const struct ivpu_hw_ops ivpu_hw_37xx_ops = { .profiling_freq_get = ivpu_hw_37xx_profiling_freq_get, .profiling_freq_drive = ivpu_hw_37xx_profiling_freq_drive, .reg_pll_freq_get = ivpu_hw_37xx_reg_pll_freq_get, + .ratio_to_freq = ivpu_hw_37xx_ratio_to_freq, .reg_telemetry_offset_get = ivpu_hw_37xx_reg_telemetry_offset_get, .reg_telemetry_size_get = ivpu_hw_37xx_reg_telemetry_size_get, .reg_telemetry_enable_get = ivpu_hw_37xx_reg_telemetry_enable_get, diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c index e4eddbf5d11c..b0b88d4c8926 100644 --- a/drivers/accel/ivpu/ivpu_hw_40xx.c +++ b/drivers/accel/ivpu/ivpu_hw_40xx.c @@ -980,6 +980,11 @@ static u32 ivpu_hw_40xx_reg_pll_freq_get(struct ivpu_device *vdev) return PLL_RATIO_TO_FREQ(pll_curr_ratio); } +static u32 ivpu_hw_40xx_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) +{ + return PLL_RATIO_TO_FREQ(ratio); +} + static u32 ivpu_hw_40xx_reg_telemetry_offset_get(struct ivpu_device *vdev) { return REGB_RD32(VPU_40XX_BUTTRESS_VPU_TELEMETRY_OFFSET); @@ -1230,6 +1235,7 @@ const struct ivpu_hw_ops ivpu_hw_40xx_ops = { .profiling_freq_get = ivpu_hw_40xx_profiling_freq_get, .profiling_freq_drive = ivpu_hw_40xx_profiling_freq_drive, .reg_pll_freq_get = ivpu_hw_40xx_reg_pll_freq_get, + .ratio_to_freq = ivpu_hw_40xx_ratio_to_freq, .reg_telemetry_offset_get = ivpu_hw_40xx_reg_telemetry_offset_get, .reg_telemetry_size_get = ivpu_hw_40xx_reg_telemetry_size_get, .reg_telemetry_enable_get = ivpu_hw_40xx_reg_telemetry_enable_get, -- cgit v1.2.3 From 0d298e23292b7a5b58c5589fe33b96e95363214f Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:28 +0200 Subject: accel/ivpu: Fix missed error message after VPU rename Change "VPU" to "NPU" in ivpu_suspend() so it matches all other error messages. Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-8-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index ba51781b5896..4f5ea466731f 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -62,7 +62,7 @@ static int ivpu_suspend(struct ivpu_device *vdev) ret = ivpu_shutdown(vdev); if (ret) - ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret); + ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret); return ret; } -- cgit v1.2.3 From fd7726e75968b27fe98534ccbf47ccd6fef686f3 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 2 Apr 2024 12:49:29 +0200 Subject: accel/ivpu: Fix deadlock in context_xa ivpu_device->context_xa is locked both in kernel thread and IRQ context. It requires XA_FLAGS_LOCK_IRQ flag to be passed during initialization otherwise the lock could be acquired from a thread and interrupted by an IRQ that locks it for the second time causing the deadlock. This deadlock was reported by lockdep and observed in internal tests. Fixes: 35b137630f08 ("accel/ivpu: Introduce a new DRM driver for Intel VPU") Cc: # v6.3+ Signed-off-by: Jacek Lawrynowicz Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20240402104929.941186-9-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 77283daaedd1..51d3f1a55d02 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -517,7 +517,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev) vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID; vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID; atomic64_set(&vdev->unique_id_counter, 0); - xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC); + xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1); xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1); lockdep_set_class(&vdev->submitted_jobs_xa.xa_lock, &submitted_jobs_xa_lock_class_key); -- cgit v1.2.3 From e9d47b7b31563a6524b9f64ea70ed0289cc4d9c4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Apr 2024 18:36:45 +0200 Subject: lib: checksum: hide unused expected_csum_ipv6_magic[] When CONFIG_NET is disabled, an extra warning shows up for this unused variable: lib/checksum_kunit.c:218:18: error: 'expected_csum_ipv6_magic' defined but not used [-Werror=unused-const-variable=] Replace the #ifdef with an IS_ENABLED() check that makes the compiler's dead-code-elimination take care of the link failure. Fixes: f24a70106dc1 ("lib: checksum: Fix build with CONFIG_NET=n") Suggested-by: Christophe Leroy Acked-by: Palmer Dabbelt Acked-by: Jakub Kicinski Signed-off-by: Arnd Bergmann Reviewed-by: Simon Horman Tested-by: Simon Horman # build-tested Signed-off-by: David S. Miller --- lib/checksum_kunit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/checksum_kunit.c b/lib/checksum_kunit.c index bf70850035c7..404dba36bae3 100644 --- a/lib/checksum_kunit.c +++ b/lib/checksum_kunit.c @@ -594,13 +594,15 @@ static void test_ip_fast_csum(struct kunit *test) static void test_csum_ipv6_magic(struct kunit *test) { -#if defined(CONFIG_NET) const struct in6_addr *saddr; const struct in6_addr *daddr; unsigned int len; unsigned char proto; __wsum csum; + if (!IS_ENABLED(CONFIG_NET)) + return; + const int daddr_offset = sizeof(struct in6_addr); const int len_offset = sizeof(struct in6_addr) + sizeof(struct in6_addr); const int proto_offset = sizeof(struct in6_addr) + sizeof(struct in6_addr) + @@ -618,7 +620,6 @@ static void test_csum_ipv6_magic(struct kunit *test) CHECK_EQ(to_sum16(expected_csum_ipv6_magic[i]), csum_ipv6_magic(saddr, daddr, len, proto, csum)); } -#endif /* !CONFIG_NET */ } static struct kunit_case __refdata checksum_test_cases[] = { -- cgit v1.2.3 From be121ffb384f53e966ee7299ffccc6eeb61bc73d Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Wed, 3 Apr 2024 12:03:46 +0300 Subject: RDMA/mlx5: Fix port number for counter query in multi-port configuration Set the correct port when querying PPCNT in multi-port configuration. Distinguish between cases where switchdev mode was enabled to multi-port configuration and don't overwrite the queried port to 1 in multi-port case. Fixes: 74b30b3ad5ce ("RDMA/mlx5: Set local port to one when accessing counters") Signed-off-by: Michael Guralnik Link: https://lore.kernel.org/r/9bfcc8ade958b760a51408c3ad654a01b11f7d76.1712134988.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mad.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 0c3c4e64812c..3e43687a7f6f 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -188,7 +188,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, mdev = dev->mdev; mdev_port_num = 1; } - if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1) { + if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1 && + !mlx5_core_mp_enabled(mdev)) { /* set local port to one for Function-Per-Port HCA. */ mdev = dev->mdev; mdev_port_num = 1; -- cgit v1.2.3 From eaac25d026a14be4fe97683103f2a3ae76bff7bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 5 Apr 2024 09:20:41 +0200 Subject: MAINTAINERS: Drop Li Yang as their email address stopped working MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sending a patch to (among others) Li Yang the nxp MTA replied that the address doesn't exist and so the mail couldn't be delivered. The error code was 550, so at least technically that's not a temporal issue. Signed-off-by: Uwe Kleine-König Signed-off-by: David S. Miller --- MAINTAINERS | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 75381386fe4c..f389bf6d78a3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2191,7 +2191,6 @@ N: mxs ARM/FREESCALE LAYERSCAPE ARM ARCHITECTURE M: Shawn Guo -M: Li Yang L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git @@ -8523,7 +8522,6 @@ S: Maintained F: drivers/video/fbdev/fsl-diu-fb.* FREESCALE DMA DRIVER -M: Li Yang M: Zhang Wei L: linuxppc-dev@lists.ozlabs.org S: Maintained @@ -8688,10 +8686,9 @@ F: drivers/soc/fsl/qe/tsa.h F: include/dt-bindings/soc/cpm1-fsl,tsa.h FREESCALE QUICC ENGINE UCC ETHERNET DRIVER -M: Li Yang L: netdev@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/net/ethernet/freescale/ucc_geth* FREESCALE QUICC ENGINE UCC HDLC DRIVER @@ -8708,10 +8705,9 @@ S: Maintained F: drivers/tty/serial/ucc_uart.c FREESCALE SOC DRIVERS -M: Li Yang L: linuxppc-dev@lists.ozlabs.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained +S: Orphan F: Documentation/devicetree/bindings/misc/fsl,dpaa2-console.yaml F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ @@ -8745,10 +8741,9 @@ F: Documentation/devicetree/bindings/sound/fsl,qmc-audio.yaml F: sound/soc/fsl/fsl_qmc_audio.c FREESCALE USB PERIPHERAL DRIVERS -M: Li Yang L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/usb/gadget/udc/fsl* FREESCALE USB PHY DRIVER -- cgit v1.2.3 From d8a6213d70accb403b82924a1c229e733433a5ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Apr 2024 10:30:34 +0000 Subject: geneve: fix header validation in geneve[6]_xmit_skb syzbot is able to trigger an uninit-value in geneve_xmit() [1] Problem : While most ip tunnel helpers (like ip_tunnel_get_dsfield()) uses skb_protocol(skb, true), pskb_inet_may_pull() is only using skb->protocol. If anything else than ETH_P_IPV6 or ETH_P_IP is found in skb->protocol, pskb_inet_may_pull() does nothing at all. If a vlan tag was provided by the caller (af_packet in the syzbot case), the network header might not point to the correct location, and skb linear part could be smaller than expected. Add skb_vlan_inet_prepare() to perform a complete mac validation. Use this in geneve for the moment, I suspect we need to adopt this more broadly. v4 - Jakub reported v3 broke l2_tos_ttl_inherit.sh selftest - Only call __vlan_get_protocol() for vlan types. Link: https://lore.kernel.org/netdev/20240404100035.3270a7d5@kernel.org/ v2,v3 - Addressed Sabrina comments on v1 and v2 Link: https://lore.kernel.org/netdev/Zg1l9L2BNoZWZDZG@hog/ [1] BUG: KMSAN: uninit-value in geneve_xmit_skb drivers/net/geneve.c:910 [inline] BUG: KMSAN: uninit-value in geneve_xmit+0x302d/0x5420 drivers/net/geneve.c:1030 geneve_xmit_skb drivers/net/geneve.c:910 [inline] geneve_xmit+0x302d/0x5420 drivers/net/geneve.c:1030 __netdev_start_xmit include/linux/netdevice.h:4903 [inline] netdev_start_xmit include/linux/netdevice.h:4917 [inline] xmit_one net/core/dev.c:3531 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3547 __dev_queue_xmit+0x348d/0x52c0 net/core/dev.c:4335 dev_queue_xmit include/linux/netdevice.h:3091 [inline] packet_xmit+0x9c/0x6c0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3081 [inline] packet_sendmsg+0x8bb0/0x9ef0 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2199 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 packet_alloc_skb net/packet/af_packet.c:2930 [inline] packet_snd net/packet/af_packet.c:3024 [inline] packet_sendmsg+0x722d/0x9ef0 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2199 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 0 PID: 5033 Comm: syz-executor346 Not tainted 6.9.0-rc1-syzkaller-00005-g928a87efa423 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Fixes: d13f048dd40e ("net: geneve: modify IP header check in geneve6_xmit_skb and geneve_xmit_skb") Reported-by: syzbot+9ee20ec1de7b3168db09@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/000000000000d19c3a06152f9ee4@google.com/ Signed-off-by: Eric Dumazet Cc: Phillip Potter Cc: Sabrina Dubroca Reviewed-by: Sabrina Dubroca Reviewed-by: Phillip Potter Signed-off-by: David S. Miller --- drivers/net/geneve.c | 4 ++-- include/net/ip_tunnels.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 2f6739fe78af..6c2835086b57 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -822,7 +822,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_inet_may_pull(skb)) + if (!skb_vlan_inet_prepare(skb)) return -EINVAL; if (!gs4) @@ -929,7 +929,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_inet_may_pull(skb)) + if (!skb_vlan_inet_prepare(skb)) return -EINVAL; if (!gs6) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5cd64bb2104d..c286cc2e766e 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -361,6 +361,39 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) return pskb_network_may_pull(skb, nhlen); } +/* Variant of pskb_inet_may_pull(). + */ +static inline bool skb_vlan_inet_prepare(struct sk_buff *skb) +{ + int nhlen = 0, maclen = ETH_HLEN; + __be16 type = skb->protocol; + + /* Essentially this is skb_protocol(skb, true) + * And we get MAC len. + */ + if (eth_type_vlan(type)) + type = __vlan_get_protocol(skb, type, &maclen); + + switch (type) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + } + /* For ETH_P_IPV6/ETH_P_IP we make sure to pull + * a base network header in skb->head. + */ + if (!pskb_may_pull(skb, maclen + nhlen)) + return false; + + skb_set_network_header(skb, maclen); + return true; +} + static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; -- cgit v1.2.3 From 58effa3476536215530c9ec4910ffc981613b413 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Fri, 5 Apr 2024 13:16:06 +0200 Subject: s390/ism: fix receive message buffer allocation Since [1], dma_alloc_coherent() does not accept requests for GFP_COMP anymore, even on archs that may be able to fulfill this. Functionality that relied on the receive buffer being a compound page broke at that point: The SMC-D protocol, that utilizes the ism device driver, passes receive buffers to the splice processor in a struct splice_pipe_desc with a single entry list of struct pages. As the buffer is no longer a compound page, the splice processor now rejects requests to handle more than a page worth of data. Replace dma_alloc_coherent() and allocate a buffer with folio_alloc and create a DMA map for it with dma_map_page(). Since only receive buffers on ISM devices use DMA, qualify the mapping as FROM_DEVICE. Since ISM devices are available on arch s390, only and on that arch all DMA is coherent, there is no need to introduce and export some kind of dma_sync_to_cpu() method to be called by the SMC-D protocol layer. Analogously, replace dma_free_coherent by a two step dma_unmap_page, then folio_put to free the receive buffer. [1] https://lore.kernel.org/all/20221113163535.884299-1-hch@lst.de/ Fixes: c08004eede4b ("s390/ism: don't pass bogus GFP_ flags to dma_alloc_coherent") Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2c8e964425dc..affb05521e14 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "ism.h" @@ -292,13 +294,15 @@ out: static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, - dmb->cpu_addr, dmb->dma_addr); + dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, + DMA_FROM_DEVICE); + folio_put(virt_to_folio(dmb->cpu_addr)); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { unsigned long bit; + int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -315,14 +319,30 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, - &dmb->dma_addr, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!dmb->cpu_addr) - clear_bit(dmb->sba_idx, ism->sba_bitmap); + dmb->cpu_addr = + folio_address(folio_alloc(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_NORETRY, + get_order(dmb->dmb_len))); - return dmb->cpu_addr ? 0 : -ENOMEM; + if (!dmb->cpu_addr) { + rc = -ENOMEM; + goto out_bit; + } + dmb->dma_addr = dma_map_page(&ism->pdev->dev, + virt_to_page(dmb->cpu_addr), 0, + dmb->dmb_len, DMA_FROM_DEVICE); + if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { + rc = -ENOMEM; + goto out_free; + } + + return 0; + +out_free: + kfree(dmb->cpu_addr); +out_bit: + clear_bit(dmb->sba_idx, ism->sba_bitmap); + return rc; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, -- cgit v1.2.3 From b45d0d01da542be280d935d87b71465028cdcb1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bernhard=20Rosenkr=C3=A4nzer?= Date: Fri, 29 Mar 2024 16:28:00 +0100 Subject: platform/x86: acer-wmi: Add support for Acer PH18-71 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Acer Predator PH18-71 to acer_quirks with predator_v4 to support mode button and fan speed sensor. Signed-off-by: Bernhard Rosenkränzer Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240329152800.29393-1-bero@baylibre.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/acer-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index ee2e164f86b9..38c932df6446 100644 --- a/drivers/platform/x86/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c @@ -597,6 +597,15 @@ static const struct dmi_system_id acer_quirks[] __initconst = { }, .driver_data = &quirk_acer_predator_v4, }, + { + .callback = dmi_matched, + .ident = "Acer Predator PH18-71", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Predator PH18-71"), + }, + .driver_data = &quirk_acer_predator_v4, + }, { .callback = set_force_caps, .ident = "Acer Aspire Switch 10E SW3-016", -- cgit v1.2.3 From 7ac10c7d728d75bc9daaa8fade3c7a3273b9a9ff Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 5 Apr 2024 16:55:11 -0700 Subject: bnxt_en: Fix possible memory leak in bnxt_rdma_aux_device_init() If ulp = kzalloc() fails, the allocated edev will leak because it is not properly assigned and the cleanup path will not be able to free it. Fix it by assigning it properly immediately after allocation. Fixes: 303432211324 ("bnxt_en: Remove runtime interrupt vector allocation") Reviewed-by: Andy Gospodarek Signed-off-by: Vikas Gupta Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 93f9bd55020f..a5f9c9090a6b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -392,12 +392,13 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp) if (!edev) goto aux_dev_uninit; + aux_priv->edev = edev; + ulp = kzalloc(sizeof(*ulp), GFP_KERNEL); if (!ulp) goto aux_dev_uninit; edev->ulp_tbl = ulp; - aux_priv->edev = edev; bp->edev = edev; bnxt_set_edev_info(edev, bp); -- cgit v1.2.3 From b5ea7d33ba2a42b95b4298d08d2af9cdeeaf0090 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Fri, 5 Apr 2024 16:55:12 -0700 Subject: bnxt_en: Fix error recovery for RoCE ulp client Since runtime MSIXs vector allocation/free has been removed, the L2 driver needs to repopulate the MSIX entries for the ulp client as the irq table may change during the recovery process. Fixes: 303432211324 ("bnxt_en: Remove runtime interrupt vector allocation") Reviewed-by: Andy Gospodarek Signed-off-by: Vikas Gupta Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index a5f9c9090a6b..195c02dc0683 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -210,6 +210,9 @@ void bnxt_ulp_start(struct bnxt *bp, int err) if (err) return; + if (edev->ulp_tbl->msix_requested) + bnxt_fill_msix_vecs(bp, edev->msix_entries); + if (aux_priv) { struct auxiliary_device *adev; -- cgit v1.2.3 From faa12ca245585379d612736a4b5e98e88481ea59 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Fri, 5 Apr 2024 16:55:13 -0700 Subject: bnxt_en: Reset PTP tx_avail after possible firmware reset It is possible that during error recovery and firmware reset, there is a pending TX PTP packet waiting for the timestamp. We need to reset this condition so that after recovery, the tx_avail count for PTP is reset back to the initial value. Otherwise, we may not accept any PTP TX timestamps after recovery. Fixes: 118612d519d8 ("bnxt_en: Add PTP clock APIs, ioctls, and ethtool methods") Reviewed-by: Kalesh AP Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 493b724848c8..57e61f963167 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11758,6 +11758,8 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) /* VF-reps may need to be re-opened after the PF is re-opened */ if (BNXT_PF(bp)) bnxt_vf_reps_open(bp); + if (bp->ptp_cfg) + atomic_set(&bp->ptp_cfg->tx_avail, BNXT_MAX_TX_TS); bnxt_ptp_init_rtc(bp, true); bnxt_ptp_cfg_tstamp_filters(bp); bnxt_cfg_usr_fltrs(bp); -- cgit v1.2.3 From 3c89a068bfd0698a5478f4cf39493595ef757d5e Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 8 Apr 2024 09:02:23 +0200 Subject: PM: s2idle: Make sure CPUs will wakeup directly on resume s2idle works like a regular suspend with freezing processes and freezing devices. All CPUs except the control CPU go into idle. Once this is completed the control CPU kicks all other CPUs out of idle, so that they reenter the idle loop and then enter s2idle state. The control CPU then issues an swait() on the suspend state and therefore enters the idle loop as well. Due to being kicked out of idle, the other CPUs leave their NOHZ states, which means the tick is active and the corresponding hrtimer is programmed to the next jiffie. On entering s2idle the CPUs shut down their local clockevent device to prevent wakeups. The last CPU which enters s2idle shuts down its local clockevent and freezes timekeeping. On resume, one of the CPUs receives the wakeup interrupt, unfreezes timekeeping and its local clockevent and starts the resume process. At that point all other CPUs are still in s2idle with their clockevents switched off. They only resume when they are kicked by another CPU or after resuming devices and then receiving a device interrupt. That means there is no guarantee that all CPUs will wakeup directly on resume. As a consequence there is no guarantee that timers which are queued on those CPUs and should expire directly after resume, are handled. Also timer list timers which are remotely queued to one of those CPUs after resume will not result in a reprogramming IPI as the tick is active. Queueing a hrtimer will also not result in a reprogramming IPI because the first hrtimer event is already in the past. The recent introduction of the timer pull model (7ee988770326 ("timers: Implement the hierarchical pull model")) amplifies this problem, if the current migrator is one of the non woken up CPUs. When a non pinned timer list timer is queued and the queuing CPU goes idle, it relies on the still suspended migrator CPU to expire the timer which will happen by chance. The problem exists since commit 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path"). There the cpuidle_pause() call which in turn invoked a wakeup for all idle CPUs was moved to a later point in the resume process. This might not be reached or reached very late because it waits on a timer of a still suspended CPU. Address this by kicking all CPUs out of idle after the control CPU returns from swait() so that they resume their timers and restore consistent system state. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218641 Fixes: 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path") Signed-off-by: Anna-Maria Behnsen Reviewed-by: Thomas Gleixner Tested-by: Mario Limonciello Cc: 5.16+ # 5.16+ Acked-by: Peter Zijlstra (Intel) Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index e3ae93bbcb9b..09f8397bae15 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -106,6 +106,12 @@ static void s2idle_enter(void) swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); + /* + * Kick all CPUs to ensure that they resume their timers and restore + * consistent system state. + */ + wake_up_all_idle_cpus(); + cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); -- cgit v1.2.3 From 5ce344beaca688f4cdea07045e0b8f03dc537e74 Mon Sep 17 00:00:00 2001 From: Adam Dunlap Date: Mon, 18 Mar 2024 16:09:27 -0700 Subject: x86/apic: Force native_apic_mem_read() to use the MOV instruction When done from a virtual machine, instructions that touch APIC memory must be emulated. By convention, MMIO accesses are typically performed via io.h helpers such as readl() or writeq() to simplify instruction emulation/decoding (ex: in KVM hosts and SEV guests) [0]. Currently, native_apic_mem_read() does not follow this convention, allowing the compiler to emit instructions other than the MOV instruction generated by readl(). In particular, when the kernel is compiled with clang and run as a SEV-ES or SEV-SNP guest, the compiler would emit a TESTL instruction which is not supported by the SEV-ES emulator, causing a boot failure in that environment. It is likely the same problem would happen in a TDX guest as that uses the same instruction emulator as SEV-ES. To make sure all emulators can emulate APIC memory reads via MOV, use the readl() function in native_apic_mem_read(). It is expected that any emulator would support MOV in any addressing mode as it is the most generic and is what is usually emitted currently. The TESTL instruction is emitted when native_apic_mem_read() is inlined into apic_mem_wait_icr_idle(). The emulator comes from insn_decode_mmio() in arch/x86/lib/insn-eval.c. It's not worth it to extend insn_decode_mmio() to support more instructions since, in theory, the compiler could choose to output nearly any instruction for such reads which would bloat the emulator beyond reason. [0] https://lore.kernel.org/all/20220405232939.73860-12-kirill.shutemov@linux.intel.com/ [ bp: Massage commit message, fix typos. ] Signed-off-by: Adam Dunlap Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Reviewed-by: Ard Biesheuvel Tested-by: Kevin Loughlin Cc: Link: https://lore.kernel.org/r/20240318230927.2191933-1-acdunlap@google.com --- arch/x86/include/asm/apic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 94ce0f7c9d3a..e6ab0cf15ed5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -13,6 +13,7 @@ #include #include #include +#include #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -98,7 +99,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) static inline u32 native_apic_mem_read(u32 reg) { - return *((volatile u32 *)(APIC_BASE + reg)); + return readl((void __iomem *)(APIC_BASE + reg)); } static inline void native_apic_mem_eoi(void) -- cgit v1.2.3 From dfe073f8714dc8022b5578510e2288e5292adeb5 Mon Sep 17 00:00:00 2001 From: Minda Chen Date: Mon, 8 Apr 2024 09:29:42 +0800 Subject: net: stmmac: mmc_core: Add GMAC LPI statistics XGMAC MMC has already added LPI statistics. GMAC MMC lack of these statistics. Add register definition and reading the LPI statistics from registers. Signed-off-by: Minda Chen Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/mmc_core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c index 7eb477faa75a..b0db5f4e8fe8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c @@ -79,6 +79,12 @@ #define MMC_RX_FIFO_OVERFLOW 0xd4 #define MMC_RX_VLAN_FRAMES_GB 0xd8 #define MMC_RX_WATCHDOG_ERROR 0xdc + +#define MMC_TX_LPI_USEC 0xec +#define MMC_TX_LPI_TRAN 0xf0 +#define MMC_RX_LPI_USEC 0xf4 +#define MMC_RX_LPI_TRAN 0xf8 + /* IPC*/ #define MMC_RX_IPC_INTR_MASK 0x100 #define MMC_RX_IPC_INTR 0x108 @@ -283,6 +289,8 @@ static void dwmac_mmc_read(void __iomem *mmcaddr, struct stmmac_counters *mmc) mmc->mmc_tx_excessdef += readl(mmcaddr + MMC_TX_EXCESSDEF); mmc->mmc_tx_pause_frame += readl(mmcaddr + MMC_TX_PAUSE_FRAME); mmc->mmc_tx_vlan_frame_g += readl(mmcaddr + MMC_TX_VLAN_FRAME_G); + mmc->mmc_tx_lpi_usec += readl(mmcaddr + MMC_TX_LPI_USEC); + mmc->mmc_tx_lpi_tran += readl(mmcaddr + MMC_TX_LPI_TRAN); /* MMC RX counter registers */ mmc->mmc_rx_framecount_gb += readl(mmcaddr + MMC_RX_FRAMECOUNT_GB); @@ -316,6 +324,9 @@ static void dwmac_mmc_read(void __iomem *mmcaddr, struct stmmac_counters *mmc) mmc->mmc_rx_fifo_overflow += readl(mmcaddr + MMC_RX_FIFO_OVERFLOW); mmc->mmc_rx_vlan_frames_gb += readl(mmcaddr + MMC_RX_VLAN_FRAMES_GB); mmc->mmc_rx_watchdog_error += readl(mmcaddr + MMC_RX_WATCHDOG_ERROR); + mmc->mmc_rx_lpi_usec += readl(mmcaddr + MMC_RX_LPI_USEC); + mmc->mmc_rx_lpi_tran += readl(mmcaddr + MMC_RX_LPI_TRAN); + /* IPv4 */ mmc->mmc_rx_ipv4_gd += readl(mmcaddr + MMC_RX_IPV4_GD); mmc->mmc_rx_ipv4_hderr += readl(mmcaddr + MMC_RX_IPV4_HDERR); -- cgit v1.2.3 From ff20393bdc4537c5e044e3002d7f25a45f0d0f98 Mon Sep 17 00:00:00 2001 From: Minda Chen Date: Mon, 8 Apr 2024 09:29:43 +0800 Subject: net: stmmac: mmc_core: Add GMAC mmc tx/rx missing statistics The missing statistics including Rx_Receive_Error_Packets and Tx_OSize_Packets_Good. Signed-off-by: Minda Chen Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/mmc.h | 2 ++ drivers/net/ethernet/stmicro/stmmac/mmc_core.c | 4 ++++ drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc.h b/drivers/net/ethernet/stmicro/stmmac/mmc.h index dff02d75d519..5d1ea3e07459 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc.h +++ b/drivers/net/ethernet/stmicro/stmmac/mmc.h @@ -52,6 +52,7 @@ struct stmmac_counters { unsigned int mmc_tx_excessdef; unsigned int mmc_tx_pause_frame; unsigned int mmc_tx_vlan_frame_g; + unsigned int mmc_tx_oversize_g; unsigned int mmc_tx_lpi_usec; unsigned int mmc_tx_lpi_tran; @@ -80,6 +81,7 @@ struct stmmac_counters { unsigned int mmc_rx_fifo_overflow; unsigned int mmc_rx_vlan_frames_gb; unsigned int mmc_rx_watchdog_error; + unsigned int mmc_rx_error; unsigned int mmc_rx_lpi_usec; unsigned int mmc_rx_lpi_tran; unsigned int mmc_rx_discard_frames_gb; diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c index b0db5f4e8fe8..0fab842902a8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c @@ -53,6 +53,7 @@ #define MMC_TX_EXCESSDEF 0x6c #define MMC_TX_PAUSE_FRAME 0x70 #define MMC_TX_VLAN_FRAME_G 0x74 +#define MMC_TX_OVERSIZE_G 0x78 /* MMC RX counter registers */ #define MMC_RX_FRAMECOUNT_GB 0x80 @@ -79,6 +80,7 @@ #define MMC_RX_FIFO_OVERFLOW 0xd4 #define MMC_RX_VLAN_FRAMES_GB 0xd8 #define MMC_RX_WATCHDOG_ERROR 0xdc +#define MMC_RX_ERROR 0xe0 #define MMC_TX_LPI_USEC 0xec #define MMC_TX_LPI_TRAN 0xf0 @@ -289,6 +291,7 @@ static void dwmac_mmc_read(void __iomem *mmcaddr, struct stmmac_counters *mmc) mmc->mmc_tx_excessdef += readl(mmcaddr + MMC_TX_EXCESSDEF); mmc->mmc_tx_pause_frame += readl(mmcaddr + MMC_TX_PAUSE_FRAME); mmc->mmc_tx_vlan_frame_g += readl(mmcaddr + MMC_TX_VLAN_FRAME_G); + mmc->mmc_tx_oversize_g += readl(mmcaddr + MMC_TX_OVERSIZE_G); mmc->mmc_tx_lpi_usec += readl(mmcaddr + MMC_TX_LPI_USEC); mmc->mmc_tx_lpi_tran += readl(mmcaddr + MMC_TX_LPI_TRAN); @@ -324,6 +327,7 @@ static void dwmac_mmc_read(void __iomem *mmcaddr, struct stmmac_counters *mmc) mmc->mmc_rx_fifo_overflow += readl(mmcaddr + MMC_RX_FIFO_OVERFLOW); mmc->mmc_rx_vlan_frames_gb += readl(mmcaddr + MMC_RX_VLAN_FRAMES_GB); mmc->mmc_rx_watchdog_error += readl(mmcaddr + MMC_RX_WATCHDOG_ERROR); + mmc->mmc_rx_error += readl(mmcaddr + MMC_RX_ERROR); mmc->mmc_rx_lpi_usec += readl(mmcaddr + MMC_RX_LPI_USEC); mmc->mmc_rx_lpi_tran += readl(mmcaddr + MMC_RX_LPI_TRAN); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index e1537a57815f..542e2633a6f5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -212,6 +212,7 @@ static const struct stmmac_stats stmmac_mmc[] = { STMMAC_MMC_STAT(mmc_tx_excessdef), STMMAC_MMC_STAT(mmc_tx_pause_frame), STMMAC_MMC_STAT(mmc_tx_vlan_frame_g), + STMMAC_MMC_STAT(mmc_tx_oversize_g), STMMAC_MMC_STAT(mmc_tx_lpi_usec), STMMAC_MMC_STAT(mmc_tx_lpi_tran), STMMAC_MMC_STAT(mmc_rx_framecount_gb), @@ -238,6 +239,7 @@ static const struct stmmac_stats stmmac_mmc[] = { STMMAC_MMC_STAT(mmc_rx_fifo_overflow), STMMAC_MMC_STAT(mmc_rx_vlan_frames_gb), STMMAC_MMC_STAT(mmc_rx_watchdog_error), + STMMAC_MMC_STAT(mmc_rx_error), STMMAC_MMC_STAT(mmc_rx_lpi_usec), STMMAC_MMC_STAT(mmc_rx_lpi_tran), STMMAC_MMC_STAT(mmc_rx_discard_frames_gb), -- cgit v1.2.3 From c1d11fc2c8320871b40730991071dd0a0b405bc8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:46:01 +0200 Subject: irqflags: Explicitly ignore lockdep_hrtimer_exit() argument When building with 'make W=1' but CONFIG_TRACE_IRQFLAGS=n, the unused argument to lockdep_hrtimer_exit() causes a warning: kernel/time/hrtimer.c:1655:14: error: variable 'expires_in_hardirq' set but not used [-Werror=unused-but-set-variable] This is intentional behavior, so add a cast to void to shut up the warning. Fixes: 73d20564e0dc ("hrtimer: Don't dereference the hrtimer pointer after the callback") Reported-by: kernel test robot Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240408074609.3170807-1-arnd@kernel.org Closes: https://lore.kernel.org/oe-kbuild-all/202311191229.55QXHVc6-lkp@intel.com/ --- include/linux/irqflags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 147feebd508c..3f003d5fde53 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -114,7 +114,7 @@ do { \ # define lockdep_softirq_enter() do { } while (0) # define lockdep_softirq_exit() do { } while (0) # define lockdep_hrtimer_enter(__hrtimer) false -# define lockdep_hrtimer_exit(__context) do { } while (0) +# define lockdep_hrtimer_exit(__context) do { (void)(__context); } while (0) # define lockdep_posixtimer_enter() do { } while (0) # define lockdep_posixtimer_exit() do { } while (0) # define lockdep_irq_work_enter(__work) do { } while (0) -- cgit v1.2.3 From fa1f51162338b3e2f520d4bfedc42b3b2e00da6d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 19 Mar 2024 19:20:50 +0100 Subject: locking: Make rwsem_assert_held_write_nolockdep() build with PREEMPT_RT=y The commit cited below broke the build for PREEMPT_RT because rwsem_assert_held_write_nolockdep() passes a struct rw_semaphore but rw_base_assert_held_write() expects struct rwbase_rt. Fixing the type alone leads to the problem that WARN_ON() is not found because bug.h is missing. In order to resolve this: - Keep the assert (WARN_ON()) in rwsem.h (not rwbase_rt.h) - Make rwsem_assert_held_write_nolockdep() do the implementation specific (rw_base) writer check. - Replace the "inline" with __always_inline which was used before. Fixes: f70405afc99b1 ("locking: Add rwsem_assert_held() and rwsem_assert_held_write()") Reported-by: Clark Williams Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Waiman Long Link: https://lore.kernel.org/r/20240319182050.U4AzUF3I@linutronix.de --- include/linux/rwbase_rt.h | 4 ++-- include/linux/rwsem.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h index 29c4e4f243e4..f2394a409c9d 100644 --- a/include/linux/rwbase_rt.h +++ b/include/linux/rwbase_rt.h @@ -31,9 +31,9 @@ static __always_inline bool rw_base_is_locked(const struct rwbase_rt *rwb) return atomic_read(&rwb->readers) != READER_BIAS; } -static inline void rw_base_assert_held_write(const struct rwbase_rt *rwb) +static __always_inline bool rw_base_is_write_locked(const struct rwbase_rt *rwb) { - WARN_ON(atomic_read(&rwb->readers) != WRITER_BIAS); + return atomic_read(&rwb->readers) == WRITER_BIAS; } static __always_inline bool rw_base_is_contended(const struct rwbase_rt *rwb) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4f1c18992f76..c8b543d428b0 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -167,14 +167,14 @@ static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) return rw_base_is_locked(&sem->rwbase); } -static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rwsem_is_locked(sem)); } -static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { - rw_base_assert_held_write(sem); + WARN_ON(!rw_base_is_write_locked(&sem->rwbase)); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) -- cgit v1.2.3 From d730192ff0246356a2d7e63ff5bd501060670eec Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 6 Apr 2024 13:40:52 +0200 Subject: ACPI: scan: Do not increase dep_unmet for already met dependencies On the Toshiba Encore WT10-A tablet the BATC battery ACPI device depends on 3 other devices: Name (_DEP, Package (0x03) // _DEP: Dependencies { I2C1, GPO2, GPO0 }) acpi_scan_check_dep() adds all 3 of these to the acpi_dep_list and then before an acpi_device is created for the BATC handle (and thus before acpi_scan_dep_init() runs) acpi_scan_clear_dep() gets called for both GPIO depenencies, with free_when_met not set for the dependencies. Since there is no adev for BATC yet, there also is no dep_unmet to decrement. The only result of acpi_scan_clear_dep() in this case is dep->met getting set. Soon after acpi_scan_clear_dep() has been called for the GPIO dependencies the acpi_device gets created for the BATC handle and acpi_scan_dep_init() runs, this sees 3 dependencies on the acpi_dep_list and initializes unmet_dep to 3. Later when the dependency for I2C1 is met unmet_dep becomes 2, but since the 2 GPIO deps where already met it never becomes 0 causing battery monitoring to not work. Fix this by modifying acpi_scan_dep_init() to not increase dep_met for dependencies which have already been marked as being met. Fixes: 3ba12d8de3fa ("ACPI: scan: Reduce overhead related to devices with dependencies") Signed-off-by: Hans de Goede Cc: 6.5+ # 6.5+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 7c157bf92695..d1464324de95 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1843,7 +1843,8 @@ static void acpi_scan_dep_init(struct acpi_device *adev) if (dep->honor_dep) adev->flags.honor_deps = 1; - adev->dep_unmet++; + if (!dep->met) + adev->dep_unmet++; } } } -- cgit v1.2.3 From 8ab58f6841b19423231c5db3378691ec80c778f8 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 14 Mar 2024 16:49:43 +0100 Subject: gpu: host1x: Do not setup DMA for virtual devices The host1x devices are virtual compound devices and do not perform DMA accesses themselves, so they do not need to be set up for DMA. Ideally we would also not need to set up DMA masks for the virtual devices, but we currently still need those for legacy support on old hardware. Tested-by: Jon Hunter Acked-by: Jon Hunter Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240314154943.2487549-1-thierry.reding@gmail.com --- drivers/gpu/host1x/bus.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c index 783975d1384f..7c52757a89db 100644 --- a/drivers/gpu/host1x/bus.c +++ b/drivers/gpu/host1x/bus.c @@ -351,11 +351,6 @@ static int host1x_device_uevent(const struct device *dev, return 0; } -static int host1x_dma_configure(struct device *dev) -{ - return of_dma_configure(dev, dev->of_node, true); -} - static const struct dev_pm_ops host1x_device_pm_ops = { .suspend = pm_generic_suspend, .resume = pm_generic_resume, @@ -369,7 +364,6 @@ const struct bus_type host1x_bus_type = { .name = "host1x", .match = host1x_device_match, .uevent = host1x_device_uevent, - .dma_configure = host1x_dma_configure, .pm = &host1x_device_pm_ops, }; @@ -458,8 +452,6 @@ static int host1x_device_add(struct host1x *host1x, device->dev.bus = &host1x_bus_type; device->dev.parent = host1x->dev; - of_dma_configure(&device->dev, host1x->dev->of_node, true); - device->dev.dma_parms = &device->dma_parms; dma_set_max_seg_size(&device->dev, UINT_MAX); -- cgit v1.2.3 From aca1a5287ea328fd1f7e2bfa6806646486d86a70 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 28 Mar 2024 09:25:40 +0530 Subject: ACPI: bus: allow _UID matching for integer zero Commit b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") added _UID matching support for both integer and string types, which satisfies NULL @uid2 argument for string types using inversion, but this logic prevents _UID comparision in case the argument is integer 0, which may result in false positives. Fix this using _Generic(), which will allow NULL @uid2 argument for string types as well as _UID matching for all possible integer values. Fixes: b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") Signed-off-by: Raag Jadav [ rjw: Comment adjustment ] Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 5de954e2b18a..e7796f373d0d 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -911,17 +911,19 @@ static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) * acpi_dev_hid_uid_match - Match device by supplied HID and UID * @adev: ACPI device to match. * @hid2: Hardware ID of the device. - * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID. + * @uid2: Unique ID of the device, pass NULL to not check _UID. * * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 * will be treated as a match. If user wants to validate @uid2, it should be * done before calling this function. * - * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise. + * Returns: %true if matches or @uid2 is NULL, %false otherwise. */ #define acpi_dev_hid_uid_match(adev, hid2, uid2) \ (acpi_dev_hid_match(adev, hid2) && \ - (!(uid2) || acpi_dev_uid_match(adev, uid2))) + /* Distinguish integer 0 from NULL @uid2 */ \ + (_Generic(uid2, ACPI_STR_TYPES(!(uid2)), default: 0) || \ + acpi_dev_uid_match(adev, uid2))) void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); -- cgit v1.2.3 From 3eadd887dbac1df8f25f701e5d404d1b90fd0fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 4 Apr 2024 23:33:25 +0300 Subject: drm/client: Fully protect modes[] with dev->mode_config.mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The modes[] array contains pointers to modes on the connectors' mode lists, which are protected by dev->mode_config.mutex. Thus we need to extend modes[] the same protection or by the time we use it the elements may already be pointing to freed/reused memory. Cc: stable@vger.kernel.org Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10583 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240404203336.10454-2-ville.syrjala@linux.intel.com Reviewed-by: Dmitry Baryshkov Reviewed-by: Jani Nikula Reviewed-by: Thomas Zimmermann --- drivers/gpu/drm/drm_client_modeset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c index 871e4e2129d6..0683a129b362 100644 --- a/drivers/gpu/drm/drm_client_modeset.c +++ b/drivers/gpu/drm/drm_client_modeset.c @@ -777,6 +777,7 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; + /* points to modes protected by mode_config.mutex */ struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; @@ -845,7 +846,6 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } - mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); @@ -875,6 +875,7 @@ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, modeset->y = offset->y; } } + mutex_unlock(&dev->mode_config.mutex); mutex_unlock(&client->modeset_mutex); out: -- cgit v1.2.3 From 5864e479ca4344f3a5df8074524da24c960f440b Mon Sep 17 00:00:00 2001 From: David McFarland Date: Thu, 4 Apr 2024 08:41:45 -0300 Subject: platform/x86/intel/hid: Don't wake on 5-button releases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If, for example, the power button is configured to suspend, holding it and releasing it after the machine has suspended, will wake the machine. Also on some machines, power button release events are sent during hibernation, even if the button wasn't used to hibernate the machine. This causes hibernation to be aborted. Fixes: 0c4cae1bc00d ("PM: hibernate: Avoid missing wakeup events during hibernation") Signed-off-by: David McFarland Tested-by: Enrik Berkhan Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/878r1tpd6u.fsf_-_@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/hid.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index 7457ca2b27a6..9ffbdc988fe5 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -504,6 +504,7 @@ static void notify_handler(acpi_handle handle, u32 event, void *context) struct platform_device *device = context; struct intel_hid_priv *priv = dev_get_drvdata(&device->dev); unsigned long long ev_index; + struct key_entry *ke; int err; /* @@ -545,11 +546,15 @@ static void notify_handler(acpi_handle handle, u32 event, void *context) if (event == 0xc0 || !priv->array) return; - if (!sparse_keymap_entry_from_scancode(priv->array, event)) { + ke = sparse_keymap_entry_from_scancode(priv->array, event); + if (!ke) { dev_info(&device->dev, "unknown event 0x%x\n", event); return; } + if (ke->type == KE_IGNORE) + return; + wakeup: pm_wakeup_hard_event(&device->dev); -- cgit v1.2.3 From 79ce88064bb04ec62c4e9e4da4614d36906f8a04 Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Fri, 5 Apr 2024 17:56:30 +0530 Subject: platform/x86/intel/hid: Add Lunar Lake and Arrow Lake support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add INTC107B for Lunar Lake and INTC10CB for Arrow Lake ACPI devices IDs. Signed-off-by: Sumeet Pawnikar Link: https://lore.kernel.org/r/20240405122630.32154-1-sumeet.r.pawnikar@intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/hid.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index 9ffbdc988fe5..c7a827645864 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -49,6 +49,8 @@ static const struct acpi_device_id intel_hid_ids[] = { {"INTC1076", 0}, {"INTC1077", 0}, {"INTC1078", 0}, + {"INTC107B", 0}, + {"INTC10CB", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, intel_hid_ids); -- cgit v1.2.3 From 592780b8391fe31f129ef4823c1513528f4dcb76 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:13 -0700 Subject: cxl: Fix retrieving of access_coordinates in PCIe path Current loop in cxl_endpoint_get_perf_coordinates() incorrectly assumes the Root Port (RP) dport is the one with generic port access_coordinate. However those coordinates are one level up in the Host Bridge (HB). Current code causes the computation code to pick up 0s as the coordinates and cause minimal bandwidth to result in 0. Add check to skip RP when combining coordinates. Fixes: 14a6960b3e92 ("cxl: Add helper function that calculate performance data for downstream ports") Reported-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 6cbde50a742b..7aadcec4fc64 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2165,6 +2165,11 @@ int cxl_hb_get_perf_coordinates(struct cxl_port *port, return 0; } +static bool parent_port_is_cxl_root(struct cxl_port *port) +{ + return is_cxl_root(to_cxl_port(port->dev.parent)); +} + /** * cxl_endpoint_get_perf_coordinates - Retrieve performance numbers stored in dports * of CXL path @@ -2184,27 +2189,31 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct cxl_dport *dport; struct pci_dev *pdev; unsigned int bw; + bool is_cxl_root; if (!is_cxl_endpoint(port)) return -EINVAL; - dport = iter->parent_dport; - /* - * Exit the loop when the parent port of the current port is cxl root. - * The iterative loop starts at the endpoint and gathers the - * latency of the CXL link from the current iter to the next downstream - * port each iteration. If the parent is cxl root then there is - * nothing to gather. + * Exit the loop when the parent port of the current iter port is cxl + * root. The iterative loop starts at the endpoint and gathers the + * latency of the CXL link from the current device/port to the connected + * downstream port each iteration. */ - while (!is_cxl_root(to_cxl_port(iter->dev.parent))) { - cxl_coordinates_combine(&c, &c, &dport->sw_coord); + do { + dport = iter->parent_dport; + iter = to_cxl_port(iter->dev.parent); + is_cxl_root = parent_port_is_cxl_root(iter); + + /* + * There's no valid access_coordinate for a root port since RPs do not + * have CDAT and therefore needs to be skipped. + */ + if (!is_cxl_root) + cxl_coordinates_combine(&c, &c, &dport->sw_coord); c.write_latency += dport->link_latency; c.read_latency += dport->link_latency; - - iter = to_cxl_port(iter->dev.parent); - dport = iter->parent_dport; - } + } while (!is_cxl_root); /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); -- cgit v1.2.3 From 51293c565cf4b8d57c154efadb57b17866c74bcb Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:14 -0700 Subject: cxl: Fix incorrect region perf data calculation Current math in cxl_region_perf_data_calculate divides the latency by 1000 every time the function gets called. This causes the region latency to be divided by 1000 per memory device and the math is incorrect. This is user visible as the latency access_coordinate exposed via sysfs will show incorrect latency data. Normalize values from CDAT to nanoseconds. Adjust sub-nanoseconds latency to at least 1. Remove adjustment of perf numbers from the generic target since hmat handling code has already normalized those numbers. Now all computation and stored numbers should be in nanoseconds. cxl_hb_get_perf_coordinates() is removed and HB coords are calculated in the port access_coordinate calculation path since it no longer need to be treated special. Fixes: 3d9f4a197230 ("cxl/region: Calculate performance data for a region") Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-4-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 13 +------ drivers/cxl/core/cdat.c | 94 +++++++++++++++++++++---------------------------- drivers/cxl/core/port.c | 36 +++---------------- drivers/cxl/cxl.h | 2 -- 4 files changed, 45 insertions(+), 100 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index af5cb818f84d..566c387d4385 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -525,22 +525,11 @@ static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport) { struct acpi_device *hb = to_cxl_host_bridge(NULL, dev); u32 uid; - int rc; if (kstrtou32(acpi_device_uid(hb), 0, &uid)) return -EINVAL; - rc = acpi_get_genport_coordinates(uid, dport->hb_coord); - if (rc < 0) - return rc; - - /* Adjust back to picoseconds from nanoseconds */ - for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { - dport->hb_coord[i].read_latency *= 1000; - dport->hb_coord[i].write_latency *= 1000; - } - - return 0; + return acpi_get_genport_coordinates(uid, dport->hb_coord); } static int add_host_bridge_dport(struct device *match, void *arg) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index eddbbe21450c..f66b493b5d3c 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -20,6 +20,36 @@ struct dsmas_entry { int qos_class; }; +static u32 cdat_normalize(u16 entry, u64 base, u8 type) +{ + u32 value; + + /* + * Check for invalid and overflow values + */ + if (entry == 0xffff || !entry) + return 0; + else if (base > (UINT_MAX / (entry))) + return 0; + + /* + * CDAT fields follow the format of HMAT fields. See table 5 Device + * Scoped Latency and Bandwidth Information Structure in Coherent Device + * Attribute Table (CDAT) Specification v1.01. + */ + value = entry * base; + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + value = DIV_ROUND_UP(value, 1000); + break; + default: + break; + } + return value; +} + static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, const unsigned long end) { @@ -97,7 +127,6 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, __le16 le_val; u64 val; u16 len; - int rc; len = le16_to_cpu((__force __le16)hdr->length); if (len != size || (unsigned long)hdr + len > end) { @@ -124,10 +153,8 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, le_base = (__force __le64)dslbis->entry_base_unit; le_val = (__force __le16)dslbis->entry[0]; - rc = check_mul_overflow(le64_to_cpu(le_base), - le16_to_cpu(le_val), &val); - if (rc) - pr_warn("DSLBIS value overflowed.\n"); + val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), + dslbis->data_type); cxl_access_coordinate_set(&dent->coord, dslbis->data_type, val); @@ -164,7 +191,6 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { struct access_coordinate ep_c; - struct access_coordinate coord[ACCESS_COORDINATE_MAX]; struct dsmas_entry *dent; int valid_entries = 0; unsigned long index; @@ -176,12 +202,6 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, return rc; } - rc = cxl_hb_get_perf_coordinates(port, coord); - if (rc) { - dev_dbg(&port->dev, "Failed to retrieve hb perf coordinates.\n"); - return rc; - } - struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); if (!cxl_root) @@ -194,18 +214,9 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, int qos_class; cxl_coordinates_combine(&dent->coord, &dent->coord, &ep_c); - /* - * Keeping the host bridge coordinates separate from the dsmas - * coordinates in order to allow calculation of access class - * 0 and 1 for region later. - */ - cxl_coordinates_combine(&coord[ACCESS_COORDINATE_CPU], - &coord[ACCESS_COORDINATE_CPU], - &dent->coord); dent->entries = 1; - rc = cxl_root->ops->qos_class(cxl_root, - &coord[ACCESS_COORDINATE_CPU], - 1, &qos_class); + rc = cxl_root->ops->qos_class(cxl_root, &dent->coord, 1, + &qos_class); if (rc != 1) continue; @@ -461,10 +472,8 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, le_base = (__force __le64)tbl->sslbis_header.entry_base_unit; le_val = (__force __le16)tbl->entries[i].latency_or_bandwidth; - - if (check_mul_overflow(le64_to_cpu(le_base), - le16_to_cpu(le_val), &val)) - dev_warn(dev, "SSLBIS value overflowed!\n"); + val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), + sslbis->data_type); xa_for_each(&port->dports, index, dport) { if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || @@ -521,17 +530,13 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlmd->endpoint; struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - struct access_coordinate hb_coord[ACCESS_COORDINATE_MAX]; - struct access_coordinate coord; struct range dpa = { .start = cxled->dpa_res->start, .end = cxled->dpa_res->end, }; struct cxl_dpa_perf *perf; - int rc; switch (cxlr->mode) { case CXL_DECODER_RAM: @@ -549,35 +554,16 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, if (!range_contains(&perf->dpa_range, &dpa)) return; - rc = cxl_hb_get_perf_coordinates(port, hb_coord); - if (rc) { - dev_dbg(&port->dev, "Failed to retrieve hb perf coordinates.\n"); - return; - } - for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { - /* Pickup the host bridge coords */ - cxl_coordinates_combine(&coord, &hb_coord[i], &perf->coord); - /* Get total bandwidth and the worst latency for the cxl region */ cxlr->coord[i].read_latency = max_t(unsigned int, cxlr->coord[i].read_latency, - coord.read_latency); + perf->coord.read_latency); cxlr->coord[i].write_latency = max_t(unsigned int, cxlr->coord[i].write_latency, - coord.write_latency); - cxlr->coord[i].read_bandwidth += coord.read_bandwidth; - cxlr->coord[i].write_bandwidth += coord.write_bandwidth; - - /* - * Convert latency to nanosec from picosec to be consistent - * with the resulting latency coordinates computed by the - * HMAT_REPORTING code. - */ - cxlr->coord[i].read_latency = - DIV_ROUND_UP(cxlr->coord[i].read_latency, 1000); - cxlr->coord[i].write_latency = - DIV_ROUND_UP(cxlr->coord[i].write_latency, 1000); + perf->coord.write_latency); + cxlr->coord[i].read_bandwidth += perf->coord.read_bandwidth; + cxlr->coord[i].write_bandwidth += perf->coord.write_bandwidth; } } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 7aadcec4fc64..c7c00eb373af 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2133,38 +2133,6 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) } EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); -/** - * cxl_hb_get_perf_coordinates - Retrieve performance numbers between initiator - * and host bridge - * - * @port: endpoint cxl_port - * @coord: output access coordinates - * - * Return: errno on failure, 0 on success. - */ -int cxl_hb_get_perf_coordinates(struct cxl_port *port, - struct access_coordinate *coord) -{ - struct cxl_port *iter = port; - struct cxl_dport *dport; - - if (!is_cxl_endpoint(port)) - return -EINVAL; - - dport = iter->parent_dport; - while (iter && !is_cxl_root(to_cxl_port(iter->dev.parent))) { - iter = to_cxl_port(iter->dev.parent); - dport = iter->parent_dport; - } - - coord[ACCESS_COORDINATE_LOCAL] = - dport->hb_coord[ACCESS_COORDINATE_LOCAL]; - coord[ACCESS_COORDINATE_CPU] = - dport->hb_coord[ACCESS_COORDINATE_CPU]; - - return 0; -} - static bool parent_port_is_cxl_root(struct cxl_port *port) { return is_cxl_root(to_cxl_port(port->dev.parent)); @@ -2215,6 +2183,10 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, c.read_latency += dport->link_latency; } while (!is_cxl_root); + dport = iter->parent_dport; + /* Retrieve HB coords */ + cxl_coordinates_combine(&c, &c, dport->hb_coord); + /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); bw = pcie_bandwidth_available(pdev, NULL, NULL, NULL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 534e25e2f0a4..ed02373ce3d9 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -884,8 +884,6 @@ void cxl_switch_parse_cdat(struct cxl_port *port); int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord); -int cxl_hb_get_perf_coordinates(struct cxl_port *port, - struct access_coordinate *coord); void cxl_region_perf_data_calculate(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled); -- cgit v1.2.3 From 001c5d19341a39cb683ab0a18ce4b662a09d96a0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:15 -0700 Subject: cxl: Consolidate dport access_coordinate ->hb_coord and ->sw_coord into ->coord The driver stores access_coordinate for host bridge in ->hb_coord and switch CDAT access_coordinate in ->sw_coord. Since neither of these access_coordinate clobber each other, the variable name can be consolidated into ->coord to simplify the code. Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-5-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 2 +- drivers/cxl/core/cdat.c | 74 ++++++++++++++++++++++++++++---------------- drivers/cxl/core/port.c | 48 +++++++++++++++++++++------- drivers/cxl/cxl.h | 6 ++-- drivers/cxl/cxlmem.h | 2 +- tools/testing/cxl/test/cxl.c | 10 +++--- 6 files changed, 94 insertions(+), 48 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 566c387d4385..cb8c155a2c9b 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -529,7 +529,7 @@ static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport) if (kstrtou32(acpi_device_uid(hb), 0, &uid)) return -EINVAL; - return acpi_get_genport_coordinates(uid, dport->hb_coord); + return acpi_get_genport_coordinates(uid, dport->coord); } static int add_host_bridge_dport(struct device *match, void *arg) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index f66b493b5d3c..bb83867d9fec 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -14,7 +14,7 @@ struct dsmas_entry { struct range dpa_range; u8 handle; - struct access_coordinate coord; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; int entries; int qos_class; @@ -88,8 +88,8 @@ static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, return 0; } -static void cxl_access_coordinate_set(struct access_coordinate *coord, - int access, unsigned int val) +static void __cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) { switch (access) { case ACPI_HMAT_ACCESS_LATENCY: @@ -115,6 +115,13 @@ static void cxl_access_coordinate_set(struct access_coordinate *coord, } } +static void cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_access_coordinate_set(&coord[i], access, val); +} + static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, const unsigned long end) { @@ -156,7 +163,7 @@ static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), dslbis->data_type); - cxl_access_coordinate_set(&dent->coord, dslbis->data_type, val); + cxl_access_coordinate_set(dent->coord, dslbis->data_type, val); return 0; } @@ -190,13 +197,13 @@ static int cxl_cdat_endpoint_process(struct cxl_port *port, static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { - struct access_coordinate ep_c; + struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; struct dsmas_entry *dent; int valid_entries = 0; unsigned long index; int rc; - rc = cxl_endpoint_get_perf_coordinates(port, &ep_c); + rc = cxl_endpoint_get_perf_coordinates(port, ep_c); if (rc) { dev_dbg(&port->dev, "Failed to retrieve ep perf coordinates.\n"); return rc; @@ -213,10 +220,11 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, xa_for_each(dsmas_xa, index, dent) { int qos_class; - cxl_coordinates_combine(&dent->coord, &dent->coord, &ep_c); + cxl_coordinates_combine(dent->coord, dent->coord, ep_c); dent->entries = 1; - rc = cxl_root->ops->qos_class(cxl_root, &dent->coord, 1, - &qos_class); + rc = cxl_root->ops->qos_class(cxl_root, + &dent->coord[ACCESS_COORDINATE_CPU], + 1, &qos_class); if (rc != 1) continue; @@ -233,14 +241,17 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, static void update_perf_entry(struct device *dev, struct dsmas_entry *dent, struct cxl_dpa_perf *dpa_perf) { + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + dpa_perf->coord[i] = dent->coord[i]; dpa_perf->dpa_range = dent->dpa_range; - dpa_perf->coord = dent->coord; dpa_perf->qos_class = dent->qos_class; dev_dbg(dev, "DSMAS: dpa: %#llx qos: %d read_bw: %d write_bw %d read_lat: %d write_lat: %d\n", dent->dpa_range.start, dpa_perf->qos_class, - dent->coord.read_bandwidth, dent->coord.write_bandwidth, - dent->coord.read_latency, dent->coord.write_latency); + dent->coord[ACCESS_COORDINATE_CPU].read_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].write_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].read_latency, + dent->coord[ACCESS_COORDINATE_CPU].write_latency); } static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, @@ -477,10 +488,11 @@ static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, xa_for_each(&port->dports, index, dport) { if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || - dsp_id == dport->port_id) - cxl_access_coordinate_set(&dport->sw_coord, + dsp_id == dport->port_id) { + cxl_access_coordinate_set(dport->coord, sslbis->data_type, val); + } } } @@ -502,6 +514,21 @@ void cxl_switch_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_switch_parse_cdat, CXL); +static void __cxl_coordinates_combine(struct access_coordinate *out, + struct access_coordinate *c1, + struct access_coordinate *c2) +{ + if (c1->write_bandwidth && c2->write_bandwidth) + out->write_bandwidth = min(c1->write_bandwidth, + c2->write_bandwidth); + out->write_latency = c1->write_latency + c2->write_latency; + + if (c1->read_bandwidth && c2->read_bandwidth) + out->read_bandwidth = min(c1->read_bandwidth, + c2->read_bandwidth); + out->read_latency = c1->read_latency + c2->read_latency; +} + /** * cxl_coordinates_combine - Combine the two input coordinates * @@ -513,15 +540,8 @@ void cxl_coordinates_combine(struct access_coordinate *out, struct access_coordinate *c1, struct access_coordinate *c2) { - if (c1->write_bandwidth && c2->write_bandwidth) - out->write_bandwidth = min(c1->write_bandwidth, - c2->write_bandwidth); - out->write_latency = c1->write_latency + c2->write_latency; - - if (c1->read_bandwidth && c2->read_bandwidth) - out->read_bandwidth = min(c1->read_bandwidth, - c2->read_bandwidth); - out->read_latency = c1->read_latency + c2->read_latency; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_coordinates_combine(&out[i], &c1[i], &c2[i]); } MODULE_IMPORT_NS(CXL); @@ -558,12 +578,12 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, /* Get total bandwidth and the worst latency for the cxl region */ cxlr->coord[i].read_latency = max_t(unsigned int, cxlr->coord[i].read_latency, - perf->coord.read_latency); + perf->coord[i].read_latency); cxlr->coord[i].write_latency = max_t(unsigned int, cxlr->coord[i].write_latency, - perf->coord.write_latency); - cxlr->coord[i].read_bandwidth += perf->coord.read_bandwidth; - cxlr->coord[i].write_bandwidth += perf->coord.write_bandwidth; + perf->coord[i].write_latency); + cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth; + cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth; } } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c7c00eb373af..801c4018a1dd 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2133,6 +2133,29 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) } EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); +static void add_latency(struct access_coordinate *c, long latency) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_latency += latency; + c[i].read_latency += latency; + } +} + +static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_bandwidth = min(c[i].write_bandwidth, bw); + c[i].read_bandwidth = min(c[i].read_bandwidth, bw); + } +} + +static void set_access_coordinates(struct access_coordinate *out, + struct access_coordinate *in) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + out[i] = in[i]; +} + static bool parent_port_is_cxl_root(struct cxl_port *port) { return is_cxl_root(to_cxl_port(port->dev.parent)); @@ -2149,9 +2172,15 @@ static bool parent_port_is_cxl_root(struct cxl_port *port) int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, struct access_coordinate *coord) { - struct access_coordinate c = { - .read_bandwidth = UINT_MAX, - .write_bandwidth = UINT_MAX, + struct access_coordinate c[] = { + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, }; struct cxl_port *iter = port; struct cxl_dport *dport; @@ -2178,14 +2207,13 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * have CDAT and therefore needs to be skipped. */ if (!is_cxl_root) - cxl_coordinates_combine(&c, &c, &dport->sw_coord); - c.write_latency += dport->link_latency; - c.read_latency += dport->link_latency; + cxl_coordinates_combine(c, c, dport->coord); + add_latency(c, dport->link_latency); } while (!is_cxl_root); dport = iter->parent_dport; /* Retrieve HB coords */ - cxl_coordinates_combine(&c, &c, dport->hb_coord); + cxl_coordinates_combine(c, c, dport->coord); /* Get the calculated PCI paths bandwidth */ pdev = to_pci_dev(port->uport_dev->parent); @@ -2194,10 +2222,8 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, return -ENXIO; bw /= BITS_PER_BYTE; - c.write_bandwidth = min(c.write_bandwidth, bw); - c.read_bandwidth = min(c.read_bandwidth, bw); - - *coord = c; + set_min_bandwidth(c, bw); + set_access_coordinates(coord, c); return 0; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ed02373ce3d9..036d17db68e0 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -663,8 +663,7 @@ struct cxl_rcrb_info { * @rch: Indicate whether this dport was enumerated in RCH or VH mode * @port: reference to cxl_port that contains this downstream port * @regs: Dport parsed register blocks - * @sw_coord: access coordinates (performance) for switch from CDAT - * @hb_coord: access coordinates (performance) from ACPI generic port (host bridge) + * @coord: access coordinates (bandwidth and latency performance attributes) * @link_latency: calculated PCIe downstream latency */ struct cxl_dport { @@ -675,8 +674,7 @@ struct cxl_dport { bool rch; struct cxl_port *port; struct cxl_regs regs; - struct access_coordinate sw_coord; - struct access_coordinate hb_coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; long link_latency; }; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 20fb3b35e89e..36cee9c30ceb 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -401,7 +401,7 @@ enum cxl_devtype { */ struct cxl_dpa_perf { struct range dpa_range; - struct access_coordinate coord; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; int qos_class; }; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 908e0d083936..61c69297e797 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -986,10 +986,12 @@ static void dpa_perf_setup(struct cxl_port *endpoint, struct range *range, { dpa_perf->qos_class = FAKE_QTG_ID; dpa_perf->dpa_range = *range; - dpa_perf->coord.read_latency = 500; - dpa_perf->coord.write_latency = 500; - dpa_perf->coord.read_bandwidth = 1000; - dpa_perf->coord.write_bandwidth = 1000; + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + dpa_perf->coord[i].read_latency = 500; + dpa_perf->coord[i].write_latency = 500; + dpa_perf->coord[i].read_bandwidth = 1000; + dpa_perf->coord[i].write_bandwidth = 1000; + } } static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) -- cgit v1.2.3 From 7bcf809b1e7889ab7e75fe1fcf8f1a98332f36d2 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 3 Apr 2024 08:47:16 -0700 Subject: cxl: Add checks to access_coordinate calculation to fail missing data Jonathan noted that when the coordinates for host bridge and switches can be 0s if no actual data are retrieved and the calculation continues. The resulting number would be inaccurate. Add checks to ensure that the calculation would complete only if the numbers are valid. While not seen in the wild, issue may show up with a BIOS that reported CXL root ports via Generic Ports (via a PCI handle in the SRAT entry). Fixes: 14a6960b3e92 ("cxl: Add helper function that calculate performance data for downstream ports") Reported-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20240403154844.3403859-6-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 801c4018a1dd..762783bb091a 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2141,6 +2141,18 @@ static void add_latency(struct access_coordinate *c, long latency) } } +static bool coordinates_valid(struct access_coordinate *c) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + if (c[i].read_bandwidth && c[i].write_bandwidth && + c[i].read_latency && c[i].write_latency) + continue; + return false; + } + + return true; +} + static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw) { for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { @@ -2206,13 +2218,18 @@ int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, * There's no valid access_coordinate for a root port since RPs do not * have CDAT and therefore needs to be skipped. */ - if (!is_cxl_root) + if (!is_cxl_root) { + if (!coordinates_valid(dport->coord)) + return -EINVAL; cxl_coordinates_combine(c, c, dport->coord); + } add_latency(c, dport->link_latency); } while (!is_cxl_root); dport = iter->parent_dport; /* Retrieve HB coords */ + if (!coordinates_valid(dport->coord)) + return -EINVAL; cxl_coordinates_combine(c, c, dport->coord); /* Get the calculated PCI paths bandwidth */ -- cgit v1.2.3 From 0dd50b3e2c3d651ea972c97cff1af67870f3deaf Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 2 Apr 2024 14:43:51 +0200 Subject: platform/x86: toshiba_acpi: Silence logging for some events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stop logging unknown event / unknown keycode messages on suspend / resume on a Toshiba Portege Z830: 1. The Toshiba Portege Z830 sends a 0x8e event when the power button is pressed, ignore this. 2. The Toshiba Portege Z830 sends a 0xe00 hotkey event on resume from suspend, ignore this. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240402124351.167152-1-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/toshiba_acpi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 291f14ef6702..77244c9aa60d 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -264,6 +264,7 @@ static const struct key_entry toshiba_acpi_keymap[] = { { KE_KEY, 0xb32, { KEY_NEXTSONG } }, { KE_KEY, 0xb33, { KEY_PLAYPAUSE } }, { KE_KEY, 0xb5a, { KEY_MEDIA } }, + { KE_IGNORE, 0x0e00, { KEY_RESERVED } }, /* Wake from sleep */ { KE_IGNORE, 0x1430, { KEY_RESERVED } }, /* Wake from sleep */ { KE_IGNORE, 0x1501, { KEY_RESERVED } }, /* Output changed */ { KE_IGNORE, 0x1502, { KEY_RESERVED } }, /* HDMI plugged/unplugged */ @@ -3523,9 +3524,10 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) (dev->kbd_mode == SCI_KBD_MODE_ON) ? LED_FULL : LED_OFF); break; + case 0x8e: /* Power button pressed */ + break; case 0x85: /* Unknown */ case 0x8d: /* Unknown */ - case 0x8e: /* Unknown */ case 0x94: /* Unknown */ case 0x95: /* Unknown */ default: -- cgit v1.2.3 From 868adf8a29179c00309ddd8ffe0afa2043f42cb5 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 29 Mar 2024 07:32:05 -0700 Subject: platform/x86: intel-vbtn: Use acpi_has_method to check for switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check for a device having virtual buttons is done using acpi_has_method(..."VBDL"). Mimic that for checking virtual switch presence. Signed-off-by: Gwendal Grignou Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240329143206.2977734-2-gwendal@chromium.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vbtn.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/platform/x86/intel/vbtn.c b/drivers/platform/x86/intel/vbtn.c index 084c355c86f5..48f0ac19d6dd 100644 --- a/drivers/platform/x86/intel/vbtn.c +++ b/drivers/platform/x86/intel/vbtn.c @@ -258,9 +258,6 @@ static const struct dmi_system_id dmi_switches_allow_list[] = { static bool intel_vbtn_has_switches(acpi_handle handle, bool dual_accel) { - unsigned long long vgbs; - acpi_status status; - /* See dual_accel_detect.h for more info */ if (dual_accel) return false; @@ -268,8 +265,7 @@ static bool intel_vbtn_has_switches(acpi_handle handle, bool dual_accel) if (!dmi_check_system(dmi_switches_allow_list)) return false; - status = acpi_evaluate_integer(handle, "VGBS", NULL, &vgbs); - return ACPI_SUCCESS(status); + return acpi_has_method(handle, "VGBS"); } static int intel_vbtn_probe(struct platform_device *device) -- cgit v1.2.3 From 434e5781d8cd2d0ed512d920c6cdeba4b33a2e81 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 29 Mar 2024 07:32:06 -0700 Subject: platform/x86: intel-vbtn: Update tablet mode switch at end of probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ACER Vivobook Flip (TP401NAS) virtual intel switch is implemented as follow: Device (VGBI) { Name (_HID, EisaId ("INT33D6") ... Name (VBDS, Zero) Method (_STA, 0, Serialized) // _STA: Status ... Method (VBDL, 0, Serialized) { PB1E |= 0x20 VBDS |= 0x40 } Method (VGBS, 0, Serialized) { Return (VBDS) /* \_SB_.PCI0.SBRG.EC0_.VGBI.VBDS */ } ... } By default VBDS is set to 0. At boot it is set to clamshell (bit 6 set) only after method VBDL is executed. Since VBDL is now evaluated in the probe routine later, after the device is registered, the retrieved value of VBDS was still 0 ("tablet mode") when setting up the virtual switch. Make sure to evaluate VGBS after VBDL, to ensure the convertible boots in clamshell mode, the expected default. Fixes: 26173179fae1 ("platform/x86: intel-vbtn: Eval VBDL after registering our notifier") Signed-off-by: Gwendal Grignou Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240329143206.2977734-3-gwendal@chromium.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vbtn.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/vbtn.c b/drivers/platform/x86/intel/vbtn.c index 48f0ac19d6dd..79bb2c801daa 100644 --- a/drivers/platform/x86/intel/vbtn.c +++ b/drivers/platform/x86/intel/vbtn.c @@ -136,8 +136,6 @@ static int intel_vbtn_input_setup(struct platform_device *device) priv->switches_dev->id.bustype = BUS_HOST; if (priv->has_switches) { - detect_tablet_mode(&device->dev); - ret = input_register_device(priv->switches_dev); if (ret) return ret; @@ -312,6 +310,9 @@ static int intel_vbtn_probe(struct platform_device *device) if (ACPI_FAILURE(status)) dev_err(&device->dev, "Error VBDL failed with ACPI status %d\n", status); } + // Check switches after buttons since VBDL may have side effects. + if (has_switches) + detect_tablet_mode(&device->dev); device_init_wakeup(&device->dev, true); /* -- cgit v1.2.3 From e71c8481692582c70cdfd0996c20cdcc71e425d3 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Wed, 3 Apr 2024 16:34:27 +0200 Subject: platform/x86: lg-laptop: fix %s null argument warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit W=1 warns about null argument to kprintf: warning: ‘%s’ directive argument is null [-Wformat-overflow=] pr_info("product: %s year: %d\n", product, year); Use "unknown" instead of NULL. Signed-off-by: Gergo Koteles Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://lore.kernel.org/r/33d40e976f08f82b9227d0ecae38c787fcc0c0b2.1712154684.git.soyer@irl.hu Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lg-laptop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c index ad3c39e9e9f5..e714ee6298dd 100644 --- a/drivers/platform/x86/lg-laptop.c +++ b/drivers/platform/x86/lg-laptop.c @@ -736,7 +736,7 @@ static int acpi_add(struct acpi_device *device) default: year = 2019; } - pr_info("product: %s year: %d\n", product, year); + pr_info("product: %s year: %d\n", product ?: "unknown", year); if (year >= 2019) battery_limit_use_wmbb = 1; -- cgit v1.2.3 From 7b1f6b5aaec0f849e19c3e99d4eea75876853cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 18:50:03 +0300 Subject: drm/i915/cdclk: Fix CDCLK programming order when pipes are active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we always reprogram CDCLK from the intel_set_cdclk_pre_plane_update() when using squash/crawl. The code only works correctly for the cd2x update or full modeset cases, and it was simply never updated to deal with squash/crawl. If the CDCLK frequency is increasing we must reprogram it before we do anything else that might depend on the new higher frequency, and conversely we must not decrease the frequency until everything that might still depend on the old higher frequency has been dealt with. Since cdclk_state->pipe is only relevant when doing a cd2x update we can't use it to determine the correct sequence during squash/crawl. To that end introduce cdclk_state->disable_pipes which simply indicates that we must perform the update while the pipes are disable (ie. during intel_set_cdclk_pre_plane_update()). Otherwise we use the same old vs. new CDCLK frequency comparsiong as for cd2x updates. The only remaining problem case is when the voltage_level needs to increase due to a DDI port, but the CDCLK frequency is decreasing (and not all pipes are being disabled). The current approach will not bump the voltage level up until after the port has already been enabled, which is too late. But we'll take care of that case separately. v2: Don't break the "must disable pipes case" v3: Keep the on stack 'pipe' for future use Cc: stable@vger.kernel.org Fixes: d62686ba3b54 ("drm/i915/adl_p: CDCLK crawl support for ADL") Reviewed-by: Uma Shankar Reviewed-by: Gustavo Sousa Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402155016.13733-2-ville.syrjala@linux.intel.com (cherry picked from commit 3aecee90ac12a351905f12dda7643d5b0676d6ca) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_cdclk.c | 7 +++++-- drivers/gpu/drm/i915/display/intel_cdclk.h | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index ed89b86ea625..4833479e2e17 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -2543,7 +2543,7 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_pre_notify(state); - if (pipe == INVALID_PIPE || + if (new_cdclk_state->disable_pipes || old_cdclk_state->actual.cdclk <= new_cdclk_state->actual.cdclk) { drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); @@ -2575,7 +2575,7 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_post_notify(state); - if (pipe != INVALID_PIPE && + if (!new_cdclk_state->disable_pipes && old_cdclk_state->actual.cdclk > new_cdclk_state->actual.cdclk) { drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); @@ -3058,6 +3058,7 @@ static struct intel_global_state *intel_cdclk_duplicate_state(struct intel_globa return NULL; cdclk_state->pipe = INVALID_PIPE; + cdclk_state->disable_pipes = false; return &cdclk_state->base; } @@ -3236,6 +3237,8 @@ int intel_modeset_calc_cdclk(struct intel_atomic_state *state) if (ret) return ret; + new_cdclk_state->disable_pipes = true; + drm_dbg_kms(&dev_priv->drm, "Modeset required for cdclk change\n"); } diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.h b/drivers/gpu/drm/i915/display/intel_cdclk.h index 48fd7d39e0cd..71bc032bfef1 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.h +++ b/drivers/gpu/drm/i915/display/intel_cdclk.h @@ -51,6 +51,9 @@ struct intel_cdclk_state { /* bitmask of active pipes */ u8 active_pipes; + + /* update cdclk with pipes disabled */ + bool disable_pipes; }; int intel_crtc_compute_min_cdclk(const struct intel_crtc_state *crtc_state); -- cgit v1.2.3 From 6154cc9177ccea00c89ce0bf93352e474b819ff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 2 Apr 2024 18:50:04 +0300 Subject: drm/i915/cdclk: Fix voltage_level programming edge case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we only consider the relationship of the old and new CDCLK frequencies when determining whether to do the repgramming from intel_set_cdclk_pre_plane_update() or intel_set_cdclk_post_plane_update(). It is technically possible to have a situation where the CDCLK frequency is decreasing, but the voltage_level is increasing due a DDI port. In this case we should bump the voltage level already in intel_set_cdclk_pre_plane_update() (so that the voltage_level will have been increased by the time the port gets enabled), while leaving the CDCLK frequency unchanged (as active planes/etc. may still depend on it). We can then reduce the CDCLK frequency to its final value from intel_set_cdclk_post_plane_update(). In order to handle that correctly we shall construct a suitable amalgam of the old and new cdclk states in intel_set_cdclk_pre_plane_update(). And we can simply call intel_set_cdclk() unconditionally in both places as it will not do anything if nothing actually changes vs. the current hw state. v2: Handle cdclk_state->disable_pipes v3: Only synchronize the cd2x update against the pipe's vblank when the cdclk frequency is changing during the current commit phase (Gustavo) Cc: stable@vger.kernel.org Cc: Gustavo Sousa Reviewed-by: Uma Shankar Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240402155016.13733-3-ville.syrjala@linux.intel.com (cherry picked from commit 34d127e2bdef73a923aa0dcd95cbc3257ad5af52) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_cdclk.c | 37 ++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c index 4833479e2e17..f672bfd70d45 100644 --- a/drivers/gpu/drm/i915/display/intel_cdclk.c +++ b/drivers/gpu/drm/i915/display/intel_cdclk.c @@ -2534,7 +2534,8 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) intel_atomic_get_old_cdclk_state(state); const struct intel_cdclk_state *new_cdclk_state = intel_atomic_get_new_cdclk_state(state); - enum pipe pipe = new_cdclk_state->pipe; + struct intel_cdclk_config cdclk_config; + enum pipe pipe; if (!intel_cdclk_changed(&old_cdclk_state->actual, &new_cdclk_state->actual)) @@ -2543,12 +2544,25 @@ intel_set_cdclk_pre_plane_update(struct intel_atomic_state *state) if (IS_DG2(i915)) intel_cdclk_pcode_pre_notify(state); - if (new_cdclk_state->disable_pipes || - old_cdclk_state->actual.cdclk <= new_cdclk_state->actual.cdclk) { - drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + if (new_cdclk_state->disable_pipes) { + cdclk_config = new_cdclk_state->actual; + pipe = INVALID_PIPE; + } else { + if (new_cdclk_state->actual.cdclk >= old_cdclk_state->actual.cdclk) { + cdclk_config = new_cdclk_state->actual; + pipe = new_cdclk_state->pipe; + } else { + cdclk_config = old_cdclk_state->actual; + pipe = INVALID_PIPE; + } - intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); + cdclk_config.voltage_level = max(new_cdclk_state->actual.voltage_level, + old_cdclk_state->actual.voltage_level); } + + drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + + intel_set_cdclk(i915, &cdclk_config, pipe); } /** @@ -2566,7 +2580,7 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) intel_atomic_get_old_cdclk_state(state); const struct intel_cdclk_state *new_cdclk_state = intel_atomic_get_new_cdclk_state(state); - enum pipe pipe = new_cdclk_state->pipe; + enum pipe pipe; if (!intel_cdclk_changed(&old_cdclk_state->actual, &new_cdclk_state->actual)) @@ -2576,11 +2590,14 @@ intel_set_cdclk_post_plane_update(struct intel_atomic_state *state) intel_cdclk_pcode_post_notify(state); if (!new_cdclk_state->disable_pipes && - old_cdclk_state->actual.cdclk > new_cdclk_state->actual.cdclk) { - drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + new_cdclk_state->actual.cdclk < old_cdclk_state->actual.cdclk) + pipe = new_cdclk_state->pipe; + else + pipe = INVALID_PIPE; - intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); - } + drm_WARN_ON(&i915->drm, !new_cdclk_state->base.changed); + + intel_set_cdclk(i915, &new_cdclk_state->actual, pipe); } static int intel_pixel_rate_to_cdclk(const struct intel_crtc_state *crtc_state) -- cgit v1.2.3 From 12bcd9108f9d3b8d4b5f4418bd16df4628b6fa8f Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Mon, 1 Apr 2024 11:26:53 +0530 Subject: drm/i915/hdcp: Fix get remote hdcp capability function HDCP 1.x capability needs to be checked even if setup is not HDCP 2.x capable. --v2 -Assign hdcp_capable and hdcp2_capable to false [Chaitanya] --v3 -Fix variable assignment [Chaitanya] Fixes: 813cca96e4ac ("drm/i915/hdcp: Add new remote capability check shim function") Signed-off-by: Suraj Kandpal Reviewed-by: Chaitanya Kumar Borah Signed-off-by: Animesh Manna Link: https://patchwork.freedesktop.org/patch/msgid/20240401055652.276785-2-suraj.kandpal@intel.com (cherry picked from commit 6809f9246d43f7cb07310ca6a3deb7aa1c0ea938) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp_hdcp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_hdcp.c b/drivers/gpu/drm/i915/display/intel_dp_hdcp.c index b98a87883fef..9db43bd81ce2 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_dp_hdcp.c @@ -691,12 +691,15 @@ int intel_dp_hdcp_get_remote_capability(struct intel_connector *connector, u8 bcaps; int ret; + *hdcp_capable = false; + *hdcp2_capable = false; if (!intel_encoder_is_mst(connector->encoder)) return -EINVAL; ret = _intel_dp_hdcp2_get_capability(aux, hdcp2_capable); if (ret) - return ret; + drm_dbg_kms(&i915->drm, + "HDCP2 DPCD capability read failed err: %d\n", ret); ret = intel_dp_hdcp_read_bcaps(aux, i915, &bcaps); if (ret) -- cgit v1.2.3 From 152191e5e94bba55c938c18688e66c7276b765a7 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 29 Mar 2024 16:53:05 -0700 Subject: drm/i915/guc: Fix the fix for reset lock confusion The previous fix for the circlular lock splat about the busyness worker wasn't quite complete. Even though the reset-in-progress flag is cleared at the start of intel_uc_reset_finish, the entire function is still inside the reset mutex lock. Not sure why the patch appeared to fix the issue both locally and in CI. However, it is now back again. There is a further complication that the wedge code path within intel_gt_reset() jumps around so much that it results in nested reset_prepare/_finish calls. That is, the call sequence is: intel_gt_reset | reset_prepare | __intel_gt_set_wedged | | reset_prepare | | reset_finish | reset_finish The nested finish means that even if the clear of the in-progress flag was moved to the end of _finish, it would still be clear for the entire second call. Surprisingly, this does not seem to be causing any other problems at present. As an aside, a wedge on fini does not call the finish functions at all. The reset_in_progress flag is left set (twice). So instead of trying to cancel the worker anywhere at all in the reset path, just add a cancel to intel_guc_submission_fini instead. Note that it is not a problem if the worker is still active during a reset. Either it will run before the reset path starts locking things and will simply block the reset code for a tiny amount of time. Or it will run after the locks have been acquired and will early exit due to the try-lock. Also, do not use the reset-in-progress flag to decide whether a synchronous cancel is safe (from a lockdep perspective) or not. Instead, use the actual reset mutex state (both the genuine one and the custom rolled BACKOFF one). Fixes: 0e00a8814eec ("drm/i915/guc: Avoid circular locking issue on busyness flush") Signed-off-by: John Harrison Cc: Zhanjun Dong Cc: John Harrison Cc: Andi Shyti Cc: Daniel Vetter Cc: Daniel Vetter Cc: Rodrigo Vivi Cc: Nirmoy Das Cc: Tvrtko Ursulin Cc: Umesh Nerlige Ramappa Cc: Andrzej Hajda Cc: Matt Roper Cc: Jonathan Cavitt Cc: Prathap Kumar Valsan Cc: Alan Previn Cc: Madhumitha Tolakanahalli Pradeep Cc: Daniele Ceraolo Spurio Cc: Ashutosh Dixit Cc: Dnyaneshwar Bhadane Reviewed-by: Nirmoy Das Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240329235306.1559639-1-John.C.Harrison@Intel.com (cherry picked from commit 3563d855312acedcd445a3767f0cb07906f1c26f) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 23 +++++++++-------------- drivers/gpu/drm/i915/gt/uc/intel_uc.c | 4 ++++ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index f3dcae4b9d45..0f83c6d4376f 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1403,14 +1403,17 @@ static void guc_cancel_busyness_worker(struct intel_guc *guc) * Trying to pass a 'need_sync' or 'in_reset' flag all the way down through * every possible call stack is unfeasible. It would be too intrusive to many * areas that really don't care about the GuC backend. However, there is the - * 'reset_in_progress' flag available, so just use that. + * I915_RESET_BACKOFF flag and the gt->reset.mutex can be tested for is_locked. + * So just use those. Note that testing both is required due to the hideously + * complex nature of the i915 driver's reset code paths. * * And note that in the case of a reset occurring during driver unload - * (wedge_on_fini), skipping the cancel in _prepare (when the reset flag is set - * is fine because there is another cancel in _finish (when the reset flag is - * not). + * (wedged_on_fini), skipping the cancel in reset_prepare/reset_fini (when the + * reset flag/mutex are set) is fine because there is another explicit cancel in + * intel_guc_submission_fini (when the reset flag/mutex are not). */ - if (guc_to_gt(guc)->uc.reset_in_progress) + if (mutex_is_locked(&guc_to_gt(guc)->reset.mutex) || + test_bit(I915_RESET_BACKOFF, &guc_to_gt(guc)->reset.flags)) cancel_delayed_work(&guc->timestamp.work); else cancel_delayed_work_sync(&guc->timestamp.work); @@ -1424,8 +1427,6 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) unsigned long flags; ktime_t unused; - guc_cancel_busyness_worker(guc); - spin_lock_irqsave(&guc->timestamp.lock, flags); guc_update_pm_timestamp(guc, &unused); @@ -2004,13 +2005,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc *guc) void intel_guc_submission_reset_finish(struct intel_guc *guc) { - /* - * Ensure the busyness worker gets cancelled even on a fatal wedge. - * Note that reset_prepare is not allowed to because it confuses lockdep. - */ - if (guc_submission_initialized(guc)) - guc_cancel_busyness_worker(guc); - /* Reset called during driver load or during wedge? */ if (unlikely(!guc_submission_initialized(guc) || !intel_guc_is_fw_running(guc) || @@ -2136,6 +2130,7 @@ void intel_guc_submission_fini(struct intel_guc *guc) if (!guc->submission_initialized) return; + guc_fini_engine_stats(guc); guc_flush_destroyed_contexts(guc); guc_lrc_desc_pool_destroy_v69(guc); i915_sched_engine_put(guc->sched_engine); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c index 6dfe5d9456c6..399bc319180b 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -637,6 +637,10 @@ void intel_uc_reset_finish(struct intel_uc *uc) { struct intel_guc *guc = &uc->guc; + /* + * NB: The wedge code path results in prepare -> prepare -> finish -> finish. + * So this function is sometimes called with the in-progress flag not set. + */ uc->reset_in_progress = false; /* Firmware expected to be running when this function is called */ -- cgit v1.2.3 From e3d4ead4d48c05355bd3b99c8162428f68c3c1a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:26 +0300 Subject: drm/i915/psr: Disable PSR when bigjoiner is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bigjoiner seem to be causing all kinds of grief to the PSR code currently. I don't believe there is any hardware issue but the code simply not handling this correctly. For now just disable PSR when bigjoiner is needed. Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-3-ville.syrjala@linux.intel.com Reviewed-by: Arun R Murthy Acked-by: Jouni Högander Signed-off-by: Ville Syrjälä (cherry picked from commit 372fa0c79d3f289f813d8001e0a8a96d1011826c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_psr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index b6e539f1342c..aabd018bd737 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -1422,6 +1422,17 @@ void intel_psr_compute_config(struct intel_dp *intel_dp, return; } + /* + * FIXME figure out what is wrong with PSR+bigjoiner and + * fix it. Presumably something related to the fact that + * PSR is a transcoder level feature. + */ + if (crtc_state->bigjoiner_pipes) { + drm_dbg_kms(&dev_priv->drm, + "PSR disabled due to bigjoiner\n"); + return; + } + if (CAN_PANEL_REPLAY(intel_dp)) crtc_state->has_panel_replay = true; else -- cgit v1.2.3 From 0653d501409eeb9f1deb7e4c12e4d0d2c9f1cba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:27 +0300 Subject: drm/i915: Disable port sync when bigjoiner is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current modeset sequence can't handle port sync and bigjoiner at the same time. Refuse port sync when bigjoiner is needed, at least until we fix the modeset sequence. v2: Add a FIXME (Vandite) Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Vandita Kulkarni Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-4-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit b37e1347b991459c38c56ec2476087854a4f720b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_ddi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index c587a8efeafc..c17462b4c2ac 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -4256,7 +4256,12 @@ static bool m_n_equal(const struct intel_link_m_n *m_n_1, static bool crtcs_port_sync_compatible(const struct intel_crtc_state *crtc_state1, const struct intel_crtc_state *crtc_state2) { + /* + * FIXME the modeset sequence is currently wrong and + * can't deal with bigjoiner + port sync at the same time. + */ return crtc_state1->hw.active && crtc_state2->hw.active && + !crtc_state1->bigjoiner_pipes && !crtc_state2->bigjoiner_pipes && crtc_state1->output_types == crtc_state2->output_types && crtc_state1->output_format == crtc_state2->output_format && crtc_state1->lane_count == crtc_state2->lane_count && -- cgit v1.2.3 From 4a36e46df7aa781c756f09727d37dc2783f1ee75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:28 +0300 Subject: drm/i915: Disable live M/N updates when using bigjoiner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All joined pipes share the same transcoder/timing generator. Currently we just do the commits per-pipe, which doesn't really work if we need to change the timings at the same time. For now just disable live M/N updates when bigjoiner is needed. Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Arun R Murthy Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-5-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit ef79820db723a2a7c229a7251c12859e7e25a247) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index abd62bebc46d..e583515f9b25 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -2725,7 +2725,11 @@ intel_dp_drrs_compute_config(struct intel_connector *connector, intel_panel_downclock_mode(connector, &pipe_config->hw.adjusted_mode); int pixel_clock; - if (has_seamless_m_n(connector)) + /* + * FIXME all joined pipes share the same transcoder. + * Need to account for that when updating M/N live. + */ + if (has_seamless_m_n(connector) && !pipe_config->bigjoiner_pipes) pipe_config->update_m_n = true; if (!can_enable_drrs(connector, pipe_config, downclock_mode)) { -- cgit v1.2.3 From dcd8992e47f13afb5c11a61e8d9c141c35e23751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 5 Apr 2024 00:34:29 +0300 Subject: drm/i915/vrr: Disable VRR when using bigjoiner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All joined pipes share the same transcoder/timing generator. Currently we just do the commits per-pipe, which doesn't really work if we need to change switch between non-VRR and VRR timings generators on the fly, or even when sending the push to the transcoder. For now just disable VRR when bigjoiner is needed. Cc: stable@vger.kernel.org Tested-by: Vidya Srinivas Reviewed-by: Vandita Kulkarni Link: https://patchwork.freedesktop.org/patch/msgid/20240404213441.17637-6-ville.syrjala@linux.intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit f9d5e51db65652dbd8a2102fd7619440e3599fd2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_vrr.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_vrr.c b/drivers/gpu/drm/i915/display/intel_vrr.c index eb5bd0743902..f542ee1db1d9 100644 --- a/drivers/gpu/drm/i915/display/intel_vrr.c +++ b/drivers/gpu/drm/i915/display/intel_vrr.c @@ -117,6 +117,13 @@ intel_vrr_compute_config(struct intel_crtc_state *crtc_state, const struct drm_display_info *info = &connector->base.display_info; int vmin, vmax; + /* + * FIXME all joined pipes share the same transcoder. + * Need to account for that during VRR toggle/push/etc. + */ + if (crtc_state->bigjoiner_pipes) + return; + if (adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) return; -- cgit v1.2.3 From 0cd01ac5dcb1e18eb18df0f0d05b5de76522a437 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 5 Apr 2024 11:14:13 -0700 Subject: x86/bugs: Change commas to semicolons in 'spectre_v2' sysfs file Change the format of the 'spectre_v2' vulnerabilities sysfs file slightly by converting the commas to semicolons, so that mitigations for future variants can be grouped together and separated by commas. Signed-off-by: Josh Poimboeuf Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/bugs.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e7ba936d798b..e8c7146f9ed7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2695,15 +2695,15 @@ static char *stibp_state(void) switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: - return ", STIBP: disabled"; + return "; STIBP: disabled"; case SPECTRE_V2_USER_STRICT: - return ", STIBP: forced"; + return "; STIBP: forced"; case SPECTRE_V2_USER_STRICT_PREFERRED: - return ", STIBP: always-on"; + return "; STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) - return ", STIBP: conditional"; + return "; STIBP: conditional"; } return ""; } @@ -2712,10 +2712,10 @@ static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { if (static_key_enabled(&switch_mm_always_ibpb)) - return ", IBPB: always-on"; + return "; IBPB: always-on"; if (static_key_enabled(&switch_mm_cond_ibpb)) - return ", IBPB: conditional"; - return ", IBPB: disabled"; + return "; IBPB: conditional"; + return "; IBPB: disabled"; } return ""; } @@ -2725,11 +2725,11 @@ static char *pbrsb_eibrs_state(void) if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) - return ", PBRSB-eIBRS: SW sequence"; + return "; PBRSB-eIBRS: SW sequence"; else - return ", PBRSB-eIBRS: Vulnerable"; + return "; PBRSB-eIBRS: Vulnerable"; } else { - return ", PBRSB-eIBRS: Not affected"; + return "; PBRSB-eIBRS: Not affected"; } } @@ -2748,9 +2748,9 @@ static ssize_t spectre_v2_show_state(char *buf) return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), spectre_v2_module_string()); } -- cgit v1.2.3 From 1e3ad78334a69b36e107232e337f9d693dcc9df2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 3 Apr 2024 16:36:44 -0700 Subject: x86/syscall: Don't force use of indirect calls for system calls Make build a switch statement instead, and the compiler can either decide to generate an indirect jump, or - more likely these days due to mitigations - just a series of conditional branches. Yes, the conditional branches also have branch prediction, but the branch prediction is much more controlled, in that it just causes speculatively running the wrong system call (harmless), rather than speculatively running possibly wrong random less controlled code gadgets. This doesn't mitigate other indirect calls, but the system call indirection is the first and most easily triggered case. Signed-off-by: Linus Torvalds Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf --- arch/x86/entry/common.c | 6 +++--- arch/x86/entry/syscall_32.c | 21 +++++++++++++++++++-- arch/x86/entry/syscall_64.c | 19 +++++++++++++++++-- arch/x86/entry/syscall_x32.c | 10 +++++++--- arch/x86/include/asm/syscall.h | 10 ++++------ 5 files changed, 50 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6356060caaf3..cea0e2a23b42 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -49,7 +49,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); - regs->ax = sys_call_table[unr](regs); + regs->ax = x64_sys_call(regs, unr); return true; } return false; @@ -66,7 +66,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call_table[xnr](regs); + regs->ax = x32_sys_call(regs, xnr); return true; } return false; @@ -162,7 +162,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) if (likely(unr < IA32_NR_syscalls)) { unr = array_index_nospec(unr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[unr](regs); + regs->ax = ia32_sys_call(regs, unr); } else if (nr != -1) { regs->ax = __ia32_sys_ni_syscall(regs); } diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8cfc9bc73e7f..c2235bae17ef 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -18,8 +18,25 @@ #include #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#ifdef CONFIG_X86_32 #define __SYSCALL(nr, sym) __ia32_##sym, - -__visible const sys_call_ptr_t ia32_sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include }; +#undef __SYSCALL +#endif + +#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); + +long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include + default: return __ia32_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index be120eec1fc9..33b3f09e6f15 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -11,8 +11,23 @@ #include #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ #define __SYSCALL(nr, sym) __x64_##sym, - -asmlinkage const sys_call_ptr_t sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include }; +#undef __SYSCALL + +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); + +long x64_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include + default: return __x64_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index bdd0e03a1265..03de4a932131 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -11,8 +11,12 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, sym) __x64_##sym, +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); -asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { -#include +long x32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include + default: return __x64_sys_ni_syscall(regs); + } }; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f44e2f9ab65d..3c28f26bfe22 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,19 +16,17 @@ #include /* for TS_COMPAT */ #include +/* This is used purely for kernel/trace/trace_syscalls.c */ typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; -#if defined(CONFIG_X86_32) -#define ia32_sys_call_table sys_call_table -#else /* * These may not exist, but still put the prototypes in so we * can use IS_ENABLED(). */ -extern const sys_call_ptr_t ia32_sys_call_table[]; -extern const sys_call_ptr_t x32_sys_call_table[]; -#endif +extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x64_sys_call(const struct pt_regs *, unsigned int nr); /* * Only the low 32 bits of orig_ax are meaningful, so we return int. -- cgit v1.2.3 From 7390db8aea0d64e9deb28b8e1ce716f5020c7ee5 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 08:56:58 -0700 Subject: x86/bhi: Add support for clearing branch history at syscall entry Branch History Injection (BHI) attacks may allow a malicious application to influence indirect branch prediction in kernel by poisoning the branch history. eIBRS isolates indirect branch targets in ring0. The BHB can still influence the choice of indirect branch predictor entry, and although branch predictor entries are isolated between modes when eIBRS is enabled, the BHB itself is not isolated between modes. Alder Lake and new processors supports a hardware control BHI_DIS_S to mitigate BHI. For older processors Intel has released a software sequence to clear the branch history on parts that don't support BHI_DIS_S. Add support to execute the software sequence at syscall entry and VMexit to overwrite the branch history. For now, branch history is not cleared at interrupt entry, as malicious applications are not believed to have sufficient control over the registers, since previous register state is cleared at interrupt entry. Researchers continue to poke at this area and it may become necessary to clear at interrupt entry as well in the future. This mitigation is only defined here. It is enabled later. Signed-off-by: Pawan Gupta Co-developed-by: Daniel Sneddon Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- arch/x86/entry/common.c | 4 +-- arch/x86/entry/entry_64.S | 61 ++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_64_compat.S | 16 ++++++++++ arch/x86/include/asm/cpufeatures.h | 3 +- arch/x86/include/asm/nospec-branch.h | 12 +++++++ arch/x86/include/asm/syscall.h | 1 + arch/x86/kvm/vmx/vmenter.S | 2 ++ 7 files changed, 96 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index cea0e2a23b42..6de50b80702e 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -189,7 +189,7 @@ static __always_inline bool int80_is_external(void) } /** - * int80_emulation - 32-bit legacy syscall entry + * do_int80_emulation - 32-bit legacy syscall C entry from asm * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in @@ -207,7 +207,7 @@ static __always_inline bool int80_is_external(void) * eax: system call number * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 */ -DEFINE_IDTENTRY_RAW(int80_emulation) +__visible noinstr void do_int80_emulation(struct pt_regs *regs) { int nr; diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 8af2a26b24f6..1b5be07f8669 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -116,6 +116,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ @@ -1491,3 +1492,63 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) call make_task_dead SYM_CODE_END(rewind_stack_and_make_dead) .popsection + +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. + */ +SYM_FUNC_START(clear_bhb_loop) + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f + RET + .align 64, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b + RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +EXPORT_SYMBOL_GPL(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index eabf48c4d4b4..c779046cc3fe 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -92,6 +92,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY /* * SYSENTER doesn't filter flags, so we need to clear NT and AC @@ -206,6 +207,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY movq %rsp, %rdi call do_fast_syscall_32 @@ -276,3 +278,17 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) + +/* + * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries + * point to C routines, however since this is a system call interface the branch + * history needs to be scrubbed to protect against BHI attacks, and that + * scrubbing needs to take place in assembly code prior to entering any C + * routines. + */ +SYM_CODE_START(int80_emulation) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + CLEAR_BRANCH_HISTORY + jmp do_int80_emulation +SYM_CODE_END(int80_emulation) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a38f8f9ba657..9b94a86f4600 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -461,11 +461,12 @@ /* * Extended auxiliary flags: Linux defined - for features scattered in various - * CPUID levels like 0x80000022, etc. + * CPUID levels like 0x80000022, etc and Linux defined features. * * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ /* * BUG word(s) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 170c89ed22fc..aea40204278b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -326,6 +326,14 @@ ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF .endm +#ifdef CONFIG_X86_64 +.macro CLEAR_BRANCH_HISTORY + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP +.endm +#else +#define CLEAR_BRANCH_HISTORY +#endif + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -368,6 +376,10 @@ extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_X86_64 +extern void clear_bhb_loop(void); +#endif + extern void (*x86_return_thunk)(void); extern void __warn_thunk(void); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 3c28f26bfe22..2fc7bc3863ff 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -125,6 +125,7 @@ static inline int syscall_get_arch(struct task_struct *task) } bool do_syscall_64(struct pt_regs *regs, int nr); +void do_int80_emulation(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 2bfbf758d061..0f3593e10c57 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -275,6 +275,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) call vmx_spec_ctrl_restore_host + CLEAR_BRANCH_HISTORY + /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX -- cgit v1.2.3 From 0f4a837615ff925ba62648d280a861adf1582df7 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Wed, 13 Mar 2024 09:47:57 -0700 Subject: x86/bhi: Define SPEC_CTRL_BHI_DIS_S Newer processors supports a hardware control BHI_DIS_S to mitigate Branch History Injection (BHI). Setting BHI_DIS_S protects the kernel from userspace BHI attacks without having to manually overwrite the branch history. Define MSR_SPEC_CTRL bit BHI_DIS_S and its enumeration CPUID.BHI_CTRL. Mitigation is enabled later. Signed-off-by: Daniel Sneddon Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 5 ++++- arch/x86/kernel/cpu/scattered.c | 1 + arch/x86/kvm/reverse_cpuid.h | 3 ++- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 9b94a86f4600..408509070429 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -467,6 +467,7 @@ */ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ #define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ /* * BUG word(s) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 05956bd8bacf..24615e826998 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -61,10 +61,13 @@ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ #define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) +#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */ +#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT) /* A mask for bits which the kernel toggles when controlling mitigations */ #define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ - | SPEC_CTRL_RRSBA_DIS_S) + | SPEC_CTRL_RRSBA_DIS_S \ + | SPEC_CTRL_BHI_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index a515328d9d7d..af5aa2c754c2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index aadefcaa9561..da9880d74a0b 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -52,7 +52,7 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1) #define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2) #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) -#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) +#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) /* CPUID level 0x80000007 (EDX). */ @@ -126,6 +126,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC); KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); + KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); default: return x86_feature; } -- cgit v1.2.3 From be482ff9500999f56093738f9219bbabc729d163 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 08:57:03 -0700 Subject: x86/bhi: Enumerate Branch History Injection (BHI) bug Mitigation for BHI is selected based on the bug enumeration. Add bits needed to enumerate BHI bug. Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 4 ++++ arch/x86/kernel/cpu/common.c | 24 ++++++++++++++++-------- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 408509070429..8edd7a28869a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -517,4 +517,5 @@ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 24615e826998..e72c2b872957 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -166,6 +166,10 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_BHI_NO BIT(20) /* + * CPU is not affected by Branch + * History Injection. + */ #define ARCH_CAP_PBRSB_NO BIT(24) /* * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5c1e6d6be267..754d91857d63 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1120,6 +1120,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SPECTRE_V2 BIT(8) #define NO_MMIO BIT(9) #define NO_EIBRS_PBRSB BIT(10) +#define NO_BHI BIT(11) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1182,18 +1183,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), {} }; @@ -1435,6 +1436,13 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(ia32_cap)) setup_force_cpu_bug(X86_BUG_RFDS); + /* When virtualized, eIBRS could be hidden, assume vulnerable */ + if (!(ia32_cap & ARCH_CAP_BHI_NO) && + !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || + boot_cpu_has(X86_FEATURE_HYPERVISOR))) + setup_force_cpu_bug(X86_BUG_BHI); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; -- cgit v1.2.3 From ec9404e40e8f36421a2b66ecb76dc2209fe7f3ef Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 08:57:05 -0700 Subject: x86/bhi: Add BHI mitigation knob Branch history clearing software sequences and hardware control BHI_DIS_S were defined to mitigate Branch History Injection (BHI). Add cmdline spectre_bhi={on|off|auto} to control BHI mitigation: auto - Deploy the hardware mitigation BHI_DIS_S, if available. on - Deploy the hardware mitigation BHI_DIS_S, if available, otherwise deploy the software sequence at syscall entry and VMexit. off - Turn off BHI mitigation. The default is auto mode which does not deploy the software sequence mitigation. This is because of the hardening done in the syscall dispatch path, which is the likely target of BHI. Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- Documentation/admin-guide/hw-vuln/spectre.rst | 45 +++++++++++-- Documentation/admin-guide/kernel-parameters.txt | 11 +++ arch/x86/Kconfig | 25 +++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 90 ++++++++++++++++++++++++- 5 files changed, 165 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index cce768afec6b..7cb99b09827c 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -138,11 +138,10 @@ associated with the source address of the indirect branch. Specifically, the BHB might be shared across privilege levels even in the presence of Enhanced IBRS. -Currently the only known real-world BHB attack vector is via -unprivileged eBPF. Therefore, it's highly recommended to not enable -unprivileged eBPF, especially when eIBRS is used (without retpolines). -For a full mitigation against BHB attacks, it's recommended to use -retpolines (or eIBRS combined with retpolines). +Previously the only known real-world BHB attack vector was via unprivileged +eBPF. Further research has found attacks that don't require unprivileged eBPF. +For a full mitigation against BHB attacks it is recommended to set BHI_DIS_S or +use the BHB clearing sequence. Attack scenarios ---------------- @@ -430,6 +429,21 @@ The possible values in this file are: 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB =========================== ======================================================= + - Branch History Injection (BHI) protection status: + +.. list-table:: + + * - BHI: Not affected + - System is not affected + * - BHI: Retpoline + - System is protected by retpoline + * - BHI: BHI_DIS_S + - System is protected by BHI_DIS_S + * - BHI: SW loop + - System is protected by software clearing sequence + * - BHI: Syscall hardening + - Syscalls are hardened against BHI + Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will report vulnerability. @@ -484,7 +498,11 @@ Spectre variant 2 Systems which support enhanced IBRS (eIBRS) enable IBRS protection once at boot, by setting the IBRS bit, and they're automatically protected against - Spectre v2 variant attacks. + some Spectre v2 variant attacks. The BHB can still influence the choice of + indirect branch predictor entry, and although branch predictor entries are + isolated between modes when eIBRS is enabled, the BHB itself is not isolated + between modes. Systems which support BHI_DIS_S will set it to protect against + BHI attacks. On Intel's enhanced IBRS systems, this includes cross-thread branch target injections on SMT systems (STIBP). In other words, Intel eIBRS enables @@ -638,6 +656,21 @@ kernel command line. spectre_v2=off. Spectre variant 1 mitigations cannot be disabled. + spectre_bhi= + + [X86] Control mitigation of Branch History Injection + (BHI) vulnerability. Syscalls are hardened against BHI + regardless of this setting. This setting affects the deployment + of the HW BHI control and the SW BHB clearing sequence. + + on + unconditionally enable. + off + unconditionally disable. + auto + enable if hardware mitigation + control(BHI_DIS_S) is available. + For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt Mitigation selection guide diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f6..2dbe60c1db22 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6063,6 +6063,17 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst + spectre_bhi= [X86] Control mitigation of Branch History Injection + (BHI) vulnerability. Syscalls are hardened against BHI + reglardless of this setting. This setting affects the + deployment of the HW BHI control and the SW BHB + clearing sequence. + + on - unconditionally enable. + off - unconditionally disable. + auto - (default) enable only if hardware mitigation + control(BHI_DIS_S) is available. + spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. The default operation protects the kernel from diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4fff6ed46e90..29a7b69a1316 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2633,6 +2633,31 @@ config MITIGATION_RFDS stored in floating point, vector and integer registers. See also +choice + prompt "Clear branch history" + depends on CPU_SUP_INTEL + default SPECTRE_BHI_AUTO + help + Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks + where the branch history buffer is poisoned to speculatively steer + indirect branches. + See + +config SPECTRE_BHI_ON + bool "on" + help + Equivalent to setting spectre_bhi=on command line parameter. +config SPECTRE_BHI_OFF + bool "off" + help + Equivalent to setting spectre_bhi=off command line parameter. +config SPECTRE_BHI_AUTO + bool "auto" + help + Equivalent to setting spectre_bhi=auto command line parameter. + +endchoice + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8edd7a28869a..a2ee9a00e4a7 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -468,6 +468,7 @@ #define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ #define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e8c7146f9ed7..1ab275029888 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1607,6 +1607,74 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ dump_stack(); } +/* + * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by + * branch history in userspace. Not needed if BHI_NO is set. + */ +static bool __init spec_ctrl_bhi_dis(void) +{ + if (!boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW); + + return true; +} + +enum bhi_mitigations { + BHI_MITIGATION_OFF, + BHI_MITIGATION_ON, + BHI_MITIGATION_AUTO, +}; + +static enum bhi_mitigations bhi_mitigation __ro_after_init = + IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : + IS_ENABLED(CONFIG_SPECTRE_BHI_OFF) ? BHI_MITIGATION_OFF : + BHI_MITIGATION_AUTO; + +static int __init spectre_bhi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + bhi_mitigation = BHI_MITIGATION_OFF; + else if (!strcmp(str, "on")) + bhi_mitigation = BHI_MITIGATION_ON; + else if (!strcmp(str, "auto")) + bhi_mitigation = BHI_MITIGATION_AUTO; + else + pr_err("Ignoring unknown spectre_bhi option (%s)", str); + + return 0; +} +early_param("spectre_bhi", spectre_bhi_parse_cmdline); + +static void __init bhi_select_mitigation(void) +{ + if (bhi_mitigation == BHI_MITIGATION_OFF) + return; + + /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + return; + + if (spec_ctrl_bhi_dis()) + return; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (bhi_mitigation == BHI_MITIGATION_AUTO) + return; + + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1718,6 +1786,9 @@ static void __init spectre_v2_select_mitigation(void) mode == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); + if (boot_cpu_has(X86_BUG_BHI)) + bhi_select_mitigation(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -2733,6 +2804,21 @@ static char *pbrsb_eibrs_state(void) } } +static const char * const spectre_bhi_state(void) +{ + if (!boot_cpu_has_bug(X86_BUG_BHI)) + return "; BHI: Not affected"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) + return "; BHI: BHI_DIS_S"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) + return "; BHI: SW loop"; + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + return "; BHI: Retpoline"; + + return "; BHI: Vulnerable (Syscall hardening enabled)"; +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2745,13 +2831,15 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), + spectre_bhi_state(), + /* this should always be at the end */ spectre_v2_module_string()); } -- cgit v1.2.3 From 95a6ccbdc7199a14b71ad8901cb788ba7fb5167b Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 08:57:09 -0700 Subject: x86/bhi: Mitigate KVM by default BHI mitigation mode spectre_bhi=auto does not deploy the software mitigation by default. In a cloud environment, it is a likely scenario where userspace is trusted but the guests are not trusted. Deploying system wide mitigation in such cases is not desirable. Update the auto mode to unconditionally mitigate against malicious guests. Deploy the software sequence at VMexit in auto mode also, when hardware mitigation is not available. Unlike the force =on mode, software sequence is not deployed at syscalls in auto mode. Suggested-by: Alexandre Chartre Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- Documentation/admin-guide/hw-vuln/spectre.rst | 7 +++++-- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/nospec-branch.h | 5 +++++ arch/x86/kernel/cpu/bugs.c | 9 ++++++++- arch/x86/kvm/vmx/vmenter.S | 2 +- 6 files changed, 23 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 7cb99b09827c..b70b1d8bd8e6 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -439,10 +439,12 @@ The possible values in this file are: - System is protected by retpoline * - BHI: BHI_DIS_S - System is protected by BHI_DIS_S - * - BHI: SW loop + * - BHI: SW loop; KVM SW loop - System is protected by software clearing sequence * - BHI: Syscall hardening - Syscalls are hardened against BHI + * - BHI: Syscall hardening; KVM: SW loop + - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will @@ -669,7 +671,8 @@ kernel command line. unconditionally disable. auto enable if hardware mitigation - control(BHI_DIS_S) is available. + control(BHI_DIS_S) is available, otherwise + enable alternate mitigation in KVM. For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2dbe60c1db22..4fa46302f436 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6071,8 +6071,9 @@ on - unconditionally enable. off - unconditionally disable. - auto - (default) enable only if hardware mitigation - control(BHI_DIS_S) is available. + auto - (default) enable hardware mitigation + (BHI_DIS_S) if available, otherwise enable + alternate mitigation in KVM. spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a2ee9a00e4a7..3c7434329661 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -469,6 +469,7 @@ #define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ /* * BUG word(s) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index aea40204278b..ff5f1ecc7d1e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -330,8 +330,13 @@ .macro CLEAR_BRANCH_HISTORY ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP .endm + +.macro CLEAR_BRANCH_HISTORY_VMEXIT + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT +.endm #else #define CLEAR_BRANCH_HISTORY +#define CLEAR_BRANCH_HISTORY_VMEXIT #endif #else /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1ab275029888..295463707e68 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1668,9 +1668,14 @@ static void __init bhi_select_mitigation(void) if (!IS_ENABLED(CONFIG_X86_64)) return; + /* Mitigate KVM by default */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n"); + if (bhi_mitigation == BHI_MITIGATION_AUTO) return; + /* Mitigate syscalls when the mitigation is forced =on */ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); } @@ -2811,10 +2816,12 @@ static const char * const spectre_bhi_state(void) else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) return "; BHI: BHI_DIS_S"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) - return "; BHI: SW loop"; + return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) return "; BHI: Retpoline"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + return "; BHI: Syscall hardening, KVM: SW loop"; return "; BHI: Vulnerable (Syscall hardening enabled)"; } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0f3593e10c57..f6986dee6f8c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -275,7 +275,7 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) call vmx_spec_ctrl_restore_host - CLEAR_BRANCH_HISTORY + CLEAR_BRANCH_HISTORY_VMEXIT /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX -- cgit v1.2.3 From ed2e8d49b54d677f3123668a21a57822d679651f Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Wed, 13 Mar 2024 09:49:17 -0700 Subject: KVM: x86: Add BHI_NO Intel processors that aren't vulnerable to BHI will set MSR_IA32_ARCH_CAPABILITIES[BHI_NO] = 1;. Guests may use this BHI_NO bit to determine if they need to implement BHI mitigations or not. Allow this bit to be passed to the guests. Signed-off-by: Daniel Sneddon Signed-off-by: Pawan Gupta Signed-off-by: Daniel Sneddon Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Josh Poimboeuf --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47d9f03b7778..984ea2089efc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1621,7 +1621,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) static u64 kvm_get_arch_capabilities(void) { -- cgit v1.2.3 From 8db8f6ce556af60ca9a9fd5e826d369ded70fcc7 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 3 Apr 2024 18:50:03 +0530 Subject: scsi: ufs: qcom: Add missing interconnect bandwidth values for Gear 5 These entries are necessary to scale the interconnect bandwidth while operating in Gear 5. Cc: Amit Pundir Fixes: 03ce80a1bb86 ("scsi: ufs: qcom: Add support for scaling interconnects") Tested-by: Amit Pundir Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240403-ufs-icc-fix-v2-1-958412a5eb45@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 06859e17b67b..7a00004bfd03 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -47,7 +47,7 @@ enum { TSTBUS_MAX, }; -#define QCOM_UFS_MAX_GEAR 4 +#define QCOM_UFS_MAX_GEAR 5 #define QCOM_UFS_MAX_LANE 2 enum { @@ -67,26 +67,32 @@ static const struct __ufs_qcom_bw_table { [MODE_PWM][UFS_PWM_G2][UFS_LANE_1] = { 1844, 1000 }, [MODE_PWM][UFS_PWM_G3][UFS_LANE_1] = { 3688, 1000 }, [MODE_PWM][UFS_PWM_G4][UFS_LANE_1] = { 7376, 1000 }, + [MODE_PWM][UFS_PWM_G5][UFS_LANE_1] = { 14752, 1000 }, [MODE_PWM][UFS_PWM_G1][UFS_LANE_2] = { 1844, 1000 }, [MODE_PWM][UFS_PWM_G2][UFS_LANE_2] = { 3688, 1000 }, [MODE_PWM][UFS_PWM_G3][UFS_LANE_2] = { 7376, 1000 }, [MODE_PWM][UFS_PWM_G4][UFS_LANE_2] = { 14752, 1000 }, + [MODE_PWM][UFS_PWM_G5][UFS_LANE_2] = { 29504, 1000 }, [MODE_HS_RA][UFS_HS_G1][UFS_LANE_1] = { 127796, 1000 }, [MODE_HS_RA][UFS_HS_G2][UFS_LANE_1] = { 255591, 1000 }, [MODE_HS_RA][UFS_HS_G3][UFS_LANE_1] = { 1492582, 102400 }, [MODE_HS_RA][UFS_HS_G4][UFS_LANE_1] = { 2915200, 204800 }, + [MODE_HS_RA][UFS_HS_G5][UFS_LANE_1] = { 5836800, 409600 }, [MODE_HS_RA][UFS_HS_G1][UFS_LANE_2] = { 255591, 1000 }, [MODE_HS_RA][UFS_HS_G2][UFS_LANE_2] = { 511181, 1000 }, [MODE_HS_RA][UFS_HS_G3][UFS_LANE_2] = { 1492582, 204800 }, [MODE_HS_RA][UFS_HS_G4][UFS_LANE_2] = { 2915200, 409600 }, + [MODE_HS_RA][UFS_HS_G5][UFS_LANE_2] = { 5836800, 819200 }, [MODE_HS_RB][UFS_HS_G1][UFS_LANE_1] = { 149422, 1000 }, [MODE_HS_RB][UFS_HS_G2][UFS_LANE_1] = { 298189, 1000 }, [MODE_HS_RB][UFS_HS_G3][UFS_LANE_1] = { 1492582, 102400 }, [MODE_HS_RB][UFS_HS_G4][UFS_LANE_1] = { 2915200, 204800 }, + [MODE_HS_RB][UFS_HS_G5][UFS_LANE_1] = { 5836800, 409600 }, [MODE_HS_RB][UFS_HS_G1][UFS_LANE_2] = { 298189, 1000 }, [MODE_HS_RB][UFS_HS_G2][UFS_LANE_2] = { 596378, 1000 }, [MODE_HS_RB][UFS_HS_G3][UFS_LANE_2] = { 1492582, 204800 }, [MODE_HS_RB][UFS_HS_G4][UFS_LANE_2] = { 2915200, 409600 }, + [MODE_HS_RB][UFS_HS_G5][UFS_LANE_2] = { 5836800, 819200 }, [MODE_MAX][0][0] = { 7643136, 307200 }, }; -- cgit v1.2.3 From 18f06e97692516d28c3cdc577fb5c501d690b303 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Mar 2024 17:15:40 -0700 Subject: KVM: Add helpers to consolidate gfn_to_pfn_cache's page split check Add a helper to check that the incoming length for a gfn_to_pfn_cache is valid with respect to the cache's GPA and/or HVA. To avoid activating a cache with a bogus GPA, a future fix will fork the page split check in the inner refresh path into activate() and the public rerfresh() APIs, at which point KVM will check the length in three separate places. Deliberately keep the "page offset" logic open coded, as the only other path that consumes the offset, __kvm_gpc_refresh(), already needs to differentiate between GPA-based and HVA-based caches, and it's not obvious that using a helper is a net positive in overall code readability. Note, for GPA-based caches, this has a subtle side effect of using the GPA instead of the resolved HVA in the check() path, but that should be a nop as the HVA offset is derived from the GPA, i.e. the two offsets are identical, barring a KVM bug. Reviewed-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240320001542.3203871-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/pfncache.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 4e07112a24c2..8f2121b5f2a0 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -57,6 +57,19 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, spin_unlock(&kvm->gpc_lock); } +static bool kvm_gpc_is_valid_len(gpa_t gpa, unsigned long uhva, + unsigned long len) +{ + unsigned long offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) : + offset_in_page(gpa); + + /* + * The cached access must fit within a single page. The 'len' argument + * to activate() and refresh() exists only to enforce that. + */ + return offset + len <= PAGE_SIZE; +} + bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); @@ -74,7 +87,7 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) if (kvm_is_error_hva(gpc->uhva)) return false; - if (offset_in_page(gpc->uhva) + len > PAGE_SIZE) + if (!kvm_gpc_is_valid_len(gpc->gpa, gpc->uhva, len)) return false; if (!gpc->valid) @@ -247,13 +260,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva))) return -EINVAL; - /* - * The cached acces must fit within a single page. The 'len' argument - * exists only to enforce that. - */ - page_offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) : - offset_in_page(gpa); - if (page_offset + len > PAGE_SIZE) + if (!kvm_gpc_is_valid_len(gpa, uhva, len)) return -EINVAL; lockdep_assert_held(&gpc->refresh_lock); @@ -270,6 +277,8 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l old_uhva = PAGE_ALIGN_DOWN(gpc->uhva); if (kvm_is_error_gpa(gpa)) { + page_offset = offset_in_page(uhva); + gpc->gpa = INVALID_GPA; gpc->memslot = NULL; gpc->uhva = PAGE_ALIGN_DOWN(uhva); @@ -279,6 +288,8 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l } else { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); + page_offset = offset_in_page(gpa); + if (gpc->gpa != gpa || gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva)) { gfn_t gfn = gpa_to_gfn(gpa); -- cgit v1.2.3 From 5c9ca4ed890889a2b7c300c4f63f3baf3f63383f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Mar 2024 17:15:41 -0700 Subject: KVM: Check validity of offset+length of gfn_to_pfn_cache prior to activation When activating a gfn_to_pfn_cache, verify that the offset+length is sane and usable before marking the cache active. Letting __kvm_gpc_refresh() detect the problem results in a cache being marked active without setting the GPA (or any other fields), which in turn results in KVM trying to refresh a cache with INVALID_GPA. Attempting to refresh a cache with INVALID_GPA isn't functionally problematic, but it runs afoul of the sanity check that exactly one of GPA or userspace HVA is valid, i.e. that a cache is either GPA-based or HVA-based. Reported-by: syzbot+106a4f72b0474e1d1b33@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000005fa5cc0613f1cebd@google.com Fixes: 721f5b0dda78 ("KVM: pfncache: allow a cache to be activated with a fixed (userspace) HVA") Cc: David Woodhouse Cc: Paul Durrant Reviewed-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240320001542.3203871-3-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/pfncache.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 8f2121b5f2a0..91b0e329006b 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -245,8 +245,7 @@ out_error: return -EFAULT; } -static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva, - unsigned long len) +static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva) { unsigned long page_offset; bool unmap_old = false; @@ -260,9 +259,6 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva))) return -EINVAL; - if (!kvm_gpc_is_valid_len(gpa, uhva, len)) - return -EINVAL; - lockdep_assert_held(&gpc->refresh_lock); write_lock_irq(&gpc->lock); @@ -365,6 +361,9 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) guard(mutex)(&gpc->refresh_lock); + if (!kvm_gpc_is_valid_len(gpc->gpa, gpc->uhva, len)) + return -EINVAL; + /* * If the GPA is valid then ignore the HVA, as a cache can be GPA-based * or HVA-based, not both. For GPA-based caches, the HVA will be @@ -372,7 +371,7 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) */ uhva = kvm_is_error_gpa(gpc->gpa) ? gpc->uhva : KVM_HVA_ERR_BAD; - return __kvm_gpc_refresh(gpc, gpc->gpa, uhva, len); + return __kvm_gpc_refresh(gpc, gpc->gpa, uhva); } void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm) @@ -392,6 +391,9 @@ static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned { struct kvm *kvm = gpc->kvm; + if (!kvm_gpc_is_valid_len(gpa, uhva, len)) + return -EINVAL; + guard(mutex)(&gpc->refresh_lock); if (!gpc->active) { @@ -411,7 +413,7 @@ static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned gpc->active = true; write_unlock_irq(&gpc->lock); } - return __kvm_gpc_refresh(gpc, gpa, uhva, len); + return __kvm_gpc_refresh(gpc, gpa, uhva); } int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) -- cgit v1.2.3 From fc62a4e8dee2d1a9037e8cdeaa52ba67457f7300 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 19 Mar 2024 17:15:42 -0700 Subject: KVM: Explicitly disallow activatating a gfn_to_pfn_cache with INVALID_GPA Explicit disallow activating a gfn_to_pfn_cache with an error gpa, i.e. INVALID_GPA, to ensure that KVM doesn't mistake a GPA-based cache for an HVA-based cache (KVM uses INVALID_GPA as a magic value to differentiate between GPA-based and HVA-based caches). WARN if KVM attempts to activate a cache with INVALID_GPA, purely so that new caches need to at least consider what to do with a "bad" GPA, as all existing usage of kvm_gpc_activate() guarantees gpa != INVALID_GPA. I.e. removing the WARN in the future is completely reasonable if doing so would yield cleaner/better code overall. Reviewed-by: David Woodhouse Reviewed-by: Paul Durrant Link: https://lore.kernel.org/r/20240320001542.3203871-4-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/pfncache.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 91b0e329006b..f618719644e0 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -418,6 +418,13 @@ static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { + /* + * Explicitly disallow INVALID_GPA so that the magic value can be used + * by KVM to differentiate between GPA-based and HVA-based caches. + */ + if (WARN_ON_ONCE(kvm_is_error_gpa(gpa))) + return -EINVAL; + return __kvm_gpc_activate(gpc, gpa, KVM_HVA_ERR_BAD, len); } -- cgit v1.2.3 From 9e985cbf2942a1bb8fcef9adc2a17d90fd7ca8ee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 16:58:33 -0800 Subject: KVM: x86/pmu: Disable support for adaptive PEBS Drop support for virtualizing adaptive PEBS, as KVM's implementation is architecturally broken without an obvious/easy path forward, and because exposing adaptive PEBS can leak host LBRs to the guest, i.e. can leak host kernel addresses to the guest. Bug #1 is that KVM doesn't account for the upper 32 bits of IA32_FIXED_CTR_CTRL when (re)programming fixed counters, e.g fixed_ctrl_field() drops the upper bits, reprogram_fixed_counters() stores local variables as u8s and truncates the upper bits too, etc. Bug #2 is that, because KVM _always_ sets precise_ip to a non-zero value for PEBS events, perf will _always_ generate an adaptive record, even if the guest requested a basic record. Note, KVM will also enable adaptive PEBS in individual *counter*, even if adaptive PEBS isn't exposed to the guest, but this is benign as MSR_PEBS_DATA_CFG is guaranteed to be zero, i.e. the guest will only ever see Basic records. Bug #3 is in perf. intel_pmu_disable_fixed() doesn't clear the upper bits either, i.e. leaves ICL_FIXED_0_ADAPTIVE set, and intel_pmu_enable_fixed() effectively doesn't clear ICL_FIXED_0_ADAPTIVE either. I.e. perf _always_ enables ADAPTIVE counters, regardless of what KVM requests. Bug #4 is that adaptive PEBS *might* effectively bypass event filters set by the host, as "Updated Memory Access Info Group" records information that might be disallowed by userspace via KVM_SET_PMU_EVENT_FILTER. Bug #5 is that KVM doesn't ensure LBR MSRs hold guest values (or at least zeros) when entering a vCPU with adaptive PEBS, which allows the guest to read host LBRs, i.e. host RIPs/addresses, by enabling "LBR Entries" records. Disable adaptive PEBS support as an immediate fix due to the severity of the LBR leak in particular, and because fixing all of the bugs will be non-trivial, e.g. not suitable for backporting to stable kernels. Note! This will break live migration, but trying to make KVM play nice with live migration would be quite complicated, wouldn't be guaranteed to work (i.e. KVM might still kill/confuse the guest), and it's not clear that there are any publicly available VMMs that support adaptive PEBS, let alone live migrate VMs that support adaptive PEBS, e.g. QEMU doesn't support PEBS in any capacity. Link: https://lore.kernel.org/all/20240306230153.786365-1-seanjc@google.com Link: https://lore.kernel.org/all/ZeepGjHCeSfadANM@google.com Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Cc: stable@vger.kernel.org Cc: Like Xu Cc: Mingwei Zhang Cc: Zhenyu Wang Cc: Zhang Xiong Cc: Lv Zhiyuan Cc: Dapeng Mi Cc: Jim Mattson Acked-by: Like Xu Link: https://lore.kernel.org/r/20240307005833.827147-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c37a89eda90f..e3315bda6237 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7882,8 +7882,28 @@ static u64 vmx_get_perf_capabilities(void) if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; + + /* + * Disallow adaptive PEBS as it is functionally broken, can be + * used by the guest to read *host* LBRs, and can be used to + * bypass userspace event filters. To correctly and safely + * support adaptive PEBS, KVM needs to: + * + * 1. Account for the ADAPTIVE flag when (re)programming fixed + * counters. + * + * 2. Gain support from perf (or take direct control of counter + * programming) to support events without adaptive PEBS + * enabled for the hardware counter. + * + * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with + * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. + * + * 4. Document which PMU events are effectively exposed to the + * guest via adaptive PEBS, and make adaptive PEBS mutually + * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. + */ + perf_cap &= ~PERF_CAP_PEBS_BASELINE; } return perf_cap; -- cgit v1.2.3 From 992b54bd083c5bee24ff7cc35991388ab08598c4 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 14 Mar 2024 14:29:02 -0700 Subject: KVM: x86/mmu: x86: Don't overflow lpage_info when checking attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix KVM_SET_MEMORY_ATTRIBUTES to not overflow lpage_info array and trigger KASAN splat, as seen in the private_mem_conversions_test selftest. When memory attributes are set on a GFN range, that range will have specific properties applied to the TDP. A huge page cannot be used when the attributes are inconsistent, so they are disabled for those the specific huge pages. For internal KVM reasons, huge pages are also not allowed to span adjacent memslots regardless of whether the backing memory could be mapped as huge. What GFNs support which huge page sizes is tracked by an array of arrays 'lpage_info' on the memslot, of ‘kvm_lpage_info’ structs. Each index of lpage_info contains a vmalloc allocated array of these for a specific supported page size. The kvm_lpage_info denotes whether a specific huge page (GFN and page size) on the memslot is supported. These arrays include indices for unaligned head and tail huge pages. Preventing huge pages from spanning adjacent memslot is covered by incrementing the count in head and tail kvm_lpage_info when the memslot is allocated, but disallowing huge pages for memory that has mixed attributes has to be done in a more complicated way. During the KVM_SET_MEMORY_ATTRIBUTES ioctl KVM updates lpage_info for each memslot in the range that has mismatched attributes. KVM does this a memslot at a time, and marks a special bit, KVM_LPAGE_MIXED_FLAG, in the kvm_lpage_info for any huge page. This bit is essentially a permanently elevated count. So huge pages will not be mapped for the GFN at that page size if the count is elevated in either case: a huge head or tail page unaligned to the memslot or if KVM_LPAGE_MIXED_FLAG is set because it has mixed attributes. To determine whether a huge page has consistent attributes, the KVM_SET_MEMORY_ATTRIBUTES operation checks an xarray to make sure it consistently has the incoming attribute. Since level - 1 huge pages are aligned to level huge pages, it employs an optimization. As long as the level - 1 huge pages are checked first, it can just check these and assume that if each level - 1 huge page contained within the level sized huge page is not mixed, then the level size huge page is not mixed. This optimization happens in the helper hugepage_has_attrs(). Unfortunately, although the kvm_lpage_info array representing page size 'level' will contain an entry for an unaligned tail page of size level, the array for level - 1 will not contain an entry for each GFN at page size level. The level - 1 array will only contain an index for any unaligned region covered by level - 1 huge page size, which can be a smaller region. So this causes the optimization to overflow the level - 1 kvm_lpage_info and perform a vmalloc out of bounds read. In some cases of head and tail pages where an overflow could happen, callers skip the operation completely as KVM_LPAGE_MIXED_FLAG is not required to prevent huge pages as discussed earlier. But for memslots that are smaller than the 1GB page size, it does call hugepage_has_attrs(). In this case the huge page is both the head and tail page. The issue can be observed simply by compiling the kernel with CONFIG_KASAN_VMALLOC and running the selftest “private_mem_conversions_test”, which produces the output like the following: BUG: KASAN: vmalloc-out-of-bounds in hugepage_has_attrs+0x7e/0x110 Read of size 4 at addr ffffc900000a3008 by task private_mem_con/169 Call Trace: dump_stack_lvl print_report ? __virt_addr_valid ? hugepage_has_attrs ? hugepage_has_attrs kasan_report ? hugepage_has_attrs hugepage_has_attrs kvm_arch_post_set_memory_attributes kvm_vm_ioctl It is a little ambiguous whether the unaligned head page (in the bug case also the tail page) should be expected to have KVM_LPAGE_MIXED_FLAG set. It is not functionally required, as the unaligned head/tail pages will already have their kvm_lpage_info count incremented. The comments imply not setting it on unaligned head pages is intentional, so fix the callers to skip trying to set KVM_LPAGE_MIXED_FLAG in this case, and in doing so not call hugepage_has_attrs(). Cc: stable@vger.kernel.org Fixes: 90b4fe17981e ("KVM: x86: Disallow hugepages when memory attributes are mixed") Signed-off-by: Rick Edgecombe Reviewed-by: Kai Huang Reviewed-by: Chao Peng Link: https://lore.kernel.org/r/20240314212902.2762507-1-rick.p.edgecombe@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 992e651540e8..18a404b5923f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7399,7 +7399,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ - if (gfn >= slot->base_gfn) { + if (gfn >= slot->base_gfn && + gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else -- cgit v1.2.3 From de120e1d692d73c7eefa3278837b1eb68f90728a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Mar 2024 17:36:40 -0800 Subject: KVM: x86/pmu: Set enable bits for GP counters in PERF_GLOBAL_CTRL at "RESET" Set the enable bits for general purpose counters in IA32_PERF_GLOBAL_CTRL when refreshing the PMU to emulate the MSR's architecturally defined post-RESET behavior. Per Intel's SDM: IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. and Where "n" is the number of general-purpose counters available in the processor. AMD also documents this behavior for PerfMonV2 CPUs in one of AMD's many PPRs. Do not set any PERF_GLOBAL_CTRL bits if there are no general purpose counters, although a literal reading of the SDM would require the CPU to set either bits 63:0 or 31:0. The intent of the behavior is to globally enable all GP counters; honor the intent, if not the letter of the law. Leaving PERF_GLOBAL_CTRL '0' effectively breaks PMU usage in guests that haven't been updated to work with PMUs that support PERF_GLOBAL_CTRL. This bug was recently exposed when KVM added supported for AMD's PerfMonV2, i.e. when KVM started exposing a vPMU with PERF_GLOBAL_CTRL to guest software that only knew how to program v1 PMUs (that don't support PERF_GLOBAL_CTRL). Failure to emulate the post-RESET behavior results in such guests unknowingly leaving all general purpose counters globally disabled (the entire reason the post-RESET value sets the GP counter enable bits is to maintain backwards compatibility). The bug has likely gone unnoticed because PERF_GLOBAL_CTRL has been supported on Intel CPUs for as long as KVM has existed, i.e. hardly anyone is running guest software that isn't aware of PERF_GLOBAL_CTRL on Intel PMUs. And because up until v6.0, KVM _did_ emulate the behavior for Intel CPUs, although the old behavior was likely dumb luck. Because (a) that old code was also broken in its own way (the history of this code is a comedy of errors), and (b) PERF_GLOBAL_CTRL was documented as having a value of '0' post-RESET in all SDMs before March 2023. Initial vPMU support in commit f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests") *almost* got it right (again likely by dumb luck), but for some reason only set the bits if the guest PMU was advertised as v1: if (pmu->version == 1) { pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; return; } Commit f19a0c2c2e6a ("KVM: PMU emulation: GLOBAL_CTRL MSR should be enabled on reset") then tried to remedy that goof, presumably because guest PMUs were leaving PERF_GLOBAL_CTRL '0', i.e. weren't enabling counters. pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; That was KVM's behavior up until commit c49467a45fe0 ("KVM: x86/pmu: Don't overwrite the pmu->global_ctrl when refreshing") removed *everything*. However, it did so based on the behavior defined by the SDM , which at the time stated that "Global Perf Counter Controls" is '0' at Power-Up and RESET. But then the March 2023 SDM (325462-079US), stealthily changed its "IA-32 and Intel 64 Processor States Following Power-up, Reset, or INIT" table to say: IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. Note, kvm_pmu_refresh() can be invoked multiple times, i.e. it's not a "pure" RESET flow. But it can only be called prior to the first KVM_RUN, i.e. the guest will only ever observe the final value. Note #2, KVM has always cleared global_ctrl during refresh (see commit f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests")), i.e. there is no danger of breaking existing setups by clobbering a value set by userspace. Reported-by: Babu Moger Cc: Sandipan Das Cc: Like Xu Cc: Mingwei Zhang Cc: Dapeng Mi Cc: stable@vger.kernel.org Reviewed-by: Dapeng Mi Tested-by: Dapeng Mi Link: https://lore.kernel.org/r/20240309013641.1413400-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c397b28e3d1b..a593b03c9aed 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -775,8 +775,20 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_data_cfg_mask = ~0ull; bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); - if (vcpu->kvm->arch.enable_pmu) - static_call(kvm_x86_pmu_refresh)(vcpu); + if (!vcpu->kvm->arch.enable_pmu) + return; + + static_call(kvm_x86_pmu_refresh)(vcpu); + + /* + * At RESET, both Intel and AMD CPUs set all enable bits for general + * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that + * was written for v1 PMUs don't unknowingly leave GP counters disabled + * in the global controls). Emulate that behavior when refreshing the + * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. + */ + if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) + pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); } void kvm_pmu_init(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 08a828249b16f69248652937be7f1b8dab340a22 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Mar 2024 17:36:41 -0800 Subject: KVM: selftests: Verify post-RESET value of PERF_GLOBAL_CTRL in PMCs test Add a guest assert in the PMU counters test to verify that KVM stuffs the vCPU's post-RESET value to globally enable all general purpose counters. Per Intel's SDM, IA32_PERF_GLOBAL_CTRL: Sets bits n-1:0 and clears the upper bits. and Where "n" is the number of general-purpose counters available in the processor. For the edge case where there are zero GP counters, follow the spirit of the architecture, not the SDM's literal wording, which doesn't account for this possibility and would require the CPU to set _all_ bits in PERF_GLOBAL_CTRL. Reviewed-by: Dapeng Mi Tested-by: Dapeng Mi Link: https://lore.kernel.org/r/20240309013641.1413400-3-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/x86_64/pmu_counters_test.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index 29609b52f8fa..26c85815f7e9 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -416,12 +416,30 @@ static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters static void guest_test_gp_counters(void) { + uint8_t pmu_version = guest_get_pmu_version(); uint8_t nr_gp_counters = 0; uint32_t base_msr; - if (guest_get_pmu_version()) + if (pmu_version) nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); + /* + * For v2+ PMUs, PERF_GLOBAL_CTRL's architectural post-RESET value is + * "Sets bits n-1:0 and clears the upper bits", where 'n' is the number + * of GP counters. If there are no GP counters, require KVM to leave + * PERF_GLOBAL_CTRL '0'. This edge case isn't covered by the SDM, but + * follow the spirit of the architecture and only globally enable GP + * counters, of which there are none. + */ + if (pmu_version > 1) { + uint64_t global_ctrl = rdmsr(MSR_CORE_PERF_GLOBAL_CTRL); + + if (nr_gp_counters) + GUEST_ASSERT_EQ(global_ctrl, GENMASK_ULL(nr_gp_counters - 1, 0)); + else + GUEST_ASSERT_EQ(global_ctrl, 0); + } + if (this_cpu_has(X86_FEATURE_PDCM) && rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) base_msr = MSR_IA32_PMC0; -- cgit v1.2.3 From 0ef2dd1f4144bc024d3c98954fa061a102f6481b Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Fri, 15 Mar 2024 10:35:07 -0400 Subject: KVM: selftests: fix max_guest_memory_test with more that 256 vCPUs max_guest_memory_test uses ucalls to sync with the host, but it also resets the guest RIP back to its initial value in between tests stages. This makes the guest never reach the code which frees the ucall struct and since a fixed pool of 512 ucall structs is used, the test starts to fail when more that 256 vCPUs are used. Fix that by replacing the manual register reset with a loop in the guest code. Signed-off-by: Maxim Levitsky Link: https://lore.kernel.org/r/20240315143507.102629-1-mlevitsk@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/max_guest_memory_test.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c index 6628dc4dda89..1a6da7389bf1 100644 --- a/tools/testing/selftests/kvm/max_guest_memory_test.c +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -22,10 +22,11 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) { uint64_t gpa; - for (gpa = start_gpa; gpa < end_gpa; gpa += stride) - *((volatile uint64_t *)gpa) = gpa; - - GUEST_DONE(); + for (;;) { + for (gpa = start_gpa; gpa < end_gpa; gpa += stride) + *((volatile uint64_t *)gpa) = gpa; + GUEST_SYNC(0); + } } struct vcpu_info { @@ -55,7 +56,7 @@ static void rendezvous_with_boss(void) static void run_vcpu(struct kvm_vcpu *vcpu) { vcpu_run(vcpu); - TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); } static void *vcpu_worker(void *data) @@ -64,17 +65,13 @@ static void *vcpu_worker(void *data) struct kvm_vcpu *vcpu = info->vcpu; struct kvm_vm *vm = vcpu->vm; struct kvm_sregs sregs; - struct kvm_regs regs; vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size); - /* Snapshot regs before the first run. */ - vcpu_regs_get(vcpu, ®s); rendezvous_with_boss(); run_vcpu(vcpu); rendezvous_with_boss(); - vcpu_regs_set(vcpu, ®s); vcpu_sregs_get(vcpu, &sregs); #ifdef __x86_64__ /* Toggle CR0.WP to trigger a MMU context reset. */ -- cgit v1.2.3 From 449c0811d8729068646e1a270be72bf2da2e66f3 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 3 Apr 2024 14:33:01 +0200 Subject: KVM: selftests: fix supported_flags for riscv commit 849c1816436f ("KVM: selftests: fix supported_flags for aarch64") fixed the set-memory-region test for aarch64 by declaring the read-only flag is supported. riscv also supports the read-only flag. Fix it too. Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20240403123300.63923-2-ajones@ventanamicro.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/set_memory_region_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 06b43ed23580..bd57d991e27d 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -333,7 +333,7 @@ static void test_invalid_memory_region_flags(void) struct kvm_vm *vm; int r, i; -#if defined __aarch64__ || defined __x86_64__ +#if defined __aarch64__ || defined __riscv || defined __x86_64__ supported_flags |= KVM_MEM_READONLY; #endif -- cgit v1.2.3 From 7f2817ef52a1cc3ee0ace35eb8df7a39bd4fc9b7 Mon Sep 17 00:00:00 2001 From: Tao Su Date: Tue, 19 Mar 2024 11:11:11 +0800 Subject: KVM: VMX: Ignore MKTME KeyID bits when intercepting #PF for allow_smaller_maxphyaddr Use the raw/true host.MAXPHYADDR when deciding whether or not KVM must intercept #PFs when allow_smaller_maxphyaddr is enabled, as any adjustments the kernel makes to boot_cpu_data.x86_phys_bits to account for MKTME KeyID bits do not apply to the guest physical address space. I.e. the KeyID are off-limits for host physical addresses, but are not reserved for GPAs as far as hardware is concerned. Signed-off-by: Tao Su Link: https://lore.kernel.org/r/20240319031111.495006-1-tao1.su@linux.intel.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 65786dbe7d60..1742c88ba9c8 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -15,6 +15,7 @@ #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" +#include "../mmu.h" #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 @@ -719,7 +720,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) if (!enable_ept) return true; - return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; + return allow_smaller_maxphyaddr && + cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits(); } static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From ca91259b775f6fd98ae5d23bb4eec101d468ba8d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 Mar 2024 15:44:17 -0700 Subject: scsi: core: Fix handling of SCMD_FAIL_IF_RECOVERING There is code in the SCSI core that sets the SCMD_FAIL_IF_RECOVERING flag but there is no code that clears this flag. Instead of only clearing SCMD_INITIALIZED in scsi_end_request(), clear all flags. It is never necessary to preserve any command flags inside scsi_end_request(). Cc: stable@vger.kernel.org Fixes: 310bcaef6d7e ("scsi: core: Support failing requests while recovering") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240325224417.1477135-1-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 2e28e2360c85..5b3230ef51fe 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -635,10 +635,9 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (blk_queue_add_random(q)) add_disk_randomness(req->q->disk); - if (!blk_rq_is_passthrough(req)) { - WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); - cmd->flags &= ~SCMD_INITIALIZED; - } + WARN_ON_ONCE(!blk_rq_is_passthrough(req) && + !(cmd->flags & SCMD_INITIALIZED)); + cmd->flags = 0; /* * Calling rcu_barrier() is not necessary here because the -- cgit v1.2.3 From f96f700449b6d190e06272f1cf732ae8e45b73df Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 5 Apr 2024 22:30:39 +0200 Subject: net: ks8851: Inline ks8851_rx_skb() Both ks8851_rx_skb_par() and ks8851_rx_skb_spi() call netif_rx(skb), inline the netif_rx(skb) call directly into ks8851_common.c and drop the .rx_skb callback and ks8851_rx_skb() wrapper. This removes one indirect call from the driver, no functional change otherwise. Signed-off-by: Marek Vasut Link: https://lore.kernel.org/r/20240405203204.82062-1-marex@denx.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/micrel/ks8851.h | 3 --- drivers/net/ethernet/micrel/ks8851_common.c | 12 +----------- drivers/net/ethernet/micrel/ks8851_par.c | 11 ----------- drivers/net/ethernet/micrel/ks8851_spi.c | 11 ----------- 4 files changed, 1 insertion(+), 36 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851.h b/drivers/net/ethernet/micrel/ks8851.h index e5ec0a363aff..31f75b4a67fd 100644 --- a/drivers/net/ethernet/micrel/ks8851.h +++ b/drivers/net/ethernet/micrel/ks8851.h @@ -368,7 +368,6 @@ union ks8851_tx_hdr { * @rdfifo: FIFO read callback * @wrfifo: FIFO write callback * @start_xmit: start_xmit() implementation callback - * @rx_skb: rx_skb() implementation callback * @flush_tx_work: flush_tx_work() implementation callback * * The @statelock is used to protect information in the structure which may @@ -423,8 +422,6 @@ struct ks8851_net { struct sk_buff *txp, bool irq); netdev_tx_t (*start_xmit)(struct sk_buff *skb, struct net_device *dev); - void (*rx_skb)(struct ks8851_net *ks, - struct sk_buff *skb); void (*flush_tx_work)(struct ks8851_net *ks); }; diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 0bf13b38b8f5..896d43bb8883 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -231,16 +231,6 @@ static void ks8851_dbg_dumpkkt(struct ks8851_net *ks, u8 *rxpkt) rxpkt[12], rxpkt[13], rxpkt[14], rxpkt[15]); } -/** - * ks8851_rx_skb - receive skbuff - * @ks: The device state. - * @skb: The skbuff - */ -static void ks8851_rx_skb(struct ks8851_net *ks, struct sk_buff *skb) -{ - ks->rx_skb(ks, skb); -} - /** * ks8851_rx_pkts - receive packets from the host * @ks: The device information. @@ -309,7 +299,7 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) ks8851_dbg_dumpkkt(ks, rxpkt); skb->protocol = eth_type_trans(skb, ks->netdev); - ks8851_rx_skb(ks, skb); + netif_rx(skb); ks->netdev->stats.rx_packets++; ks->netdev->stats.rx_bytes += rxlen; diff --git a/drivers/net/ethernet/micrel/ks8851_par.c b/drivers/net/ethernet/micrel/ks8851_par.c index 2a7f29854267..381b9cd285eb 100644 --- a/drivers/net/ethernet/micrel/ks8851_par.c +++ b/drivers/net/ethernet/micrel/ks8851_par.c @@ -210,16 +210,6 @@ static void ks8851_wrfifo_par(struct ks8851_net *ks, struct sk_buff *txp, iowrite16_rep(ksp->hw_addr, txp->data, len / 2); } -/** - * ks8851_rx_skb_par - receive skbuff - * @ks: The device state. - * @skb: The skbuff - */ -static void ks8851_rx_skb_par(struct ks8851_net *ks, struct sk_buff *skb) -{ - netif_rx(skb); -} - static unsigned int ks8851_rdreg16_par_txqcr(struct ks8851_net *ks) { return ks8851_rdreg16_par(ks, KS_TXQCR); @@ -298,7 +288,6 @@ static int ks8851_probe_par(struct platform_device *pdev) ks->rdfifo = ks8851_rdfifo_par; ks->wrfifo = ks8851_wrfifo_par; ks->start_xmit = ks8851_start_xmit_par; - ks->rx_skb = ks8851_rx_skb_par; #define STD_IRQ (IRQ_LCI | /* Link Change */ \ IRQ_RXI | /* RX done */ \ diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c index 2f803377c9f9..670c1de966db 100644 --- a/drivers/net/ethernet/micrel/ks8851_spi.c +++ b/drivers/net/ethernet/micrel/ks8851_spi.c @@ -298,16 +298,6 @@ static unsigned int calc_txlen(unsigned int len) return ALIGN(len + 4, 4); } -/** - * ks8851_rx_skb_spi - receive skbuff - * @ks: The device state - * @skb: The skbuff - */ -static void ks8851_rx_skb_spi(struct ks8851_net *ks, struct sk_buff *skb) -{ - netif_rx(skb); -} - /** * ks8851_tx_work - process tx packet(s) * @work: The work strucutre what was scheduled. @@ -435,7 +425,6 @@ static int ks8851_probe_spi(struct spi_device *spi) ks->rdfifo = ks8851_rdfifo_spi; ks->wrfifo = ks8851_wrfifo_spi; ks->start_xmit = ks8851_start_xmit_spi; - ks->rx_skb = ks8851_rx_skb_spi; ks->flush_tx_work = ks8851_flush_tx_work_spi; #define STD_IRQ (IRQ_LCI | /* Link Change */ \ -- cgit v1.2.3 From be0384bf599cf1eb8d337517feeb732d71f75a6f Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 5 Apr 2024 22:30:40 +0200 Subject: net: ks8851: Handle softirqs at the end of IRQ thread to fix hang The ks8851_irq() thread may call ks8851_rx_pkts() in case there are any packets in the MAC FIFO, which calls netif_rx(). This netif_rx() implementation is guarded by local_bh_disable() and local_bh_enable(). The local_bh_enable() may call do_softirq() to run softirqs in case any are pending. One of the softirqs is net_rx_action, which ultimately reaches the driver .start_xmit callback. If that happens, the system hangs. The entire call chain is below: ks8851_start_xmit_par from netdev_start_xmit netdev_start_xmit from dev_hard_start_xmit dev_hard_start_xmit from sch_direct_xmit sch_direct_xmit from __dev_queue_xmit __dev_queue_xmit from __neigh_update __neigh_update from neigh_update neigh_update from arp_process.constprop.0 arp_process.constprop.0 from __netif_receive_skb_one_core __netif_receive_skb_one_core from process_backlog process_backlog from __napi_poll.constprop.0 __napi_poll.constprop.0 from net_rx_action net_rx_action from __do_softirq __do_softirq from call_with_stack call_with_stack from do_softirq do_softirq from __local_bh_enable_ip __local_bh_enable_ip from netif_rx netif_rx from ks8851_irq ks8851_irq from irq_thread_fn irq_thread_fn from irq_thread irq_thread from kthread kthread from ret_from_fork The hang happens because ks8851_irq() first locks a spinlock in ks8851_par.c ks8851_lock_par() spin_lock_irqsave(&ksp->lock, ...) and with that spinlock locked, calls netif_rx(). Once the execution reaches ks8851_start_xmit_par(), it calls ks8851_lock_par() again which attempts to claim the already locked spinlock again, and the hang happens. Move the do_softirq() call outside of the spinlock protected section of ks8851_irq() by disabling BHs around the entire spinlock protected section of ks8851_irq() handler. Place local_bh_enable() outside of the spinlock protected section, so that it can trigger do_softirq() without the ks8851_par.c ks8851_lock_par() spinlock being held, and safely call ks8851_start_xmit_par() without attempting to lock the already locked spinlock. Since ks8851_irq() is protected by local_bh_disable()/local_bh_enable() now, replace netif_rx() with __netif_rx() which is not duplicating the local_bh_disable()/local_bh_enable() calls. Fixes: 797047f875b5 ("net: ks8851: Implement Parallel bus operations") Signed-off-by: Marek Vasut Link: https://lore.kernel.org/r/20240405203204.82062-2-marex@denx.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/micrel/ks8851_common.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 896d43bb8883..d4cdf3d4f552 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -299,7 +299,7 @@ static void ks8851_rx_pkts(struct ks8851_net *ks) ks8851_dbg_dumpkkt(ks, rxpkt); skb->protocol = eth_type_trans(skb, ks->netdev); - netif_rx(skb); + __netif_rx(skb); ks->netdev->stats.rx_packets++; ks->netdev->stats.rx_bytes += rxlen; @@ -330,6 +330,8 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) unsigned long flags; unsigned int status; + local_bh_disable(); + ks8851_lock(ks, &flags); status = ks8851_rdreg16(ks, KS_ISR); @@ -406,6 +408,8 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) if (status & IRQ_LCI) mii_check_link(&ks->mii); + local_bh_enable(); + return IRQ_HANDLED; } -- cgit v1.2.3 From 9c432404b9555c9444cbf6c8feaf52c0d8cad486 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 8 Apr 2024 22:32:08 -0400 Subject: bcachefs: fix eytzinger0_find_gt() - fix return types: promoting from unsigned to ssize_t does not do what we want here, and was pointless since the rest of the eytzinger code is u32 - nr, not size Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index ee0e2df33322..24840aee335c 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -242,8 +242,8 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) = eytzinger0_next((_i), (_size))) /* return greatest node <= @search, or -1 if not found */ -static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) +static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -256,18 +256,32 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, } while (n < nr); if (n & 1) { - /* @i was greater than @search, return previous node: */ + /* + * @i was greater than @search, return previous node: + * + * if @i was leftmost/smallest element, + * eytzinger0_prev(eytzinger0_first())) returns -1, as expected + */ return eytzinger0_prev(i, nr); } else { return i; } } -static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, - cmp_func_t cmp, const void *search) +static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) { ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); - return eytzinger0_next(idx, size); + + /* + * if eytitzinger0_find_le() returned -1 - no element was <= search - we + * want to return the first element; next/prev identities mean this work + * as expected + * + * similarly if find_le() returns last element, we should return -1; + * identities mean this all works out: + */ + return eytzinger0_next(idx, nr); } #define eytzinger0_find(base, nr, size, _cmp, search) \ -- cgit v1.2.3 From b46f4eaa4f0ec38909fb0072eea3aeddb32f954e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 5 Apr 2024 15:10:57 -0700 Subject: af_unix: Clear stale u->oob_skb. syzkaller started to report deadlock of unix_gc_lock after commit 4090fa373f0e ("af_unix: Replace garbage collection algorithm."), but it just uncovers the bug that has been there since commit 314001f0bf92 ("af_unix: Add OOB support"). The repro basically does the following. from socket import * from array import array c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) c1.sendmsg([b'a'], [(SOL_SOCKET, SCM_RIGHTS, array("i", [c2.fileno()]))], MSG_OOB) c2.recv(1) # blocked as no normal data in recv queue c2.close() # done async and unblock recv() c1.close() # done async and trigger GC A socket sends its file descriptor to itself as OOB data and tries to receive normal data, but finally recv() fails due to async close(). The problem here is wrong handling of OOB skb in manage_oob(). When recvmsg() is called without MSG_OOB, manage_oob() is called to check if the peeked skb is OOB skb. In such a case, manage_oob() pops it out of the receive queue but does not clear unix_sock(sk)->oob_skb. This is wrong in terms of uAPI. Let's say we send "hello" with MSG_OOB, and "world" without MSG_OOB. The 'o' is handled as OOB data. When recv() is called twice without MSG_OOB, the OOB data should be lost. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM, 0) >>> c1.send(b'hello', MSG_OOB) # 'o' is OOB data 5 >>> c1.send(b'world') 5 >>> c2.recv(5) # OOB data is not received b'hell' >>> c2.recv(5) # OOB date is skipped b'world' >>> c2.recv(5, MSG_OOB) # This should return an error b'o' In the same situation, TCP actually returns -EINVAL for the last recv(). Also, if we do not clear unix_sk(sk)->oob_skb, unix_poll() always set EPOLLPRI even though the data has passed through by previous recv(). To avoid these issues, we must clear unix_sk(sk)->oob_skb when dequeuing it from recv queue. The reason why the old GC did not trigger the deadlock is because the old GC relied on the receive queue to detect the loop. When it is triggered, the socket with OOB data is marked as GC candidate because file refcount == inflight count (1). However, after traversing all inflight sockets, the socket still has a positive inflight count (1), thus the socket is excluded from candidates. Then, the old GC lose the chance to garbage-collect the socket. With the old GC, the repro continues to create true garbage that will never be freed nor detected by kmemleak as it's linked to the global inflight list. That's why we couldn't even notice the issue. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: syzbot+7f7f201cc2668a8fd169@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7f7f201cc2668a8fd169 Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240405221057.2406-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5b41e2321209..d032eb5fa6df 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2665,7 +2665,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } } else if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); - consume_skb(skb); + WRITE_ONCE(u->oob_skb, NULL); + if (!WARN_ON_ONCE(skb_unref(skb))) + kfree_skb(skb); skb = skb_peek(&sk->sk_receive_queue); } } -- cgit v1.2.3 From 718c4fb221dbeff9072810841b949413c5ffc345 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 8 Apr 2024 16:42:43 +1000 Subject: nouveau: fix devinit paths to only handle display on GSP. This reverts: nouveau/gsp: don't check devinit disable on GSP. and applies a further fix. It turns out the open gpu driver, checks this register, but only for display. Match that behaviour and in the turing path only disable the display block. (ampere already only does displays). Fixes: 5d4e8ae6e57b ("nouveau/gsp: don't check devinit disable on GSP.") Reviewed-by: Danilo Krummrich Signed-off-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240408064243.2219527-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c | 12 ++++++++---- drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c index 7bcbc4895ec2..271bfa038f5b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/gm107.c @@ -25,6 +25,7 @@ #include #include +#include void gm107_devinit_disable(struct nvkm_devinit *init) @@ -33,10 +34,13 @@ gm107_devinit_disable(struct nvkm_devinit *init) u32 r021c00 = nvkm_rd32(device, 0x021c00); u32 r021c04 = nvkm_rd32(device, 0x021c04); - if (r021c00 & 0x00000001) - nvkm_subdev_disable(device, NVKM_ENGINE_CE, 0); - if (r021c00 & 0x00000004) - nvkm_subdev_disable(device, NVKM_ENGINE_CE, 2); + /* gsp only wants to enable/disable display */ + if (!nvkm_gsp_rm(device->gsp)) { + if (r021c00 & 0x00000001) + nvkm_subdev_disable(device, NVKM_ENGINE_CE, 0); + if (r021c00 & 0x00000004) + nvkm_subdev_disable(device, NVKM_ENGINE_CE, 2); + } if (r021c04 & 0x00000001) nvkm_subdev_disable(device, NVKM_ENGINE_DISP, 0); } diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c index 11b4c9c274a1..666eb93b1742 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c @@ -41,6 +41,7 @@ r535_devinit_new(const struct nvkm_devinit_func *hw, rm->dtor = r535_devinit_dtor; rm->post = hw->post; + rm->disable = hw->disable; ret = nv50_devinit_new_(rm, device, type, inst, pdevinit); if (ret) -- cgit v1.2.3 From 4fe82aedeb8a8cb09bfa60f55ab57b5c10a74ac4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Apr 2024 18:11:09 +0100 Subject: io_uring/net: restore msg_control on sendzc retry cac9e4418f4cb ("io_uring/net: save msghdr->msg_control for retries") reinstatiates msg_control before every __sys_sendmsg_sock(), since the function can overwrite the value in msghdr. We need to do same for zerocopy sendmsg. Cc: stable@vger.kernel.org Fixes: 493108d95f146 ("io_uring/net: zerocopy sendmsg") Link: https://github.com/axboe/liburing/issues/1067 Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cc1d5d9df0576fa66ddad4420d240a98a020b267.1712596179.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index 1e7665ff6ef7..4afb475d4197 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1276,6 +1276,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; + kmsg->msg.msg_control_user = sr->msg_control; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) -- cgit v1.2.3 From 359571c327a726d622786aef3833637dacfd5d38 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Apr 2024 00:02:47 -0400 Subject: bcachefs: Fix check_topology() when using node scan shoot down journal keys _before_ populating journal keys with pointers to scanned nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6280da1244b5..a1c2c0ffb0cc 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -544,12 +544,12 @@ reconstruct_root: bch2_btree_root_alloc_fake(c, i, 0); } else { bch2_btree_root_alloc_fake(c, i, 1); + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); if (ret) break; } - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); reconstructed_root = true; } -- cgit v1.2.3 From 5ab4beb759c05c74fb385ac5ca0ade5d3db67975 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Apr 2024 00:49:39 -0400 Subject: bcachefs: Don't scan for btree nodes when we can reconstruct Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 14 ++++++++++++++ fs/bcachefs/btree_gc.c | 11 ++++++++--- fs/bcachefs/btree_node_scan.c | 8 +++++++- fs/bcachefs/recovery.c | 14 -------------- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 63102992d955..364ae42022af 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1535,6 +1535,20 @@ enum btree_id { BTREE_ID_NR }; +static inline bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: + return true; + default: + return false; + } +} + #define BTREE_MAX_DEPTH 4U /* Btree nodes */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index a1c2c0ffb0cc..d2555da55c6d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -368,11 +368,16 @@ again: buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: - bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); if (ret) break; + + if (!btree_id_is_alloc(b->c.btree_id)) { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + break; + } continue; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 3f33be7e5e5c..a7d0593b3871 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -133,6 +133,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (le64_to_cpu(bn->magic) != bset_magic(c)) return; + if (btree_id_is_alloc(BTREE_NODE_ID(bn))) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), @@ -290,7 +293,7 @@ again: found_btree_node_to_text(&buf, c, n); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); - return -1; + return -BCH_ERR_fsck_repair_unimplemented; } } @@ -436,6 +439,9 @@ bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos node_min, struct bpos node_max) { + if (btree_id_is_alloc(btree)) + return 0; + struct find_btree_nodes *f = &c->found_btree_nodes; int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b76c16152579..0f328aba9760 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -47,20 +47,6 @@ void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) } } -static bool btree_id_is_alloc(enum btree_id id) -{ - switch (id) { - case BTREE_ID_alloc: - case BTREE_ID_backpointers: - case BTREE_ID_need_discard: - case BTREE_ID_freespace: - case BTREE_ID_bucket_gens: - return true; - default: - return false; - } -} - /* for -o reconstruct_alloc: */ static void bch2_reconstruct_alloc(struct bch_fs *c) { -- cgit v1.2.3 From 80e9963fb3b5509dfcabe9652d56bf4b35542055 Mon Sep 17 00:00:00 2001 From: Nianyao Tang Date: Sat, 6 Apr 2024 02:27:37 +0000 Subject: irqchip/gic-v3-its: Fix VSYNC referencing an unmapped VPE on GIC v4.1 As per the GICv4.1 spec (Arm IHI 0069H, 5.3.19): "A VMAPP with {V, Alloc}=={0, x} is self-synchronizing, This means the ITS command queue does not show the command as consumed until all of its effects are completed." Furthermore, VSYNC is allowed to deliver an SError when referencing a non existent VPE. By these definitions, a VMAPP followed by a VSYNC is a bug, as the later references a VPE that has been unmapped by the former. Fix it by eliding the VSYNC in this scenario. Fixes: 64edfaa9a234 ("irqchip/gic-v4.1: Implement the v4.1 flavour of VMAPP") Signed-off-by: Nianyao Tang Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20240406022737.3898763-1-tangnianyao@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fca888b36680..2a537cbfcb07 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -786,6 +786,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, struct its_cmd_block *cmd, struct its_cmd_desc *desc) { + struct its_vpe *vpe = valid_vpe(its, desc->its_vmapp_cmd.vpe); unsigned long vpt_addr, vconf_addr; u64 target; bool alloc; @@ -798,6 +799,11 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, if (is_v4_1(its)) { alloc = !atomic_dec_return(&desc->its_vmapp_cmd.vpe->vmapp_count); its_encode_alloc(cmd, alloc); + /* + * Unmapping a VPE is self-synchronizing on GICv4.1, + * no need to issue a VSYNC. + */ + vpe = NULL; } goto out; @@ -832,7 +838,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, out: its_fixup_cmd(cmd); - return valid_vpe(its, desc->its_vmapp_cmd.vpe); + return vpe; } static struct its_vpe *its_build_vmapti_cmd(struct its_node *its, -- cgit v1.2.3 From faf23006185e777db18912685922c5ddb2df383f Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Mon, 8 Apr 2024 12:06:43 +0530 Subject: octeontx2-af: Fix NIX SQ mode and BP config NIX SQ mode and link backpressure configuration is required for all platforms. But in current driver this code is wrongly placed under specific platform check. This patch fixes the issue by moving the code out of platform check. Fixes: 5d9b976d4480 ("octeontx2-af: Support fixed transmit scheduler topology") Signed-off-by: Geetha sowjanya Link: https://lore.kernel.org/r/20240408063643.26288-1-gakula@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index d39001cdc707..00af8888e329 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -4819,18 +4819,18 @@ static int rvu_nix_block_init(struct rvu *rvu, struct nix_hw *nix_hw) */ rvu_write64(rvu, blkaddr, NIX_AF_CFG, rvu_read64(rvu, blkaddr, NIX_AF_CFG) | 0x40ULL); + } - /* Set chan/link to backpressure TL3 instead of TL2 */ - rvu_write64(rvu, blkaddr, NIX_AF_PSE_CHANNEL_LEVEL, 0x01); + /* Set chan/link to backpressure TL3 instead of TL2 */ + rvu_write64(rvu, blkaddr, NIX_AF_PSE_CHANNEL_LEVEL, 0x01); - /* Disable SQ manager's sticky mode operation (set TM6 = 0) - * This sticky mode is known to cause SQ stalls when multiple - * SQs are mapped to same SMQ and transmitting pkts at a time. - */ - cfg = rvu_read64(rvu, blkaddr, NIX_AF_SQM_DBG_CTL_STATUS); - cfg &= ~BIT_ULL(15); - rvu_write64(rvu, blkaddr, NIX_AF_SQM_DBG_CTL_STATUS, cfg); - } + /* Disable SQ manager's sticky mode operation (set TM6 = 0) + * This sticky mode is known to cause SQ stalls when multiple + * SQs are mapped to same SMQ and transmitting pkts at a time. + */ + cfg = rvu_read64(rvu, blkaddr, NIX_AF_SQM_DBG_CTL_STATUS); + cfg &= ~BIT_ULL(15); + rvu_write64(rvu, blkaddr, NIX_AF_SQM_DBG_CTL_STATUS, cfg); ltdefs = rvu->kpu.lt_def; /* Calibrate X2P bus to check if CGX/LBK links are fine */ -- cgit v1.2.3 From 74043489fcb5e5ca4074133582b5b8011b67f9e7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:42:02 +0200 Subject: ipv6: fib: hide unused 'pn' variable When CONFIG_IPV6_SUBTREES is disabled, the only user is hidden, causing a 'make W=1' warning: net/ipv6/ip6_fib.c: In function 'fib6_add': net/ipv6/ip6_fib.c:1388:32: error: variable 'pn' set but not used [-Werror=unused-but-set-variable] Add another #ifdef around the variable declaration, matching the other uses in this file. Fixes: 66729e18df08 ("[IPV6] ROUTE: Make sure we have fn->leaf when adding a node on subtree.") Link: https://lore.kernel.org/netdev/20240322131746.904943-1-arnd@kernel.org/ Reviewed-by: David Ahern Signed-off-by: Arnd Bergmann Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408074219.3030256-1-arnd@kernel.org Signed-off-by: Paolo Abeni --- net/ipv6/ip6_fib.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7209419cfb0e..c1f62352a481 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1385,7 +1385,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; - struct fib6_node *fn, *pn = NULL; + struct fib6_node *fn; +#ifdef CONFIG_IPV6_SUBTREES + struct fib6_node *pn = NULL; +#endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; @@ -1409,9 +1412,9 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, goto out; } +#ifdef CONFIG_IPV6_SUBTREES pn = fn; -#ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; -- cgit v1.2.3 From cf1b7201df59fb936f40f4a807433fe3f2ce310a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:42:03 +0200 Subject: ipv4/route: avoid unused-but-set-variable warning The log_martians variable is only used in an #ifdef, causing a 'make W=1' warning with gcc: net/ipv4/route.c: In function 'ip_rt_send_redirect': net/ipv4/route.c:880:13: error: variable 'log_martians' set but not used [-Werror=unused-but-set-variable] Change the #ifdef to an equivalent IS_ENABLED() to let the compiler see where the variable is used. Fixes: 30038fc61adf ("net: ip_rt_send_redirect() optimization") Reviewed-by: David Ahern Signed-off-by: Arnd Bergmann Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408074219.3030256-2-arnd@kernel.org Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c8f76f56dc16..d36ace160d42 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -926,13 +926,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; -#ifdef CONFIG_IP_ROUTE_VERBOSE - if (log_martians && + if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); -#endif } out_put_peer: inet_putpeer(peer); -- cgit v1.2.3 From 638441bed666619b4275f68bbca9d1cd731a2063 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 16 Mar 2024 12:30:09 +0300 Subject: serial: 8250_lpc18xx: disable clks on error in probe() Goto the clean up path to clean up a couple clocks before returning on this error path. Fixes: 0087b9e694ee ("serial: 8250_lpc18xx: Switch to use uart_read_port_properties()") Signed-off-by: Dan Carpenter Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/92646c10-e0b5-4117-a9ac-ce9987d33ce3@moroto.mountain Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_lpc18xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_lpc18xx.c b/drivers/tty/serial/8250/8250_lpc18xx.c index 7984ee05af1d..47e1a056a60c 100644 --- a/drivers/tty/serial/8250/8250_lpc18xx.c +++ b/drivers/tty/serial/8250/8250_lpc18xx.c @@ -151,7 +151,7 @@ static int lpc18xx_serial_probe(struct platform_device *pdev) ret = uart_read_port_properties(&uart.port); if (ret) - return ret; + goto dis_uart_clk; uart.port.iotype = UPIO_MEM32; uart.port.regshift = 2; -- cgit v1.2.3 From 7dfae6cbadc1ac99e38ad19fb08810b31ff167be Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 17 Mar 2024 22:41:23 +0100 Subject: serial: 8250_dw: Revert: Do not reclock if already at correct rate Commit e5d6bd25f93d ("serial: 8250_dw: Do not reclock if already at correct rate") breaks the dw UARTs on Intel Bay Trail (BYT) and Cherry Trail (CHT) SoCs. Before this change the RTL8732BS Bluetooth HCI which is found connected over the dw UART on both BYT and CHT boards works properly: Bluetooth: hci0: RTL: examining hci_ver=06 hci_rev=000b lmp_ver=06 lmp_subver=8723 Bluetooth: hci0: RTL: rom_version status=0 version=1 Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_fw.bin Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_config-OBDA8723.bin Bluetooth: hci0: RTL: cfg_sz 64, total sz 24508 Bluetooth: hci0: RTL: fw version 0x365d462e where as after this change probing it fails: Bluetooth: hci0: RTL: examining hci_ver=06 hci_rev=000b lmp_ver=06 lmp_subver=8723 Bluetooth: hci0: RTL: rom_version status=0 version=1 Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_fw.bin Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_config-OBDA8723.bin Bluetooth: hci0: RTL: cfg_sz 64, total sz 24508 Bluetooth: hci0: command 0xfc20 tx timeout Bluetooth: hci0: RTL: download fw command failed (-110) Revert the changes to fix this regression. Fixes: e5d6bd25f93d ("serial: 8250_dw: Do not reclock if already at correct rate") Cc: stable@vger.kernel.org Cc: Peter Collingbourne Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Acked-by: Peter Collingbourne Link: https://lore.kernel.org/r/20240317214123.34482-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_dw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index a3acbf0f5da1..1300c92b8702 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -356,9 +356,9 @@ static void dw8250_set_termios(struct uart_port *p, struct ktermios *termios, long rate; int ret; + clk_disable_unprepare(d->clk); rate = clk_round_rate(d->clk, newrate); - if (rate > 0 && p->uartclk != rate) { - clk_disable_unprepare(d->clk); + if (rate > 0) { /* * Note that any clock-notifer worker will block in * serial8250_update_uartclk() until we are done. @@ -366,8 +366,8 @@ static void dw8250_set_termios(struct uart_port *p, struct ktermios *termios, ret = clk_set_rate(d->clk, newrate); if (!ret) p->uartclk = rate; - clk_prepare_enable(d->clk); } + clk_prepare_enable(d->clk); dw8250_do_set_termios(p, termios, old); } -- cgit v1.2.3 From 54c4ec5f8c471b7c1137a1f769648549c423c026 Mon Sep 17 00:00:00 2001 From: Emil Kronborg Date: Wed, 20 Mar 2024 12:15:36 +0000 Subject: serial: mxs-auart: add spinlock around changing cts state The uart_handle_cts_change() function in serial_core expects the caller to hold uport->lock. For example, I have seen the below kernel splat, when the Bluetooth driver is loaded on an i.MX28 board. [ 85.119255] ------------[ cut here ]------------ [ 85.124413] WARNING: CPU: 0 PID: 27 at /drivers/tty/serial/serial_core.c:3453 uart_handle_cts_change+0xb4/0xec [ 85.134694] Modules linked in: hci_uart bluetooth ecdh_generic ecc wlcore_sdio configfs [ 85.143314] CPU: 0 PID: 27 Comm: kworker/u3:0 Not tainted 6.6.3-00021-gd62a2f068f92 #1 [ 85.151396] Hardware name: Freescale MXS (Device Tree) [ 85.156679] Workqueue: hci0 hci_power_on [bluetooth] (...) [ 85.191765] uart_handle_cts_change from mxs_auart_irq_handle+0x380/0x3f4 [ 85.198787] mxs_auart_irq_handle from __handle_irq_event_percpu+0x88/0x210 (...) Cc: stable@vger.kernel.org Fixes: 4d90bb147ef6 ("serial: core: Document and assert lock requirements for irq helpers") Reviewed-by: Frank Li Signed-off-by: Emil Kronborg Link: https://lore.kernel.org/r/20240320121530.11348-1-emil.kronborg@protonmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/mxs-auart.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/mxs-auart.c b/drivers/tty/serial/mxs-auart.c index 4749331fe618..1e8853eae504 100644 --- a/drivers/tty/serial/mxs-auart.c +++ b/drivers/tty/serial/mxs-auart.c @@ -1086,11 +1086,13 @@ static void mxs_auart_set_ldisc(struct uart_port *port, static irqreturn_t mxs_auart_irq_handle(int irq, void *context) { - u32 istat; + u32 istat, stat; struct mxs_auart_port *s = context; u32 mctrl_temp = s->mctrl_prev; - u32 stat = mxs_read(s, REG_STAT); + uart_port_lock(&s->port); + + stat = mxs_read(s, REG_STAT); istat = mxs_read(s, REG_INTR); /* ack irq */ @@ -1126,6 +1128,8 @@ static irqreturn_t mxs_auart_irq_handle(int irq, void *context) istat &= ~AUART_INTR_TXIS; } + uart_port_unlock(&s->port); + return IRQ_HANDLED; } -- cgit v1.2.3 From 5555980571cc744cd99b6455e3e388b54519db8f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 25 Mar 2024 09:16:47 +0200 Subject: serial: core: Fix regression when runtime PM is not enabled Commit 45a3a8ef8129 ("serial: core: Revert checks for tx runtime PM state") caused a regression for Sun Ultra 60 for the sunsab driver as reported by Nick Bowler . We need to add back the check runtime PM enabled state for serial port controller device, I wrongly assumed earlier we could just remove it. Fixes: 45a3a8ef8129 ("serial: core: Revert checks for tx runtime PM state") Cc: stable Reported-by: Nick Bowler Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20240325071649.27040-1-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index ff85ebd3a007..25a83820927a 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -156,7 +156,7 @@ static void __uart_start(struct uart_state *state) * enabled, serial_port_runtime_resume() calls start_tx() again * after enabling the device. */ - if (pm_runtime_active(&port_dev->dev)) + if (!pm_runtime_enabled(port->dev) || pm_runtime_active(&port_dev->dev)) port->ops->start_tx(port); pm_runtime_mark_last_busy(&port_dev->dev); pm_runtime_put_autosuspend(&port_dev->dev); -- cgit v1.2.3 From 90452456eb69297fe7ae34e56e40d8e47dc9e019 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 4 Apr 2024 01:41:52 +0300 Subject: serial: 8250_pci: Remove redundant PCI IDs Driver complains that PCI IDs are not needed for some of the LAVA cards: [ 0.297252] serial 0000:04:00.0: Redundant entry in serial pci_table. [ 0.297252] Please send the output of lspci -vv, this [ 0.297252] message (0x1407,0x0120,0x0000,0x0000), the [ 0.297252] manufacturer and name of serial board or [ 0.297252] modem board to . Do as suggested. Reported-by: Jimmy A Closes: https://lore.kernel.org/r/VI1P194MB052751BE157EFE9CEAB75725CE362@VI1P194MB0527.EURP194.PROD.OUTLOOK.COM Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240403224152.945099-1-andy.shevchenko@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pci.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index 0d35c77fad9e..e2e4f99f9d34 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -5010,12 +5010,6 @@ static const struct pci_device_id serial_pci_tbl[] = { { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_QUATRO_B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, pbn_b0_bt_2_115200 }, - { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_QUATTRO_A, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, - pbn_b0_bt_2_115200 }, - { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_QUATTRO_B, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, - pbn_b0_bt_2_115200 }, { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_OCTO_A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, pbn_b0_bt_4_460800 }, -- cgit v1.2.3 From 1be3226445362bfbf461c92a5bcdb1723f2e4907 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Mon, 8 Apr 2024 19:23:43 +1000 Subject: serial/pmac_zilog: Remove flawed mitigation for rx irq flood The mitigation was intended to stop the irq completely. That may be better than a hard lock-up but it turns out that you get a crash anyway if you're using pmac_zilog as a serial console: ttyPZ0: pmz: rx irq flood ! BUG: spinlock recursion on CPU#0, swapper/0 That's because the pr_err() call in pmz_receive_chars() results in pmz_console_write() attempting to lock a spinlock already locked in pmz_interrupt(). With CONFIG_DEBUG_SPINLOCK=y, this produces a fatal BUG splat. The spinlock in question is the one in struct uart_port. Even when it's not fatal, the serial port rx function ceases to work. Also, the iteration limit doesn't play nicely with QEMU, as can be seen in the bug report linked below. A web search for other reports of the error message "pmz: rx irq flood" didn't produce anything. So I don't think this code is needed any more. Remove it. Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Aneesh Kumar K.V Cc: Naveen N. Rao Cc: Andy Shevchenko Cc: stable@kernel.org Cc: linux-m68k@lists.linux-m68k.org Link: https://github.com/vivier/qemu-m68k/issues/44 Link: https://lore.kernel.org/all/1078874617.9746.36.camel@gaston/ Acked-by: Michael Ellerman Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable Signed-off-by: Finn Thain Link: https://lore.kernel.org/r/e853cf2c762f23101cd2ddec0cc0c2be0e72685f.1712568223.git.fthain@linux-m68k.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/pmac_zilog.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/tty/serial/pmac_zilog.c b/drivers/tty/serial/pmac_zilog.c index 05d97e89511e..92195f984de1 100644 --- a/drivers/tty/serial/pmac_zilog.c +++ b/drivers/tty/serial/pmac_zilog.c @@ -210,7 +210,6 @@ static bool pmz_receive_chars(struct uart_pmac_port *uap) { struct tty_port *port; unsigned char ch, r1, drop, flag; - int loops = 0; /* Sanity check, make sure the old bug is no longer happening */ if (uap->port.state == NULL) { @@ -291,24 +290,11 @@ static bool pmz_receive_chars(struct uart_pmac_port *uap) if (r1 & Rx_OVR) tty_insert_flip_char(port, 0, TTY_OVERRUN); next_char: - /* We can get stuck in an infinite loop getting char 0 when the - * line is in a wrong HW state, we break that here. - * When that happens, I disable the receive side of the driver. - * Note that what I've been experiencing is a real irq loop where - * I'm getting flooded regardless of the actual port speed. - * Something strange is going on with the HW - */ - if ((++loops) > 1000) - goto flood; ch = read_zsreg(uap, R0); if (!(ch & Rx_CH_AV)) break; } - return true; - flood: - pmz_interrupt_control(uap, 0); - pmz_error("pmz: rx irq flood !\n"); return true; } -- cgit v1.2.3 From d325a858a53b5816a60447887f7148eace999e00 Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Mon, 8 Apr 2024 12:13:29 +0200 Subject: MAINTAINERS: mailmap: update Richard Genoud's email address I'm working now at bootlin, so I'll use my bootlin address for kernel development from now on. Update also the yaml file for atmel-serial accordingly. Signed-off-by: Richard Genoud Reviewed-by: Nicolas Ferre Link: https://lore.kernel.org/r/20240408101329.9448-1-richard.genoud@bootlin.com Signed-off-by: Greg Kroah-Hartman --- .mailmap | 1 + Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml | 2 +- MAINTAINERS | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 8284692f9610..71e28f4e0d4a 100644 --- a/.mailmap +++ b/.mailmap @@ -524,6 +524,7 @@ Rémi Denis-Courmont Ricardo Ribalda Ricardo Ribalda Ricardo Ribalda Delgado Ricardo Ribalda +Richard Genoud Richard Leitner Richard Leitner Richard Leitner diff --git a/Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml b/Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml index 65cb2e5c5eee..eb2992a447d7 100644 --- a/Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml +++ b/Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml @@ -8,7 +8,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Atmel Universal Synchronous Asynchronous Receiver/Transmitter (USART) maintainers: - - Richard Genoud + - Richard Genoud properties: compatible: diff --git a/MAINTAINERS b/MAINTAINERS index aea47e04c3a5..0dbdc81e46c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14363,7 +14363,7 @@ F: drivers/dma/at_xdmac.c F: include/dt-bindings/dma/at91.h MICROCHIP AT91 SERIAL DRIVER -M: Richard Genoud +M: Richard Genoud S: Maintained F: Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml F: drivers/tty/serial/atmel_serial.c -- cgit v1.2.3 From fbbdc255fbee59b4207a5398fdb4f04590681a79 Mon Sep 17 00:00:00 2001 From: Zhenhua Huang Date: Mon, 8 Apr 2024 21:43:57 -0700 Subject: fs/proc: remove redundant comments from /proc/bootconfig commit 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to /proc/bootconfig") adds bootloader argument comments into /proc/bootconfig. /proc/bootconfig shows boot_command_line[] multiple times following every xbc key value pair, that's duplicated and not necessary. Remove redundant ones. Output before and after the fix is like: key1 = value1 *bootloader argument comments* key2 = value2 *bootloader argument comments* key3 = value3 *bootloader argument comments* ... key1 = value1 key2 = value2 key3 = value3 *bootloader argument comments* ... Link: https://lore.kernel.org/all/20240409044358.1156477-1-paulmck@kernel.org/ Fixes: 717c7c894d4b ("fs/proc: Add boot loader arguments as comment to /proc/bootconfig") Signed-off-by: Zhenhua Huang Signed-off-by: Paul E. McKenney Cc: Cc: Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- fs/proc/bootconfig.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 902b326e1e56..e5635a6b127b 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; } - if (ret >= 0 && boot_command_line[0]) { - ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", - boot_command_line); - if (ret > 0) - dst += ret; - } + } + if (ret >= 0 && boot_command_line[0]) { + ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", + boot_command_line); + if (ret > 0) + dst += ret; } out: kfree(key); -- cgit v1.2.3 From c722cea208789d9e2660992bcd05fb9fac3adb56 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 8 Apr 2024 21:43:58 -0700 Subject: fs/proc: Skip bootloader comment if no embedded kernel parameters If the "bootconfig" kernel command-line argument was specified or if the kernel was built with CONFIG_BOOT_CONFIG_FORCE, but if there are no embedded kernel parameter, omit the "# Parameters from bootloader:" comment from the /proc/bootconfig file. This will cause automation to fall back to the /proc/cmdline file, which will be identical to the comment in this no-embedded-kernel-parameters case. Link: https://lore.kernel.org/all/20240409044358.1156477-2-paulmck@kernel.org/ Fixes: 8b8ce6c75430 ("fs/proc: remove redundant comments from /proc/bootconfig") Signed-off-by: Masami Hiramatsu Signed-off-by: Paul E. McKenney Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- fs/proc/bootconfig.c | 2 +- include/linux/bootconfig.h | 1 + init/main.c | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index e5635a6b127b..87dcaae32ff8 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -63,7 +63,7 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) dst += ret; } } - if (ret >= 0 && boot_command_line[0]) { + if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) { ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", boot_command_line); if (ret > 0) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index ca73940e26df..e5ee2c694401 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -10,6 +10,7 @@ #ifdef __KERNEL__ #include #include +bool __init cmdline_has_extra_options(void); #else /* !__KERNEL__ */ /* * NOTE: This is only for tools/bootconfig, because tools/bootconfig will diff --git a/init/main.c b/init/main.c index 2ca52474d0c3..881f6230ee59 100644 --- a/init/main.c +++ b/init/main.c @@ -487,6 +487,11 @@ static int __init warn_bootconfig(char *str) early_param("bootconfig", warn_bootconfig); +bool __init cmdline_has_extra_options(void) +{ + return extra_command_line || extra_init_args; +} + /* Change NUL term back to "=", to make "param" the whole string. */ static void __init repair_env_string(char *param, char *val) { -- cgit v1.2.3 From 4370b673ccf240bf7587b0cb8e6726a5ccaf1f17 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Thu, 28 Mar 2024 14:27:56 +0000 Subject: MIPS: scall: Save thread_info.syscall unconditionally on entry thread_info.syscall is used by syscall_get_nr to supply syscall nr over a thread stack frame. Previously, thread_info.syscall is only saved at syscall_trace_enter when syscall tracing is enabled. However rest of the kernel code do expect syscall_get_nr to be available without syscall tracing. The previous design breaks collect_syscall. Move saving process to syscall entry to fix it. Reported-by: Xi Ruoyao Link: https://github.com/util-linux/util-linux/issues/2867 Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/ptrace.h | 2 +- arch/mips/kernel/asm-offsets.c | 1 + arch/mips/kernel/ptrace.c | 15 ++++++--------- arch/mips/kernel/scall32-o32.S | 23 +++++++++++++---------- arch/mips/kernel/scall64-n32.S | 3 ++- arch/mips/kernel/scall64-n64.S | 3 ++- arch/mips/kernel/scall64-o32.S | 33 +++++++++++++++++---------------- 7 files changed, 42 insertions(+), 38 deletions(-) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index d14d0e37ad02..4a2b40ce39e0 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -159,7 +159,7 @@ extern unsigned long exception_ip(struct pt_regs *regs); #define exception_ip(regs) exception_ip(regs) #define profile_pc(regs) instruction_pointer(regs) -extern asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall); +extern asmlinkage long syscall_trace_enter(struct pt_regs *regs); extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); extern void die(const char *, struct pt_regs *) __noreturn; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index d1b11f66f748..cb1045ebab06 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -101,6 +101,7 @@ void output_thread_info_defines(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); OFFSET(TI_REGS, thread_info, regs); + OFFSET(TI_SYSCALL, thread_info, syscall); DEFINE(_THREAD_SIZE, THREAD_SIZE); DEFINE(_THREAD_MASK, THREAD_MASK); DEFINE(_IRQ_STACK_SIZE, IRQ_STACK_SIZE); diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 59288c13b581..61503a36067e 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -1317,16 +1317,13 @@ long arch_ptrace(struct task_struct *child, long request, * Notification of system call entry/exit * - triggered by current->work.syscall_trace */ -asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) +asmlinkage long syscall_trace_enter(struct pt_regs *regs) { user_exit(); - current_thread_info()->syscall = syscall; - if (test_thread_flag(TIF_SYSCALL_TRACE)) { if (ptrace_report_syscall_entry(regs)) return -1; - syscall = current_thread_info()->syscall; } #ifdef CONFIG_SECCOMP @@ -1335,7 +1332,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) struct seccomp_data sd; unsigned long args[6]; - sd.nr = syscall; + sd.nr = current_thread_info()->syscall; sd.arch = syscall_get_arch(current); syscall_get_arguments(current, regs, args); for (i = 0; i < 6; i++) @@ -1345,23 +1342,23 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) ret = __secure_computing(&sd); if (ret == -1) return ret; - syscall = current_thread_info()->syscall; } #endif if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[2]); - audit_syscall_entry(syscall, regs->regs[4], regs->regs[5], + audit_syscall_entry(current_thread_info()->syscall, + regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7]); /* * Negative syscall numbers are mistaken for rejected syscalls, but * won't have had the return value set appropriately, so we do so now. */ - if (syscall < 0) + if (current_thread_info()->syscall < 0) syscall_set_return_value(current, regs, -ENOSYS, 0); - return syscall; + return current_thread_info()->syscall; } /* diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 18dc9b345056..2c604717e630 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -77,6 +77,18 @@ loads_done: PTR_WD load_a7, bad_stack_a7 .previous + /* + * syscall number is in v0 unless we called syscall(__NR_###) + * where the real syscall number is in a0 + */ + subu t2, v0, __NR_O32_Linux + bnez t2, 1f /* __NR_syscall at offset 0 */ + LONG_S a0, TI_SYSCALL($28) # Save a0 as syscall number + b 2f +1: + LONG_S v0, TI_SYSCALL($28) # Save v0 as syscall number +2: + lw t0, TI_FLAGS($28) # syscall tracing enabled? li t1, _TIF_WORK_SYSCALL_ENTRY and t0, t1 @@ -114,16 +126,7 @@ syscall_trace_entry: SAVE_STATIC move a0, sp - /* - * syscall number is in v0 unless we called syscall(__NR_###) - * where the real syscall number is in a0 - */ - move a1, v0 - subu t2, v0, __NR_O32_Linux - bnez t2, 1f /* __NR_syscall at offset 0 */ - lw a1, PT_R4(sp) - -1: jal syscall_trace_enter + jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 97456b2ca7dc..97788859238c 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -44,6 +44,8 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd a3, PT_R26(sp) # save a3 for syscall restarting + LONG_S v0, TI_SYSCALL($28) # Store syscall number + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -72,7 +74,6 @@ syscall_common: n32_syscall_trace_entry: SAVE_STATIC move a0, sp - move a1, v0 jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-n64.S b/arch/mips/kernel/scall64-n64.S index e6264aa62e45..be11ea5cc67e 100644 --- a/arch/mips/kernel/scall64-n64.S +++ b/arch/mips/kernel/scall64-n64.S @@ -46,6 +46,8 @@ NESTED(handle_sys64, PT_SIZE, sp) sd a3, PT_R26(sp) # save a3 for syscall restarting + LONG_S v0, TI_SYSCALL($28) # Store syscall number + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -82,7 +84,6 @@ n64_syscall_exit: syscall_trace_entry: SAVE_STATIC move a0, sp - move a1, v0 jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index d3c2616cba22..7a5abb73e531 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -79,6 +79,22 @@ loads_done: PTR_WD load_a7, bad_stack_a7 .previous + /* + * absolute syscall number is in v0 unless we called syscall(__NR_###) + * where the real syscall number is in a0 + * note: NR_syscall is the first O32 syscall but the macro is + * only defined when compiling with -mabi=32 (CONFIG_32BIT) + * therefore __NR_O32_Linux is used (4000) + */ + + subu t2, v0, __NR_O32_Linux + bnez t2, 1f /* __NR_syscall at offset 0 */ + LONG_S a0, TI_SYSCALL($28) # Save a0 as syscall number + b 2f +1: + LONG_S v0, TI_SYSCALL($28) # Save v0 as syscall number +2: + li t1, _TIF_WORK_SYSCALL_ENTRY LONG_L t0, TI_FLAGS($28) # syscall tracing enabled? and t0, t1, t0 @@ -113,22 +129,7 @@ trace_a_syscall: sd a7, PT_R11(sp) # For indirect syscalls move a0, sp - /* - * absolute syscall number is in v0 unless we called syscall(__NR_###) - * where the real syscall number is in a0 - * note: NR_syscall is the first O32 syscall but the macro is - * only defined when compiling with -mabi=32 (CONFIG_32BIT) - * therefore __NR_O32_Linux is used (4000) - */ - .set push - .set reorder - subu t1, v0, __NR_O32_Linux - move a1, v0 - bnez t1, 1f /* __NR_syscall at offset 0 */ - ld a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ - .set pop - -1: jal syscall_trace_enter + jal syscall_trace_enter bltz v0, 1f # seccomp failed? Skip syscall -- cgit v1.2.3 From 9cf7ea2eeb745213dc2a04103e426b960e807940 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 4 Apr 2024 17:59:26 +0300 Subject: serial: core: Clearing the circular buffer before NULLifying it The circular buffer is NULLified in uart_tty_port_shutdown() under the spin lock. However, the PM or other timer based callbacks may still trigger after this event without knowning that buffer pointer is not valid. Since the serial code is a bit inconsistent in checking the buffer state (some rely on the head-tail positions, some on the buffer pointer), it's better to have both aligned, i.e. buffer pointer to be NULL and head-tail possitions to be the same, meaning it's empty. This will prevent asynchronous calls to dereference NULL pointer as reported recently in 8250 case: BUG: kernel NULL pointer dereference, address: 00000cf5 Workqueue: pm pm_runtime_work EIP: serial8250_tx_chars (drivers/tty/serial/8250/8250_port.c:1809) ... ? serial8250_tx_chars (drivers/tty/serial/8250/8250_port.c:1809) __start_tx (drivers/tty/serial/8250/8250_port.c:1551) serial8250_start_tx (drivers/tty/serial/8250/8250_port.c:1654) serial_port_runtime_suspend (include/linux/serial_core.h:667 drivers/tty/serial/serial_port.c:63) __rpm_callback (drivers/base/power/runtime.c:393) ? serial_port_remove (drivers/tty/serial/serial_port.c:50) rpm_suspend (drivers/base/power/runtime.c:447) The proposed change will prevent ->start_tx() to be called during suspend on shut down port. Fixes: 43066e32227e ("serial: port: Don't suspend if the port is still busy") Cc: stable Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404031607.2e92eebe-lkp@intel.com Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240404150034.41648-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 25a83820927a..2247efe97250 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -1788,6 +1788,7 @@ static void uart_tty_port_shutdown(struct tty_port *port) * Free the transmit buffer. */ uart_port_lock_irq(uport); + uart_circ_clear(&state->xmit); buf = state->xmit.buf; state->xmit.buf = NULL; uart_port_unlock_irq(uport); -- cgit v1.2.3 From 011d79ef1cfad701c2d8e7e80d8c77523af9c771 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 8 Apr 2024 09:32:34 +0200 Subject: MAINTAINERS: Change Krzysztof Kozlowski's email address Switch Krzysztof Kozlowski's to @kernel.org account. Link: https://lore.kernel.org/r/20240329174823.74918-1-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Arnd Bergmann --- MAINTAINERS | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..cb0049030acd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2708,7 +2708,7 @@ F: sound/soc/rockchip/ N: rockchip ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-samsung-soc@vger.kernel.org @@ -5557,7 +5557,7 @@ F: drivers/cpuidle/cpuidle-big_little.c CPUIDLE DRIVER - ARM EXYNOS M: Daniel Lezcano M: Kukjin Kim -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski L: linux-pm@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -9000,7 +9000,7 @@ F: drivers/i2c/muxes/i2c-mux-gpio.c F: include/linux/platform_data/i2c-mux-gpio.h GENERIC GPIO RESET DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: drivers/reset/reset-gpio.c @@ -13295,7 +13295,7 @@ F: drivers/iio/adc/max11205.c MAXIM MAX17040 FAMILY FUEL GAUGE DRIVERS R: Iskren Chernev -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Matheus Castello L: linux-pm@vger.kernel.org @@ -13305,7 +13305,7 @@ F: drivers/power/supply/max17040_battery.c MAXIM MAX17042 FAMILY FUEL GAUGE DRIVERS R: Hans de Goede -R: Krzysztof Kozlowski +R: Krzysztof Kozlowski R: Marek Szyprowski R: Sebastian Krzyszkowiak R: Purism Kernel Team @@ -13363,7 +13363,7 @@ F: Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml F: drivers/power/supply/max77976_charger.c MAXIM MUIC CHARGER DRIVERS FOR EXYNOS BASED BOARDS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-pm@vger.kernel.org S: Maintained B: mailto:linux-samsung-soc@vger.kernel.org @@ -13374,7 +13374,7 @@ F: drivers/power/supply/max77693_charger.c MAXIM PMIC AND MUIC DRIVERS FOR EXYNOS BASED BOARDS M: Chanwoo Choi -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained B: mailto:linux-samsung-soc@vger.kernel.org @@ -14156,7 +14156,7 @@ F: mm/mm_init.c F: tools/testing/memblock/ MEMORY CONTROLLER DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained B: mailto:krzysztof.kozlowski@linaro.org @@ -15537,7 +15537,7 @@ F: include/uapi/linux/nexthop.h F: net/ipv4/nexthop.c NFC SUBSYSTEM -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/nfc/ @@ -15914,7 +15914,7 @@ F: Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml F: drivers/regulator/pf8x00-regulator.c NXP PTN5150A CC LOGIC AND EXTCON DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/extcon/extcon-ptn5150.yaml @@ -16525,7 +16525,7 @@ K: of_overlay_remove OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS M: Rob Herring -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Conor Dooley L: devicetree@vger.kernel.org S: Maintained @@ -17483,7 +17483,7 @@ F: Documentation/devicetree/bindings/pinctrl/renesas,* F: drivers/pinctrl/renesas/ PIN CONTROLLER - SAMSUNG -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki R: Alim Akhtar L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -19451,7 +19451,7 @@ F: Documentation/devicetree/bindings/sound/samsung* F: sound/soc/samsung/ SAMSUNG EXYNOS PSEUDO RANDOM NUMBER GENERATOR (RNG) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -19486,7 +19486,7 @@ S: Maintained F: drivers/platform/x86/samsung-laptop.c SAMSUNG MULTIFUNCTION PMIC DEVICE DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -19512,7 +19512,7 @@ F: drivers/media/platform/samsung/s3c-camif/ F: include/media/drv-intf/s3c_camif.h SAMSUNG S3FWRN5 NFC DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml F: drivers/nfc/s3fwrn5 @@ -19533,7 +19533,7 @@ S: Supported F: drivers/media/i2c/s5k5baf.c SAMSUNG S5P Security SubSystem (SSS) DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Vladimir Zapolskiy L: linux-crypto@vger.kernel.org L: linux-samsung-soc@vger.kernel.org @@ -19555,7 +19555,7 @@ F: Documentation/devicetree/bindings/media/samsung,fimc.yaml F: drivers/media/platform/samsung/exynos4-is/ SAMSUNG SOC CLOCK DRIVERS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski M: Sylwester Nawrocki M: Chanwoo Choi R: Alim Akhtar @@ -19587,7 +19587,7 @@ F: drivers/net/ethernet/samsung/sxgbe/ SAMSUNG THERMAL DRIVER M: Bartlomiej Zolnierkiewicz -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-pm@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Maintained @@ -23782,7 +23782,7 @@ S: Orphan F: drivers/mmc/host/vub300.c W1 DALLAS'S 1-WIRE BUS -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski S: Maintained F: Documentation/devicetree/bindings/w1/ F: Documentation/w1/ -- cgit v1.2.3 From fbdd90334a6205e8a99d0bc2dfc738ee438f00bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 5 Apr 2024 09:20:41 +0200 Subject: MAINTAINERS: Drop Li Yang as their email address stopped working MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sending a patch to (among others) Li Yang the nxp MTA replied that the address doesn't exist and so the mail couldn't be delivered. The error code was 550, so at least technically that's not a temporal issue. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240405072042.697182-2-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3c78979fb819..da6b81a9e336 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2191,7 +2191,6 @@ N: mxs ARM/FREESCALE LAYERSCAPE ARM ARCHITECTURE M: Shawn Guo -M: Li Yang L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git @@ -8523,7 +8522,6 @@ S: Maintained F: drivers/video/fbdev/fsl-diu-fb.* FREESCALE DMA DRIVER -M: Li Yang M: Zhang Wei L: linuxppc-dev@lists.ozlabs.org S: Maintained @@ -8688,10 +8686,9 @@ F: drivers/soc/fsl/qe/tsa.h F: include/dt-bindings/soc/cpm1-fsl,tsa.h FREESCALE QUICC ENGINE UCC ETHERNET DRIVER -M: Li Yang L: netdev@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/net/ethernet/freescale/ucc_geth* FREESCALE QUICC ENGINE UCC HDLC DRIVER @@ -8708,10 +8705,9 @@ S: Maintained F: drivers/tty/serial/ucc_uart.c FREESCALE SOC DRIVERS -M: Li Yang L: linuxppc-dev@lists.ozlabs.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained +S: Orphan F: Documentation/devicetree/bindings/misc/fsl,dpaa2-console.yaml F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ @@ -8745,10 +8741,9 @@ F: Documentation/devicetree/bindings/sound/fsl,qmc-audio.yaml F: sound/soc/fsl/fsl_qmc_audio.c FREESCALE USB PERIPHERAL DRIVERS -M: Li Yang L: linux-usb@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org -S: Maintained +S: Orphan F: drivers/usb/gadget/udc/fsl* FREESCALE USB PHY DRIVER -- cgit v1.2.3 From 3461e02066758b78a0731eb71faecfb1eccd0e6c Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 9 Apr 2024 13:36:46 +0200 Subject: usb: typec: mux: it5205: Fix ChipID value typo The ChipID bytes are read in inverse order: invert the ChipID value defined as IT5205FN_CHIP_ID and used for validating the same. Fixes: 41fe9ea1696c ("usb: typec: mux: Add ITE IT5205 Alternate Mode Passive MUX driver") Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20240409113646.305105-1-angelogioacchino.delregno@collabora.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/it5205.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/mux/it5205.c b/drivers/usb/typec/mux/it5205.c index 5535932e42cd..4357cc67a867 100644 --- a/drivers/usb/typec/mux/it5205.c +++ b/drivers/usb/typec/mux/it5205.c @@ -22,7 +22,7 @@ #include #define IT5205_REG_CHIP_ID(x) (0x4 + (x)) -#define IT5205FN_CHIP_ID 0x35323035 /* "5205" */ +#define IT5205FN_CHIP_ID 0x35303235 /* "5025" -> "5205" */ /* MUX power down register */ #define IT5205_REG_MUXPDR 0x10 -- cgit v1.2.3 From eed04fa96c48790c1cce73c8a248e9d460b088f8 Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Tue, 9 Apr 2024 12:27:54 +0000 Subject: usb: dwc2: host: Fix dereference issue in DDMA completion flow. Fixed variable dereference issue in DDMA completion flow. Fixes: b258e4268850 ("usb: dwc2: host: Fix ISOC flow in DDMA mode") CC: stable@vger.kernel.org Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-usb/2024040834-ethically-rumble-701f@gregkh/T/#m4c4b83bef0ebb4b67fe2e0a7d6466cbb6f416e39 Signed-off-by: Minas Harutyunyan Link: https://lore.kernel.org/r/cc826d3ef53c934d8e6d98870f17f3cdc3d2755d.1712665387.git.Minas.Harutyunyan@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/hcd_ddma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc2/hcd_ddma.c b/drivers/usb/dwc2/hcd_ddma.c index 79582b102c7e..994a78ad084b 100644 --- a/drivers/usb/dwc2/hcd_ddma.c +++ b/drivers/usb/dwc2/hcd_ddma.c @@ -867,13 +867,15 @@ static int dwc2_cmpl_host_isoc_dma_desc(struct dwc2_hsotg *hsotg, struct dwc2_dma_desc *dma_desc; struct dwc2_hcd_iso_packet_desc *frame_desc; u16 frame_desc_idx; - struct urb *usb_urb = qtd->urb->priv; + struct urb *usb_urb; u16 remain = 0; int rc = 0; if (!qtd->urb) return -EINVAL; + usb_urb = qtd->urb->priv; + dma_sync_single_for_cpu(hsotg->dev, qh->desc_list_dma + (idx * sizeof(struct dwc2_dma_desc)), sizeof(struct dwc2_dma_desc), -- cgit v1.2.3 From 6d029c25b71f2de2838a6f093ce0fa0e69336154 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 9 Apr 2024 15:38:03 +0200 Subject: selftests/timers/posix_timers: Reimplement check_timer_distribution() check_timer_distribution() runs ten threads in a busy loop and tries to test that the kernel distributes a process posix CPU timer signal to every thread over time. There is not guarantee that this is true even after commit bcb7ee79029d ("posix-timers: Prefer delivery of signals to the current thread") because that commit only avoids waking up the sleeping process leader thread, but that has nothing to do with the actual signal delivery. As the signal is process wide the first thread which observes sigpending and wins the race to lock sighand will deliver the signal. Testing shows that this hangs on a regular base because some threads never win the race. The comment "This primarily tests that the kernel does not favour any one." is wrong. The kernel does favour a thread which hits the timer interrupt when CLOCK_PROCESS_CPUTIME_ID expires. Rewrite the test so it only checks that the group leader sleeping in join() never receives SIGALRM and the thread which burns CPU cycles receives all signals. In older kernels which do not have commit bcb7ee79029d ("posix-timers: Prefer delivery of signals to the current thread") the test-case fails immediately, the very 1st tick wakes the leader up. Otherwise it quickly succeeds after 100 ticks. CI testing wants to use newer selftest versions on stable kernels. In this case the test is guaranteed to fail. So check in the failure case whether the kernel version is less than v6.3 and skip the test result in that case. [ tglx: Massaged change log, renamed the version check helper ] Fixes: e797203fb3ba ("selftests/timers/posix_timers: Test delivery of signals across threads") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240409133802.GD29396@redhat.com --- tools/testing/selftests/kselftest.h | 13 ++++ tools/testing/selftests/timers/posix_timers.c | 103 ++++++++++++-------------- 2 files changed, 60 insertions(+), 56 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e..973b18e156b2 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -51,6 +51,7 @@ #include #include #include +#include #endif #ifndef ARRAY_SIZE @@ -388,4 +389,16 @@ static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) exit(KSFT_SKIP); } +static inline int ksft_min_kernel_version(unsigned int min_major, + unsigned int min_minor) +{ + unsigned int major, minor; + struct utsname info; + + if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) + ksft_exit_fail_msg("Can't parse kernel version\n"); + + return major > min_major || (major == min_major && minor >= min_minor); +} + #endif /* __KSELFTEST_H */ diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d49dd3ffd0d9..d86a0e00711e 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -184,80 +184,71 @@ static int check_timer_create(int which) return 0; } -int remain; -__thread int got_signal; +static pthread_t ctd_thread; +static volatile int ctd_count, ctd_failed; -static void *distribution_thread(void *arg) +static void ctd_sighandler(int sig) { - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); - return NULL; + if (pthread_self() != ctd_thread) + ctd_failed = 1; + ctd_count--; } -static void distribution_handler(int nr) +static void *ctd_thread_func(void *arg) { - if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED)) - __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED); -} - -/* - * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID - * timer signals. This primarily tests that the kernel does not favour any one. - */ -static int check_timer_distribution(void) -{ - int err, i; - timer_t id; - const int nthreads = 10; - pthread_t threads[nthreads]; struct itimerspec val = { .it_value.tv_sec = 0, .it_value.tv_nsec = 1000 * 1000, .it_interval.tv_sec = 0, .it_interval.tv_nsec = 1000 * 1000, }; + timer_t id; - remain = nthreads + 1; /* worker threads + this thread */ - signal(SIGALRM, distribution_handler); - err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); - if (err < 0) { - ksft_perror("Can't create timer"); - return -1; - } - err = timer_settime(id, 0, &val, NULL); - if (err < 0) { - ksft_perror("Can't set timer"); - return -1; - } + /* 1/10 seconds to ensure the leader sleeps */ + usleep(10000); - for (i = 0; i < nthreads; i++) { - err = pthread_create(&threads[i], NULL, distribution_thread, - NULL); - if (err) { - ksft_print_msg("Can't create thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + ctd_count = 100; + if (timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id)) + return "Can't create timer\n"; + if (timer_settime(id, 0, &val, NULL)) + return "Can't set timer\n"; - /* Wait for all threads to receive the signal. */ - while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + while (ctd_count > 0 && !ctd_failed) + ; - for (i = 0; i < nthreads; i++) { - err = pthread_join(threads[i], NULL); - if (err) { - ksft_print_msg("Can't join thread: %s (%d)\n", - strerror(errno), errno); - return -1; - } - } + if (timer_delete(id)) + return "Can't delete timer\n"; - if (timer_delete(id)) { - ksft_perror("Can't delete timer"); - return -1; - } + return NULL; +} + +/* + * Test that only the running thread receives the timer signal. + */ +static int check_timer_distribution(void) +{ + const char *errmsg; - ksft_test_result_pass("check_timer_distribution\n"); + signal(SIGALRM, ctd_sighandler); + + errmsg = "Can't create thread\n"; + if (pthread_create(&ctd_thread, NULL, ctd_thread_func, NULL)) + goto err; + + errmsg = "Can't join thread\n"; + if (pthread_join(ctd_thread, (void **)&errmsg) || errmsg) + goto err; + + if (!ctd_failed) + ksft_test_result_pass("check signal distribution\n"); + else if (ksft_min_kernel_version(6, 3)) + ksft_test_result_fail("check signal distribution\n"); + else + ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; +err: + ksft_print_msg(errmsg); + return -1; } int main(int argc, char **argv) -- cgit v1.2.3 From d7a62d0a9a17e97ce2d4d40431094e09956a0568 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 9 Apr 2024 17:46:23 +0200 Subject: compiler.h: Add missing quote in macro comment Add a missing doublequote in the __is_constexpr() macro comment. Signed-off-by: Thorsten Blum Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c00cc6c0878a..8c252e073bd8 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -268,7 +268,7 @@ static inline void *offset_to_ptr(const int *off) * - When one operand is a null pointer constant (i.e. when x is an integer * constant expression) and the other is an object pointer (i.e. our * third operand), the conditional operator returns the type of the - * object pointer operand (i.e. "int *). Here, within the sizeof(), we + * object pointer operand (i.e. "int *"). Here, within the sizeof(), we * would then get: * sizeof(*((int *)(...)) == sizeof(int) == 4 * - When one operand is a void pointer (i.e. when x is not an integer -- cgit v1.2.3 From 4710e4fc3e2afa6c092db2c65eef365b0836d6db Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 9 Mar 2024 18:15:45 +0100 Subject: KVM: SVM: Remove a useless zeroing of allocated memory Remove KVM's unnecessary zeroing of memory when allocating the pages array in sev_pin_memory() via __vmalloc(), as the array is only used to hold kernel pointers. The kmalloc() path for "small" regions doesn't zero the array, and if KVM leaks state and/or accesses uninitialized data, then the kernel has bigger problems. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/c7619a3d3cbb36463531a7c73ccbde9db587986c.1710004509.git.christophe.jaillet@wanadoo.fr [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 61a7531d41b0..759581bb2128 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -434,7 +434,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); -- cgit v1.2.3 From 19597a71a0c8603f8c8599686ac9b0ff4b846716 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:26 -0800 Subject: KVM: SVM: Create a stack frame in __svm_vcpu_run() for unwinding Unconditionally create a stack frame in __svm_vcpu_run() to play nice with unwinding via frame pointers, at least until the point where RBP is loaded with the guest's value. Don't bother conditioning the code on CONFIG_FRAME_POINTER=y, as RBP needs to be saved and restored anyways (due to it being clobbered with the guest's value); omitting the "MOV RSP, RBP" is not worth the extra #ifdef. Creating a stack frame will allow removing the OBJECT_FILES_NON_STANDARD tag from vmenter.S once __svm_sev_es_vcpu_run() is fixed to not stomp all over RBP for no reason. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 187018c424bf..82637abf0b7e 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -99,6 +99,7 @@ */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP + mov %_ASM_SP, %_ASM_BP #ifdef CONFIG_X86_64 push %r15 push %r14 -- cgit v1.2.3 From 7774c8f32e99b1f314c0df7c204a897792b4f378 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:27 -0800 Subject: KVM: SVM: Wrap __svm_sev_es_vcpu_run() with #ifdef CONFIG_KVM_AMD_SEV Compile (and link) __svm_sev_es_vcpu_run() if and only if SEV support is actually enabled. This will allow dropping non-existent 32-bit "support" from __svm_sev_es_vcpu_run() without causing undue confusion. Intentionally don't provide a stub (but keep the declaration), as any sane compiler, even with things like KASAN enabled, should eliminate the call to __svm_sev_es_vcpu_run() since sev_es_guest() unconditionally returns "false" if CONFIG_KVM_AMD_SEV=n. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 82637abf0b7e..6f57d496867a 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -291,6 +291,7 @@ SYM_FUNC_START(__svm_vcpu_run) SYM_FUNC_END(__svm_vcpu_run) +#ifdef CONFIG_KVM_AMD_SEV /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * @@ -389,3 +390,4 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) _ASM_EXTABLE(1b, 3b) SYM_FUNC_END(__svm_sev_es_vcpu_run) +#endif /* CONFIG_KVM_AMD_SEV */ -- cgit v1.2.3 From 331282fdb15edaf1beb1d27a64d3f65a34d7394d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:28 -0800 Subject: KVM: SVM: Drop 32-bit "support" from __svm_sev_es_vcpu_run() Drop 32-bit "support" from __svm_sev_es_vcpu_run(), as SEV/SEV-ES firmly 64-bit only. The "support" was purely the result of bad copy+paste from __svm_vcpu_run(), which in turn was slightly less bad copy+paste from __vmx_vcpu_run(). Opportunistically convert to unadulterated register accesses so that it's easier (but still not easy) to follow which registers hold what arguments, and when. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 44 +++++++++++++------------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 6f57d496867a..c057866a459b 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -298,17 +298,12 @@ SYM_FUNC_END(__svm_vcpu_run) * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) - push %_ASM_BP -#ifdef CONFIG_X86_64 + push %rbp push %r15 push %r14 push %r13 push %r12 -#else - push %edi - push %esi -#endif - push %_ASM_BX + push %rbx /* * Save variables needed after vmexit on the stack, in inverse @@ -316,39 +311,31 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ - push %_ASM_ARG2 + push %rsi /* Save @svm. */ - push %_ASM_ARG1 - -.ifnc _ASM_ARG1, _ASM_DI - /* - * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX - * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. - */ - mov %_ASM_ARG1, %_ASM_DI -.endif + push %rdi /* Clobbers RAX, RCX, RDX. */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ - mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX - mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX + mov SVM_current_vmcb(%rdi), %rax + mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ sti -1: vmrun %_ASM_AX +1: vmrun %rax 2: cli /* Pop @svm to RDI, guest registers have been saved already. */ - pop %_ASM_DI + pop %rdi #ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif /* Clobbers RAX, RCX, RDX. */ @@ -364,26 +351,21 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) UNTRAIN_RET_VM /* "Pop" @spec_ctrl_intercepted. */ - pop %_ASM_BX + pop %rbx - pop %_ASM_BX + pop %rbx -#ifdef CONFIG_X86_64 pop %r12 pop %r13 pop %r14 pop %r15 -#else - pop %esi - pop %edi -#endif - pop %_ASM_BP + pop %rbp RET RESTORE_GUEST_SPEC_CTRL_BODY RESTORE_HOST_SPEC_CTRL_BODY -3: cmpb $0, _ASM_RIP(kvm_rebooting) +3: cmpb $0, kvm_rebooting(%rip) jne 2b ud2 -- cgit v1.2.3 From 87e8e360a05fd29465691aeac179bcf585600c59 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:29 -0800 Subject: KVM: SVM: Clobber RAX instead of RBX when discarding spec_ctrl_intercepted POP @spec_ctrl_intercepted into RAX instead of RBX when discarding it from the stack so that __svm_sev_es_vcpu_run() doesn't modify any non-volatile registers. __svm_sev_es_vcpu_run() doesn't return a value, and RAX is already are clobbered multiple times in the #VMEXIT path. This will allowing using the host save area to save/restore non-volatile registers in __svm_sev_es_vcpu_run(). Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index c057866a459b..db94fb6f610a 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -350,8 +350,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM - /* "Pop" @spec_ctrl_intercepted. */ - pop %rbx + /* "Pop" and discard @spec_ctrl_intercepted. */ + pop %rax pop %rbx -- cgit v1.2.3 From c92be2fd8edf7b300a758c185fe032fd0257b886 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:30 -0800 Subject: KVM: SVM: Save/restore non-volatile GPRs in SEV-ES VMRUN via host save area Use the host save area to save/restore non-volatile (callee-saved) registers in __svm_sev_es_vcpu_run() to take advantage of hardware loading all registers from the save area on #VMEXIT. KVM still needs to save the registers it wants restored, but the loads are handled automatically by hardware. Aside from less assembly code, letting hardware do the restoration means stack frames are preserved for the entirety of __svm_sev_es_vcpu_run(). Opportunistically add a comment to call out why @svm needs to be saved across VMRUN->#VMEXIT, as it's not easy to decipher that from the macro hell. Cc: Tom Lendacky Cc: Michael Roth Cc: Alexey Kardashevskiy Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 17 ++++++++++------- arch/x86/kvm/svm/svm.h | 3 ++- arch/x86/kvm/svm/vmenter.S | 41 +++++++++++++++++++++++------------------ 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d1a9f9951635..9aaf83c8d57d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1503,6 +1503,11 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) +{ + return page_address(sd->save_area) + 0x400; +} + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1519,12 +1524,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * or subsequent vmload of host save area. */ vmsave(sd->save_area_pa); - if (sev_es_guest(vcpu->kvm)) { - struct sev_es_save_area *hostsa; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - sev_es_prepare_switch_to_guest(hostsa); - } + if (sev_es_guest(vcpu->kvm)) + sev_es_prepare_switch_to_guest(sev_es_host_save_area(sd)); if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -4101,6 +4102,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) { + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); guest_state_enter_irqoff(); @@ -4108,7 +4110,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) - __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, + sev_es_host_save_area(sd)); else __svm_vcpu_run(svm, spec_ctrl_intercepted); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7f1fbd874c45..33878efdebc8 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -698,7 +698,8 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); /* vmenter.S */ -void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, + struct sev_es_save_area *hostsa); void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index db94fb6f610a..7f627d535afc 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -292,23 +292,35 @@ SYM_FUNC_START(__svm_vcpu_run) SYM_FUNC_END(__svm_vcpu_run) #ifdef CONFIG_KVM_AMD_SEV + + +#ifdef CONFIG_X86_64 +#define SEV_ES_GPRS_BASE 0x300 +#define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE) +#define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE) +#define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE) +#define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE) +#define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE) +#define SEV_ES_R15 (SEV_ES_GPRS_BASE + __VCPU_REGS_R15 * WORD_SIZE) +#endif + /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) - push %rbp - push %r15 - push %r14 - push %r13 - push %r12 - push %rbx - /* - * Save variables needed after vmexit on the stack, in inverse - * order compared to when they are needed. + * Save non-volatile (callee-saved) registers to the host save area. + * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not + * saved on VMRUN. */ + mov %rbp, SEV_ES_RBP (%rdx) + mov %r15, SEV_ES_R15 (%rdx) + mov %r14, SEV_ES_R14 (%rdx) + mov %r13, SEV_ES_R13 (%rdx) + mov %r12, SEV_ES_R12 (%rdx) + mov %rbx, SEV_ES_RBX (%rdx) /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ push %rsi @@ -316,7 +328,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) /* Save @svm. */ push %rdi - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX (@hostsa). */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ @@ -338,7 +350,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX, consumes RDI (@svm). */ RESTORE_HOST_SPEC_CTRL /* @@ -353,13 +365,6 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) /* "Pop" and discard @spec_ctrl_intercepted. */ pop %rax - pop %rbx - - pop %r12 - pop %r13 - pop %r14 - pop %r15 - pop %rbp RET RESTORE_GUEST_SPEC_CTRL_BODY -- cgit v1.2.3 From adac42bf42c1608f23938c03e3ca53fa6c87f337 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:31 -0800 Subject: KVM: SVM: Save/restore args across SEV-ES VMRUN via host save area Use the host save area to preserve volatile registers that are used in __svm_sev_es_vcpu_run() to access function parameters after #VMEXIT. Like saving/restoring non-volatile registers, there's no reason not to take advantage of hardware restoring registers on #VMEXIT, as doing so shaves a few instructions and the save area is going to be accessed no matter what. Converting all register save/restore code to use the host save area also make it easier to follow the SEV-ES VMRUN flow in its entirety, as opposed to having a mix of stack-based versus host save area save/restore. Add a parameter to RESTORE_HOST_SPEC_CTRL_BODY so that the SEV-ES path doesn't need to write @spec_ctrl_intercepted to memory just to play nice with the common macro. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 7f627d535afc..d1613395dd39 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -67,7 +67,7 @@ "", X86_FEATURE_V_SPEC_CTRL 901: .endm -.macro RESTORE_HOST_SPEC_CTRL_BODY +.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req 900: /* Same for after vmexit. */ mov $MSR_IA32_SPEC_CTRL, %ecx @@ -76,7 +76,7 @@ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, * if it was not intercepted during guest execution. */ - cmpb $0, (%_ASM_SP) + cmpb $0, \spec_ctrl_intercepted jnz 998f rdmsr movl %eax, SVM_spec_ctrl(%_ASM_DI) @@ -269,7 +269,7 @@ SYM_FUNC_START(__svm_vcpu_run) RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP) 10: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b @@ -298,6 +298,8 @@ SYM_FUNC_END(__svm_vcpu_run) #define SEV_ES_GPRS_BASE 0x300 #define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE) #define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE) +#define SEV_ES_RSI (SEV_ES_GPRS_BASE + __VCPU_REGS_RSI * WORD_SIZE) +#define SEV_ES_RDI (SEV_ES_GPRS_BASE + __VCPU_REGS_RDI * WORD_SIZE) #define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE) #define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE) #define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE) @@ -322,11 +324,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov %r12, SEV_ES_R12 (%rdx) mov %rbx, SEV_ES_RBX (%rdx) - /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ - push %rsi - - /* Save @svm. */ - push %rdi + /* + * Save volatile registers that hold arguments that are needed after + * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted). + */ + mov %rdi, SEV_ES_RDI (%rdx) + mov %rsi, SEV_ES_RSI (%rdx) /* Clobbers RAX, RCX, RDX (@hostsa). */ RESTORE_GUEST_SPEC_CTRL @@ -342,15 +345,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) 2: cli - /* Pop @svm to RDI, guest registers have been saved already. */ - pop %rdi - #ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif - /* Clobbers RAX, RCX, RDX, consumes RDI (@svm). */ + /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ RESTORE_HOST_SPEC_CTRL /* @@ -362,13 +362,10 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM - /* "Pop" and discard @spec_ctrl_intercepted. */ - pop %rax - RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY %sil 3: cmpb $0, kvm_rebooting(%rip) jne 2b -- cgit v1.2.3 From 4367a75887ec8d68932cd84ea9cffe24d7a55fa0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:32 -0800 Subject: KVM: SVM: Create a stack frame in __svm_sev_es_vcpu_run() Now that KVM uses the host save area to context switch RBP, i.e. preserves RBP for the entirety of __svm_sev_es_vcpu_run(), create a stack frame using the standared FRAME_{BEGIN,END} macros. Note, __svm_sev_es_vcpu_run() is subtly not a leaf function as it can call into ibpb_feature() via UNTRAIN_RET_VM. Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/vmenter.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index d1613395dd39..a0c8eb37d3e1 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include "kvm-asm-offsets.h" @@ -312,6 +313,8 @@ SYM_FUNC_END(__svm_vcpu_run) * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) + FRAME_BEGIN + /* * Save non-volatile (callee-saved) registers to the host save area. * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not @@ -362,6 +365,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM + FRAME_END RET RESTORE_GUEST_SPEC_CTRL_BODY -- cgit v1.2.3 From 27ca867042affef7eba692d3bac7b51ee85e36ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 23 Feb 2024 12:42:33 -0800 Subject: KVM: x86: Stop compiling vmenter.S with OBJECT_FILES_NON_STANDARD Stop compiling vmenter.S with OBJECT_FILES_NON_STANDARD to skip objtool's stack validation now that __svm_vcpu_run() and __svm_sev_es_vcpu_run() create stack frames (though the former's effectiveness is dubious). Note, due to a quirk in how OBJECT_FILES_NON_STANDARD was handled by the build system prior to commit bf48d9b756b9 ("kbuild: change tool coverage variables to take the path relative to $(obj)"), vmx/vmenter.S got lumped in with svm/vmenter.S. __vmx_vcpu_run() already plays nice with frame pointers, i.e. it was collateral damage when commit 7f4b5cde2409 ("kvm: Disable objtool frame pointer checking for vmenter.S") added the OBJECT_FILES_NON_STANDARD hack-a-fix. Link: https://lore.kernel.org/all/20240217055504.2059803-1-masahiroy@kernel.org Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240223204233.3337324-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a88bb14266b6..addc44fc7187 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,11 +3,6 @@ ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror -ifeq ($(CONFIG_FRAME_POINTER),y) -OBJECT_FILES_NON_STANDARD_vmx/vmenter.o := y -OBJECT_FILES_NON_STANDARD_svm/vmenter.o := y -endif - include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ -- cgit v1.2.3 From 4c08f01934ab67d1d283d5cbaa52b923abcfe4cd Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Sun, 7 Apr 2024 22:28:02 -0400 Subject: drm/vmwgfx: Enable DMA mappings with SEV Enable DMA mappings in vmwgfx after TTM has been fixed in commit 3bf3710e3718 ("drm/ttm: Add a generic TTM memcpy move for page-based iomem") This enables full guest-backed memory support and in particular allows usage of screen targets as the presentation mechanism. Signed-off-by: Zack Rusin Reported-by: Ye Li Tested-by: Ye Li Fixes: 3b0d6458c705 ("drm/vmwgfx: Refuse DMA operation when SEV encryption is active") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v6.6+ Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240408022802.358641-1-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index c7d90f96d16a..0a304706e013 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -666,11 +666,12 @@ static int vmw_dma_select_mode(struct vmw_private *dev_priv) [vmw_dma_map_populate] = "Caching DMA mappings.", [vmw_dma_map_bind] = "Giving up DMA mappings early."}; - /* TTM currently doesn't fully support SEV encryption. */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) - return -EINVAL; - - if (vmw_force_coherent) + /* + * When running with SEV we always want dma mappings, because + * otherwise ttm tt pool pages will bounce through swiotlb running + * out of available space. + */ + if (vmw_force_coherent || cc_platform_has(CC_ATTR_MEM_ENCRYPT)) dev_priv->map_mode = vmw_dma_alloc_coherent; else if (vmw_restrict_iommu) dev_priv->map_mode = vmw_dma_map_bind; -- cgit v1.2.3 From 05a2f07db8883b027c0b4a475fcc586278922b8d Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 5 Mar 2024 12:27:27 +0100 Subject: tools/power turbostat: read RAPL counters via perf Some of the future Intel platforms will require reading the RAPL counters via perf and not MSR. On current platforms we can still read them using both ways. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 786 ++++++++++++++++++++++++++++------ 1 file changed, 649 insertions(+), 137 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a380829c5890..283dffb987b5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -37,6 +37,7 @@ #include #include #include +#include #define UNUSED(x) (void)(x) @@ -60,6 +61,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; +enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; struct msr_counter { unsigned int msr_num; @@ -958,6 +960,175 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 +/* Indexes used to map data read from perf and MSRs into global variables */ +enum rapl_rci_index { + RAPL_RCI_INDEX_ENERGY_PKG = 0, + RAPL_RCI_INDEX_ENERGY_CORES = 1, + RAPL_RCI_INDEX_DRAM = 2, + RAPL_RCI_INDEX_GFX = 3, + RAPL_RCI_INDEX_PKG_PERF_STATUS = 4, + RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5, + RAPL_RCI_INDEX_CORE_ENERGY = 6, + NUM_RAPL_COUNTERS, +}; + +enum rapl_unit { + RAPL_UNIT_INVALID, + RAPL_UNIT_JOULES, + RAPL_UNIT_WATTS, +}; + +struct rapl_counter_info_t { + unsigned long long data[NUM_RAPL_COUNTERS]; + enum rapl_source source[NUM_RAPL_COUNTERS]; + unsigned long long flags[NUM_RAPL_COUNTERS]; + double scale[NUM_RAPL_COUNTERS]; + enum rapl_unit unit[NUM_RAPL_COUNTERS]; + + union { + /* Active when source == RAPL_SOURCE_MSR */ + struct { + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; + }; + }; + + int fd_perf; +}; + +/* struct rapl_counter_info_t for each RAPL domain */ +struct rapl_counter_info_t *rapl_counter_info_perdomain; + +#define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1) + +struct rapl_counter_arch_info { + int feature_mask; /* Mask for testing if the counter is supported on host */ + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + int msr_shift; /* Positive mean shift right, negative mean shift left */ + double *platform_rapl_msr_scale; /* Scale applied to values read by MSR (platform dependent, filled at runtime) */ + unsigned int rci_index; /* Maps data from perf counters to global variables */ + unsigned long long bic; + double compat_scale; /* Some counters require constant scaling to be in the same range as other, similar ones */ + unsigned long long flags; +}; + +static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = { + { + .feature_mask = RAPL_PKG, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = "power", + .perf_name = "energy-pkg", + .msr = MSR_PKG_ENERGY_STAT, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_PKG, + .bic = BIC_PkgWatt | BIC_Pkg_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_CORE_ENERGY_STATUS, + .perf_subsys = "power", + .perf_name = "energy-cores", + .msr = MSR_PP0_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_ENERGY_CORES, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM, + .perf_subsys = "power", + .perf_name = "energy-ram", + .msr = MSR_DRAM_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_dram_energy_units, + .rci_index = RAPL_RCI_INDEX_DRAM, + .bic = BIC_RAMWatt | BIC_RAM_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_GFX, + .perf_subsys = "power", + .perf_name = "energy-gpu", + .msr = MSR_PP1_ENERGY_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_GFX, + .bic = BIC_GFXWatt | BIC_GFX_J, + .compat_scale = 1.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_PKG_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_PKG_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS, + .bic = BIC_PKG__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_DRAM_PERF_STATUS, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_DRAM_PERF_STATUS, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_time_units, + .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS, + .bic = BIC_RAM__, + .compat_scale = 100.0, + .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM, + }, + { + .feature_mask = RAPL_AMD_F17H, + .perf_subsys = NULL, + .perf_name = NULL, + .msr = MSR_CORE_ENERGY_STAT, + .msr_mask = 0xFFFFFFFF, + .msr_shift = 0, + .platform_rapl_msr_scale = &rapl_energy_units, + .rci_index = RAPL_RCI_INDEX_CORE_ENERGY, + .bic = BIC_CorWatt | BIC_Cor_J, + .compat_scale = 1.0, + .flags = 0, + }, +}; + +struct rapl_counter { + unsigned long long raw_value; + enum rapl_unit unit; + double scale; +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -984,7 +1155,7 @@ struct core_data { unsigned long long c7; unsigned long long mc6_us; /* duplicate as per-core for now, even though per module */ unsigned int core_temp_c; - unsigned int core_energy; /* MSR_CORE_ENERGY_STAT */ + struct rapl_counter core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -1009,12 +1180,12 @@ struct pkg_data { unsigned int gfx_mhz; unsigned int gfx_act_mhz; unsigned int package_id; - unsigned long long energy_pkg; /* MSR_PKG_ENERGY_STATUS */ - unsigned long long energy_dram; /* MSR_DRAM_ENERGY_STATUS */ - unsigned long long energy_cores; /* MSR_PP0_ENERGY_STATUS */ - unsigned long long energy_gfx; /* MSR_PP1_ENERGY_STATUS */ - unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ - unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ + struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ + struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ + struct rapl_counter energy_cores; /* MSR_PP0_ENERGY_STATUS */ + struct rapl_counter energy_gfx; /* MSR_PP1_ENERGY_STATUS */ + struct rapl_counter rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ + struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_COUNTERS]; @@ -1403,6 +1574,21 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) return 0; } +int probe_msr(int cpu, off_t offset) +{ + ssize_t retval; + unsigned long long dummy; + + assert(!no_msr); + + retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset); + + if (retval != sizeof(dummy)) + return 1; + + return 0; +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1755,7 +1941,11 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "c7: %016llX\n", c->c7); outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c); outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt); - outp += sprintf(outp, "Joules: %0X\n", c->core_energy); + + const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale; + const double energy_scale = c->core_energy.scale; + if (c->core_energy.unit == RAPL_UNIT_JOULES) + outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += @@ -1785,12 +1975,12 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "pc10: %016llX\n", p->pc10); outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi); outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi); - outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg); - outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); - outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); - outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); + outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value); + outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value); + outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value); + outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value); + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { @@ -1805,6 +1995,23 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; } +double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval) +{ + assert(desired_unit != RAPL_UNIT_INVALID); + + /* + * For now we don't expect anything other than joules, + * so just simplify the logic. + */ + assert(c->unit == RAPL_UNIT_JOULES); + + const double scaled = c->raw_value * c->scale; + + if (desired_unit == RAPL_UNIT_WATTS) + return scaled / interval; + return scaled; +} + /* * column formatting convention & formats */ @@ -1998,9 +2205,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float)); /* print per-package data only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) @@ -2072,34 +2281,40 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_PkgWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); - + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_GFXWatt)) outp += - sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); + sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAMWatt)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - p->energy_dram * rapl_dram_energy_units / interval_float); + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_Pkg_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_GFX_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_RAM_J)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units); + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), + rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float)); if (DO_BIC(BIC_PKG__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float)); if (DO_BIC(BIC_RAM__)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), - 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float)); /* UncMHz */ if (DO_BIC(BIC_UNCORE_MHZ)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); @@ -2208,12 +2423,13 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; - old->energy_pkg = new->energy_pkg - old->energy_pkg; - old->energy_cores = new->energy_cores - old->energy_cores; - old->energy_gfx = new->energy_gfx - old->energy_gfx; - old->energy_dram = new->energy_dram - old->energy_dram; - old->rapl_pkg_perf_status = new->rapl_pkg_perf_status - old->rapl_pkg_perf_status; - old->rapl_dram_perf_status = new->rapl_dram_perf_status - old->rapl_dram_perf_status; + old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value; + old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value; + old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; + old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value; + old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value; + old->rapl_dram_perf_status.raw_value = + new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2237,7 +2453,7 @@ void delta_core(struct core_data *new, struct core_data *old) old->core_throt_cnt = new->core_throt_cnt; old->mc6_us = new->mc6_us - old->mc6_us; - DELTA_WRAP32(new->core_energy, old->core_energy); + DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2364,6 +2580,13 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; } +void rapl_counter_clear(struct rapl_counter *c) +{ + c->raw_value = 0; + c->scale = 0.0; + c->unit = RAPL_UNIT_INVALID; +} + void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2391,7 +2614,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data c->c7 = 0; c->mc6_us = 0; c->core_temp_c = 0; - c->core_energy = 0; + rapl_counter_clear(&c->core_energy); c->core_throt_cnt = 0; p->pkg_wtd_core_c0 = 0; @@ -2412,12 +2635,12 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->cpu_lpi = 0; p->sys_lpi = 0; - p->energy_pkg = 0; - p->energy_dram = 0; - p->energy_cores = 0; - p->energy_gfx = 0; - p->rapl_pkg_perf_status = 0; - p->rapl_dram_perf_status = 0; + rapl_counter_clear(&p->energy_pkg); + rapl_counter_clear(&p->energy_dram); + rapl_counter_clear(&p->energy_cores); + rapl_counter_clear(&p->energy_gfx); + rapl_counter_clear(&p->rapl_pkg_perf_status); + rapl_counter_clear(&p->rapl_dram_perf_status); p->pkg_temp_c = 0; p->gfx_rc6_ms = 0; @@ -2434,6 +2657,20 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->counter[i] = 0; } +void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) +{ + /* Copy unit and scale from src if dst is not initialized */ + if (dst->unit == RAPL_UNIT_INVALID) { + dst->unit = src->unit; + dst->scale = src->scale; + } + + assert(dst->unit == src->unit); + assert(dst->scale == src->scale); + + dst->raw_value += src->raw_value; +} + int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; @@ -2480,7 +2717,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c); average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt); - average.cores.core_energy += c->core_energy; + rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2515,10 +2752,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; - average.packages.energy_pkg += p->energy_pkg; - average.packages.energy_dram += p->energy_dram; - average.packages.energy_cores += p->energy_cores; - average.packages.energy_gfx += p->energy_gfx; + rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg); + rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram); + rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores); + rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx); average.packages.gfx_rc6_ms = p->gfx_rc6_ms; average.packages.uncore_mhz = p->uncore_mhz; @@ -2527,8 +2764,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); - average.packages.rapl_pkg_perf_status += p->rapl_pkg_perf_status; - average.packages.rapl_dram_perf_status += p->rapl_dram_perf_status; + rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status); + rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0)) @@ -2797,25 +3034,53 @@ struct amperf_group_fd { int mperf; }; -static unsigned int read_perf_counter_info(const char *const path, const char *const parse_format) +static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr) { int fdmt; - char buf[16]; - unsigned int v; + int bytes_read; + char buf[64]; + int ret = -1; fdmt = open(path, O_RDONLY, 0); - if (fdmt == -1) - errx(1, "Failed to read perf counter info %s\n", path); + if (fdmt == -1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } - if (read(fdmt, buf, sizeof(buf)) <= 0) - return 0; + bytes_read = read(fdmt, buf, sizeof(buf) - 1); + if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } - buf[sizeof(buf) - 1] = '\0'; + buf[bytes_read] = '\0'; - if (sscanf(buf, parse_format, &v) != 1) - errx(1, "Failed to parse perf counter info %s\n", path); + if (sscanf(buf, parse_format, value_ptr) != 1) { + if (debug) + fprintf(stderr, "Failed to parse perf counter info %s\n", path); + ret = -1; + goto cleanup_and_exit; + } + ret = 0; + +cleanup_and_exit: close(fdmt); + return ret; +} + +static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format) +{ + unsigned int v; + int status; + + status = read_perf_counter_info(path, parse_format, &v); + if (status) + v = -1; return v; } @@ -2825,7 +3090,7 @@ static unsigned read_msr_type(void) const char *const path = "/sys/bus/event_source/devices/msr/type"; const char *const format = "%u"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); } static unsigned read_aperf_config(void) @@ -2833,7 +3098,7 @@ static unsigned read_aperf_config(void) const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; const char *const format = "event=%x"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); } static unsigned read_mperf_config(void) @@ -2841,7 +3106,60 @@ static unsigned read_mperf_config(void) const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; const char *const format = "event=%x"; - return read_perf_counter_info(path, format); + return read_perf_counter_info_n(path, format); +} + +static unsigned read_perf_type(const char *subsys) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/type"; + const char *const format = "%u"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys); + + return read_perf_counter_info_n(path, format); +} + +static unsigned read_rapl_config(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; + const char *const format = "event=%x"; + char path[128]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + return read_perf_counter_info_n(path, format); +} + +static unsigned read_perf_rapl_unit(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit"; + const char *const format = "%s"; + char path[128]; + char unit_buffer[16]; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + read_perf_counter_info(path, format, &unit_buffer); + if (strcmp("Joules", unit_buffer) == 0) + return RAPL_UNIT_JOULES; + + return RAPL_UNIT_INVALID; +} + +static double read_perf_rapl_scale(const char *subsys, const char *event_name) +{ + const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; + const char *const format = "%lf"; + char path[128]; + double scale; + + snprintf(path, sizeof(path), path_format, subsys, event_name); + + if (read_perf_counter_info(path, format, &scale)) + return 0.0; + + return scale; } static struct amperf_group_fd open_amperf_fd(int cpu) @@ -2961,6 +3279,99 @@ retry: return 0; } +size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) + if (rci->source[i] == RAPL_SOURCE_PERF) + ++ret; + + return ret; +} + +void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx) +{ + rc->raw_value = rci->data[idx]; + rc->unit = rci->unit[idx]; + rc->scale = rci->scale[idx]; +} + +int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data *p) +{ + unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; + + if (debug) + fprintf(stderr, "get_rapl_counters: cpu%d domain%d\n", cpu, domain); + + assert(rapl_counter_info_perdomain); + + /* + * If we have any perf counters to read, read them all now, in bulk + */ + if (rci->fd_perf != -1) { + size_t num_perf_counters = rapl_counter_info_count_perf(rci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); + if (actual_read_size != expected_read_size) + err(-1, "get_rapl_counters: failed to read perf_data (%zu %zu)", expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { + switch (rci->source[i]) { + case RAPL_SOURCE_NONE: + break; + + case RAPL_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(rci->fd_perf != -1); + + if (debug) + fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", + i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); + + rci->data[i] = perf_data[pi]; + + ++pi; + break; + + case RAPL_SOURCE_MSR: + if (debug) + fprintf(stderr, "Reading rapl counter via msr at %u\n", i); + + assert(!no_msr); + if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) { + if (get_msr_sum(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } else { + if (get_msr(cpu, rci->msr[i], &rci->data[i])) + return -13 - i; + } + + rci->data[i] &= rci->msr_mask[i]; + if (rci->msr_shift[i] >= 0) + rci->data[i] >>= abs(rci->msr_shift[i]); + else + rci->data[i] <<= abs(rci->msr_shift[i]); + + break; + } + } + + _Static_assert(NUM_RAPL_COUNTERS == 7); + write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG); + write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES); + write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM); + write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX); + write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS); + write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS); + write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY); + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2972,6 +3383,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) unsigned long long msr; struct msr_counter *mp; int i; + int status; if (cpu_migrate(cpu)) { fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); @@ -3029,6 +3441,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (!is_cpu_first_thread_in_core(t, c, p)) goto done; + if (platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, c->core_id, c, p); + if (status != 0) + return status; + } + if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) return -6; @@ -3069,12 +3487,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if ((platform->rapl_msrs & RAPL_AMD_F17H) && !no_msr) { - if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) - return -14; - c->core_energy = msr & 0xFFFFFFFF; - } - for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &c->counter[i])) return -10; @@ -3134,42 +3546,10 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (!no_msr) { - if (platform->rapl_msrs & RAPL_PKG) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) - return -13; - p->energy_pkg = msr; - } - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { - if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) - return -14; - p->energy_cores = msr; - } - if (platform->rapl_msrs & RAPL_DRAM) { - if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) - return -15; - p->energy_dram = msr; - } - if (platform->rapl_msrs & RAPL_GFX) { - if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) - return -16; - p->energy_gfx = msr; - } - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) - return -16; - p->rapl_pkg_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { - if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) - return -16; - p->rapl_dram_perf_status = msr; - } - if (platform->rapl_msrs & RAPL_AMD_F17H) { - if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) - return -13; - p->energy_pkg = msr; - } + if (!platform->has_per_core_rapl) { + status = get_rapl_counters(cpu, p->package_id, c, p); + if (status != 0) + return status; } if (DO_BIC(BIC_PkgTmp)) { @@ -3714,6 +4094,21 @@ void free_fd_instr_count_percpu(void) fd_instr_count_percpu = NULL; } +void free_fd_rapl_percpu(void) +{ + if (!rapl_counter_info_perdomain) + return; + + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + if (rapl_counter_info_perdomain[domain_id].fd_perf != -1) + close(rapl_counter_info_perdomain[domain_id].fd_perf); + } + + free(rapl_counter_info_perdomain); +} + void free_all_buffers(void) { int i; @@ -3757,6 +4152,7 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); free_fd_amperf_percpu(); + free_fd_rapl_percpu(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4091,12 +4487,14 @@ static void update_effective_set(bool startup) } void linux_perf_init(void); +void rapl_perf_init(void); void re_initialize(void) { free_all_buffers(); setup_all_buffers(false); linux_perf_init(); + rapl_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -5315,31 +5713,18 @@ void rapl_probe_intel(void) unsigned long long msr; unsigned int time_unit; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J; - if (rapl_joules) { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_Pkg_J); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_Cor_J); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAM_J); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFX_J); - } else { - if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) - BIC_PRESENT(BIC_PkgWatt); - if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) - BIC_PRESENT(BIC_CorWatt); - if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) - BIC_PRESENT(BIC_RAMWatt); - if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) - BIC_PRESENT(BIC_GFXWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; - if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) - BIC_PRESENT(BIC_PKG__); - if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) - BIC_PRESENT(BIC_RAM__); + if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS)) + bic_enabled &= ~BIC_PKG__; + if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)) + bic_enabled &= ~BIC_RAM__; /* units on package 0, verify later other packages match */ if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) @@ -5373,14 +5758,13 @@ void rapl_probe_amd(void) { unsigned long long msr; double tdp; + const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt; + const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } + if (rapl_joules) + bic_enabled &= ~bic_watt_bits; + else + bic_enabled &= ~bic_joules_bits; if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) return; @@ -5885,6 +6269,48 @@ bool is_aperf_access_required(void) || BIC_IS_ENABLED(BIC_IPC); } +int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale_, enum rapl_unit *unit_) +{ + if (no_perf) + return -1; + + const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + if (scale == 0.0) + return -1; + + const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name); + if (unit == RAPL_UNIT_INVALID) + return -1; + + const unsigned rapl_type = read_perf_type(cai->perf_subsys); + const unsigned rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = + open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (rci->fd_perf == -1) + rci->fd_perf = fd_counter; + + *scale_ = scale; + *unit_ = unit; + return fd_counter; +} + +int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, + double *scale, enum rapl_unit *unit) +{ + int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); + + if (debug) + fprintf(stderr, "add_rapl_perf_counter: %d (cpu: %d)\n", ret, cpu); + + return ret; +} + /* * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5908,17 +6334,101 @@ void linux_perf_init(void) } } -static int has_amperf_access_via_msr(void) +void rapl_perf_init(void) { - unsigned long long dummy; + const int num_domains = platform->has_per_core_rapl ? topo.num_cores : topo.num_packages; + bool *domain_visited = calloc(num_domains, sizeof(bool)); + + rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain)); + if (rapl_counter_info_perdomain == NULL) + err(-1, "calloc rapl_counter_info_percpu"); + /* + * Initialize rapl_counter_info_percpu + */ + for (int domain_id = 0; domain_id < num_domains; ++domain_id) { + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; + rci->fd_perf = -1; + for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { + rci->data[i] = 0; + rci->source[i] = RAPL_SOURCE_NONE; + } + } + + /* + * Open/probe the counters + * If can't get it via perf, fallback to MSR + */ + for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) { + + const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i]; + bool has_counter = 0; + double scale; + enum rapl_unit unit; + int next_domain; + + memset(domain_visited, 0, num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + if (cpu_is_not_allowed(cpu)) + continue; + + /* Skip already seen and handled RAPL domains */ + next_domain = + platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id; + + if (domain_visited[next_domain]) + continue; + + domain_visited[next_domain] = 1; + + struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain]; + + /* Check if the counter is enabled and accessible */ + if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) { + + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name + && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { + rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->scale[cai->rci_index] = scale * cai->compat_scale; + rci->unit[cai->rci_index] = unit; + rci->flags[cai->rci_index] = cai->flags; + + /* Use MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->msr[cai->rci_index] = cai->msr; + rci->msr_mask[cai->rci_index] = cai->msr_mask; + rci->msr_shift[cai->rci_index] = cai->msr_shift; + rci->unit[cai->rci_index] = RAPL_UNIT_JOULES; + rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale; + rci->flags[cai->rci_index] = cai->flags; + } + } + + if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + has_counter = 1; + } + + /* If any CPU has access to the counter, make it present */ + if (has_counter) + BIC_PRESENT(cai->bic); + } + + free(domain_visited); +} + +static int has_amperf_access_via_msr(void) +{ if (no_msr) return 0; - if (get_msr(base_cpu, MSR_IA32_APERF, &dummy)) + if (probe_msr(base_cpu, MSR_IA32_APERF)) return 0; - if (get_msr(base_cpu, MSR_IA32_MPERF, &dummy)) + if (probe_msr(base_cpu, MSR_IA32_MPERF)) return 0; return 1; @@ -6696,6 +7206,7 @@ bool is_msr_access_required(void) || BIC_IS_ENABLED(BIC_Pkgpc8) || BIC_IS_ENABLED(BIC_Pkgpc9) || BIC_IS_ENABLED(BIC_Pkgpc10) + /* TODO: Multiplex access with perf */ || BIC_IS_ENABLED(BIC_CorWatt) || BIC_IS_ENABLED(BIC_Cor_J) || BIC_IS_ENABLED(BIC_PkgWatt) @@ -6749,6 +7260,7 @@ void turbostat_init() probe_pm_features(); set_amperf_source(); linux_perf_init(); + rapl_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); -- cgit v1.2.3 From 17d1ea136be86f53be0461b0c33daf6b58e6cbf7 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 7 Mar 2024 17:00:35 +0100 Subject: tools/power turbostat: Add selftests Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/testing/selftests/turbostat/defcolumns.py | 59 +++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100755 tools/testing/selftests/turbostat/defcolumns.py diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py new file mode 100755 index 000000000000..70d3b7780311 --- /dev/null +++ b/tools/testing/selftests/turbostat/defcolumns.py @@ -0,0 +1,59 @@ +#!/bin/env python3 + +import subprocess +from shutil import which + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +# +# By default --list reports also "usec" and "Time_Of_Day_Seconds" columns +# which are only visible when running with --debug. +# +expected_columns_debug = proc_turbostat.stdout.replace(b',', b'\t').strip() +expected_columns = expected_columns_debug.replace(b'usec\t', b'').replace(b'Time_Of_Day_Seconds\t', b'').replace(b'X2APIC\t', b'').replace(b'APIC\t', b'') + +# +# Run turbostat with no options for 10 seconds and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '1s'] +turbostat_argv = [turbostat, '-i', '0.250'] + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) +print('OK') + +# +# Same, but with --debug +# +turbostat_argv.append('--debug') + +print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) +actual_columns = proc_turbostat.stdout.split(b'\n')[0] +if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) +print('OK') -- cgit v1.2.3 From bb5db22c13125b38b0740e19c18ae94f8e5a0eb6 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 11:19:15 +0800 Subject: tools/power/turbostat: Enable MSR_CORE_C1_RES support for ICX Enable Core C1 hardware residency counter (MSR_CORE_C1_RES) on ICX. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 283dffb987b5..372f67a70d8a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -664,6 +664,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, + .has_msr_core_c1_res = 1, .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, -- cgit v1.2.3 From 4e2bbbf78cf7144204214fd0bd7cca309acd8f89 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 14:23:37 +0800 Subject: tools/power/turbostat: Cache graphics sysfs path Graphics drivers (i915/Xe) have different sysfs knobs on different platforms, and it is possible that different sysfs knobs fit into the same turbostat columns. Instead of specifying different sysfs knobs every time, detect them once and cache the path for future use. No functional change. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 45 +++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 372f67a70d8a..78db4f65a237 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -276,6 +276,19 @@ bool no_msr; bool no_perf; enum amperf_source amperf_source; +enum gfx_sysfs_idx { + GFX_rc6, + GFX_MHz, + GFX_ACTMHz, + GFX_MAX +}; + +struct gfx_sysfs_info { + const char *path; +}; + +static struct gfx_sysfs_info gfx_info[GFX_MAX]; + int get_msr(int cpu, off_t offset, unsigned long long *msr); /* Model specific support Start */ @@ -4620,7 +4633,7 @@ int snapshot_gfx_rc6_ms(void) FILE *fp; int retval; - fp = fopen_or_die("/sys/class/drm/card0/power/rc6_residency_ms", "r"); + fp = fopen_or_die(gfx_info[GFX_rc6].path, "r"); retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms); if (retval != 1) @@ -4645,9 +4658,7 @@ int snapshot_gfx_mhz(void) int retval; if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); + fp = fopen_or_die(gfx_info[GFX_MHz].path, "r"); } else { rewind(fp); fflush(fp); @@ -4674,9 +4685,7 @@ int snapshot_gfx_act_mhz(void) int retval; if (fp == NULL) { - fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r"); - if (!fp) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); + fp = fopen_or_die(gfx_info[GFX_ACTMHz].path, "r"); } else { rewind(fp); fflush(fp); @@ -5338,14 +5347,24 @@ probe_cluster: static void probe_graphics(void) { if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - BIC_PRESENT(BIC_GFX_rc6); + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXMHz); + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz"; + + + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz"; + else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + if (gfx_info[GFX_rc6].path) + BIC_PRESENT(BIC_GFX_rc6); + if (gfx_info[GFX_MHz].path) + BIC_PRESENT(BIC_GFXMHz); + if (gfx_info[GFX_ACTMHz].path) BIC_PRESENT(BIC_GFXACTMHz); } -- cgit v1.2.3 From de39d38c06eb047954c5ad20a3f9acb6d3c78498 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 13 Mar 2024 10:12:19 +0800 Subject: tools/power/turbostat: Unify graphics sysfs snapshots Graphics sysfs snapshots share similar logic. Combine them into one function to avoid code duplication. No functional change. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 109 +++++++++++----------------------- 1 file changed, 34 insertions(+), 75 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 78db4f65a237..e5b6161fef48 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -251,11 +251,8 @@ char *output_buffer, *outp; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; -unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; -unsigned int gfx_cur_mhz; -unsigned int gfx_act_mhz; unsigned int tj_max; unsigned int tj_max_override; double rapl_power_units, rapl_time_units; @@ -285,6 +282,9 @@ enum gfx_sysfs_idx { struct gfx_sysfs_info { const char *path; + FILE *fp; + unsigned int val; + unsigned long long val_ull; }; static struct gfx_sysfs_info gfx_info[GFX_MAX]; @@ -3573,17 +3573,17 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } if (DO_BIC(BIC_GFX_rc6)) - p->gfx_rc6_ms = gfx_cur_rc6_ms; + p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) p->uncore_mhz = get_uncore_mhz(p->package_id, 0); if (DO_BIC(BIC_GFXMHz)) - p->gfx_mhz = gfx_cur_mhz; + p->gfx_mhz = gfx_info[GFX_MHz].val; if (DO_BIC(BIC_GFXACTMHz)) - p->gfx_act_mhz = gfx_act_mhz; + p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val; for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) @@ -4621,81 +4621,40 @@ int snapshot_proc_interrupts(void) } /* - * snapshot_gfx_rc6_ms() + * snapshot_graphics() * - * record snapshot of - * /sys/class/drm/card0/power/rc6_residency_ms + * record snapshot of specified graphics sysfs knob * * return 1 if config change requires a restart, else return 0 */ -int snapshot_gfx_rc6_ms(void) +int snapshot_graphics(int idx) { FILE *fp; int retval; - fp = fopen_or_die(gfx_info[GFX_rc6].path, "r"); - - retval = fscanf(fp, "%lld", &gfx_cur_rc6_ms); - if (retval != 1) - err(1, "GFX rc6"); - - fclose(fp); - - return 0; -} - -/* - * snapshot_gfx_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz - * when /sys/class/drm/card0/gt_cur_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen_or_die(gfx_info[GFX_MHz].path, "r"); - } else { - rewind(fp); - fflush(fp); - } - - retval = fscanf(fp, "%d", &gfx_cur_mhz); - if (retval != 1) - err(1, "GFX MHz"); - - return 0; -} - -/* - * snapshot_gfx_cur_mhz() - * - * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz - * when /sys/class/drm/card0/gt_act_freq_mhz is not available. - * - * return 1 if config change requires a restart, else return 0 - */ -int snapshot_gfx_act_mhz(void) -{ - static FILE *fp; - int retval; - - if (fp == NULL) { - fp = fopen_or_die(gfx_info[GFX_ACTMHz].path, "r"); - } else { - rewind(fp); - fflush(fp); + switch (idx) { + case GFX_rc6: + fp = fopen_or_die(gfx_info[idx].path, "r"); + retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); + if (retval != 1) + err(1, "rc6"); + fclose(fp); + return 0; + case GFX_MHz: + case GFX_ACTMHz: + if (gfx_info[idx].fp == NULL) { + gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); + } else { + rewind(gfx_info[idx].fp); + fflush(gfx_info[idx].fp); + } + retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val); + if (retval != 1) + err(1, "MHz"); + return 0; + default: + return -EINVAL; } - - retval = fscanf(fp, "%d", &gfx_act_mhz); - if (retval != 1) - err(1, "GFX ACT MHz"); - - return 0; } /* @@ -4760,13 +4719,13 @@ int snapshot_proc_sysfs_files(void) return 1; if (DO_BIC(BIC_GFX_rc6)) - snapshot_gfx_rc6_ms(); + snapshot_graphics(GFX_rc6); if (DO_BIC(BIC_GFXMHz)) - snapshot_gfx_mhz(); + snapshot_graphics(GFX_MHz); if (DO_BIC(BIC_GFXACTMHz)) - snapshot_gfx_act_mhz(); + snapshot_graphics(GFX_ACTMHz); if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); -- cgit v1.2.3 From 60add818ab2543b7e4f2bfeaacf2504743c1eb50 Mon Sep 17 00:00:00 2001 From: Justin Ernst Date: Tue, 2 Apr 2024 13:40:29 -0400 Subject: tools/power/turbostat: Fix uncore frequency file string Running turbostat on a 16 socket HPE Scale-up Compute 3200 (SapphireRapids) fails with: turbostat: /sys/devices/system/cpu/intel_uncore_frequency/package_010_die_00/current_freq_khz: open failed: No such file or directory We observe the sysfs uncore frequency directories named: ... package_09_die_00/ package_10_die_00/ package_11_die_00/ ... package_15_die_00/ The culprit is an incorrect sprintf format string "package_0%d_die_0%d" used with each instance of reading uncore frequency files. uncore-frequency-common.c creates the sysfs directory with the format "package_%02d_die_%02d". Once the package value reaches double digits, the formats diverge. Change each instance of "package_0%d_die_0%d" to "package_%02d_die_%02d". [lenb: deleted the probe part of this patch, as it was already fixed] Signed-off-by: Justin Ernst Reviewed-by: Thomas Renninger Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5b6161fef48..016a5c7dc9bf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2939,7 +2939,7 @@ unsigned long long get_uncore_mhz(int package, int die) { char path[128]; - sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/current_freq_khz", package, + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz", package, die); return (snapshot_sysfs_counter(path) / 1000); -- cgit v1.2.3 From ff81dade48608363136d52bb2493a6df76458b28 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 10 Apr 2024 01:35:28 +0800 Subject: io-uring: correct typo in comment for IOU_F_TWQ_LAZY_WAKE The 'r' key is near to 't' key, that makes 'with' to be 'wirh' ? :) Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240409173531.846714-1-haiyue.wang@intel.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 05df0e399d7c..ac333ea81d31 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -13,7 +13,7 @@ enum { * A hint to not wake right away but delay until there are enough of * tw's queued to match the number of CQEs the task is waiting for. * - * Must not be used wirh requests generating more than one CQE. + * Must not be used with requests generating more than one CQE. * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. */ IOU_F_TWQ_LAZY_WAKE = 1, -- cgit v1.2.3 From 68879386180c0efd5a11e800b0525a01068c9457 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 26 Mar 2024 14:39:20 +0900 Subject: btrfs: zoned: do not flag ZEROOUT on non-dirty extent buffer Btrfs clears the content of an extent buffer marked as EXTENT_BUFFER_ZONED_ZEROOUT before the bio submission. This mechanism is introduced to prevent a write hole of an extent buffer, which is once allocated, marked dirty, but turns out unnecessary and cleaned up within one transaction operation. Currently, btrfs_clear_buffer_dirty() marks the extent buffer as EXTENT_BUFFER_ZONED_ZEROOUT, and skips the entry function. If this call happens while the buffer is under IO (with the WRITEBACK flag set, without the DIRTY flag), we can add the ZEROOUT flag and clear the buffer's content just before a bio submission. As a result: 1) it can lead to adding faulty delayed reference item which leads to a FS corrupted (EUCLEAN) error, and 2) it writes out cleared tree node on disk The former issue is previously discussed in [1]. The corruption happens when it runs a delayed reference update. So, on-disk data is safe. [1] https://lore.kernel.org/linux-btrfs/3f4f2a0ff1a6c818050434288925bdcf3cd719e5.1709124777.git.naohiro.aota@wdc.com/ The latter one can reach on-disk data. But, as that node is already processed by btrfs_clear_buffer_dirty(), that will be invalidated in the next transaction commit anyway. So, the chance of hitting the corruption is relatively small. Anyway, we should skip flagging ZEROOUT on a non-DIRTY extent buffer, to keep the content under IO intact. Fixes: aa6313e6ff2b ("btrfs: zoned: don't clear dirty flag of extent buffer") CC: stable@vger.kernel.org # 6.8 Link: https://lore.kernel.org/linux-btrfs/oadvdekkturysgfgi4qzuemd57zudeasynswurjxw3ocdfsef6@sjyufeugh63f/ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 61594eaf1f89..df3fe36126f9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4154,7 +4154,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, * The actual zeroout of the buffer will happen later in * btree_csum_one_bio. */ - if (btrfs_is_zoned(fs_info)) { + if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); return; } -- cgit v1.2.3 From 073bda7a541731f41ed08f32d286394236c74005 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 26 Mar 2024 14:39:21 +0900 Subject: btrfs: zoned: add ASSERT and WARN for EXTENT_BUFFER_ZONED_ZEROOUT handling Add an ASSERT to catch a faulty delayed reference item resulting from prematurely cleared extent buffer. Also, add a WARN to detect if we try to dirty a ZEROOUT buffer again, which is suspicious as its update will be lost. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++++++ fs/btrfs/extent_io.c | 1 + 2 files changed, 9 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index beedd6ed64d3..257d044bca91 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3464,6 +3464,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root_id != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref generic_ref = { 0 }; + /* + * Assert that the extent buffer is not cleared due to + * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer + * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for + * detail. + */ + ASSERT(btrfs_header_bytenr(buf) != 0); + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent, btrfs_header_owner(buf)); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index df3fe36126f9..b18034f2ab80 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4193,6 +4193,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { bool subpage = eb->fs_info->nodesize < PAGE_SIZE; -- cgit v1.2.3 From 1db7959aacd905e6487d0478ac01d89f86eb1e51 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Mar 2024 09:16:46 +1030 Subject: btrfs: do not wait for short bulk allocation [BUG] There is a recent report that when memory pressure is high (including cached pages), btrfs can spend most of its time on memory allocation in btrfs_alloc_page_array() for compressed read/write. [CAUSE] For btrfs_alloc_page_array() we always go alloc_pages_bulk_array(), and even if the bulk allocation failed (fell back to single page allocation) we still retry but with extra memalloc_retry_wait(). If the bulk alloc only returned one page a time, we would spend a lot of time on the retry wait. The behavior was introduced in commit 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations"). [FIX] Although the commit mentioned that other filesystems do the wait, it's not the case at least nowadays. All the mainlined filesystems only call memalloc_retry_wait() if they failed to allocate any page (not only for bulk allocation). If there is any progress, they won't call memalloc_retry_wait() at all. For example, xfs_buf_alloc_pages() would only call memalloc_retry_wait() if there is no allocation progress at all, and the call is not for metadata readahead. So I don't believe we should call memalloc_retry_wait() unconditionally for short allocation. Call memalloc_retry_wait() if it fails to allocate any page for tree block allocation (which goes with __GFP_NOFAIL and may not need the special handling anyway), and reduce the latency for btrfs_alloc_page_array(). Reported-by: Julian Taylor Tested-by: Julian Taylor Link: https://lore.kernel.org/all/8966c095-cbe7-4d22-9784-a647d1bf27c3@1und1.de/ Fixes: 395cb57e8560 ("btrfs: wait between incomplete batch memory allocations") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b18034f2ab80..2776112dbdf8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -681,31 +681,21 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t extra_gfp) { + const gfp_t gfp = GFP_NOFS | extra_gfp; unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp, - nr_pages, page_array); - - if (allocated == nr_pages) - return 0; - - /* - * During this iteration, no page could be allocated, even - * though alloc_pages_bulk_array() falls back to alloc_page() - * if it could not bulk-allocate. So we must be out of memory. - */ - if (allocated == last) { + allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); + if (unlikely(allocated == last)) { + /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { __free_page(page_array[i]); page_array[i] = NULL; } return -ENOMEM; } - - memalloc_retry_wait(GFP_NOFS); } return 0; } -- cgit v1.2.3 From 60b703c71fa80de0c2e14af66e57e234019b7da2 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 2 Apr 2024 12:17:16 +0200 Subject: zonefs: Use str_plural() to fix Coccinelle warning Fixes the following Coccinelle/coccicheck warning reported by string_choices.cocci: opportunity for str_plural(zgroup->g_nr_zones) Signed-off-by: Thorsten Blum Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index c6a124e8d565..964fa7f24003 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1048,7 +1048,7 @@ static int zonefs_init_zgroup(struct super_block *sb, zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", zonefs_zgroup_name(ztype), zgroup->g_nr_zones, - zgroup->g_nr_zones > 1 ? "s" : ""); + str_plural(zgroup->g_nr_zones)); return 0; } -- cgit v1.2.3 From 9b31152fd74eeb10a20345909e542fef6f1d98e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Apr 2024 18:50:27 -0400 Subject: bcachefs: btree_node_scan: Respect member.data_allowed If a device wasn't used for btree nodes, no need to scan for them. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index a7d0593b3871..556f76f5c84e 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -216,6 +216,9 @@ static int read_btree_nodes(struct find_btree_nodes *f) closure_init_stack(&cl); for_each_online_member(c, ca) { + if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) + continue; + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); struct task_struct *t; -- cgit v1.2.3 From 6e45a30fe5e7cf5d42ac07262a3d97644f23dc68 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Tue, 9 Apr 2024 23:53:00 +0000 Subject: fs/9p: remove erroneous nlink init from legacy stat2inode In 9p2000 legacy mode, stat2inode initializes nlink to 1, which is redundant with what alloc_inode should have already set. 9p2000.u overrides this with extensions if present in the stat structure, and 9p2000.L incorporates nlink into its stat structure. At the very least this probably messes with directory nlink accounting in legacy mode. Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c5b4d3631c47..47bd77199e20 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1064,8 +1064,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); - set_nlink(inode, 1); - inode_set_atime(inode, stat->atime, 0); inode_set_mtime(inode, stat->mtime, 0); inode_set_ctime(inode, stat->mtime, 0); -- cgit v1.2.3 From 6309863b31dd80317cd7d6824820b44e254e2a9c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:43 +0000 Subject: net: add copy_safe_from_sockptr() helper copy_from_sockptr() helper is unsafe, unless callers did the prior check against user provided optlen. Too many callers get this wrong, lets add a helper to fix them and avoid future copy/paste bugs. Instead of : if (optlen < sizeof(opt)) { err = -EINVAL; break; } if (copy_from_sockptr(&opt, optval, sizeof(opt)) { err = -EFAULT; break; } Use : err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408082845.3957374-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/sockptr.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 307961b41541..317200cd3a60 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -50,11 +50,36 @@ static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, return 0; } +/* Deprecated. + * This is unsafe, unless caller checked user provided optlen. + * Prefer copy_safe_from_sockptr() instead. + */ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } +/** + * copy_safe_from_sockptr: copy a struct from sockptr + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @optval: Source address. (in user or kernel space) + * @optlen: Size of @optval data. + * + * Returns: + * * -EINVAL: @optlen < @ksize + * * -EFAULT: access to userspace failed. + * * 0 : @ksize bytes were copied + */ +static inline int copy_safe_from_sockptr(void *dst, size_t ksize, + sockptr_t optval, unsigned int optlen) +{ + if (optlen < ksize) + return -EINVAL; + return copy_from_sockptr(dst, optval, ksize); +} + static inline int copy_struct_from_sockptr(void *dst, size_t ksize, sockptr_t src, size_t usize) { -- cgit v1.2.3 From 138b787804f4a10417618e8d1e6e2700539fd88c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:44 +0000 Subject: mISDN: fix MISDN_TIME_STAMP handling syzbot reports one unsafe call to copy_from_sockptr() [1] Use copy_safe_from_sockptr() instead. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in data_sock_setsockopt+0x46c/0x4cc drivers/isdn/mISDN/socket.c:417 Read of size 4 at addr ffff0000c6d54083 by task syz-executor406/6167 CPU: 1 PID: 6167 Comm: syz-executor406 Not tainted 6.8.0-rc7-syzkaller-g707081b61156 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call trace: dump_backtrace+0x1b8/0x1e4 arch/arm64/kernel/stacktrace.c:291 show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:298 __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd0/0x124 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:377 [inline] print_report+0x178/0x518 mm/kasan/report.c:488 kasan_report+0xd8/0x138 mm/kasan/report.c:601 __asan_report_load_n_noabort+0x1c/0x28 mm/kasan/report_generic.c:391 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] data_sock_setsockopt+0x46c/0x4cc drivers/isdn/mISDN/socket.c:417 do_sock_setsockopt+0x2a0/0x4e0 net/socket.c:2311 __sys_setsockopt+0x128/0x1a8 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __arm64_sys_setsockopt+0xb8/0xd4 net/socket.c:2340 __invoke_syscall arch/arm64/kernel/syscall.c:34 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:48 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:133 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:152 el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Fixes: 1b2b03f8e514 ("Add mISDN core files") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Karsten Keil Link: https://lore.kernel.org/r/20240408082845.3957374-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/isdn/mISDN/socket.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 2776ca5fc33f..b215b28cad7b 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -401,23 +401,23 @@ data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } static int data_sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int len) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0, opt = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %d, %x, optval, %d)\n", __func__, sock, - level, optname, len); + level, optname, optlen); lock_sock(sk); switch (optname) { case MISDN_TIME_STAMP: - if (copy_from_sockptr(&opt, optval, sizeof(int))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt) _pms(sk)->cmask |= MISDN_TIME_STAMP; -- cgit v1.2.3 From 7a87441c9651ba37842f4809224aca13a554a26f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:45 +0000 Subject: nfc: llcp: fix nfc_llcp_setsockopt() unsafe copies syzbot reported unsafe calls to copy_from_sockptr() [1] Use copy_safe_from_sockptr() instead. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in nfc_llcp_setsockopt+0x6c2/0x850 net/nfc/llcp_sock.c:255 Read of size 4 at addr ffff88801caa1ec3 by task syz-executor459/5078 CPU: 0 PID: 5078 Comm: syz-executor459 Not tainted 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] nfc_llcp_setsockopt+0x6c2/0x850 net/nfc/llcp_sock.c:255 do_sock_setsockopt+0x3b1/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfd/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7f7fac07fd89 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 91 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff660eb788 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f7fac07fd89 RDX: 0000000000000000 RSI: 0000000000000118 RDI: 0000000000000004 RBP: 0000000000000000 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000020000a80 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240408082845.3957374-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/nfc/llcp_sock.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 819157bbb5a2..d5344563e525 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -252,10 +252,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_RW) { err = -EINVAL; @@ -274,10 +274,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_MIUX) { err = -EINVAL; -- cgit v1.2.3 From 7633c4da919ad51164acbf1aa322cc1a3ead6129 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 8 Apr 2024 16:18:21 +0200 Subject: ipv6: fix race condition between ipv6_get_ifaddr and ipv6_del_addr Although ipv6_get_ifaddr walks inet6_addr_lst under the RCU lock, it still means hlist_for_each_entry_rcu can return an item that got removed from the list. The memory itself of such item is not freed thanks to RCU but nothing guarantees the actual content of the memory is sane. In particular, the reference count can be zero. This can happen if ipv6_del_addr is called in parallel. ipv6_del_addr removes the entry from inet6_addr_lst (hlist_del_init_rcu(&ifp->addr_lst)) and drops all references (__in6_ifa_put(ifp) + in6_ifa_put(ifp)). With bad enough timing, this can happen: 1. In ipv6_get_ifaddr, hlist_for_each_entry_rcu returns an entry. 2. Then, the whole ipv6_del_addr is executed for the given entry. The reference count drops to zero and kfree_rcu is scheduled. 3. ipv6_get_ifaddr continues and tries to increments the reference count (in6_ifa_hold). 4. The rcu is unlocked and the entry is freed. 5. The freed entry is returned. Prevent increasing of the reference count in such case. The name in6_ifa_hold_safe is chosen to mimic the existing fib6_info_hold_safe. [ 41.506330] refcount_t: addition on 0; use-after-free. [ 41.506760] WARNING: CPU: 0 PID: 595 at lib/refcount.c:25 refcount_warn_saturate+0xa5/0x130 [ 41.507413] Modules linked in: veth bridge stp llc [ 41.507821] CPU: 0 PID: 595 Comm: python3 Not tainted 6.9.0-rc2.main-00208-g49563be82afa #14 [ 41.508479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) [ 41.509163] RIP: 0010:refcount_warn_saturate+0xa5/0x130 [ 41.509586] Code: ad ff 90 0f 0b 90 90 c3 cc cc cc cc 80 3d c0 30 ad 01 00 75 a0 c6 05 b7 30 ad 01 01 90 48 c7 c7 38 cc 7a 8c e8 cc 18 ad ff 90 <0f> 0b 90 90 c3 cc cc cc cc 80 3d 98 30 ad 01 00 0f 85 75 ff ff ff [ 41.510956] RSP: 0018:ffffbda3c026baf0 EFLAGS: 00010282 [ 41.511368] RAX: 0000000000000000 RBX: ffff9e9c46914800 RCX: 0000000000000000 [ 41.511910] RDX: ffff9e9c7ec29c00 RSI: ffff9e9c7ec1c900 RDI: ffff9e9c7ec1c900 [ 41.512445] RBP: ffff9e9c43660c9c R08: 0000000000009ffb R09: 00000000ffffdfff [ 41.512998] R10: 00000000ffffdfff R11: ffffffff8ca58a40 R12: ffff9e9c4339a000 [ 41.513534] R13: 0000000000000001 R14: ffff9e9c438a0000 R15: ffffbda3c026bb48 [ 41.514086] FS: 00007fbc4cda1740(0000) GS:ffff9e9c7ec00000(0000) knlGS:0000000000000000 [ 41.514726] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 41.515176] CR2: 000056233b337d88 CR3: 000000000376e006 CR4: 0000000000370ef0 [ 41.515713] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 41.516252] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 41.516799] Call Trace: [ 41.517037] [ 41.517249] ? __warn+0x7b/0x120 [ 41.517535] ? refcount_warn_saturate+0xa5/0x130 [ 41.517923] ? report_bug+0x164/0x190 [ 41.518240] ? handle_bug+0x3d/0x70 [ 41.518541] ? exc_invalid_op+0x17/0x70 [ 41.520972] ? asm_exc_invalid_op+0x1a/0x20 [ 41.521325] ? refcount_warn_saturate+0xa5/0x130 [ 41.521708] ipv6_get_ifaddr+0xda/0xe0 [ 41.522035] inet6_rtm_getaddr+0x342/0x3f0 [ 41.522376] ? __pfx_inet6_rtm_getaddr+0x10/0x10 [ 41.522758] rtnetlink_rcv_msg+0x334/0x3d0 [ 41.523102] ? netlink_unicast+0x30f/0x390 [ 41.523445] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 41.523832] netlink_rcv_skb+0x53/0x100 [ 41.524157] netlink_unicast+0x23b/0x390 [ 41.524484] netlink_sendmsg+0x1f2/0x440 [ 41.524826] __sys_sendto+0x1d8/0x1f0 [ 41.525145] __x64_sys_sendto+0x1f/0x30 [ 41.525467] do_syscall_64+0xa5/0x1b0 [ 41.525794] entry_SYSCALL_64_after_hwframe+0x72/0x7a [ 41.526213] RIP: 0033:0x7fbc4cfcea9a [ 41.526528] Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89 [ 41.527942] RSP: 002b:00007ffcf54012a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 41.528593] RAX: ffffffffffffffda RBX: 00007ffcf5401368 RCX: 00007fbc4cfcea9a [ 41.529173] RDX: 000000000000002c RSI: 00007fbc4b9d9bd0 RDI: 0000000000000005 [ 41.529786] RBP: 00007fbc4bafb040 R08: 00007ffcf54013e0 R09: 000000000000000c [ 41.530375] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 41.530977] R13: ffffffffc4653600 R14: 0000000000000001 R15: 00007fbc4ca85d1b [ 41.531573] Fixes: 5c578aedcb21d ("IPv6: convert addrconf hash list to RCU") Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jiri Benc Link: https://lore.kernel.org/r/8ab821e36073a4a406c50ec83c9e8dc586c539e4.1712585809.git.jbenc@redhat.com Signed-off-by: Jakub Kicinski --- include/net/addrconf.h | 4 ++++ net/ipv6/addrconf.c | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 9d06eb945509..62a407db1bf5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -438,6 +438,10 @@ static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) refcount_inc(&ifp->refcnt); } +static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) +{ + return refcount_inc_not_zero(&ifp->refcnt); +} /* * compute link-local solicited-node multicast address diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 92db9b474f2b..779aa6ecdd49 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2091,9 +2091,10 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add if (ipv6_addr_equal(&ifp->addr, addr)) { if (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { - result = ifp; - in6_ifa_hold(ifp); - break; + if (in6_ifa_hold_safe(ifp)) { + result = ifp; + break; + } } } } -- cgit v1.2.3 From 8bdfb4ea95ca738d33ef71376c21eba20130f2eb Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Tue, 26 Mar 2024 15:32:46 -0400 Subject: drm/amdkfd: Reset GPU on queue preemption failure Currently, with F32 HWS GPU reset is only when unmap queue fails. However, if compute queue doesn't repond to preemption request in time unmap will return without any error. In this case, only preemption error is logged and Reset is not triggered. Call GPU reset in this case also. Reviewed-by: Alex Deucher Signed-off-by: Harish Kasiviswanathan Reviewed-by: Mukul Joshi Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f4d395e38683..0b655555e167 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2001,6 +2001,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); while (halt_if_hws_hang) schedule(); + kfd_hws_hang(dqm); return -ETIME; } -- cgit v1.2.3 From 65ff8092e4802f96d87d3d7cde146961f5228265 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sat, 23 Mar 2024 20:46:53 -0400 Subject: drm/amdgpu: always force full reset for SOC21 There are cases where soft reset seems to succeed, but does not, so always use mode1/2 for now. Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 581a3bd11481..8526282f4da1 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -457,10 +457,8 @@ static bool soc21_need_full_reset(struct amdgpu_device *adev) { switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(11, 0, 0): - return amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC); case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): - return false; default: return true; } -- cgit v1.2.3 From 4b18a91faf1752f9bd69a4ed3aed2c8f6e5b0528 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Thu, 21 Mar 2024 17:46:36 +0530 Subject: drm/amdgpu: Refine IB schedule error logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Downgrade to debug information when IBs are skipped. Also, use dev_* to identify the device. Signed-off-by: Lijo Lazar Reviewed-by: Christian König Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 4b3000c21ef2..e4742b65032d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -304,12 +304,15 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) dma_fence_set_error(finished, -ECANCELED); if (finished->error < 0) { - DRM_INFO("Skip scheduling IBs!\n"); + dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)", + ring->name); } else { r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, &fence); if (r) - DRM_ERROR("Error scheduling IBs (%d)\n", r); + dev_err(adev->dev, + "Error scheduling IBs (%d) in ring(%s)", r, + ring->name); } job->job_run_counter++; -- cgit v1.2.3 From 0f1bbcc2bab25d5fb2dfb1ee3e08131437690d3d Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Mon, 25 Mar 2024 13:24:31 +0800 Subject: drm/amdgpu/umsch: reinitialize write pointer in hw init Otherwise the old one will be used during GPU reset. That's not expected. Signed-off-by: Lang Yu Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c index 84368cf1e175..bd57896ab85d 100644 --- a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c @@ -225,6 +225,8 @@ static int umsch_mm_v4_0_ring_start(struct amdgpu_umsch_mm *umsch) WREG32_SOC15(VCN, 0, regVCN_UMSCH_RB_SIZE, ring->ring_size); + ring->wptr = 0; + data = RREG32_SOC15(VCN, 0, regVCN_RB_ENABLE); data &= ~(VCN_RB_ENABLE__AUDIO_RB_EN_MASK); WREG32_SOC15(VCN, 0, regVCN_RB_ENABLE, data); -- cgit v1.2.3 From 8b2be55f4d6c1099d7f629b0ed7535a5be788c83 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 14 Feb 2024 17:55:54 +0530 Subject: drm/amdgpu: Reset dGPU if suspend got aborted For SOC21 ASICs, there is an issue in re-enabling PM features if a suspend got aborted. In such cases, reset the device during resume phase. This is a workaround till a proper solution is finalized. Signed-off-by: Lijo Lazar Reviewed-by: Alex Deucher Reviewed-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 8526282f4da1..abe319b0f063 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -867,10 +867,35 @@ static int soc21_common_suspend(void *handle) return soc21_common_hw_fini(adev); } +static bool soc21_need_reset_on_resume(struct amdgpu_device *adev) +{ + u32 sol_reg1, sol_reg2; + + /* Will reset for the following suspend abort cases. + * 1) Only reset dGPU side. + * 2) S3 suspend got aborted and TOS is active. + */ + if (!(adev->flags & AMD_IS_APU) && adev->in_s3 && + !adev->suspend_complete) { + sol_reg1 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81); + msleep(100); + sol_reg2 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81); + + return (sol_reg1 != sol_reg2); + } + + return false; +} + static int soc21_common_resume(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (soc21_need_reset_on_resume(adev)) { + dev_info(adev->dev, "S3 suspend aborted, resetting..."); + soc21_asic_reset(adev); + } + return soc21_common_hw_init(adev); } -- cgit v1.2.3 From d4396924c3d44f34d0643f650e70892e07f3677f Mon Sep 17 00:00:00 2001 From: Li Ma Date: Thu, 28 Mar 2024 10:55:10 +0800 Subject: drm/amd/display: add DCN 351 version for microcode load There is a new DCN veriosn 3.5.1 need to load Signed-off-by: Li Ma Reviewed-by: Yifan Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 71d2d44681b2..173438fc7537 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -148,6 +148,9 @@ MODULE_FIRMWARE(FIRMWARE_NAVI12_DMCU); #define FIRMWARE_DCN_35_DMUB "amdgpu/dcn_3_5_dmcub.bin" MODULE_FIRMWARE(FIRMWARE_DCN_35_DMUB); +#define FIRMWARE_DCN_351_DMUB "amdgpu/dcn_3_5_1_dmcub.bin" +MODULE_FIRMWARE(FIRMWARE_DCN_351_DMUB); + /* Number of bytes in PSP header for firmware. */ #define PSP_HEADER_BYTES 0x100 @@ -4820,9 +4823,11 @@ static int dm_init_microcode(struct amdgpu_device *adev) fw_name_dmub = FIRMWARE_DCN_V3_2_1_DMCUB; break; case IP_VERSION(3, 5, 0): - case IP_VERSION(3, 5, 1): fw_name_dmub = FIRMWARE_DCN_35_DMUB; break; + case IP_VERSION(3, 5, 1): + fw_name_dmub = FIRMWARE_DCN_351_DMUB; + break; default: /* ASIC doesn't support DMUB. */ return 0; -- cgit v1.2.3 From 31729e8c21ecfd671458e02b6511eb68c2225113 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Wed, 27 Mar 2024 13:10:37 +0800 Subject: drm/amd/pm: fixes a random hang in S4 for SMU v13.0.4/11 While doing multiple S4 stress tests, GC/RLC/PMFW get into an invalid state resulting into hard hangs. Adding a GFX reset as workaround just before sending the MP1_UNLOAD message avoids this failure. Signed-off-by: Tim Huang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c index bb98156b2fa1..949131bd1ecb 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c @@ -226,8 +226,18 @@ static int smu_v13_0_4_system_features_control(struct smu_context *smu, bool en) struct amdgpu_device *adev = smu->adev; int ret = 0; - if (!en && !adev->in_s0ix) + if (!en && !adev->in_s0ix) { + /* Adds a GFX reset as workaround just before sending the + * MP1_UNLOAD message to prevent GC/RLC/PMFW from entering + * an invalid state. + */ + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, + SMU_RESET_MODE_2, NULL); + if (ret) + return ret; + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_PrepareMp1ForUnload, NULL); + } return ret; } -- cgit v1.2.3 From a3a4c0b12346a2493b41c8790d85141844a04e28 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Fri, 22 Mar 2024 12:25:16 -0400 Subject: drm/amdgpu : Add mes_log_enable to control mes log feature The MES log might slow down the performance for extra step of log the data, disable it by default and introduce a parameter can enable it when necessary Signed-off-by: shaoyunl Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++++- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 7 +++++-- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9c62552bec34..b3b84647207e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -210,6 +210,7 @@ extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; extern int amdgpu_discovery; extern int amdgpu_mes; +extern int amdgpu_mes_log_enable; extern int amdgpu_mes_kiq; extern int amdgpu_noretry; extern int amdgpu_force_asic_type; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 80b9642f2bc4..e4277298cf1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -195,6 +195,7 @@ int amdgpu_async_gfx_ring = 1; int amdgpu_mcbp = -1; int amdgpu_discovery = -1; int amdgpu_mes; +int amdgpu_mes_log_enable = 0; int amdgpu_mes_kiq; int amdgpu_noretry = -1; int amdgpu_force_asic_type = -1; @@ -667,6 +668,15 @@ MODULE_PARM_DESC(mes, "Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)"); module_param_named(mes, amdgpu_mes, int, 0444); +/** + * DOC: mes_log_enable (int) + * Enable Micro Engine Scheduler log. This is used to enable/disable MES internal log. + * (0 = disabled (default), 1 = enabled) + */ +MODULE_PARM_DESC(mes_log_enable, + "Enable Micro Engine Scheduler log (0 = disabled (default), 1 = enabled)"); +module_param_named(mes_log_enable, amdgpu_mes_log_enable, int, 0444); + /** * DOC: mes_kiq (int) * Enable Micro Engine Scheduler KIQ. This is a new engine pipe for kiq. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index a98e03e0a51f..cfd613ea39b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -102,6 +102,9 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) { int r; + if (!amdgpu_mes_log_enable) + return 0; + r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, &adev->mes.event_log_gpu_obj, @@ -1565,7 +1568,7 @@ void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev) #if defined(CONFIG_DEBUG_FS) struct drm_minor *minor = adev_to_drm(adev)->primary; struct dentry *root = minor->debugfs_root; - if (adev->enable_mes) + if (adev->enable_mes && amdgpu_mes_log_enable) debugfs_create_file("amdgpu_mes_event_log", 0444, root, adev, &amdgpu_debugfs_mes_event_log_fops); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 072c478665ad..63f281a9984d 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -411,8 +411,11 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes_set_hw_res_pkt.enable_reg_active_poll = 1; mes_set_hw_res_pkt.enable_level_process_quantum_check = 1; mes_set_hw_res_pkt.oversubscription_timer = 50; - mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; - mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr; + if (amdgpu_mes_log_enable) { + mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; + mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = + mes->event_log_gpu_addr; + } return mes_v11_0_submit_pkt_and_poll_completion(mes, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), -- cgit v1.2.3 From 5b0cd091d905ee9da0a3ecdf06b9cbdd17ba711d Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Fri, 22 Mar 2024 12:44:55 -0400 Subject: drm/amdgpu : Increase the mes log buffer size as per new MES FW version From MES version 0x54, the log entry increased and require the log buffer size to be increased. The 16k is maximum size agreed Signed-off-by: shaoyunl Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index cfd613ea39b3..a00cf4756ad0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -105,7 +105,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) if (!amdgpu_mes_log_enable) return 0; - r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, + r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, &adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, @@ -1552,12 +1552,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr); seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4, - mem, PAGE_SIZE, false); + mem, AMDGPU_MES_LOG_BUFFER_SIZE, false); return 0; } - DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 7d4f93fea937..4c8fc3117ef8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level { #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */ #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */ +#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */ struct amdgpu_mes_funcs; -- cgit v1.2.3 From c5b1ccff26950d50bf2043cb2af9bafb1f08bbaf Mon Sep 17 00:00:00 2001 From: lima1002 Date: Mon, 29 Jan 2024 20:17:54 +0800 Subject: drm/amd/swsmu: Update smu v14.0.0 headers to be 14.0.1 compatible update ppsmc.h pmfw.h and driver_if.h for smu v14_0_1 Reviewed-by: Alex Deucher Signed-off-by: lima1002 Signed-off-by: Alex Deucher --- .../pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h | 33 +- .../amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h | 55 +++- .../amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h | 18 +- drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h | 1 + drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c | 2 +- .../gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c | 347 +++++++++++++++++++-- 6 files changed, 413 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h index 5bb7a63c0602..97522c085258 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0_0.h @@ -144,6 +144,37 @@ typedef struct { uint32_t MaxGfxClk; } DpmClocks_t; +//Freq in MHz +//Voltage in milli volts with 2 fractional bits +typedef struct { + uint32_t DcfClocks[NUM_DCFCLK_DPM_LEVELS]; + uint32_t DispClocks[NUM_DISPCLK_DPM_LEVELS]; + uint32_t DppClocks[NUM_DPPCLK_DPM_LEVELS]; + uint32_t SocClocks[NUM_SOCCLK_DPM_LEVELS]; + uint32_t VClocks0[NUM_VCN_DPM_LEVELS]; + uint32_t VClocks1[NUM_VCN_DPM_LEVELS]; + uint32_t DClocks0[NUM_VCN_DPM_LEVELS]; + uint32_t DClocks1[NUM_VCN_DPM_LEVELS]; + uint32_t VPEClocks[NUM_VPE_DPM_LEVELS]; + uint32_t FclkClocks_Freq[NUM_FCLK_DPM_LEVELS]; + uint32_t FclkClocks_Voltage[NUM_FCLK_DPM_LEVELS]; + uint32_t SocVoltage[NUM_SOC_VOLTAGE_LEVELS]; + MemPstateTable_t MemPstateTable[NUM_MEM_PSTATE_LEVELS]; + + uint8_t NumDcfClkLevelsEnabled; + uint8_t NumDispClkLevelsEnabled; //Applies to both Dispclk and Dppclk + uint8_t NumSocClkLevelsEnabled; + uint8_t Vcn0ClkLevelsEnabled; //Applies to both Vclk0 and Dclk0 + uint8_t Vcn1ClkLevelsEnabled; //Applies to both Vclk1 and Dclk1 + uint8_t VpeClkLevelsEnabled; + uint8_t NumMemPstatesEnabled; + uint8_t NumFclkLevelsEnabled; + uint8_t spare; + + uint32_t MinGfxClk; + uint32_t MaxGfxClk; +} DpmClocks_t_v14_0_1; + typedef struct { uint16_t CoreFrequency[16]; //Target core frequency [MHz] uint16_t CorePower[16]; //CAC calculated core power [mW] @@ -224,7 +255,7 @@ typedef enum { #define TABLE_CUSTOM_DPM 2 // Called by Driver #define TABLE_BIOS_GPIO_CONFIG 3 // Called by BIOS #define TABLE_DPMCLOCKS 4 // Called by Driver and VBIOS -#define TABLE_SPARE0 5 // Unused +#define TABLE_MOMENTARY_PM 5 // Called by Tools #define TABLE_MODERN_STDBY 6 // Called by Tools for Modern Standby Log #define TABLE_SMU_METRICS 7 // Called by Driver and SMF/PMF #define TABLE_COUNT 8 diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h index 356e0f57a426..ddb625860083 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h @@ -42,7 +42,7 @@ #define FEATURE_EDC_BIT 7 #define FEATURE_PLL_POWER_DOWN_BIT 8 #define FEATURE_VDDOFF_BIT 9 -#define FEATURE_VCN_DPM_BIT 10 +#define FEATURE_VCN_DPM_BIT 10 /* this is for both VCN0 and VCN1 */ #define FEATURE_DS_MPM_BIT 11 #define FEATURE_FCLK_DPM_BIT 12 #define FEATURE_SOCCLK_DPM_BIT 13 @@ -56,9 +56,9 @@ #define FEATURE_DS_GFXCLK_BIT 21 #define FEATURE_DS_SOCCLK_BIT 22 #define FEATURE_DS_LCLK_BIT 23 -#define FEATURE_LOW_POWER_DCNCLKS_BIT 24 // for all DISP clks +#define FEATURE_LOW_POWER_DCNCLKS_BIT 24 #define FEATURE_DS_SHUBCLK_BIT 25 -#define FEATURE_SPARE0_BIT 26 //SPARE +#define FEATURE_RESERVED0_BIT 26 #define FEATURE_ZSTATES_BIT 27 #define FEATURE_IOMMUL2_PG_BIT 28 #define FEATURE_DS_FCLK_BIT 29 @@ -66,8 +66,8 @@ #define FEATURE_DS_MP1CLK_BIT 31 #define FEATURE_WHISPER_MODE_BIT 32 #define FEATURE_SMU_LOW_POWER_BIT 33 -#define FEATURE_SMART_L3_RINSER_BIT 34 -#define FEATURE_SPARE1_BIT 35 //SPARE +#define FEATURE_RESERVED1_BIT 34 /* v14_0_0 SMART_L3_RINSER; v14_0_1 RESERVED1 */ +#define FEATURE_GFX_DEM_BIT 35 /* v14_0_0 SPARE; v14_0_1 GFX_DEM */ #define FEATURE_PSI_BIT 36 #define FEATURE_PROCHOT_BIT 37 #define FEATURE_CPUOFF_BIT 38 @@ -77,11 +77,11 @@ #define FEATURE_PERF_LIMIT_BIT 42 #define FEATURE_CORE_DLDO_BIT 43 #define FEATURE_DVO_BIT 44 -#define FEATURE_DS_VCN_BIT 45 +#define FEATURE_DS_VCN_BIT 45 /* v14_0_1 this is for both VCN0 and VCN1 */ #define FEATURE_CPPC_BIT 46 #define FEATURE_CPPC_PREFERRED_CORES 47 #define FEATURE_DF_CSTATES_BIT 48 -#define FEATURE_SPARE2_BIT 49 //SPARE +#define FEATURE_FAST_PSTATE_CLDO_BIT 49 /* v14_0_0 SPARE */ #define FEATURE_ATHUB_PG_BIT 50 #define FEATURE_VDDOFF_ECO_BIT 51 #define FEATURE_ZSTATES_ECO_BIT 52 @@ -93,8 +93,8 @@ #define FEATURE_DS_IPUCLK_BIT 58 #define FEATURE_DS_VPECLK_BIT 59 #define FEATURE_VPE_DPM_BIT 60 -#define FEATURE_SPARE_61 61 -#define FEATURE_FP_DIDT 62 +#define FEATURE_SMART_L3_RINSER_BIT 61 /* v14_0_0 SPARE*/ +#define FEATURE_PCC_BIT 62 /* v14_0_0 FP_DIDT v14_0_1 PCC_BIT */ #define NUM_FEATURES 63 // Firmware Header/Footer @@ -151,6 +151,43 @@ typedef struct { // MP1_EXT_SCRATCH7 = RTOS Current Job } FwStatus_t; +typedef struct { + // MP1_EXT_SCRATCH0 + uint32_t DpmHandlerID : 8; + uint32_t ActivityMonitorID : 8; + uint32_t DpmTimerID : 8; + uint32_t DpmHubID : 4; + uint32_t DpmHubTask : 4; + // MP1_EXT_SCRATCH1 + uint32_t CclkSyncStatus : 8; + uint32_t ZstateStatus : 4; + uint32_t Cpu1VddOff : 4; + uint32_t DstateFun : 4; + uint32_t DstateDev : 4; + uint32_t GfxOffStatus : 2; + uint32_t Cpu0Off : 2; + uint32_t Cpu1Off : 2; + uint32_t Cpu0VddOff : 2; + // MP1_EXT_SCRATCH2 + uint32_t P2JobHandler :32; + // MP1_EXT_SCRATCH3 + uint32_t PostCode :32; + // MP1_EXT_SCRATCH4 + uint32_t MsgPortBusy :15; + uint32_t RsmuPmiP1Pending : 1; + uint32_t RsmuPmiP2PendingCnt : 8; + uint32_t DfCstateExitPending : 1; + uint32_t Pc6EntryPending : 1; + uint32_t Pc6ExitPending : 1; + uint32_t WarmResetPending : 1; + uint32_t Mp0ClkPending : 1; + uint32_t InWhisperMode : 1; + uint32_t spare2 : 2; + // MP1_EXT_SCRATCH5 + uint32_t IdleMask :32; + // MP1_EXT_SCRATCH6 = RTOS threads' status + // MP1_EXT_SCRATCH7 = RTOS Current Job +} FwStatus_t_v14_0_1; #pragma pack(pop) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h index ca7ce4251482..c4dc5881d8df 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v14_0_0_ppsmc.h @@ -72,23 +72,19 @@ #define PPSMC_MSG_SetHardMinSocclkByFreq 0x13 ///< Set hard min for SOC CLK #define PPSMC_MSG_SetSoftMinFclk 0x14 ///< Set hard min for FCLK #define PPSMC_MSG_SetSoftMinVcn0 0x15 ///< Set soft min for VCN0 clocks (VCLK0 and DCLK0) - #define PPSMC_MSG_EnableGfxImu 0x16 ///< Enable GFX IMU - -#define PPSMC_MSG_spare_0x17 0x17 -#define PPSMC_MSG_spare_0x18 0x18 +#define PPSMC_MSG_spare_0x17 0x17 ///< Get GFX clock frequency +#define PPSMC_MSG_spare_0x18 0x18 ///< Get FCLK frequency #define PPSMC_MSG_AllowGfxOff 0x19 ///< Inform PMFW of allowing GFXOFF entry #define PPSMC_MSG_DisallowGfxOff 0x1A ///< Inform PMFW of disallowing GFXOFF entry #define PPSMC_MSG_SetSoftMaxGfxClk 0x1B ///< Set soft max for GFX CLK #define PPSMC_MSG_SetHardMinGfxClk 0x1C ///< Set hard min for GFX CLK - #define PPSMC_MSG_SetSoftMaxSocclkByFreq 0x1D ///< Set soft max for SOC CLK #define PPSMC_MSG_SetSoftMaxFclkByFreq 0x1E ///< Set soft max for FCLK #define PPSMC_MSG_SetSoftMaxVcn0 0x1F ///< Set soft max for VCN0 clocks (VCLK0 and DCLK0) -#define PPSMC_MSG_spare_0x20 0x20 +#define PPSMC_MSG_spare_0x20 0x20 ///< Set power limit percentage #define PPSMC_MSG_PowerDownJpeg0 0x21 ///< Power down Jpeg of VCN0 #define PPSMC_MSG_PowerUpJpeg0 0x22 ///< Power up Jpeg of VCN0; VCN0 is power gated by default - #define PPSMC_MSG_SetHardMinFclkByFreq 0x23 ///< Set hard min for FCLK #define PPSMC_MSG_SetSoftMinSocclkByFreq 0x24 ///< Set soft min for SOC CLK #define PPSMC_MSG_AllowZstates 0x25 ///< Inform PMFM of allowing Zstate entry, i.e. no Miracast activity @@ -99,8 +95,8 @@ #define PPSMC_MSG_PowerUpIspByTile 0x2A ///< This message is used to power up ISP tiles and enable the ISP DPM #define PPSMC_MSG_SetHardMinIspiclkByFreq 0x2B ///< Set HardMin by frequency for ISPICLK #define PPSMC_MSG_SetHardMinIspxclkByFreq 0x2C ///< Set HardMin by frequency for ISPXCLK -#define PPSMC_MSG_PowerDownUmsch 0x2D ///< Power down VCN.UMSCH (aka VSCH) scheduler -#define PPSMC_MSG_PowerUpUmsch 0x2E ///< Power up VCN.UMSCH (aka VSCH) scheduler +#define PPSMC_MSG_PowerDownUmsch 0x2D ///< Power down VCN0.UMSCH (aka VSCH) scheduler +#define PPSMC_MSG_PowerUpUmsch 0x2E ///< Power up VCN0.UMSCH (aka VSCH) scheduler #define PPSMC_Message_IspStutterOn_MmhubPgDis 0x2F ///< ISP StutterOn mmHub PgDis #define PPSMC_Message_IspStutterOff_MmhubPgEn 0x30 ///< ISP StufferOff mmHub PgEn #define PPSMC_MSG_PowerUpVpe 0x31 ///< Power up VPE @@ -110,7 +106,9 @@ #define PPSMC_MSG_DisableLSdma 0x35 ///< Disable LSDMA #define PPSMC_MSG_SetSoftMaxVpe 0x36 ///< #define PPSMC_MSG_SetSoftMinVpe 0x37 ///< -#define PPSMC_Message_Count 0x38 ///< Total number of PPSMC messages +#define PPSMC_MSG_AllocMALLCache 0x38 ///< Allocating MALL Cache +#define PPSMC_MSG_ReleaseMALLCache 0x39 ///< Releasing MALL Cache +#define PPSMC_Message_Count 0x3A ///< Total number of PPSMC messages /** @}*/ /** diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h index 3f7463c1c1a9..4af1985ae446 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v14_0.h @@ -27,6 +27,7 @@ #define SMU14_DRIVER_IF_VERSION_INV 0xFFFFFFFF #define SMU14_DRIVER_IF_VERSION_SMU_V14_0_0 0x7 +#define SMU14_DRIVER_IF_VERSION_SMU_V14_0_1 0x6 #define SMU14_DRIVER_IF_VERSION_SMU_V14_0_2 0x1 #define FEATURE_MASK(feature) (1ULL << feature) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c index 9e39f99154f9..07a65e005785 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c @@ -234,7 +234,7 @@ int smu_v14_0_check_fw_version(struct smu_context *smu) smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_0; break; case IP_VERSION(14, 0, 1): - smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_0; + smu->smc_driver_if_version = SMU14_DRIVER_IF_VERSION_SMU_V14_0_1; break; default: diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c index d6de6d97286c..63399c00cc28 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c @@ -161,7 +161,7 @@ static int smu_v14_0_0_init_smc_tables(struct smu_context *smu) SMU_TABLE_INIT(tables, SMU_TABLE_WATERMARKS, sizeof(Watermarks_t), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); - SMU_TABLE_INIT(tables, SMU_TABLE_DPMCLOCKS, sizeof(DpmClocks_t), + SMU_TABLE_INIT(tables, SMU_TABLE_DPMCLOCKS, max(sizeof(DpmClocks_t), sizeof(DpmClocks_t_v14_0_1)), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); SMU_TABLE_INIT(tables, SMU_TABLE_SMU_METRICS, sizeof(SmuMetrics_t), PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM); @@ -171,7 +171,7 @@ static int smu_v14_0_0_init_smc_tables(struct smu_context *smu) goto err0_out; smu_table->metrics_time = 0; - smu_table->clocks_table = kzalloc(sizeof(DpmClocks_t), GFP_KERNEL); + smu_table->clocks_table = kzalloc(max(sizeof(DpmClocks_t), sizeof(DpmClocks_t_v14_0_1)), GFP_KERNEL); if (!smu_table->clocks_table) goto err1_out; @@ -593,6 +593,60 @@ static int smu_v14_0_0_mode2_reset(struct smu_context *smu) return ret; } +static int smu_v14_0_1_get_dpm_freq_by_index(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t dpm_level, + uint32_t *freq) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + if (!clk_table || clk_type >= SMU_CLK_COUNT) + return -EINVAL; + + switch (clk_type) { + case SMU_SOCCLK: + if (dpm_level >= clk_table->NumSocClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->SocClocks[dpm_level]; + break; + case SMU_VCLK: + if (dpm_level >= clk_table->Vcn0ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->VClocks0[dpm_level]; + break; + case SMU_DCLK: + if (dpm_level >= clk_table->Vcn0ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->DClocks0[dpm_level]; + break; + case SMU_VCLK1: + if (dpm_level >= clk_table->Vcn1ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->VClocks1[dpm_level]; + break; + case SMU_DCLK1: + if (dpm_level >= clk_table->Vcn1ClkLevelsEnabled) + return -EINVAL; + *freq = clk_table->DClocks1[dpm_level]; + break; + case SMU_UCLK: + case SMU_MCLK: + if (dpm_level >= clk_table->NumMemPstatesEnabled) + return -EINVAL; + *freq = clk_table->MemPstateTable[dpm_level].MemClk; + break; + case SMU_FCLK: + if (dpm_level >= clk_table->NumFclkLevelsEnabled) + return -EINVAL; + *freq = clk_table->FclkClocks_Freq[dpm_level]; + break; + default: + return -EINVAL; + } + + return 0; +} + static int smu_v14_0_0_get_dpm_freq_by_index(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t dpm_level, @@ -637,6 +691,19 @@ static int smu_v14_0_0_get_dpm_freq_by_index(struct smu_context *smu, return 0; } +static int smu_v14_0_common_get_dpm_freq_by_index(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t dpm_level, + uint32_t *freq) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, dpm_level, freq); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_freq_by_index(smu, clk_type, dpm_level, freq); + + return 0; +} + static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, enum smu_clk_type clk_type) { @@ -657,6 +724,8 @@ static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, break; case SMU_VCLK: case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: feature_id = SMU_FEATURE_VCN_DPM_BIT; break; default: @@ -666,6 +735,126 @@ static bool smu_v14_0_0_clk_dpm_is_enabled(struct smu_context *smu, return smu_cmn_feature_is_enabled(smu, feature_id); } +static int smu_v14_0_1_get_dpm_ultimate_freq(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *min, + uint32_t *max) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + uint32_t clock_limit; + uint32_t max_dpm_level, min_dpm_level; + int ret = 0; + + if (!smu_v14_0_0_clk_dpm_is_enabled(smu, clk_type)) { + switch (clk_type) { + case SMU_MCLK: + case SMU_UCLK: + clock_limit = smu->smu_table.boot_values.uclk; + break; + case SMU_FCLK: + clock_limit = smu->smu_table.boot_values.fclk; + break; + case SMU_GFXCLK: + case SMU_SCLK: + clock_limit = smu->smu_table.boot_values.gfxclk; + break; + case SMU_SOCCLK: + clock_limit = smu->smu_table.boot_values.socclk; + break; + case SMU_VCLK: + case SMU_VCLK1: + clock_limit = smu->smu_table.boot_values.vclk; + break; + case SMU_DCLK: + case SMU_DCLK1: + clock_limit = smu->smu_table.boot_values.dclk; + break; + default: + clock_limit = 0; + break; + } + + /* clock in Mhz unit */ + if (min) + *min = clock_limit / 100; + if (max) + *max = clock_limit / 100; + + return 0; + } + + if (max) { + switch (clk_type) { + case SMU_GFXCLK: + case SMU_SCLK: + *max = clk_table->MaxGfxClk; + break; + case SMU_MCLK: + case SMU_UCLK: + case SMU_FCLK: + max_dpm_level = 0; + break; + case SMU_SOCCLK: + max_dpm_level = clk_table->NumSocClkLevelsEnabled - 1; + break; + case SMU_VCLK: + case SMU_DCLK: + max_dpm_level = clk_table->Vcn0ClkLevelsEnabled - 1; + break; + case SMU_VCLK1: + case SMU_DCLK1: + max_dpm_level = clk_table->Vcn1ClkLevelsEnabled - 1; + break; + default: + ret = -EINVAL; + goto failed; + } + + if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); + if (ret) + goto failed; + } + } + + if (min) { + switch (clk_type) { + case SMU_GFXCLK: + case SMU_SCLK: + *min = clk_table->MinGfxClk; + break; + case SMU_MCLK: + case SMU_UCLK: + min_dpm_level = clk_table->NumMemPstatesEnabled - 1; + break; + case SMU_FCLK: + min_dpm_level = clk_table->NumFclkLevelsEnabled - 1; + break; + case SMU_SOCCLK: + min_dpm_level = 0; + break; + case SMU_VCLK: + case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: + min_dpm_level = 0; + break; + default: + ret = -EINVAL; + goto failed; + } + + if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); + if (ret) + goto failed; + } + } + +failed: + return ret; +} + static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *min, @@ -736,7 +925,7 @@ static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, } if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, max_dpm_level, max); if (ret) goto failed; } @@ -768,7 +957,7 @@ static int smu_v14_0_0_get_dpm_ultimate_freq(struct smu_context *smu, } if (clk_type != SMU_GFXCLK && clk_type != SMU_SCLK) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, min_dpm_level, min); if (ret) goto failed; } @@ -778,6 +967,19 @@ failed: return ret; } +static int smu_v14_0_common_get_dpm_ultimate_freq(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *min, + uint32_t *max) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_ultimate_freq(smu, clk_type, min, max); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_ultimate_freq(smu, clk_type, min, max); + + return 0; +} + static int smu_v14_0_0_get_current_clk_freq(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *value) @@ -811,6 +1013,37 @@ static int smu_v14_0_0_get_current_clk_freq(struct smu_context *smu, return smu_v14_0_0_get_smu_metrics_data(smu, member_type, value); } +static int smu_v14_0_1_get_dpm_level_count(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *count) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + switch (clk_type) { + case SMU_SOCCLK: + *count = clk_table->NumSocClkLevelsEnabled; + break; + case SMU_VCLK: + case SMU_DCLK: + *count = clk_table->Vcn0ClkLevelsEnabled; + break; + case SMU_VCLK1: + case SMU_DCLK1: + *count = clk_table->Vcn1ClkLevelsEnabled; + break; + case SMU_MCLK: + *count = clk_table->NumMemPstatesEnabled; + break; + case SMU_FCLK: + *count = clk_table->NumFclkLevelsEnabled; + break; + default: + break; + } + + return 0; +} + static int smu_v14_0_0_get_dpm_level_count(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t *count) @@ -840,6 +1073,18 @@ static int smu_v14_0_0_get_dpm_level_count(struct smu_context *smu, return 0; } +static int smu_v14_0_common_get_dpm_level_count(struct smu_context *smu, + enum smu_clk_type clk_type, + uint32_t *count) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_get_dpm_level_count(smu, clk_type, count); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_get_dpm_level_count(smu, clk_type, count); + + return 0; +} + static int smu_v14_0_0_print_clk_levels(struct smu_context *smu, enum smu_clk_type clk_type, char *buf) { @@ -866,18 +1111,20 @@ static int smu_v14_0_0_print_clk_levels(struct smu_context *smu, case SMU_SOCCLK: case SMU_VCLK: case SMU_DCLK: + case SMU_VCLK1: + case SMU_DCLK1: case SMU_MCLK: case SMU_FCLK: ret = smu_v14_0_0_get_current_clk_freq(smu, clk_type, &cur_value); if (ret) break; - ret = smu_v14_0_0_get_dpm_level_count(smu, clk_type, &count); + ret = smu_v14_0_common_get_dpm_level_count(smu, clk_type, &count); if (ret) break; for (i = 0; i < count; i++) { - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, i, &value); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, i, &value); if (ret) break; @@ -940,8 +1187,13 @@ static int smu_v14_0_0_set_soft_freq_limited_range(struct smu_context *smu, break; case SMU_VCLK: case SMU_DCLK: - msg_set_min = SMU_MSG_SetHardMinVcn; - msg_set_max = SMU_MSG_SetSoftMaxVcn; + msg_set_min = SMU_MSG_SetHardMinVcn0; + msg_set_max = SMU_MSG_SetSoftMaxVcn0; + break; + case SMU_VCLK1: + case SMU_DCLK1: + msg_set_min = SMU_MSG_SetHardMinVcn1; + msg_set_max = SMU_MSG_SetSoftMaxVcn1; break; default: return -EINVAL; @@ -971,11 +1223,11 @@ static int smu_v14_0_0_force_clk_levels(struct smu_context *smu, case SMU_FCLK: case SMU_VCLK: case SMU_DCLK: - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, soft_min_level, &min_freq); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, soft_min_level, &min_freq); if (ret) break; - ret = smu_v14_0_0_get_dpm_freq_by_index(smu, clk_type, soft_max_level, &max_freq); + ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, soft_max_level, &max_freq); if (ret) break; @@ -1000,25 +1252,25 @@ static int smu_v14_0_0_set_performance_level(struct smu_context *smu, switch (level) { case AMD_DPM_FORCED_LEVEL_HIGH: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, NULL, &sclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &fclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, NULL, &socclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, NULL, &sclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &fclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, NULL, &socclk_max); sclk_min = sclk_max; fclk_min = fclk_max; socclk_min = socclk_max; break; case AMD_DPM_FORCED_LEVEL_LOW: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, NULL); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, NULL); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, NULL); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, NULL); sclk_max = sclk_min; fclk_max = fclk_min; socclk_max = socclk_min; break; case AMD_DPM_FORCED_LEVEL_AUTO: - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, &sclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, &fclk_max); - smu_v14_0_0_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, &socclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, &sclk_min, &sclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, &fclk_min, &fclk_max); + smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, &socclk_min, &socclk_max); break; case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK: @@ -1067,6 +1319,18 @@ static int smu_v14_0_0_set_performance_level(struct smu_context *smu, return ret; } +static int smu_v14_0_1_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + + smu->gfx_default_hard_min_freq = clk_table->MinGfxClk; + smu->gfx_default_soft_max_freq = clk_table->MaxGfxClk; + smu->gfx_actual_hard_min_freq = 0; + smu->gfx_actual_soft_max_freq = 0; + + return 0; +} + static int smu_v14_0_0_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) { DpmClocks_t *clk_table = smu->smu_table.clocks_table; @@ -1079,6 +1343,16 @@ static int smu_v14_0_0_set_fine_grain_gfx_freq_parameters(struct smu_context *sm return 0; } +static int smu_v14_0_common_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_v14_0_0_set_fine_grain_gfx_freq_parameters(smu); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_v14_0_1_set_fine_grain_gfx_freq_parameters(smu); + + return 0; +} + static int smu_v14_0_0_set_vpe_enable(struct smu_context *smu, bool enable) { @@ -1095,6 +1369,25 @@ static int smu_v14_0_0_set_umsch_mm_enable(struct smu_context *smu, 0, NULL); } +static int smu_14_0_1_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) +{ + DpmClocks_t_v14_0_1 *clk_table = smu->smu_table.clocks_table; + uint8_t idx; + + /* Only the Clock information of SOC and VPE is copied to provide VPE DPM settings for use. */ + for (idx = 0; idx < NUM_SOCCLK_DPM_LEVELS; idx++) { + clock_table->SocClocks[idx].Freq = (idx < clk_table->NumSocClkLevelsEnabled) ? clk_table->SocClocks[idx]:0; + clock_table->SocClocks[idx].Vol = 0; + } + + for (idx = 0; idx < NUM_VPE_DPM_LEVELS; idx++) { + clock_table->VPEClocks[idx].Freq = (idx < clk_table->VpeClkLevelsEnabled) ? clk_table->VPEClocks[idx]:0; + clock_table->VPEClocks[idx].Vol = 0; + } + + return 0; +} + static int smu_14_0_0_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) { DpmClocks_t *clk_table = smu->smu_table.clocks_table; @@ -1114,6 +1407,16 @@ static int smu_14_0_0_get_dpm_table(struct smu_context *smu, struct dpm_clocks * return 0; } +static int smu_v14_0_common_get_dpm_table(struct smu_context *smu, struct dpm_clocks *clock_table) +{ + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 0)) + smu_14_0_0_get_dpm_table(smu, clock_table); + else if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 1)) + smu_14_0_1_get_dpm_table(smu, clock_table); + + return 0; +} + static const struct pptable_funcs smu_v14_0_0_ppt_funcs = { .check_fw_status = smu_v14_0_check_fw_status, .check_fw_version = smu_v14_0_check_fw_version, @@ -1135,16 +1438,16 @@ static const struct pptable_funcs smu_v14_0_0_ppt_funcs = { .set_driver_table_location = smu_v14_0_set_driver_table_location, .gfx_off_control = smu_v14_0_gfx_off_control, .mode2_reset = smu_v14_0_0_mode2_reset, - .get_dpm_ultimate_freq = smu_v14_0_0_get_dpm_ultimate_freq, + .get_dpm_ultimate_freq = smu_v14_0_common_get_dpm_ultimate_freq, .od_edit_dpm_table = smu_v14_0_od_edit_dpm_table, .print_clk_levels = smu_v14_0_0_print_clk_levels, .force_clk_levels = smu_v14_0_0_force_clk_levels, .set_performance_level = smu_v14_0_0_set_performance_level, - .set_fine_grain_gfx_freq_parameters = smu_v14_0_0_set_fine_grain_gfx_freq_parameters, + .set_fine_grain_gfx_freq_parameters = smu_v14_0_common_set_fine_grain_gfx_freq_parameters, .set_gfx_power_up_by_imu = smu_v14_0_set_gfx_power_up_by_imu, .dpm_set_vpe_enable = smu_v14_0_0_set_vpe_enable, .dpm_set_umsch_mm_enable = smu_v14_0_0_set_umsch_mm_enable, - .get_dpm_clock_table = smu_14_0_0_get_dpm_table, + .get_dpm_clock_table = smu_v14_0_common_get_dpm_table, }; static void smu_v14_0_0_set_smu_mailbox_registers(struct smu_context *smu) -- cgit v1.2.3 From 533eefb9be76c3b23d220ee18edfda8eb56cefff Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Tue, 12 Dec 2023 17:17:05 +0800 Subject: drm/amdgpu: add smu 14.0.1 discovery support This patch to add smu 14.0.1 support Reviewed-by: Alex Deucher Signed-off-by: Yifan Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index fdd36fb027ab..ac5bf01fe8d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -1896,6 +1896,7 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &smu_v13_0_ip_block); break; case IP_VERSION(14, 0, 0): + case IP_VERSION(14, 0, 1): amdgpu_device_ip_block_add(adev, &smu_v14_0_ip_block); break; default: -- cgit v1.2.3 From f886b49feaae30acd599e37d4284836024b0f3ed Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 28 Mar 2024 18:22:10 +0800 Subject: drm/amdgpu: implement IRQ_STATE_ENABLE for SDMA v4.4.2 SDMA_CNTL is not set in some cases, driver configures it by itself. v2: simplify code Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 34237a1b1f2e..82eab49be82b 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1602,19 +1602,9 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct amdgpu_device *adev, u32 sdma_cntl; sdma_cntl = RREG32_SDMA(type, regSDMA_CNTL); - switch (state) { - case AMDGPU_IRQ_STATE_DISABLE: - sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, - DRAM_ECC_INT_ENABLE, 0); - WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); - break; - /* sdma ecc interrupt is enabled by default - * driver doesn't need to do anything to - * enable the interrupt */ - case AMDGPU_IRQ_STATE_ENABLE: - default: - break; - } + sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, DRAM_ECC_INT_ENABLE, + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); + WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); return 0; } -- cgit v1.2.3 From ecedd99a9369fb5cde601ae9abd58bca2739f1ae Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Fri, 15 Mar 2024 21:25:25 -0600 Subject: drm/amd/display: Skip on writeback when it's not applicable [WHY] dynamic memory safety error detector (KASAN) catches and generates error messages "BUG: KASAN: slab-out-of-bounds" as writeback connector does not support certain features which are not initialized. [HOW] Skip them when connector type is DRM_MODE_CONNECTOR_WRITEBACK. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3199 Reviewed-by: Harry Wentland Reviewed-by: Rodrigo Siqueira Acked-by: Roman Li Signed-off-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 173438fc7537..6cf0d5f276fe 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3047,6 +3047,10 @@ static int dm_resume(void *handle) /* Do mst topology probing after resuming cached state*/ drm_connector_list_iter_begin(ddev, &iter); drm_for_each_connector_iter(connector, &iter) { + + if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) + continue; + aconnector = to_amdgpu_dm_connector(connector); if (aconnector->dc_link->type != dc_connection_mst_branch || aconnector->mst_root) @@ -5926,6 +5930,9 @@ get_highest_refresh_rate_mode(struct amdgpu_dm_connector *aconnector, &aconnector->base.probed_modes : &aconnector->base.modes; + if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_WRITEBACK) + return NULL; + if (aconnector->freesync_vid_base.clock != 0) return &aconnector->freesync_vid_base; @@ -8767,10 +8774,10 @@ static void amdgpu_dm_commit_audio(struct drm_device *dev, if (!drm_atomic_crtc_needs_modeset(new_crtc_state)) continue; +notify: if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) continue; -notify: aconnector = to_amdgpu_dm_connector(connector); mutex_lock(&adev->dm.audio_lock); -- cgit v1.2.3 From 3818708e9c9712e2ba4006bc23502ee7b031bd3f Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Thu, 28 Mar 2024 11:00:50 +0800 Subject: drm/amd/pm: fix the high voltage issue after unload fix the high voltage issue after unload on smu 13.0.10 Signed-off-by: Kenneth Feng Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 26 ++++++++++++--------- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 27 ++++++++++++++++++++-- drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 1 + .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 8 ++++++- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index aa16d51dd842..7753a2e64d41 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4135,18 +4135,22 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->ip_blocks[i].status.hw = true; } } + } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && + !amdgpu_device_has_display_hardware(adev)) { + r = psp_gpu_reset(adev); } else { - tmp = amdgpu_reset_method; - /* It should do a default reset when loading or reloading the driver, - * regardless of the module parameter reset_method. - */ - amdgpu_reset_method = AMD_RESET_METHOD_NONE; - r = amdgpu_asic_reset(adev); - amdgpu_reset_method = tmp; - if (r) { - dev_err(adev->dev, "asic reset on init failed\n"); - goto failed; - } + tmp = amdgpu_reset_method; + /* It should do a default reset when loading or reloading the driver, + * regardless of the module parameter reset_method. + */ + amdgpu_reset_method = AMD_RESET_METHOD_NONE; + r = amdgpu_asic_reset(adev); + amdgpu_reset_method = tmp; + } + + if (r) { + dev_err(adev->dev, "asic reset on init failed\n"); + goto failed; } } diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 246b211b1e85..65333141b1c1 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -735,7 +735,7 @@ static int smu_early_init(void *handle) smu->adev = adev; smu->pm_enabled = !!amdgpu_dpm; smu->is_apu = false; - smu->smu_baco.state = SMU_BACO_STATE_EXIT; + smu->smu_baco.state = SMU_BACO_STATE_NONE; smu->smu_baco.platform_support = false; smu->user_dpm_profile.fan_mode = -1; @@ -1966,10 +1966,25 @@ static int smu_smc_hw_cleanup(struct smu_context *smu) return 0; } +static int smu_reset_mp1_state(struct smu_context *smu) +{ + struct amdgpu_device *adev = smu->adev; + int ret = 0; + + if ((!adev->in_runpm) && (!adev->in_suspend) && + (!amdgpu_in_reset(adev)) && amdgpu_ip_version(adev, MP1_HWIP, 0) == + IP_VERSION(13, 0, 10) && + !amdgpu_device_has_display_hardware(adev)) + ret = smu_set_mp1_state(smu, PP_MP1_STATE_UNLOAD); + + return ret; +} + static int smu_hw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct smu_context *smu = adev->powerplay.pp_handle; + int ret; if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) return 0; @@ -1987,7 +2002,15 @@ static int smu_hw_fini(void *handle) adev->pm.dpm_enabled = false; - return smu_smc_hw_cleanup(smu); + ret = smu_smc_hw_cleanup(smu); + if (ret) + return ret; + + ret = smu_reset_mp1_state(smu); + if (ret) + return ret; + + return 0; } static void smu_late_fini(void *handle) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index a870bdd49a4e..1fa81575788c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -424,6 +424,7 @@ enum smu_reset_mode { enum smu_baco_state { SMU_BACO_STATE_ENTER = 0, SMU_BACO_STATE_EXIT, + SMU_BACO_STATE_NONE, }; struct smu_baco_context { diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 9c03296f92cd..67117ced7c6a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2751,7 +2751,13 @@ static int smu_v13_0_0_set_mp1_state(struct smu_context *smu, switch (mp1_state) { case PP_MP1_STATE_UNLOAD: - ret = smu_cmn_set_mp1_state(smu, mp1_state); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_PrepareMp1ForUnload, + 0x55, NULL); + + if (!ret && smu->smu_baco.state == SMU_BACO_STATE_EXIT) + ret = smu_v13_0_disable_pmfw_state(smu); + break; default: /* Ignore others */ -- cgit v1.2.3 From f7e232de51bb1b45646e5b7dc4ebcf13510f2630 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Wed, 6 Mar 2024 17:05:07 +0530 Subject: drm/amdgpu: Fix VCN allocation in CPX partition VCN need not be shared in CPX mode always for all GFX 9.4.3 SOC SKUs. In certain configs, VCN instance can be exclusively allocated to a partition even under CPX mode. Signed-off-by: Lijo Lazar Reviewed-by: James Zhu Reviewed-by: Asad Kamal Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index d6f808acfb17..fbb43ae7624f 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -62,6 +62,11 @@ void aqua_vanjaram_doorbell_index_init(struct amdgpu_device *adev) adev->doorbell_index.max_assignment = AMDGPU_DOORBELL_LAYOUT1_MAX_ASSIGNMENT << 1; } +static bool aqua_vanjaram_xcp_vcn_shared(struct amdgpu_device *adev) +{ + return (adev->xcp_mgr->num_xcps > adev->vcn.num_vcn_inst); +} + static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev, uint32_t inst_idx, struct amdgpu_ring *ring) { @@ -87,7 +92,7 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev, case AMDGPU_RING_TYPE_VCN_ENC: case AMDGPU_RING_TYPE_VCN_JPEG: ip_blk = AMDGPU_XCP_VCN; - if (adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE) + if (aqua_vanjaram_xcp_vcn_shared(adev)) inst_mask = 1 << (inst_idx * 2); break; default: @@ -140,10 +145,12 @@ static int aqua_vanjaram_xcp_sched_list_update( aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id); - /* VCN is shared by two partitions under CPX MODE */ + /* VCN may be shared by two partitions under CPX MODE in certain + * configs. + */ if ((ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC || - ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) && - adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE) + ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) && + aqua_vanjaram_xcp_vcn_shared(adev)) aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id + 1); } -- cgit v1.2.3 From e33997e18d0fddd217a0fce988abbfd015338631 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Tue, 2 Apr 2024 11:41:05 +0800 Subject: drm/amdgpu: clear set_q_mode_offs when VM changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] set_q_mode_offs don't get cleared after GPU reset, nexting SET_Q_MODE packet to init shadow memory will be skiped, hence there has a page fault. [How] VM flush is needed after GPU reset, clear set_q_mode_offs when emitting VM flush. Fixes: 8bc75586ea01 ("drm/amdgpu: workaround to avoid SET_Q_MODE packets v2") Reviewed-by: Christian König Signed-off-by: ZhenGuo Yin Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 1770e496c1b7..2c82e2741619 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -5465,6 +5465,7 @@ static void gfx_v11_0_ring_emit_vm_flush(struct amdgpu_ring *ring, /* Make sure that we can't skip the SET_Q_MODE packets when the VM * changed in any way. */ + ring->set_q_mode_offs = 0; ring->set_q_mode_ptr = NULL; } -- cgit v1.2.3 From d06af584be5a769d124b7302b32a033e9559761d Mon Sep 17 00:00:00 2001 From: Zhigang Luo Date: Mon, 18 Mar 2024 14:13:10 -0400 Subject: amd/amdkfd: sync all devices to wait all processes being evicted If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault. Signed-off-by: Zhigang Luo Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 041ec3de55e7..719d6d365e15 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -960,7 +960,6 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { struct kfd_node *node; int i; - int count; if (!kfd->init_complete) return; @@ -968,12 +967,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) /* for runtime suspend, skip locking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = ++kfd_locked; - mutex_unlock(&kfd_processes_mutex); - /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex); } for (i = 0; i < kfd->num_nodes; i++) { @@ -984,7 +981,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) { - int ret, count, i; + int ret, i; if (!kfd->init_complete) return 0; @@ -998,12 +995,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = --kfd_locked; - mutex_unlock(&kfd_processes_mutex); - - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex); } return ret; -- cgit v1.2.3 From 2cc69a10d83180f3de9f5afe3a98e972b1453d4c Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Sat, 23 Mar 2024 12:02:54 -0600 Subject: drm/amd/display: Return max resolution supported by DWB mode_config's max width x height is 4096x2160 and is higher than DWB's max resolution 3840x2160 which is returned instead. Cc: stable@vger.kernel.org Reviewed-by: Harry Wentland Acked-by: Hamza Mahfooz Signed-off-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c index 16e72d623630..08c494a7a21b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c @@ -76,10 +76,8 @@ static int amdgpu_dm_wb_encoder_atomic_check(struct drm_encoder *encoder, static int amdgpu_dm_wb_connector_get_modes(struct drm_connector *connector) { - struct drm_device *dev = connector->dev; - - return drm_add_modes_noedid(connector, dev->mode_config.max_width, - dev->mode_config.max_height); + /* Maximum resolution supported by DWB */ + return drm_add_modes_noedid(connector, 3840, 2160); } static int amdgpu_dm_wb_prepare_job(struct drm_writeback_connector *wb_connector, -- cgit v1.2.3 From bbca7f414ae9a12ea231cdbafd79c607e3337ea8 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Wed, 3 Apr 2024 17:28:44 +0800 Subject: drm/amdgpu: fix incorrect number of active RBs for gfx11 The RB bitmap should be global active RB bitmap & active RB bitmap based on active SA. Signed-off-by: Tim Huang Reviewed-by: Yifan Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 2c82e2741619..f7325b02a191 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1635,7 +1635,7 @@ static void gfx_v11_0_setup_rb(struct amdgpu_device *adev) active_rb_bitmap |= (0x3 << (i * rb_bitmap_width_per_sa)); } - active_rb_bitmap |= global_active_rb_bitmap; + active_rb_bitmap &= global_active_rb_bitmap; adev->gfx.config.backend_enable_mask = active_rb_bitmap; adev->gfx.config.num_rbs = hweight32(active_rb_bitmap); } -- cgit v1.2.3 From 81901d8d0472e9a19d294ae1dea76b950548195d Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Fri, 22 Mar 2024 15:02:45 -0400 Subject: drm/amd/display: always reset ODM mode in context when adding first plane [why] In current implemenation ODM mode is only reset when the last plane is removed from dc state. For any dc validate we will always remove all current planes and add new planes. However when switching from no planes to 1 plane, ODM mode is not reset because no planes get removed. This has caused an issue where we kept ODM combine when it should have been remove when a plane is added. The change is to reset ODM mode when adding the first plane. Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Hamza Mahfooz Signed-off-by: Wenjing Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_state.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c index 5cc7f8da209c..61986e5cb491 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c @@ -436,6 +436,15 @@ bool dc_state_add_plane( goto out; } + if (stream_status->plane_count == 0 && dc->config.enable_windowed_mpo_odm) + /* ODM combine could prevent us from supporting more planes + * we will reset ODM slice count back to 1 when all planes have + * been removed to maximize the amount of planes supported when + * new planes are added. + */ + resource_update_pipes_for_stream_with_slice_count( + state, dc->current_state, dc->res_pool, stream, 1); + otg_master_pipe = resource_get_otg_master_for_stream( &state->res_ctx, stream); if (otg_master_pipe) -- cgit v1.2.3 From 953927587f37b731abdeabe46ad44a3b3ec67a52 Mon Sep 17 00:00:00 2001 From: Dillon Varone Date: Thu, 21 Mar 2024 13:49:43 -0400 Subject: drm/amd/display: Do not recursively call manual trigger programming [WHY&HOW] We should not be recursively calling the manual trigger programming function when FAMS is not in use. Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Hamza Mahfooz Signed-off-by: Dillon Varone Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c index f07a4c7e48bc..52eab8fccb7f 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c @@ -267,9 +267,6 @@ static void optc32_setup_manual_trigger(struct timing_generator *optc) OTG_V_TOTAL_MAX_SEL, 1, OTG_FORCE_LOCK_ON_EVENT, 0, OTG_SET_V_TOTAL_MIN_MASK, (1 << 1)); /* TRIGA */ - - // Setup manual flow control for EOF via TRIG_A - optc->funcs->setup_manual_trigger(optc); } } -- cgit v1.2.3 From cf79814cb0bf5749b9f0db53ca231aa540c02768 Mon Sep 17 00:00:00 2001 From: Fudongwang Date: Tue, 26 Mar 2024 16:03:16 +0800 Subject: drm/amd/display: fix disable otg wa logic in DCN316 [Why] Wrong logic cause screen corruption. [How] Port logic from DCN35/314. Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Hamza Mahfooz Signed-off-by: Fudongwang Signed-off-by: Alex Deucher --- .../amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c index 12f3e8aa46d8..6ad4f4efec5d 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c @@ -99,20 +99,25 @@ static int dcn316_get_active_display_cnt_wa( return display_count; } -static void dcn316_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state *context, bool disable) +static void dcn316_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state *context, + bool safe_to_lower, bool disable) { struct dc *dc = clk_mgr_base->ctx->dc; int i; for (i = 0; i < dc->res_pool->pipe_count; ++i) { - struct pipe_ctx *pipe = &dc->current_state->res_ctx.pipe_ctx[i]; + struct pipe_ctx *pipe = safe_to_lower + ? &context->res_ctx.pipe_ctx[i] + : &dc->current_state->res_ctx.pipe_ctx[i]; if (pipe->top_pipe || pipe->prev_odm_pipe) continue; - if (pipe->stream && (pipe->stream->dpms_off || pipe->plane_state == NULL || - dc_is_virtual_signal(pipe->stream->signal))) { + if (pipe->stream && (pipe->stream->dpms_off || dc_is_virtual_signal(pipe->stream->signal) || + !pipe->stream->link_enc)) { if (disable) { - pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg); + if (pipe->stream_res.tg && pipe->stream_res.tg->funcs->immediate_disable_crtc) + pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg); + reset_sync_context_for_pipe(dc, context, i); } else pipe->stream_res.tg->funcs->enable_crtc(pipe->stream_res.tg); @@ -207,11 +212,11 @@ static void dcn316_update_clocks(struct clk_mgr *clk_mgr_base, } if (should_set_clock(safe_to_lower, new_clocks->dispclk_khz, clk_mgr_base->clks.dispclk_khz)) { - dcn316_disable_otg_wa(clk_mgr_base, context, true); + dcn316_disable_otg_wa(clk_mgr_base, context, safe_to_lower, true); clk_mgr_base->clks.dispclk_khz = new_clocks->dispclk_khz; dcn316_smu_set_dispclk(clk_mgr, clk_mgr_base->clks.dispclk_khz); - dcn316_disable_otg_wa(clk_mgr_base, context, false); + dcn316_disable_otg_wa(clk_mgr_base, context, safe_to_lower, false); update_dispclk = true; } -- cgit v1.2.3 From 9e61ef8d219877202d4ee51d0d2ad9072c99a262 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Tue, 12 Mar 2024 11:55:52 -0400 Subject: drm/amd/display: Program VSC SDP colorimetry for all DP sinks >= 1.4 In order for display colorimetry to work correctly on DP displays we need to send the VSC SDP packet. We should only do so for panels with DPCD revision greater or equal to 1.4 as older receivers might have problems with it. Cc: stable@vger.kernel.org Cc: Joshua Ashton Cc: Xaver Hugl Cc: Melissa Wen Cc: Agustin Gutierrez Reviewed-by: Agustin Gutierrez Acked-by: Hamza Mahfooz Signed-off-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 6cf0d5f276fe..d112ce6c812c 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6318,7 +6318,9 @@ create_stream_for_sink(struct drm_connector *connector, if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket); - if (stream->link->psr_settings.psr_feature_enabled || stream->link->replay_settings.replay_feature_enabled) { + if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT || + stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST || + stream->signal == SIGNAL_TYPE_EDP) { // // should decide stream support vsc sdp colorimetry capability // before building vsc info packet @@ -6328,7 +6330,8 @@ create_stream_for_sink(struct drm_connector *connector, stream->use_vsc_sdp_for_colorimetry = aconnector->dc_sink->is_vsc_sdp_colorimetry_supported; } else { - if (stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) + if (stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && + stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) stream->use_vsc_sdp_for_colorimetry = true; } if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22) -- cgit v1.2.3 From c3e2a5f2da904a18661335e8be2b961738574998 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Thu, 21 Mar 2024 11:13:38 -0400 Subject: drm/amd/display: Set VSC SDP Colorimetry same way for MST and SST The previous check for the is_vsc_sdp_colorimetry_supported flag for MST sink signals did nothing. Simplify the code and use the same check for MST and SST. Cc: stable@vger.kernel.org Reviewed-by: Agustin Gutierrez Acked-by: Hamza Mahfooz Signed-off-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d112ce6c812c..6d2f60c61dec 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6325,15 +6325,9 @@ create_stream_for_sink(struct drm_connector *connector, // should decide stream support vsc sdp colorimetry capability // before building vsc info packet // - stream->use_vsc_sdp_for_colorimetry = false; - if (aconnector->dc_sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT_MST) { - stream->use_vsc_sdp_for_colorimetry = - aconnector->dc_sink->is_vsc_sdp_colorimetry_supported; - } else { - if (stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && - stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED) - stream->use_vsc_sdp_for_colorimetry = true; - } + stream->use_vsc_sdp_for_colorimetry = stream->link->dpcd_caps.dpcd_rev.raw >= 0x14 && + stream->link->dpcd_caps.dprx_feature.bits.VSC_SDP_COLORIMETRY_SUPPORTED; + if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22) tf = TRANSFER_FUNC_GAMMA_22; mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, stream->output_color_space, tf); -- cgit v1.2.3 From e047dd448d2bc12b8c30d7e3e6e98cea1fc28a17 Mon Sep 17 00:00:00 2001 From: Zhongwei Date: Wed, 27 Mar 2024 13:49:40 +0800 Subject: drm/amd/display: Adjust dprefclk by down spread percentage. [Why] OLED panels show no display for large vtotal timings. [How] Check if ss is enabled and read from lut for spread spectrum percentage. Adjust dprefclk as required. DP_DTO adjustment is for edp only. Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Hamza Mahfooz Signed-off-by: Zhongwei Signed-off-by: Alex Deucher --- .../amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 50 ++++++++++++++++++++++ .../gpu/drm/amd/display/dc/dce/dce_clock_source.c | 8 ++-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index 101fe96287cb..d9c5692c86c2 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -73,6 +73,12 @@ #define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_SEL_MASK 0x00000007L #define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_DIV_MASK 0x000F0000L +#define regCLK5_0_CLK5_spll_field_8 0x464b +#define regCLK5_0_CLK5_spll_field_8_BASE_IDX 0 + +#define CLK5_0_CLK5_spll_field_8__spll_ssc_en__SHIFT 0xd +#define CLK5_0_CLK5_spll_field_8__spll_ssc_en_MASK 0x00002000L + #define SMU_VER_THRESHOLD 0x5D4A00 //93.74.0 #define REG(reg_name) \ @@ -411,6 +417,17 @@ static void dcn35_dump_clk_registers(struct clk_state_registers_and_bypass *regs { } +static bool dcn35_is_spll_ssc_enabled(struct clk_mgr *clk_mgr_base) +{ + struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); + struct dc_context *ctx = clk_mgr->base.ctx; + uint32_t ssc_enable; + + REG_GET(CLK5_0_CLK5_spll_field_8, spll_ssc_en, &ssc_enable); + + return ssc_enable == 1; +} + static void init_clk_states(struct clk_mgr *clk_mgr) { struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); @@ -428,7 +445,16 @@ static void init_clk_states(struct clk_mgr *clk_mgr) void dcn35_init_clocks(struct clk_mgr *clk_mgr) { + struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); init_clk_states(clk_mgr); + + // to adjust dp_dto reference clock if ssc is enable otherwise to apply dprefclk + if (dcn35_is_spll_ssc_enabled(clk_mgr)) + clk_mgr->dp_dto_source_clock_in_khz = + dce_adjust_dp_ref_freq_for_ss(clk_mgr_int, clk_mgr->dprefclk_khz); + else + clk_mgr->dp_dto_source_clock_in_khz = clk_mgr->dprefclk_khz; + } static struct clk_bw_params dcn35_bw_params = { .vram_type = Ddr4MemType, @@ -517,6 +543,28 @@ static DpmClocks_t_dcn35 dummy_clocks; static struct dcn35_watermarks dummy_wms = { 0 }; +static struct dcn35_ss_info_table ss_info_table = { + .ss_divider = 1000, + .ss_percentage = {0, 0, 375, 375, 375} +}; + +static void dcn35_read_ss_info_from_lut(struct clk_mgr_internal *clk_mgr) +{ + struct dc_context *ctx = clk_mgr->base.ctx; + uint32_t clock_source; + + REG_GET(CLK1_CLK2_BYPASS_CNTL, CLK2_BYPASS_SEL, &clock_source); + // If it's DFS mode, clock_source is 0. + if (dcn35_is_spll_ssc_enabled(&clk_mgr->base) && (clock_source < ARRAY_SIZE(ss_info_table.ss_percentage))) { + clk_mgr->dprefclk_ss_percentage = ss_info_table.ss_percentage[clock_source]; + + if (clk_mgr->dprefclk_ss_percentage != 0) { + clk_mgr->ss_on_dprefclk = true; + clk_mgr->dprefclk_ss_divider = ss_info_table.ss_divider; + } + } +} + static void dcn35_build_watermark_ranges(struct clk_bw_params *bw_params, struct dcn35_watermarks *table) { int i, num_valid_sets; @@ -1061,6 +1109,8 @@ void dcn35_clk_mgr_construct( dce_clock_read_ss_info(&clk_mgr->base); /*when clk src is from FCH, it could have ss, same clock src as DPREF clk*/ + dcn35_read_ss_info_from_lut(&clk_mgr->base); + clk_mgr->base.base.bw_params = &dcn35_bw_params; if (clk_mgr->base.base.ctx->dc->debug.pstate_enabled) { diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c b/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c index 970644b695cd..b5e0289d2fe8 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c @@ -976,7 +976,10 @@ static bool dcn31_program_pix_clk( struct bp_pixel_clock_parameters bp_pc_params = {0}; enum transmitter_color_depth bp_pc_colour_depth = TRANSMITTER_COLOR_DEPTH_24; - if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0) + // Apply ssed(spread spectrum) dpref clock for edp only. + if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0 + && pix_clk_params->signal_type == SIGNAL_TYPE_EDP + && encoding == DP_8b_10b_ENCODING) dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz; // For these signal types Driver to program DP_DTO without calling VBIOS Command table if (dc_is_dp_signal(pix_clk_params->signal_type) || dc_is_virtual_signal(pix_clk_params->signal_type)) { @@ -1093,9 +1096,6 @@ static bool get_pixel_clk_frequency_100hz( unsigned int modulo_hz = 0; unsigned int dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dprefclk_khz; - if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0) - dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz; - if (clock_source->id == CLOCK_SOURCE_ID_DP_DTO) { clock_hz = REG_READ(PHASE[inst]); -- cgit v1.2.3 From 6dba20d23e85034901ccb765a7ca71199bcca4df Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Sun, 7 Apr 2024 22:01:35 +0800 Subject: drm/amdgpu: differentiate external rev id for gfx 11.5.0 This patch to differentiate external rev id for gfx 11.5.0. Signed-off-by: Yifan Zhang Reviewed-by: Tim Huang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc21.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index abe319b0f063..43ca63fe85ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -720,7 +720,10 @@ static int soc21_common_early_init(void *handle) AMD_PG_SUPPORT_VCN | AMD_PG_SUPPORT_JPEG | AMD_PG_SUPPORT_GFX_PG; - adev->external_rev_id = adev->rev_id + 0x1; + if (adev->rev_id == 0) + adev->external_rev_id = 0x1; + else + adev->external_rev_id = adev->rev_id + 0x10; break; case IP_VERSION(11, 5, 1): adev->cg_flags = -- cgit v1.2.3 From dec8ced871e17eea46f097542dd074d022be4bd1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 5 Mar 2024 22:10:03 -0800 Subject: perf/x86: Fix out of range data On x86 each struct cpu_hw_events maintains a table for counter assignment but it missed to update one for the deleted event in x86_pmu_del(). This can make perf_clear_dirty_counters() reset used counter if it's called before event scheduling or enabling. Then it would return out of range data which doesn't make sense. The following code can reproduce the problem. $ cat repro.c #include #include #include #include #include #include #include #include struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .disabled = 1, }; void *worker(void *arg) { int cpu = (long)arg; int fd1 = syscall(SYS_perf_event_open, &attr, -1, cpu, -1, 0); int fd2 = syscall(SYS_perf_event_open, &attr, -1, cpu, -1, 0); void *p; do { ioctl(fd1, PERF_EVENT_IOC_ENABLE, 0); p = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd1, 0); ioctl(fd2, PERF_EVENT_IOC_ENABLE, 0); ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0); munmap(p, 4096); ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0); } while (1); return NULL; } int main(void) { int i; int n = sysconf(_SC_NPROCESSORS_ONLN); pthread_t *th = calloc(n, sizeof(*th)); for (i = 0; i < n; i++) pthread_create(&th[i], NULL, worker, (void *)(long)i); for (i = 0; i < n; i++) pthread_join(th[i], NULL); free(th); return 0; } And you can see the out of range data using perf stat like this. Probably it'd be easier to see on a large machine. $ gcc -o repro repro.c -pthread $ ./repro & $ sudo perf stat -A -I 1000 2>&1 | awk '{ if (length($3) > 15) print }' 1.001028462 CPU6 196,719,295,683,763 cycles # 194290.996 GHz (71.54%) 1.001028462 CPU3 396,077,485,787,730 branch-misses # 15804359784.80% of all branches (71.07%) 1.001028462 CPU17 197,608,350,727,877 branch-misses # 14594186554.56% of all branches (71.22%) 2.020064073 CPU4 198,372,472,612,140 cycles # 194681.113 GHz (70.95%) 2.020064073 CPU6 199,419,277,896,696 cycles # 195720.007 GHz (70.57%) 2.020064073 CPU20 198,147,174,025,639 cycles # 194474.654 GHz (71.03%) 2.020064073 CPU20 198,421,240,580,145 stalled-cycles-frontend # 100.14% frontend cycles idle (70.93%) 3.037443155 CPU4 197,382,689,923,416 cycles # 194043.065 GHz (71.30%) 3.037443155 CPU20 196,324,797,879,414 cycles # 193003.773 GHz (71.69%) 3.037443155 CPU5 197,679,956,608,205 stalled-cycles-backend # 1315606428.66% backend cycles idle (71.19%) 3.037443155 CPU5 198,571,860,474,851 instructions # 13215422.58 insn per cycle It should move the contents in the cpuc->assign as well. Fixes: 5471eea5d3bf ("perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task") Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240306061003.1894224-1-namhyung@kernel.org --- arch/x86/events/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 09050641ce5d..5b0dd07b1ef1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1644,6 +1644,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; -- cgit v1.2.3 From 04f4230e2f86a4e961ea5466eda3db8c1762004d Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Tue, 9 Apr 2024 16:08:05 -0700 Subject: x86/bugs: Fix return type of spectre_bhi_state() The definition of spectre_bhi_state() incorrectly returns a const char * const. This causes the a compiler warning when building with W=1: warning: type qualifiers ignored on function return type [-Wignored-qualifiers] 2812 | static const char * const spectre_bhi_state(void) Remove the const qualifier from the pointer. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Reported-by: Sean Christopherson Signed-off-by: Daniel Sneddon Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240409230806.1545822-1-daniel.sneddon@linux.intel.com --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 295463707e68..27f5004a8e24 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2809,7 +2809,7 @@ static char *pbrsb_eibrs_state(void) } } -static const char * const spectre_bhi_state(void) +static const char *spectre_bhi_state(void) { if (!boot_cpu_has_bug(X86_BUG_BHI)) return "; BHI: Not affected"; -- cgit v1.2.3 From 81665adf25d28a00a986533f1d3a5df76b79cad9 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 8 Apr 2024 09:35:40 -0700 Subject: pds_core: Fix pdsc_check_pci_health function to use work thread When the driver notices fw_status == 0xff it tries to perform a PCI reset on itself via pci_reset_function() in the context of the driver's health thread. However, pdsc_reset_prepare calls pdsc_stop_health_thread(), which attempts to stop/flush the health thread. This results in a deadlock because the stop/flush will never complete since the driver called pci_reset_function() from the health thread context. Fix by changing the pdsc_check_pci_health_function() to queue a newly introduced pdsc_pci_reset_thread() on the pdsc's work queue. Unloading the driver in the fw_down/dead state uncovered another issue, which can be seen in the following trace: WARNING: CPU: 51 PID: 6914 at kernel/workqueue.c:1450 __queue_work+0x358/0x440 [...] RIP: 0010:__queue_work+0x358/0x440 [...] Call Trace: ? __warn+0x85/0x140 ? __queue_work+0x358/0x440 ? report_bug+0xfc/0x1e0 ? handle_bug+0x3f/0x70 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? __queue_work+0x358/0x440 queue_work_on+0x28/0x30 pdsc_devcmd_locked+0x96/0xe0 [pds_core] pdsc_devcmd_reset+0x71/0xb0 [pds_core] pdsc_teardown+0x51/0xe0 [pds_core] pdsc_remove+0x106/0x200 [pds_core] pci_device_remove+0x37/0xc0 device_release_driver_internal+0xae/0x140 driver_detach+0x48/0x90 bus_remove_driver+0x6d/0xf0 pci_unregister_driver+0x2e/0xa0 pdsc_cleanup_module+0x10/0x780 [pds_core] __x64_sys_delete_module+0x142/0x2b0 ? syscall_trace_enter.isra.18+0x126/0x1a0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fbd9d03a14b [...] Fix this by preventing the devcmd reset if the FW is not running. Fixes: d9407ff11809 ("pds_core: Prevent health thread from running during reset/remove") Reviewed-by: Shannon Nelson Signed-off-by: Brett Creeley Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/core.c | 13 ++++++++++++- drivers/net/ethernet/amd/pds_core/core.h | 2 ++ drivers/net/ethernet/amd/pds_core/dev.c | 3 +++ drivers/net/ethernet/amd/pds_core/main.c | 1 + 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 9662ee72814c..536635e57727 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -593,6 +593,16 @@ err_out: pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY); } +void pdsc_pci_reset_thread(struct work_struct *work) +{ + struct pdsc *pdsc = container_of(work, struct pdsc, pci_reset_work); + struct pci_dev *pdev = pdsc->pdev; + + pci_dev_get(pdev); + pci_reset_function(pdev); + pci_dev_put(pdev); +} + static void pdsc_check_pci_health(struct pdsc *pdsc) { u8 fw_status; @@ -607,7 +617,8 @@ static void pdsc_check_pci_health(struct pdsc *pdsc) if (fw_status != PDS_RC_BAD_PCI) return; - pci_reset_function(pdsc->pdev); + /* prevent deadlock between pdsc_reset_prepare and pdsc_health_thread */ + queue_work(pdsc->wq, &pdsc->pci_reset_work); } void pdsc_health_thread(struct work_struct *work) diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h index 92d7657dd614..a3e17a0c187a 100644 --- a/drivers/net/ethernet/amd/pds_core/core.h +++ b/drivers/net/ethernet/amd/pds_core/core.h @@ -197,6 +197,7 @@ struct pdsc { struct pdsc_qcq notifyqcq; u64 last_eid; struct pdsc_viftype *viftype_status; + struct work_struct pci_reset_work; }; /** enum pds_core_dbell_bits - bitwise composition of dbell values. @@ -313,5 +314,6 @@ int pdsc_firmware_update(struct pdsc *pdsc, const struct firmware *fw, void pdsc_fw_down(struct pdsc *pdsc); void pdsc_fw_up(struct pdsc *pdsc); +void pdsc_pci_reset_thread(struct work_struct *work); #endif /* _PDSC_H_ */ diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c index e494e1298dc9..495ef4ef8c10 100644 --- a/drivers/net/ethernet/amd/pds_core/dev.c +++ b/drivers/net/ethernet/amd/pds_core/dev.c @@ -229,6 +229,9 @@ int pdsc_devcmd_reset(struct pdsc *pdsc) .reset.opcode = PDS_CORE_CMD_RESET, }; + if (!pdsc_is_fw_running(pdsc)) + return 0; + return pdsc_devcmd(pdsc, &cmd, &comp, pdsc->devcmd_timeout); } diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c index ab6133e7db42..660268ff9562 100644 --- a/drivers/net/ethernet/amd/pds_core/main.c +++ b/drivers/net/ethernet/amd/pds_core/main.c @@ -239,6 +239,7 @@ static int pdsc_init_pf(struct pdsc *pdsc) snprintf(wq_name, sizeof(wq_name), "%s.%d", PDS_CORE_DRV_NAME, pdsc->uid); pdsc->wq = create_singlethread_workqueue(wq_name); INIT_WORK(&pdsc->health_work, pdsc_health_thread); + INIT_WORK(&pdsc->pci_reset_work, pdsc_pci_reset_thread); timer_setup(&pdsc->wdtimer, pdsc_wdtimer_cb, 0); pdsc->wdtimer_period = PDSC_WATCHDOG_SECS * HZ; -- cgit v1.2.3 From c38fa07dc69f0b9e6f43ecab96dc7861a70c827c Mon Sep 17 00:00:00 2001 From: Gil Fine Date: Fri, 1 Mar 2024 15:22:53 +0200 Subject: thunderbolt: Fix wake configurations after device unplug Currently we don't configure correctly the wake events after unplug of device router. What can happen is that the downstream ports of host router will be configured to wake on: USB4-wake and wake-on-disconnect, but not on wake-on-connect. This may cause the later plugged device not to wake the domain and fail in enumeration. Fix this by clearing downstream port's "USB4 Port is Configured" bit, after unplug of a device router. Signed-off-by: Gil Fine Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 6ffc4e81ffed..4edfd6e34e31 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -3180,22 +3180,29 @@ void tb_switch_unconfigure_link(struct tb_switch *sw) { struct tb_port *up, *down; - if (sw->is_unplugged) - return; if (!tb_route(sw) || tb_switch_is_icm(sw)) return; + /* + * Unconfigure downstream port so that wake-on-connect can be + * configured after router unplug. No need to unconfigure upstream port + * since its router is unplugged. + */ up = tb_upstream_port(sw); - if (tb_switch_is_usb4(up->sw)) - usb4_port_unconfigure(up); - else - tb_lc_unconfigure_port(up); - down = up->remote; if (tb_switch_is_usb4(down->sw)) usb4_port_unconfigure(down); else tb_lc_unconfigure_port(down); + + if (sw->is_unplugged) + return; + + up = tb_upstream_port(sw); + if (tb_switch_is_usb4(up->sw)) + usb4_port_unconfigure(up); + else + tb_lc_unconfigure_port(up); } static void tb_switch_credits_init(struct tb_switch *sw) -- cgit v1.2.3 From dcd12acaf384c30437fa5a9a1f71df06fc9835fd Mon Sep 17 00:00:00 2001 From: Gil Fine Date: Fri, 1 Mar 2024 15:11:18 +0200 Subject: thunderbolt: Avoid notify PM core about runtime PM resume Currently we notify PM core about occurred wakes after any resume. This is not actually needed after resume from runtime suspend. Hence, notify PM core about occurred wakes only after resume from system sleep. Also, if the wake occurred in USB4 router upstream port, we don't notify the PM core about it since it is not actually needed and can cause unexpected autowake (e.g. if /sys/power/wakeup_count is used). While there add the missing kernel-doc for tb_switch_resume(). Signed-off-by: Gil Fine Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 27 +++++++++++++++++++++++++-- drivers/thunderbolt/tb.c | 4 ++-- drivers/thunderbolt/tb.h | 3 ++- drivers/thunderbolt/usb4.c | 13 +++++++------ 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 4edfd6e34e31..326433df5880 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -3448,7 +3448,26 @@ static int tb_switch_set_wake(struct tb_switch *sw, unsigned int flags) return tb_lc_set_wake(sw, flags); } -int tb_switch_resume(struct tb_switch *sw) +static void tb_switch_check_wakes(struct tb_switch *sw) +{ + if (device_may_wakeup(&sw->dev)) { + if (tb_switch_is_usb4(sw)) + usb4_switch_check_wakes(sw); + } +} + +/** + * tb_switch_resume() - Resume a switch after sleep + * @sw: Switch to resume + * @runtime: Is this resume from runtime suspend or system sleep + * + * Resumes and re-enumerates router (and all its children), if still plugged + * after suspend. Don't enumerate device router whose UID was changed during + * suspend. If this is resume from system sleep, notifies PM core about the + * wakes occurred during suspend. Disables all wakes, except USB4 wake of + * upstream port for USB4 routers that shall be always enabled. + */ +int tb_switch_resume(struct tb_switch *sw, bool runtime) { struct tb_port *port; int err; @@ -3497,6 +3516,9 @@ int tb_switch_resume(struct tb_switch *sw) if (err) return err; + if (!runtime) + tb_switch_check_wakes(sw); + /* Disable wakes */ tb_switch_set_wake(sw, 0); @@ -3526,7 +3548,8 @@ int tb_switch_resume(struct tb_switch *sw) */ if (tb_port_unlock(port)) tb_port_warn(port, "failed to unlock port\n"); - if (port->remote && tb_switch_resume(port->remote->sw)) { + if (port->remote && + tb_switch_resume(port->remote->sw, runtime)) { tb_port_warn(port, "lost during suspend, disconnecting\n"); tb_sw_set_unplugged(port->remote->sw); diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index 360cb95f39aa..3e44c78ac409 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -2942,7 +2942,7 @@ static int tb_resume_noirq(struct tb *tb) if (!tb_switch_is_usb4(tb->root_switch)) tb_switch_reset(tb->root_switch); - tb_switch_resume(tb->root_switch); + tb_switch_resume(tb->root_switch, false); tb_free_invalid_tunnels(tb); tb_free_unplugged_children(tb->root_switch); tb_restore_children(tb->root_switch); @@ -3068,7 +3068,7 @@ static int tb_runtime_resume(struct tb *tb) struct tb_tunnel *tunnel, *n; mutex_lock(&tb->lock); - tb_switch_resume(tb->root_switch); + tb_switch_resume(tb->root_switch, true); tb_free_invalid_tunnels(tb); tb_restore_children(tb->root_switch); list_for_each_entry_safe(tunnel, n, &tcm->tunnel_list, list) diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h index feed8ecaf712..18aae4ccaed5 100644 --- a/drivers/thunderbolt/tb.h +++ b/drivers/thunderbolt/tb.h @@ -827,7 +827,7 @@ int tb_switch_configuration_valid(struct tb_switch *sw); int tb_switch_add(struct tb_switch *sw); void tb_switch_remove(struct tb_switch *sw); void tb_switch_suspend(struct tb_switch *sw, bool runtime); -int tb_switch_resume(struct tb_switch *sw); +int tb_switch_resume(struct tb_switch *sw, bool runtime); int tb_switch_reset(struct tb_switch *sw); int tb_switch_wait_for_bit(struct tb_switch *sw, u32 offset, u32 bit, u32 value, int timeout_msec); @@ -1288,6 +1288,7 @@ static inline bool tb_switch_is_usb4(const struct tb_switch *sw) return usb4_switch_version(sw) > 0; } +void usb4_switch_check_wakes(struct tb_switch *sw); int usb4_switch_setup(struct tb_switch *sw); int usb4_switch_configuration_valid(struct tb_switch *sw); int usb4_switch_read_uid(struct tb_switch *sw, u64 *uid); diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c index 9860b49d7a2b..78b06e922fda 100644 --- a/drivers/thunderbolt/usb4.c +++ b/drivers/thunderbolt/usb4.c @@ -155,7 +155,13 @@ static inline int usb4_switch_op_data(struct tb_switch *sw, u16 opcode, tx_dwords, rx_data, rx_dwords); } -static void usb4_switch_check_wakes(struct tb_switch *sw) +/** + * usb4_switch_check_wakes() - Check for wakes and notify PM core about them + * @sw: Router whose wakes to check + * + * Checks wakes occurred during suspend and notify the PM core about them. + */ +void usb4_switch_check_wakes(struct tb_switch *sw) { bool wakeup_usb4 = false; struct usb4_port *usb4; @@ -163,9 +169,6 @@ static void usb4_switch_check_wakes(struct tb_switch *sw) bool wakeup = false; u32 val; - if (!device_may_wakeup(&sw->dev)) - return; - if (tb_route(sw)) { if (tb_sw_read(sw, &val, TB_CFG_SWITCH, ROUTER_CS_6, 1)) return; @@ -244,8 +247,6 @@ int usb4_switch_setup(struct tb_switch *sw) u32 val = 0; int ret; - usb4_switch_check_wakes(sw); - if (!tb_route(sw)) return 0; -- cgit v1.2.3 From f87cbcb345d059f0377b4fa0ba1b766a17fc3710 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Apr 2024 12:29:12 +0200 Subject: timekeeping: Use READ/WRITE_ONCE() for tick_do_timer_cpu tick_do_timer_cpu is used lockless to check which CPU needs to take care of the per tick timekeeping duty. This is done to avoid a thundering herd problem on jiffies_lock. The read and writes are not annotated so KCSAN complains about data races: BUG: KCSAN: data-race in tick_nohz_idle_stop_tick / tick_nohz_next_event write to 0xffffffff8a2bda30 of 4 bytes by task 0 on cpu 26: tick_nohz_idle_stop_tick+0x3b1/0x4a0 do_idle+0x1e3/0x250 read to 0xffffffff8a2bda30 of 4 bytes by task 0 on cpu 16: tick_nohz_next_event+0xe7/0x1e0 tick_nohz_get_sleep_length+0xa7/0xe0 menu_select+0x82/0xb90 cpuidle_select+0x44/0x60 do_idle+0x1c2/0x250 value changed: 0x0000001a -> 0xffffffff Annotate them with READ/WRITE_ONCE() to document the intentional data race. Reported-by: Mirsad Todorovac Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Tested-by: Sean Anderson Link: https://lore.kernel.org/r/87cyqy7rt3.ffs@tglx --- kernel/time/tick-common.c | 17 +++++++++-------- kernel/time/tick-sched.c | 36 ++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index fb0fdec8719a..d88b13076b79 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -7,6 +7,7 @@ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ +#include #include #include #include @@ -84,7 +85,7 @@ int tick_is_oneshot_available(void) */ static void tick_periodic(int cpu) { - if (tick_do_timer_cpu == cpu) { + if (READ_ONCE(tick_do_timer_cpu) == cpu) { raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); @@ -215,8 +216,8 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; + if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) { + WRITE_ONCE(tick_do_timer_cpu, cpu); tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* @@ -232,7 +233,7 @@ static void tick_setup_device(struct tick_device *td, !tick_nohz_full_cpu(cpu)) { tick_take_do_timer_from_boot(); tick_do_timer_boot_cpu = -1; - WARN_ON(tick_do_timer_cpu != cpu); + WARN_ON(READ_ONCE(tick_do_timer_cpu) != cpu); #endif } @@ -406,10 +407,10 @@ void tick_assert_timekeeping_handover(void) int tick_cpu_dying(unsigned int dying_cpu) { /* - * If the current CPU is the timekeeper, it's the only one that - * can safely hand over its duty. Also all online CPUs are in - * stop machine, guaranteed not to be idle, therefore it's safe - * to pick any online successor. + * If the current CPU is the timekeeper, it's the only one that can + * safely hand over its duty. Also all online CPUs are in stop + * machine, guaranteed not to be idle, therefore there is no + * concurrency and it's safe to pick any online successor. */ if (tick_do_timer_cpu == dying_cpu) tick_do_timer_cpu = cpumask_first(cpu_online_mask); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1331216a9cae..71a792cd8936 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -8,6 +8,7 @@ * * Started by: Thomas Gleixner and Ingo Molnar */ +#include #include #include #include @@ -204,7 +205,7 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { - int cpu = smp_processor_id(); + int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about @@ -216,16 +217,18 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && - unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { + tick_cpu = READ_ONCE(tick_do_timer_cpu); + + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif - tick_do_timer_cpu = cpu; + WRITE_ONCE(tick_do_timer_cpu, cpu); + tick_cpu = cpu; } /* Check if jiffies need an update */ - if (tick_do_timer_cpu == cpu) + if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* @@ -610,7 +613,7 @@ bool tick_nohz_cpu_hotpluggable(unsigned int cpu) * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ - if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } @@ -891,6 +894,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; + int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; @@ -947,9 +951,9 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); - if (cpu != tick_do_timer_cpu && - (tick_do_timer_cpu != TICK_DO_TIMER_NONE || - !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) + tick_cpu = READ_ONCE(tick_do_timer_cpu); + if (tick_cpu != cpu && + (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ @@ -970,6 +974,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); + int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ @@ -1007,10 +1012,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; + tick_cpu = READ_ONCE(tick_do_timer_cpu); + if (tick_cpu == cpu) { + WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } @@ -1173,15 +1179,17 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; if (tick_nohz_full_enabled()) { + int tick_cpu = READ_ONCE(tick_do_timer_cpu); + /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ - if (tick_do_timer_cpu == cpu) + if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ - if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } -- cgit v1.2.3 From 19fa4f2a85d777a8052e869c1b892a2f7556569d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 8 Apr 2024 20:47:40 +0200 Subject: r8169: fix LED-related deadlock on module removal Binding devm_led_classdev_register() to the netdev is problematic because on module removal we get a RTNL-related deadlock. Fix this by avoiding the device-managed LED functions. Note: We can safely call led_classdev_unregister() for a LED even if registering it failed, because led_classdev_unregister() detects this and is a no-op in this case. Fixes: 18764b883e15 ("r8169: add support for LED's on RTL8168/RTL8101") Cc: stable@vger.kernel.org Reported-by: Lukas Wunner Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.h | 6 ++++-- drivers/net/ethernet/realtek/r8169_leds.c | 35 ++++++++++++++++++++----------- drivers/net/ethernet/realtek/r8169_main.c | 8 +++++-- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 4c043052198d..00882ffc7a02 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -73,6 +73,7 @@ enum mac_version { }; struct rtl8169_private; +struct r8169_led_classdev; void r8169_apply_firmware(struct rtl8169_private *tp); u16 rtl8168h_2_get_adc_bias_ioffset(struct rtl8169_private *tp); @@ -84,7 +85,8 @@ void r8169_get_led_name(struct rtl8169_private *tp, int idx, char *buf, int buf_len); int rtl8168_get_led_mode(struct rtl8169_private *tp); int rtl8168_led_mod_ctrl(struct rtl8169_private *tp, u16 mask, u16 val); -void rtl8168_init_leds(struct net_device *ndev); +struct r8169_led_classdev *rtl8168_init_leds(struct net_device *ndev); int rtl8125_get_led_mode(struct rtl8169_private *tp, int index); int rtl8125_set_led_mode(struct rtl8169_private *tp, int index, u16 mode); -void rtl8125_init_leds(struct net_device *ndev); +struct r8169_led_classdev *rtl8125_init_leds(struct net_device *ndev); +void r8169_remove_leds(struct r8169_led_classdev *leds); diff --git a/drivers/net/ethernet/realtek/r8169_leds.c b/drivers/net/ethernet/realtek/r8169_leds.c index 7c5dc9d0df85..e10bee706bc6 100644 --- a/drivers/net/ethernet/realtek/r8169_leds.c +++ b/drivers/net/ethernet/realtek/r8169_leds.c @@ -146,22 +146,22 @@ static void rtl8168_setup_ldev(struct r8169_led_classdev *ldev, led_cdev->hw_control_get_device = r8169_led_hw_control_get_device; /* ignore errors */ - devm_led_classdev_register(&ndev->dev, led_cdev); + led_classdev_register(&ndev->dev, led_cdev); } -void rtl8168_init_leds(struct net_device *ndev) +struct r8169_led_classdev *rtl8168_init_leds(struct net_device *ndev) { - /* bind resource mgmt to netdev */ - struct device *dev = &ndev->dev; struct r8169_led_classdev *leds; int i; - leds = devm_kcalloc(dev, RTL8168_NUM_LEDS, sizeof(*leds), GFP_KERNEL); + leds = kcalloc(RTL8168_NUM_LEDS + 1, sizeof(*leds), GFP_KERNEL); if (!leds) - return; + return NULL; for (i = 0; i < RTL8168_NUM_LEDS; i++) rtl8168_setup_ldev(leds + i, ndev, i); + + return leds; } static int rtl8125_led_hw_control_is_supported(struct led_classdev *led_cdev, @@ -245,20 +245,31 @@ static void rtl8125_setup_led_ldev(struct r8169_led_classdev *ldev, led_cdev->hw_control_get_device = r8169_led_hw_control_get_device; /* ignore errors */ - devm_led_classdev_register(&ndev->dev, led_cdev); + led_classdev_register(&ndev->dev, led_cdev); } -void rtl8125_init_leds(struct net_device *ndev) +struct r8169_led_classdev *rtl8125_init_leds(struct net_device *ndev) { - /* bind resource mgmt to netdev */ - struct device *dev = &ndev->dev; struct r8169_led_classdev *leds; int i; - leds = devm_kcalloc(dev, RTL8125_NUM_LEDS, sizeof(*leds), GFP_KERNEL); + leds = kcalloc(RTL8125_NUM_LEDS + 1, sizeof(*leds), GFP_KERNEL); if (!leds) - return; + return NULL; for (i = 0; i < RTL8125_NUM_LEDS; i++) rtl8125_setup_led_ldev(leds + i, ndev, i); + + return leds; +} + +void r8169_remove_leds(struct r8169_led_classdev *leds) +{ + if (!leds) + return; + + for (struct r8169_led_classdev *l = leds; l->ndev; l++) + led_classdev_unregister(&l->led); + + kfree(leds); } diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 6f1e6f386b7b..8a27328eae34 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -647,6 +647,8 @@ struct rtl8169_private { const char *fw_name; struct rtl_fw *rtl_fw; + struct r8169_led_classdev *leds; + u32 ocp_base; }; @@ -5044,6 +5046,8 @@ static void rtl_remove_one(struct pci_dev *pdev) cancel_work_sync(&tp->wk.work); + r8169_remove_leds(tp->leds); + unregister_netdev(tp->dev); if (tp->dash_type != RTL_DASH_NONE) @@ -5501,9 +5505,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (IS_ENABLED(CONFIG_R8169_LEDS)) { if (rtl_is_8125(tp)) - rtl8125_init_leds(dev); + tp->leds = rtl8125_init_leds(dev); else if (tp->mac_version > RTL_GIGA_MAC_VER_06) - rtl8168_init_leds(dev); + tp->leds = rtl8168_init_leds(dev); } netdev_info(dev, "%s, %pM, XID %03x, IRQ %d\n", -- cgit v1.2.3 From 3bbb331c1d34fdd5520a050fce35f71579430485 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 13 Mar 2024 10:30:04 +0800 Subject: tools/power/turbostat: Introduce BIC_SAM_mc6/BIC_SAMMHz/BIC_SAMACTMHz Graphics driver (i915/Xe) on mordern platforms splits GFX and SA Media information via different sysfs knobs. Existing BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz columns can be reused for GFX. Introduce BIC_SAM_mc6/BIC_SAMMHz/BIC_SAMACTMHz columns for SA Media. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.8 | 12 ++++- tools/power/x86/turbostat/turbostat.c | 91 +++++++++++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 567327b004e6..0d3672e5d9ed 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -129,9 +129,17 @@ The system configuration dump (if --quiet is not used) is followed by statistics .PP \fBPkgTmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor. .PP -\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms. +\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms or /sys/class/drm/card0/gt/gt0/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. .PP -\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz. +\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt_cur_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBGFXAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt_act_freq_mhz or /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. +.PP +\fBSAM%mc6\fP The percentage of time the SA Media is in the "module C6" state, mc6, during the measurement interval. From /sys/class/drm/card0/gt/gt1/rc6_residency_ms or /sys/class/drm/card0/device/tile0/gtN/gtidle/idle_residency_ms depending on the graphics driver being used. +.PP +\fBSAMMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/cur_freq depending on the graphics driver being used. +.PP +\fBSAMAMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz or /sys/class/drm/card0/device/tile0/gtN/freq0/act_freq depending on the graphics driver being used. .PP \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 016a5c7dc9bf..4fa2810da1a3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -133,6 +133,9 @@ struct msr_counter bic[] = { { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAM%mc6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SAMAMHz", "", 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -191,11 +194,14 @@ struct msr_counter bic[] = { #define BIC_IPC (1ULL << 52) #define BIC_CORE_THROT_CNT (1ULL << 53) #define BIC_UNCORE_MHZ (1ULL << 54) +#define BIC_SAM_mc6 (1ULL << 55) +#define BIC_SAMMHz (1ULL << 56) +#define BIC_SAMACTMHz (1ULL << 57) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) -#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) +#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -277,6 +283,9 @@ enum gfx_sysfs_idx { GFX_rc6, GFX_MHz, GFX_ACTMHz, + SAM_mc6, + SAM_MHz, + SAM_ACTMHz, GFX_MAX }; @@ -1193,6 +1202,9 @@ struct pkg_data { long long gfx_rc6_ms; unsigned int gfx_mhz; unsigned int gfx_act_mhz; + long long sam_mc6_ms; + unsigned int sam_mhz; + unsigned int sam_act_mhz; unsigned int package_id; struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */ struct rapl_counter energy_dram; /* MSR_DRAM_ENERGY_STATUS */ @@ -1844,6 +1856,15 @@ void print_header(char *delim) if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_SAM_mc6)) + outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : "")); + + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : "")); if (DO_BIC(BIC_Any_c0)) @@ -2251,6 +2272,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_GFXACTMHz)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz); + /* SAMmc6 */ + if (DO_BIC(BIC_SAM_mc6)) { + if (p->sam_mc6_ms == -1) { /* detect GFX counter reset */ + outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); + } else { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + p->sam_mc6_ms / 10.0 / interval_float); + } + } + + /* SAMMHz */ + if (DO_BIC(BIC_SAMMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz); + + /* SAMACTMHz */ + if (DO_BIC(BIC_SAMACTMHz)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz); + /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc); @@ -2437,6 +2476,15 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; + /* flag an error when mc6 counter resets/wraps */ + if (old->sam_mc6_ms > new->sam_mc6_ms) + old->sam_mc6_ms = -1; + else + old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms; + + old->sam_mhz = new->sam_mhz; + old->sam_act_mhz = new->sam_act_mhz; + old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value; old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value; old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value; @@ -2661,6 +2709,9 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->uncore_mhz = 0; p->gfx_mhz = 0; p->gfx_act_mhz = 0; + p->sam_mc6_ms = 0; + p->sam_mhz = 0; + p->sam_act_mhz = 0; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) t->counter[i] = 0; @@ -2775,6 +2826,9 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.uncore_mhz = p->uncore_mhz; average.packages.gfx_mhz = p->gfx_mhz; average.packages.gfx_act_mhz = p->gfx_act_mhz; + average.packages.sam_mc6_ms = p->sam_mc6_ms; + average.packages.sam_mhz = p->sam_mhz; + average.packages.sam_act_mhz = p->sam_act_mhz; average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c); @@ -3572,19 +3626,28 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } - if (DO_BIC(BIC_GFX_rc6)) - p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; - /* n.b. assume die0 uncore frequency applies to whole package */ if (DO_BIC(BIC_UNCORE_MHZ)) p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + if (DO_BIC(BIC_GFX_rc6)) + p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull; + if (DO_BIC(BIC_GFXMHz)) p->gfx_mhz = gfx_info[GFX_MHz].val; if (DO_BIC(BIC_GFXACTMHz)) p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val; + if (DO_BIC(BIC_SAM_mc6)) + p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull; + + if (DO_BIC(BIC_SAMMHz)) + p->sam_mhz = gfx_info[SAM_MHz].val; + + if (DO_BIC(BIC_SAMACTMHz)) + p->sam_act_mhz = gfx_info[SAM_ACTMHz].val; + for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &p->counter[i])) return -10; @@ -4634,6 +4697,7 @@ int snapshot_graphics(int idx) switch (idx) { case GFX_rc6: + case SAM_mc6: fp = fopen_or_die(gfx_info[idx].path, "r"); retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull); if (retval != 1) @@ -4642,6 +4706,8 @@ int snapshot_graphics(int idx) return 0; case GFX_MHz: case GFX_ACTMHz: + case SAM_MHz: + case SAM_ACTMHz: if (gfx_info[idx].fp == NULL) { gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r"); } else { @@ -4727,6 +4793,15 @@ int snapshot_proc_sysfs_files(void) if (DO_BIC(BIC_GFXACTMHz)) snapshot_graphics(GFX_ACTMHz); + if (DO_BIC(BIC_SAM_mc6)) + snapshot_graphics(SAM_mc6); + + if (DO_BIC(BIC_SAMMHz)) + snapshot_graphics(SAM_MHz); + + if (DO_BIC(BIC_SAMACTMHz)) + snapshot_graphics(SAM_ACTMHz); + if (DO_BIC(BIC_CPU_LPI)) snapshot_cpu_lpi_us(); @@ -5325,6 +5400,12 @@ static void probe_graphics(void) BIC_PRESENT(BIC_GFXMHz); if (gfx_info[GFX_ACTMHz].path) BIC_PRESENT(BIC_GFXACTMHz); + if (gfx_info[SAM_mc6].path) + BIC_PRESENT(BIC_SAM_mc6); + if (gfx_info[SAM_MHz].path) + BIC_PRESENT(BIC_SAMMHz); + if (gfx_info[SAM_ACTMHz].path) + BIC_PRESENT(BIC_SAMACTMHz); } static void dump_sysfs_cstate_config(void) -- cgit v1.2.3 From dc02dc937a3ef819c5da10e97084af6977be26bf Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 22 Mar 2024 09:52:24 +0800 Subject: tools/power/turbostat: Add support for new i915 sysfs knobs On Meteorlake platform, i915 driver supports the traditional graphics sysfs knobs including /sys/class/drm/card0/power/rc6_residency_ms /sys/class/drm/card0/gt_cur_freq_mhz /sys/class/drm/card0/gt_act_freq_mhz At the same time, it also supports /sys/class/drm/card0/gt/gt0/rc6_residency_ms /sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz /sys/class/drm/card0/gt/gt0/rps_act_freq_mhz /sys/class/drm/card0/gt/gt1/rc6_residency_ms /sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz /sys/class/drm/card0/gt/gt1/rps_act_freq_mhz gt0 is for GFX and gt1 is for SA Media. Enhance turbostat to prefer the i915 new sysfs knobs. Export gt0 via BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz. Export gt1 via BIC_SMA_mc6/BIC_SMAMHz/BIC_SMAACTMHz. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4fa2810da1a3..feca7f4cb5cd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5380,6 +5380,29 @@ probe_cluster: static void probe_graphics(void) { + /* New i915 graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { + gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK)) + gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK)) + gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK)) + gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK)) + gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz"; + + if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK)) + gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz"; + + goto end; + } + + /* Fall back to traditional i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms"; @@ -5394,6 +5417,7 @@ static void probe_graphics(void) else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz"; +end: if (gfx_info[GFX_rc6].path) BIC_PRESENT(BIC_GFX_rc6); if (gfx_info[GFX_MHz].path) -- cgit v1.2.3 From 91a91d389543a86963beec148d98d37875154bd4 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 12 Mar 2024 23:56:02 +0800 Subject: tools/power/turbostat: Add support for Xe sysfs knobs Xe graphics driver uses different graphics sysfs knobs including /sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms /sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq /sys/class/drm/card0/device/tile0/gt0/freq0/act_freq /sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms /sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq /sys/class/drm/card0/device/tile0/gt1/freq0/act_freq Plus that, /sys/class/drm/card0/device/tile0/gt/gtidle/name returns either gt-rc or gt-mc. rc is for GFX and mc is SA Media. Enhance turbostat to prefer the Xe sysfs knobs when they are available. Export gt-rc via BIC_GFX_rc6/BIC_GFXMHz/BIC_GFXACTMHz. Export gt-mc via BIC_SMA_mc6/BIC_SMAMHz/BIC_SMAACTMHz. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index feca7f4cb5cd..bc103851df70 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5380,6 +5380,57 @@ probe_cluster: static void probe_graphics(void) { + /* Xe graphics sysfs knobs */ + if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) { + FILE *fp; + char buf[8]; + bool gt0_is_gt; + int idx; + + fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r"); + if (!fp) + goto next; + + if (!fread(buf, sizeof(char), 7, fp)) { + fclose(fp); + goto next; + } + fclose(fp); + + if (!strncmp(buf, "gt0-rc", strlen("gt0-rc"))) + gt0_is_gt = true; + else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc"))) + gt0_is_gt = false; + else + goto next; + + idx = gt0_is_gt ? GFX_rc6 : SAM_mc6; + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? GFX_MHz : SAM_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq"; + + idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq"; + + idx = gt0_is_gt ? SAM_mc6 : GFX_rc6; + if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms"; + + idx = gt0_is_gt ? SAM_MHz : GFX_MHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq"; + + idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz; + if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK)) + gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq"; + + goto end; + } + +next: /* New i915 graphics sysfs knobs */ if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) { gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms"; -- cgit v1.2.3 From 3ab7296a7e6aa34634dcc2926af933107a117996 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 8 Apr 2024 19:32:58 -0400 Subject: tools/power turbostat: v2024.04.10 Much of turbostat can now run with perf, rather than using the MSR driver Some of turbostat can now run as a regular non-root user. Add some new output columns for some new GFX hardware. [This patch updates the version, but otherwise changes no function; it touches up some checkpatch issues from previous patches] Signed-off-by: Len Brown --- MAINTAINERS | 1 + tools/power/x86/turbostat/turbostat.c | 41 ++++++++++++++++--------- tools/testing/selftests/turbostat/defcolumns.py | 1 + 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a7c4cf8201e0..b8582a466128 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22116,6 +22116,7 @@ Q: https://patchwork.kernel.org/project/linux-pm/list/ B: https://bugzilla.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git turbostat F: tools/power/x86/turbostat/ +F: tools/testing/selftests/turbostat/ TW5864 VIDEO4LINUX DRIVER M: Bluecherry Maintainers diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bc103851df70..98256468e248 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2023 Intel Corporation. + * Copyright (c) 2024 Intel Corporation. * Len Brown */ @@ -1360,6 +1360,7 @@ struct sys_counters { void free_sys_counters(void) { struct msr_counter *p = sys.tp, *pnext = NULL; + while (p) { pnext = p->next; free(p); @@ -1979,6 +1980,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale; const double energy_scale = c->core_energy.scale; + if (c->core_energy.unit == RAPL_UNIT_JOULES) outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale); @@ -3153,7 +3155,7 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char return v; } -static unsigned read_msr_type(void) +static unsigned int read_msr_type(void) { const char *const path = "/sys/bus/event_source/devices/msr/type"; const char *const format = "%u"; @@ -3161,7 +3163,7 @@ static unsigned read_msr_type(void) return read_perf_counter_info_n(path, format); } -static unsigned read_aperf_config(void) +static unsigned int read_aperf_config(void) { const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; const char *const format = "event=%x"; @@ -3169,7 +3171,7 @@ static unsigned read_aperf_config(void) return read_perf_counter_info_n(path, format); } -static unsigned read_mperf_config(void) +static unsigned int read_mperf_config(void) { const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; const char *const format = "event=%x"; @@ -3177,7 +3179,7 @@ static unsigned read_mperf_config(void) return read_perf_counter_info_n(path, format); } -static unsigned read_perf_type(const char *subsys) +static unsigned int read_perf_type(const char *subsys) { const char *const path_format = "/sys/bus/event_source/devices/%s/type"; const char *const format = "%u"; @@ -3188,7 +3190,7 @@ static unsigned read_perf_type(const char *subsys) return read_perf_counter_info_n(path, format); } -static unsigned read_rapl_config(const char *subsys, const char *event_name) +static unsigned int read_rapl_config(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; const char *const format = "event=%x"; @@ -3199,7 +3201,7 @@ static unsigned read_rapl_config(const char *subsys, const char *event_name) return read_perf_counter_info_n(path, format); } -static unsigned read_perf_rapl_unit(const char *subsys, const char *event_name) +static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit"; const char *const format = "%s"; @@ -3235,7 +3237,7 @@ static struct amperf_group_fd open_amperf_fd(int cpu) const unsigned int msr_type = read_msr_type(); const unsigned int aperf_config = read_aperf_config(); const unsigned int mperf_config = read_mperf_config(); - struct amperf_group_fd fds = {.aperf = -1,.mperf = -1 }; + struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); @@ -3277,6 +3279,7 @@ static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) t->tsc = rdtsc(); const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); + if (n != sizeof(cnt.as_array)) return -2; @@ -3371,7 +3374,7 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain]; if (debug) - fprintf(stderr, "get_rapl_counters: cpu%d domain%d\n", cpu, domain); + fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); @@ -3382,8 +3385,9 @@ int get_rapl_counters(int cpu, int domain, struct core_data *c, struct pkg_data size_t num_perf_counters = rapl_counter_info_count_perf(rci); const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data)); + if (actual_read_size != expected_read_size) - err(-1, "get_rapl_counters: failed to read perf_data (%zu %zu)", expected_read_size, + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size); } @@ -3454,7 +3458,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int status; if (cpu_migrate(cpu)) { - fprintf(outf, "get_counters: Could not migrate to CPU %d\n", cpu); + fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu); return -1; } @@ -6411,15 +6415,17 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc return -1; const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + if (scale == 0.0) return -1; const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name); + if (unit == RAPL_UNIT_INVALID) return -1; - const unsigned rapl_type = read_perf_type(cai->perf_subsys); - const unsigned rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int rapl_type = read_perf_type(cai->perf_subsys); + const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); @@ -6441,7 +6447,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); if (debug) - fprintf(stderr, "add_rapl_perf_counter: %d (cpu: %d)\n", ret, cpu); + fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; } @@ -6462,6 +6468,7 @@ void linux_perf_init(void) } const bool aperf_required = is_aperf_access_required(); + if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); if (fd_amperf_percpu == NULL) @@ -6483,6 +6490,7 @@ void rapl_perf_init(void) */ for (int domain_id = 0; domain_id < num_domains; ++domain_id) { struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id]; + rci->fd_perf = -1; for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { rci->data[i] = 0; @@ -7296,6 +7304,7 @@ static void set_amperf_source(void) amperf_source = AMPERF_SOURCE_PERF; const bool aperf_required = is_aperf_access_required(); + if (no_perf || !aperf_required || !has_amperf_access_via_perf()) amperf_source = AMPERF_SOURCE_MSR; @@ -7373,10 +7382,12 @@ void check_msr_access(void) void check_perf_access(void) { const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); + if (no_perf || !intrcount_required || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; const bool aperf_required = is_aperf_access_required(); + if (!aperf_required || !has_amperf_access()) { bic_enabled &= ~BIC_Avg_MHz; bic_enabled &= ~BIC_Busy; @@ -7486,7 +7497,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2023.11.07 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.04.08 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 diff --git a/tools/testing/selftests/turbostat/defcolumns.py b/tools/testing/selftests/turbostat/defcolumns.py index 70d3b7780311..d9b042097da7 100755 --- a/tools/testing/selftests/turbostat/defcolumns.py +++ b/tools/testing/selftests/turbostat/defcolumns.py @@ -1,4 +1,5 @@ #!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 import subprocess from shutil import which -- cgit v1.2.3 From 0871bc0129d403747ea0272a4384895d7ad37a6c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: mm: Move lowmem_page_address() a little later LoongArch will override page_to_virt() which use page_address() in the KFENCE case (by defining WANT_PAGE_VIRTUAL/HASHED_PAGE_VIRTUAL). So move lowmem_page_address() a little later to avoid such build errors: error: implicit declaration of function 'page_address'. Acked-by: Andrew Morton Signed-off-by: Huacai Chen --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0436b919f1c7..7b0ee64225de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2207,11 +2207,6 @@ static inline int arch_make_folio_accessible(struct folio *folio) */ #include -static __always_inline void *lowmem_page_address(const struct page *page) -{ - return page_to_virt(page); -} - #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif @@ -2234,6 +2229,11 @@ void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif +static __always_inline void *lowmem_page_address(const struct page *page) +{ + return page_to_virt(page); +} + #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) -- cgit v1.2.3 From 0ca84aeaee150796d4b5577b1b0ae52a947e7813 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: LoongArch: Make {virt, phys, page, pfn} translation work with KFENCE KFENCE changes virt_to_page() to be able to translate tlb mapped virtual addresses, but forget to change virt_to_phys()/phys_to_virt() and other translation functions as well. This patch fix it, otherwise some drivers (such as nvme and virtio-blk) cannot work with KFENCE. All {virt, phys, page, pfn} translation functions are updated: 1, virt_to_pfn()/pfn_to_virt(); 2, virt_to_page()/page_to_virt(); 3, virt_to_phys()/phys_to_virt(). DMW/TLB mapped addresses are distinguished by comparing the vaddress with vm_map_base in virt_to_xyz(), and we define WANT_PAGE_VIRTUAL in the KFENCE case for the reverse translations, xyz_to_virt(). Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/io.h | 20 +++++++++++++++----- arch/loongarch/include/asm/kfence.h | 9 +++++++++ arch/loongarch/include/asm/page.h | 26 +++++++++++++++++++++++++- arch/loongarch/mm/pgtable.c | 4 ++-- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 4a8adcca329b..c2f9979b2979 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -14,11 +14,6 @@ #include #include -/* - * Change "struct page" to physical address. - */ -#define page_to_phys(page) ((phys_addr_t)page_to_pfn(page) << PAGE_SHIFT) - extern void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size); extern void __init early_iounmap(void __iomem *addr, unsigned long size); @@ -73,6 +68,21 @@ extern void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t #define __io_aw() mmiowb() +#ifdef CONFIG_KFENCE +#define virt_to_phys(kaddr) \ +({ \ + (likely((unsigned long)kaddr < vm_map_base)) ? __pa((unsigned long)kaddr) : \ + page_to_phys(tlb_virt_to_page((unsigned long)kaddr)) + offset_in_page((unsigned long)kaddr);\ +}) + +#define phys_to_virt(paddr) \ +({ \ + extern char *__kfence_pool; \ + (unlikely(__kfence_pool == NULL)) ? __va((unsigned long)paddr) : \ + page_address(phys_to_page((unsigned long)paddr)) + offset_in_page((unsigned long)paddr);\ +}) +#endif + #include #define ARCH_HAS_VALID_PHYS_ADDR_RANGE diff --git a/arch/loongarch/include/asm/kfence.h b/arch/loongarch/include/asm/kfence.h index 6c82aea1c993..a6a5760da3a3 100644 --- a/arch/loongarch/include/asm/kfence.h +++ b/arch/loongarch/include/asm/kfence.h @@ -16,6 +16,7 @@ static inline bool arch_kfence_init_pool(void) { int err; + char *kaddr, *vaddr; char *kfence_pool = __kfence_pool; struct vm_struct *area; @@ -35,6 +36,14 @@ static inline bool arch_kfence_init_pool(void) return false; } + kaddr = kfence_pool; + vaddr = __kfence_pool; + while (kaddr < kfence_pool + KFENCE_POOL_SIZE) { + set_page_address(virt_to_page(kaddr), vaddr); + kaddr += PAGE_SIZE; + vaddr += PAGE_SIZE; + } + return true; } diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 44027060c54a..e85df33f11c7 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -78,7 +78,26 @@ typedef struct { unsigned long pgprot; } pgprot_t; struct page *dmw_virt_to_page(unsigned long kaddr); struct page *tlb_virt_to_page(unsigned long kaddr); -#define virt_to_pfn(kaddr) PFN_DOWN(PHYSADDR(kaddr)) +#define pfn_to_phys(pfn) __pfn_to_phys(pfn) +#define phys_to_pfn(paddr) __phys_to_pfn(paddr) + +#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) +#define phys_to_page(paddr) pfn_to_page(phys_to_pfn(paddr)) + +#ifndef CONFIG_KFENCE + +#define page_to_virt(page) __va(page_to_phys(page)) +#define virt_to_page(kaddr) phys_to_page(__pa(kaddr)) + +#else + +#define WANT_PAGE_VIRTUAL + +#define page_to_virt(page) \ +({ \ + extern char *__kfence_pool; \ + (__kfence_pool == NULL) ? __va(page_to_phys(page)) : page_address(page); \ +}) #define virt_to_page(kaddr) \ ({ \ @@ -86,6 +105,11 @@ struct page *tlb_virt_to_page(unsigned long kaddr); dmw_virt_to_page((unsigned long)kaddr) : tlb_virt_to_page((unsigned long)kaddr);\ }) +#endif + +#define pfn_to_virt(pfn) page_to_virt(pfn_to_page(pfn)) +#define virt_to_pfn(kaddr) page_to_pfn(virt_to_page(kaddr)) + extern int __virt_addr_valid(volatile void *kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((volatile void *)(kaddr)) diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 2aae72e63871..bda018150000 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -11,13 +11,13 @@ struct page *dmw_virt_to_page(unsigned long kaddr) { - return pfn_to_page(virt_to_pfn(kaddr)); + return phys_to_page(__pa(kaddr)); } EXPORT_SYMBOL(dmw_virt_to_page); struct page *tlb_virt_to_page(unsigned long kaddr) { - return pfn_to_page(pte_pfn(*virt_to_kpte(kaddr))); + return phys_to_page(pfn_to_phys(pte_pfn(*virt_to_kpte(kaddr)))); } EXPORT_SYMBOL(tlb_virt_to_page); -- cgit v1.2.3 From 1a629fe4cca0fc4cf546b4ca2e9d4b75bbfde9ff Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: LoongArch: Make virt_addr_valid()/__virt_addr_valid() work with KFENCE When enabling both CONFIG_KFENCE and CONFIG_DEBUG_SG, I get the following backtraces when running LongArch kernels. [ 2.496257] kernel BUG at include/linux/scatterlist.h:187! ... [ 2.501925] Call Trace: [ 2.501950] [<9000000004ad59c4>] sg_init_one+0xac/0xc0 [ 2.502204] [<9000000004a438f8>] do_test_kpp+0x278/0x6e4 [ 2.502353] [<9000000004a43dd4>] alg_test_kpp+0x70/0xf4 [ 2.502494] [<9000000004a41b48>] alg_test+0x128/0x690 [ 2.502631] [<9000000004a3d898>] cryptomgr_test+0x20/0x40 [ 2.502775] [<90000000041b4508>] kthread+0x138/0x158 [ 2.502912] [<9000000004161c48>] ret_from_kernel_thread+0xc/0xa4 The backtrace is always similar but not exactly the same. It is always triggered from cryptomgr_test, but not always from the same test. Analysis shows that with CONFIG_KFENCE active, the address returned from kmalloc() and friends is not always below vm_map_base. It is allocated by kfence_alloc() which at least sometimes seems to get its memory from an address space above vm_map_base. This causes __virt_addr_valid() to return false for the affected objects. Let __virt_addr_valid() return 1 for kfence pool addresses, this make virt_addr_valid()/__virt_addr_valid() work with KFENCE. Reported-by: Guenter Roeck Suggested-by: Guenter Roeck Signed-off-by: Huacai Chen --- arch/loongarch/mm/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c index a9630a81b38a..89af7c12e8c0 100644 --- a/arch/loongarch/mm/mmap.c +++ b/arch/loongarch/mm/mmap.c @@ -4,6 +4,7 @@ */ #include #include +#include #include #include #include @@ -111,6 +112,9 @@ int __virt_addr_valid(volatile void *kaddr) { unsigned long vaddr = (unsigned long)kaddr; + if (is_kfence_address((void *)kaddr)) + return 1; + if ((vaddr < PAGE_OFFSET) || (vaddr >= vm_map_base)) return 0; -- cgit v1.2.3 From ec2bbc575e44909fd333328537519145852927a6 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: LoongArch: Update dts for Loongson-2K1000 to support ISA/LPC Some Loongson-2K1000 platforms have ISA/LPC devices such as Super-IO, define an ISA node in the dts file to avoid access error. Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k1000.dtsi | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/loongarch/boot/dts/loongson-2k1000.dtsi b/arch/loongarch/boot/dts/loongson-2k1000.dtsi index 49a70f8c3cab..b6aeb1f70e2a 100644 --- a/arch/loongarch/boot/dts/loongson-2k1000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k1000.dtsi @@ -100,6 +100,13 @@ #size-cells = <2>; dma-coherent; + isa@18000000 { + compatible = "isa"; + #size-cells = <1>; + #address-cells = <2>; + ranges = <1 0x0 0x0 0x18000000 0x4000>; + }; + liointc0: interrupt-controller@1fe01400 { compatible = "loongson,liointc-2.0"; reg = <0x0 0x1fe01400 0x0 0x40>, -- cgit v1.2.3 From b07b9f353d750ae503fc9fcbdc5f29dc38553605 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: LoongArch: Update dts for Loongson-2K2000 to support ISA/LPC Some Loongson-2K2000 platforms have ISA/LPC devices such as Super-IO, define an ISA node in the dts file to avoid access error. Also adjust the PCI io resource range to avoid confliction. Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k2000.dtsi | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi index a231949b5f55..dcee9f3783fd 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -51,6 +51,13 @@ #address-cells = <2>; #size-cells = <2>; + isa@18400000 { + compatible = "isa"; + #size-cells = <1>; + #address-cells = <2>; + ranges = <1 0x0 0x0 0x18400000 0x4000>; + }; + pmc: power-management@100d0000 { compatible = "loongson,ls2k2000-pmc", "loongson,ls2k0500-pmc", "syscon"; reg = <0x0 0x100d0000 0x0 0x58>; @@ -141,7 +148,7 @@ #size-cells = <2>; device_type = "pci"; bus-range = <0x0 0xff>; - ranges = <0x01000000 0x0 0x00008000 0x0 0x18400000 0x0 0x00008000>, + ranges = <0x01000000 0x0 0x00008000 0x0 0x18408000 0x0 0x00008000>, <0x02000000 0x0 0x60000000 0x0 0x60000000 0x0 0x20000000>; gmac0: ethernet@3,0 { -- cgit v1.2.3 From 84892cebdc7fd5d6d70ba5b667a7440dfed2d032 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: LoongArch: Update dts for Loongson-2K2000 to support PCI-MSI Current dts file for Loongson-2K2000 misses the interrupt-controller & interrupt-cells descriptions in the msi-controller node, and misses the msi-parent link in the pci root node. Add them to support PCI-MSI. Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k2000.dtsi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi index dcee9f3783fd..481c6c869a8b 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -116,6 +116,8 @@ msi: msi-controller@1fe01140 { compatible = "loongson,pch-msi-1.0"; reg = <0x0 0x1fe01140 0x0 0x8>; + interrupt-controller; + #interrupt-cells = <1>; msi-controller; loongson,msi-base-vec = <64>; loongson,msi-num-vecs = <192>; @@ -147,6 +149,7 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; + msi-parent = <&msi>; bus-range = <0x0 0xff>; ranges = <0x01000000 0x0 0x00008000 0x0 0x18408000 0x0 0x00008000>, <0x02000000 0x0 0x60000000 0x0 0x60000000 0x0 0x20000000>; -- cgit v1.2.3 From 3744e0ee80251149135aac59870147e9ed6faae7 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: LoongArch: Update dts for Loongson-2K2000 to support GMAC/GNET Current dts file for Loongson-2K2000's GMAC/GNET is incomplete, both irq and phy descriptions are missing. Add them to make GMAC/GNET work. Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k2000-ref.dts | 33 +++++++++++++++++++++++++ arch/loongarch/boot/dts/loongson-2k2000.dtsi | 12 ++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts index dca91caf895e..74b99bd234cc 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts @@ -61,12 +61,45 @@ &gmac0 { status = "okay"; + + phy-mode = "gmii"; + phy-handle = <&phy0>; + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + phy0: ethernet-phy@0 { + reg = <2>; + }; + }; }; &gmac1 { status = "okay"; + + phy-mode = "gmii"; + phy-handle = <&phy1>; + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + phy1: ethernet-phy@1 { + reg = <2>; + }; + }; }; &gmac2 { status = "okay"; + + phy-mode = "rgmii"; + phy-handle = <&phy2>; + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + phy2: ethernet-phy@2 { + reg = <0>; + }; + }; }; diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi index 481c6c869a8b..9eab2d02cbe8 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -156,21 +156,27 @@ gmac0: ethernet@3,0 { reg = <0x1800 0x0 0x0 0x0 0x0>; - interrupts = <12 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH>, + <13 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq", "eth_lpi"; interrupt-parent = <&pic>; status = "disabled"; }; gmac1: ethernet@3,1 { reg = <0x1900 0x0 0x0 0x0 0x0>; - interrupts = <14 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <14 IRQ_TYPE_LEVEL_HIGH>, + <15 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq", "eth_lpi"; interrupt-parent = <&pic>; status = "disabled"; }; gmac2: ethernet@3,2 { reg = <0x1a00 0x0 0x0 0x0 0x0>; - interrupts = <17 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <17 IRQ_TYPE_LEVEL_HIGH>, + <18 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq", "eth_lpi"; interrupt-parent = <&pic>; status = "disabled"; }; -- cgit v1.2.3 From a07c772fa658645887119184de48b255bf19a46e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: LoongArch: Include linux/sizes.h in addrspace.h to prevent build errors LoongArch's include/asm/addrspace.h uses SZ_32M and SZ_16K, so add to provide those macros to prevent build errors: In file included from ../arch/loongarch/include/asm/io.h:11, from ../include/linux/io.h:13, from ../include/linux/io-64-nonatomic-lo-hi.h:5, from ../drivers/cxl/pci.c:4: ../include/asm-generic/io.h: In function 'ioport_map': ../arch/loongarch/include/asm/addrspace.h:124:25: error: 'SZ_32M' undeclared (first use in this function); did you mean 'PS_32M'? 124 | #define PCI_IOSIZE SZ_32M Signed-off-by: Randy Dunlap Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/addrspace.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/include/asm/addrspace.h b/arch/loongarch/include/asm/addrspace.h index b24437e28c6e..7bd47d65bf7a 100644 --- a/arch/loongarch/include/asm/addrspace.h +++ b/arch/loongarch/include/asm/addrspace.h @@ -11,6 +11,7 @@ #define _ASM_ADDRSPACE_H #include +#include #include -- cgit v1.2.3 From a9025cd1c673a8d6eefc79d911075b8b452eba8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Apr 2024 15:22:01 +0200 Subject: x86/topology: Don't update cpu_possible_map in topo_set_cpuids() topo_set_cpuids() updates cpu_present_map and cpu_possible map. It is invoked during enumeration and "physical hotplug" operations. In the latter case this results in a kernel crash because cpu_possible_map is marked read only after init completes. There is no reason to update cpu_possible_map in that function. During enumeration cpu_possible_map is not relevant and gets fully initialized after enumeration completed. On "physical hotplug" the bit is already set because the kernel allows only CPUs to be plugged which have been enumerated and associated to a CPU number during early boot. Remove the bogus update of cpu_possible_map. Fixes: 0e53e7b656cf ("x86/cpu/topology: Sanitize the APIC admission logic") Reported-by: Jonathan Cameron Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87ttkc6kwx.ffs@tglx --- arch/x86/kernel/cpu/topology.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index aaca8d235dc2..d17c9b71eb4a 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -123,7 +123,6 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; #endif - set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } @@ -210,7 +209,11 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) topo_info.nr_disabled_cpus++; } - /* Register present and possible CPUs in the domain maps */ + /* + * Register present and possible CPUs in the domain + * maps. cpu_possible_map will be updated in + * topology_init_possible_cpus() after enumeration is done. + */ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); } -- cgit v1.2.3 From f337a6a21e2fd67eadea471e93d05dd37baaa9be Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Apr 2024 10:51:05 -0700 Subject: x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initialize cpu_mitigations to CPU_MITIGATIONS_OFF if the kernel is built with CONFIG_SPECULATION_MITIGATIONS=n, as the help text quite clearly states that disabling SPECULATION_MITIGATIONS is supposed to turn off all mitigations by default. │ If you say N, all mitigations will be disabled. You really │ should know what you are doing to say so. As is, the kernel still defaults to CPU_MITIGATIONS_AUTO, which results in some mitigations being enabled in spite of SPECULATION_MITIGATIONS=n. Fixes: f43b9876e857 ("x86/retbleed: Add fine grained Kconfig knobs") Signed-off-by: Sean Christopherson Signed-off-by: Ingo Molnar Reviewed-by: Daniel Sneddon Cc: stable@vger.kernel.org Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240409175108.1512861-2-seanjc@google.com --- kernel/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 8f6affd051f7..07ad53b7f119 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3207,7 +3207,8 @@ enum cpu_mitigations { }; static enum cpu_mitigations cpu_mitigations __ro_after_init = - CPU_MITIGATIONS_AUTO; + IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) ? CPU_MITIGATIONS_AUTO : + CPU_MITIGATIONS_OFF; static int __init mitigations_parse_cmdline(char *arg) { -- cgit v1.2.3 From 325f3fb551f8cd672dbbfc4cf58b14f9ee3fc9e8 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 10 Apr 2024 09:58:02 +0800 Subject: kprobes: Fix possible use-after-free issue on kprobe registration When unloading a module, its state is changing MODULE_STATE_LIVE -> MODULE_STATE_GOING -> MODULE_STATE_UNFORMED. Each change will take a time. `is_module_text_address()` and `__module_text_address()` works with MODULE_STATE_LIVE and MODULE_STATE_GOING. If we use `is_module_text_address()` and `__module_text_address()` separately, there is a chance that the first one is succeeded but the next one is failed because module->state becomes MODULE_STATE_UNFORMED between those operations. In `check_kprobe_address_safe()`, if the second `__module_text_address()` is failed, that is ignored because it expected a kernel_text address. But it may have failed simply because module->state has been changed to MODULE_STATE_UNFORMED. In this case, arm_kprobe() will try to modify non-exist module text address (use-after-free). To fix this problem, we should not use separated `is_module_text_address()` and `__module_text_address()`, but use only `__module_text_address()` once and do `try_module_get(module)` which is only available with MODULE_STATE_LIVE. Link: https://lore.kernel.org/all/20240410015802.265220-1-zhengyejian1@huawei.com/ Fixes: 28f6c37a2910 ("kprobes: Forbid probing on trampoline and BPF code areas") Cc: stable@vger.kernel.org Signed-off-by: Zheng Yejian Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9d9095e81792..65adc815fc6e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1567,10 +1567,17 @@ static int check_kprobe_address_safe(struct kprobe *p, jump_label_lock(); preempt_disable(); - /* Ensure it is not in reserved area nor out of text */ - if (!(core_kernel_text((unsigned long) p->addr) || - is_module_text_address((unsigned long) p->addr)) || - in_gate_area_no_mm((unsigned long) p->addr) || + /* Ensure the address is in a text area, and find a module if exists. */ + *probed_mod = NULL; + if (!core_kernel_text((unsigned long) p->addr)) { + *probed_mod = __module_text_address((unsigned long) p->addr); + if (!(*probed_mod)) { + ret = -EINVAL; + goto out; + } + } + /* Ensure it is not in reserved area. */ + if (in_gate_area_no_mm((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || @@ -1580,8 +1587,7 @@ static int check_kprobe_address_safe(struct kprobe *p, goto out; } - /* Check if 'p' is probing a module. */ - *probed_mod = __module_text_address((unsigned long) p->addr); + /* Get module refcount and reject __init functions for loaded modules. */ if (*probed_mod) { /* * We must hold a refcount of the probed module while updating -- cgit v1.2.3 From e3ba51ab24fddef79fc212f9840de54db8fd1685 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 5 Apr 2024 13:58:50 +1000 Subject: arm64: tlb: Fix TLBI RANGE operand KVM/arm64 relies on TLBI RANGE feature to flush TLBs when the dirty pages are collected by VMM and the page table entries become write protected during live migration. Unfortunately, the operand passed to the TLBI RANGE instruction isn't correctly sorted out due to the commit 117940aa6e5f ("KVM: arm64: Define kvm_tlb_flush_vmid_range()"). It leads to crash on the destination VM after live migration because TLBs aren't flushed completely and some of the dirty pages are missed. For example, I have a VM where 8GB memory is assigned, starting from 0x40000000 (1GB). Note that the host has 4KB as the base page size. In the middile of migration, kvm_tlb_flush_vmid_range() is executed to flush TLBs. It passes MAX_TLBI_RANGE_PAGES as the argument to __kvm_tlb_flush_vmid_range() and __flush_s2_tlb_range_op(). SCALE#3 and NUM#31, corresponding to MAX_TLBI_RANGE_PAGES, isn't supported by __TLBI_RANGE_NUM(). In this specific case, -1 has been returned from __TLBI_RANGE_NUM() for SCALE#3/2/1/0 and rejected by the loop in the __flush_tlb_range_op() until the variable @scale underflows and becomes -9, 0xffff708000040000 is set as the operand. The operand is wrong since it's sorted out by __TLBI_VADDR_RANGE() according to invalid @scale and @num. Fix it by extending __TLBI_RANGE_NUM() to support the combination of SCALE#3 and NUM#31. With the changes, [-1 31] instead of [-1 30] can be returned from the macro, meaning the TLBs for 0x200000 pages in the above example can be flushed in one shoot with SCALE#3 and NUM#31. The macro TLBI_RANGE_MASK is dropped since no one uses it any more. The comments are also adjusted accordingly. Fixes: 117940aa6e5f ("KVM: arm64: Define kvm_tlb_flush_vmid_range()") Cc: stable@kernel.org # v6.6+ Reported-by: Yihuang Yu Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Catalin Marinas Reviewed-by: Ryan Roberts Reviewed-by: Anshuman Khandual Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240405035852.1532010-2-gshan@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3b0e8248e1a4..a75de2665d84 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -161,12 +161,18 @@ static inline unsigned long get_trans_granule(void) #define MAX_TLBI_RANGE_PAGES __TLBI_RANGE_PAGES(31, 3) /* - * Generate 'num' values from -1 to 30 with -1 rejected by the - * __flush_tlb_range() loop below. + * Generate 'num' values from -1 to 31 with -1 rejected by the + * __flush_tlb_range() loop below. Its return value is only + * significant for a maximum of MAX_TLBI_RANGE_PAGES pages. If + * 'pages' is more than that, you must iterate over the overall + * range. */ -#define TLBI_RANGE_MASK GENMASK_ULL(4, 0) -#define __TLBI_RANGE_NUM(pages, scale) \ - ((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1) +#define __TLBI_RANGE_NUM(pages, scale) \ + ({ \ + int __pages = min((pages), \ + __TLBI_RANGE_PAGES(31, (scale))); \ + (__pages >> (5 * (scale) + 1)) - 1; \ + }) /* * TLB Invalidation @@ -379,10 +385,6 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * 3. If there is 1 page remaining, flush it through non-range operations. Range * operations can only span an even number of pages. We save this for last to * ensure 64KB start alignment is maintained for the LPA2 case. - * - * Note that certain ranges can be represented by either num = 31 and - * scale or num = 0 and scale + 1. The loop below favours the latter - * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. */ #define __flush_tlb_range_op(op, start, pages, stride, \ asid, tlb_level, tlbi_user, lpa2) \ -- cgit v1.2.3 From b37cab587aa3c9ab29c6b10aa55627dad713011f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Mar 2024 15:43:18 -0400 Subject: Bluetooth: ISO: Don't reject BT_ISO_QOS if parameters are unset Consider certain values (0x00) as unset and load proper default if an application has not set them properly. Fixes: 0fe8c8d07134 ("Bluetooth: Split bt_iso_qos into dedicated structures") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index c8793e57f4b5..d24148ea883c 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1451,8 +1451,8 @@ static bool check_ucast_qos(struct bt_iso_qos *qos) static bool check_bcast_qos(struct bt_iso_qos *qos) { - if (qos->bcast.sync_factor == 0x00) - return false; + if (!qos->bcast.sync_factor) + qos->bcast.sync_factor = 0x01; if (qos->bcast.packing > 0x01) return false; @@ -1475,6 +1475,9 @@ static bool check_bcast_qos(struct bt_iso_qos *qos) if (qos->bcast.skip > 0x01f3) return false; + if (!qos->bcast.sync_timeout) + qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; + if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000) return false; @@ -1484,6 +1487,9 @@ static bool check_bcast_qos(struct bt_iso_qos *qos) if (qos->bcast.mse > 0x1f) return false; + if (!qos->bcast.timeout) + qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; + if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000) return false; -- cgit v1.2.3 From 53cb4197e63ab2363aa28c3029061e4d516e7626 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 28 Mar 2024 15:58:10 -0400 Subject: Bluetooth: hci_sync: Fix using the same interval and window for Coded PHY Coded PHY recommended intervals are 3 time bigger than the 1M PHY so this aligns with that by multiplying by 3 the values given to 1M PHY since the code already used recommended values for that. Fixes: 288c90224eec ("Bluetooth: Enable all supported LE PHY by default") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 8fe02921adf1..c5d8799046cc 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2814,8 +2814,8 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (qos->bcast.in.phy & BT_ISO_PHY_CODED) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, - interval, - window); + interval * 3, + window * 3); num_phy++; phy++; } @@ -2835,7 +2835,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (scan_coded(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_CODED; - hci_le_scan_phy_params(phy, type, interval, window); + hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } -- cgit v1.2.3 From 45d355a926ab40f3ae7bc0b0a00cb0e3e8a5a810 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 2 Apr 2024 14:32:05 +0300 Subject: Bluetooth: Fix memory leak in hci_req_sync_complete() In 'hci_req_sync_complete()', always free the previous sync request state before assigning reference to a new one. Reported-by: syzbot+39ec16ff6cc18b1d066d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=39ec16ff6cc18b1d066d Cc: stable@vger.kernel.org Fixes: f60cb30579d3 ("Bluetooth: Convert hci_req_sync family of function to new request API") Signed-off-by: Dmitry Antipov Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_request.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 00e02138003e..efea25eb56ce 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -105,8 +105,10 @@ void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = result; hdev->req_status = HCI_REQ_DONE; - if (skb) + if (skb) { + kfree_skb(hdev->req_skb); hdev->req_skb = skb_get(skb); + } wake_up_interruptible(&hdev->req_wait_q); } } -- cgit v1.2.3 From 51eda36d33e43201e7a4fd35232e069b2c850b01 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:41:52 -0400 Subject: Bluetooth: SCO: Fix not validating setsockopt user input syzbot reported sco_sock_setsockopt() is copying data without checking user input length. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in sco_sock_setsockopt+0xc0b/0xf90 net/bluetooth/sco.c:893 Read of size 4 at addr ffff88805f7b15a3 by task syz-executor.5/12578 Fixes: ad10b1a48754 ("Bluetooth: Add Bluetooth socket voice option") Fixes: b96e9c671b05 ("Bluetooth: Add BT_DEFER_SETUP option to sco socket") Fixes: 00398e1d5183 ("Bluetooth: Add support for BT_PKT_STATUS CMSG data for SCO connections") Fixes: f6873401a608 ("Bluetooth: Allow setting of codec for HFP offload use case") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 9 +++++++++ net/bluetooth/sco.c | 23 ++++++++++------------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 9fe95a22abeb..eaec5d6caa29 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -585,6 +585,15 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, return skb; } +static inline int bt_copy_from_sockptr(void *dst, size_t dst_size, + sockptr_t src, size_t src_size) +{ + if (dst_size > src_size) + return -EINVAL; + + return copy_from_sockptr(dst, src, dst_size); +} + int bt_to_errno(u16 code); __u8 bt_status(int err); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 43daf965a01e..368e026f4d15 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -824,7 +824,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - int len, err = 0; + int err = 0; struct bt_voice voice; u32 opt; struct bt_codecs *codecs; @@ -843,10 +843,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -863,11 +862,10 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, voice.setting = sco_pi(sk)->setting; - len = min_t(unsigned int, sizeof(voice), optlen); - if (copy_from_sockptr(&voice, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&voice, sizeof(voice), optval, + optlen); + if (err) break; - } /* Explicitly check for these values */ if (voice.setting != BT_VOICE_TRANSPARENT && @@ -890,10 +888,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); @@ -934,9 +931,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(buffer, optval, optlen)) { + err = bt_copy_from_sockptr(buffer, optlen, optval, optlen); + if (err) { hci_dev_put(hdev); - err = -EFAULT; break; } -- cgit v1.2.3 From a97de7bff13b1cc825c1b1344eaed8d6c2d3e695 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:43:45 -0400 Subject: Bluetooth: RFCOMM: Fix not validating setsockopt user input syzbot reported rfcomm_sock_setsockopt_old() is copying data without checking user input length. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in rfcomm_sock_setsockopt_old net/bluetooth/rfcomm/sock.c:632 [inline] BUG: KASAN: slab-out-of-bounds in rfcomm_sock_setsockopt+0x893/0xa70 net/bluetooth/rfcomm/sock.c:673 Read of size 4 at addr ffff8880209a8bc3 by task syz-executor632/5064 Fixes: 9f2c8a03fbb3 ("Bluetooth: Replace RFCOMM link mode with security level") Fixes: bb23c0ab8246 ("Bluetooth: Add support for deferring RFCOMM connection setup") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/sock.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b54e8a530f55..29aa07e9db9d 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -629,7 +629,7 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, switch (optname) { case RFCOMM_LM: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { + if (bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen)) { err = -EFAULT; break; } @@ -664,7 +664,6 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct bt_security sec; int err = 0; - size_t len; u32 opt; BT_DBG("sk %p", sk); @@ -686,11 +685,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_sockptr(&sec, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + if (err) break; - } if (sec.level > BT_SECURITY_HIGH) { err = -EINVAL; @@ -706,10 +703,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); -- cgit v1.2.3 From 4f3951242ace5efc7131932e2e01e6ac6baed846 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:50:47 -0400 Subject: Bluetooth: L2CAP: Fix not validating setsockopt user input Check user input length before copying data. Fixes: 33575df7be67 ("Bluetooth: move l2cap_sock_setsockopt() to l2cap_sock.c") Fixes: 3ee7b7cd8390 ("Bluetooth: Add BT_MODE socket option") Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 52 ++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 4287aa6cc988..e7d810b23082 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -727,7 +727,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct l2cap_options opts; - int len, err = 0; + int err = 0; u32 opt; BT_DBG("sk %p", sk); @@ -754,11 +754,9 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, opts.max_tx = chan->max_tx; opts.txwin_size = chan->tx_win; - len = min_t(unsigned int, sizeof(opts), optlen); - if (copy_from_sockptr(&opts, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opts, sizeof(opts), optval, optlen); + if (err) break; - } if (opts.txwin_size > L2CAP_DEFAULT_EXT_WINDOW) { err = -EINVAL; @@ -801,10 +799,9 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; case L2CAP_LM: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt & L2CAP_LM_FIPS) { err = -EINVAL; @@ -885,7 +882,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, struct bt_security sec; struct bt_power pwr; struct l2cap_conn *conn; - int len, err = 0; + int err = 0; u32 opt; u16 mtu; u8 mode; @@ -911,11 +908,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_sockptr(&sec, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + if (err) break; - } if (sec.level < BT_SECURITY_LOW || sec.level > BT_SECURITY_FIPS) { @@ -960,10 +955,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) { set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -975,10 +969,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_FLUSHABLE: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt > BT_FLUSHABLE_ON) { err = -EINVAL; @@ -1010,11 +1003,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; - len = min_t(unsigned int, sizeof(pwr), optlen); - if (copy_from_sockptr(&pwr, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&pwr, sizeof(pwr), optval, optlen); + if (err) break; - } if (pwr.force_active) set_bit(FLAG_FORCE_ACTIVE, &chan->flags); @@ -1023,10 +1014,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_CHANNEL_POLICY: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } err = -EOPNOTSUPP; break; @@ -1055,10 +1045,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&mtu, optval, sizeof(u16))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&mtu, sizeof(mtu), optval, optlen); + if (err) break; - } if (chan->mode == L2CAP_MODE_EXT_FLOWCTL && sk->sk_state == BT_CONNECTED) @@ -1086,10 +1075,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&mode, optval, sizeof(u8))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&mode, sizeof(mode), optval, optlen); + if (err) break; - } BT_DBG("mode %u", mode); -- cgit v1.2.3 From 9e8742cdfc4b0e65266bb4a901a19462bda9285e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:56:50 -0400 Subject: Bluetooth: ISO: Fix not validating setsockopt user input Check user input length before copying data. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Fixes: 0731c5ab4d51 ("Bluetooth: ISO: Add support for BT_PKT_STATUS") Fixes: f764a6c2c1e4 ("Bluetooth: ISO: Add broadcast support") Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index d24148ea883c..ef0cc80b4c0c 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1500,7 +1500,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - int len, err = 0; + int err = 0; struct bt_iso_qos qos = default_qos; u32 opt; @@ -1515,10 +1515,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -1527,10 +1526,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); @@ -1545,17 +1543,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - len = min_t(unsigned int, sizeof(qos), optlen); - - if (copy_from_sockptr(&qos, optval, len)) { - err = -EFAULT; - break; - } - - if (len == sizeof(qos.ucast) && !check_ucast_qos(&qos)) { - err = -EINVAL; + err = bt_copy_from_sockptr(&qos, sizeof(qos), optval, optlen); + if (err) break; - } iso_pi(sk)->qos = qos; iso_pi(sk)->qos_user_set = true; @@ -1570,18 +1560,16 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, } if (optlen > sizeof(iso_pi(sk)->base)) { - err = -EOVERFLOW; + err = -EINVAL; break; } - len = min_t(unsigned int, sizeof(iso_pi(sk)->base), optlen); - - if (copy_from_sockptr(iso_pi(sk)->base, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(iso_pi(sk)->base, optlen, optval, + optlen); + if (err) break; - } - iso_pi(sk)->base_len = len; + iso_pi(sk)->base_len = optlen; break; -- cgit v1.2.3 From b2186061d6043d6345a97100460363e990af0d46 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 16:46:50 -0400 Subject: Bluetooth: hci_sock: Fix not validating setsockopt user input Check user input length before copying data. Fixes: 09572fca7223 ("Bluetooth: hci_sock: Add support for BT_{SND,RCV}BUF") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 4ee1b976678b..703b84bd48d5 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1946,10 +1946,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, switch (optname) { case HCI_DATA_DIR: - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR; @@ -1958,10 +1957,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, break; case HCI_TIME_STAMP: - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP; @@ -1979,11 +1977,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, uf.event_mask[1] = *((u32 *) f->event_mask + 1); } - len = min_t(unsigned int, len, sizeof(uf)); - if (copy_from_sockptr(&uf, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&uf, sizeof(uf), optval, len); + if (err) break; - } if (!capable(CAP_NET_RAW)) { uf.type_mask &= hci_sec_filter.type_mask; @@ -2042,10 +2038,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, goto done; } - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } hci_pi(sk)->mtu = opt; break; -- cgit v1.2.3 From 600b0bbe73d3a9a264694da0e4c2c0800309141e Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Thu, 4 Apr 2024 18:50:23 +0800 Subject: Bluetooth: l2cap: Don't double set the HCI_CONN_MGMT_CONNECTED bit The bit is set and tested inside mgmt_device_connected(), therefore we must not set it just outside the function. Fixes: eeda1bf97bb5 ("Bluetooth: hci_event: Fix not indicating new connection for BIG Sync") Signed-off-by: Archie Pusaka Reviewed-by: Manish Mandlik Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 467b242d8be0..dc0897408793 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4054,8 +4054,7 @@ static int l2cap_connect_req(struct l2cap_conn *conn, return -EPROTO; hci_dev_lock(hdev); - if (hci_dev_test_flag(hdev, HCI_MGMT) && - !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &hcon->flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_device_connected(hdev, hcon, NULL, 0); hci_dev_unlock(hdev); -- cgit v1.2.3 From 5284984a4fbacb0883bfebe905902cdda2891a07 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Apr 2024 18:32:12 +0300 Subject: bug: Fix no-return-statement warning with !CONFIG_BUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG() does not return, and arch implementations of BUG() use unreachable() or other non-returning code. However with !CONFIG_BUG, the default implementation is often used instead, and that does not do that. x86 always uses its own implementation, but powerpc with !CONFIG_BUG gives a build error: kernel/time/timekeeping.c: In function ‘timekeeping_debug_get_ns’: kernel/time/timekeeping.c:286:1: error: no return statement in function returning non-void [-Werror=return-type] Add unreachable() to default !CONFIG_BUG BUG() implementation. Fixes: e8e9d21a5df6 ("timekeeping: Refactor timekeeping helpers") Reported-by: Naresh Kamboju Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Tested-by: Linux Kernel Functional Testing Link: https://lore.kernel.org/r/20240410153212.127477-1-adrian.hunter@intel.com Closes: https://lore.kernel.org/all/CA+G9fYvjdZCW=7ZGxS6A_3bysjQ56YF7S-+PNLQ_8a4DKh1Bhg@mail.gmail.com/ --- include/asm-generic/bug.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 6e794420bd39..b7de3a4eade1 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -156,7 +156,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #else /* !CONFIG_BUG */ #ifndef HAVE_ARCH_BUG -#define BUG() do {} while (1) +#define BUG() do { \ + do {} while (1); \ + unreachable(); \ +} while (0) #endif #ifndef HAVE_ARCH_BUG_ON -- cgit v1.2.3 From 076361362122a6d8a4c45f172ced5576b2d4a50d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 9 Apr 2024 13:22:12 -0700 Subject: selftests: timers: Fix valid-adjtimex signed left-shift undefined behavior The struct adjtimex freq field takes a signed value who's units are in shifted (<<16) parts-per-million. Unfortunately for negative adjustments, the straightforward use of: freq = ppm << 16 trips undefined behavior warnings with clang: valid-adjtimex.c:66:6: warning: shifting a negative signed value is undefined [-Wshift-negative-value] -499<<16, ~~~~^ valid-adjtimex.c:67:6: warning: shifting a negative signed value is undefined [-Wshift-negative-value] -450<<16, ~~~~^ .. Fix it by using a multiply by (1 << 16) instead of shifting negative values in the valid-adjtimex test case. Align the values for better readability. Reported-by: Lee Jones Reported-by: Muhammad Usama Anjum Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240409202222.2830476-1-jstultz@google.com Link: https://lore.kernel.org/lkml/0c6d4f0d-2064-4444-986b-1d1ed782135f@collabora.com/ --- tools/testing/selftests/timers/valid-adjtimex.c | 73 ++++++++++++------------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index 48b9a803235a..d13ebde20322 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -21,9 +21,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ - - - #include #include #include @@ -62,45 +59,47 @@ int clear_time_state(void) #define NUM_FREQ_OUTOFRANGE 4 #define NUM_FREQ_INVALID 2 +#define SHIFTED_PPM (1 << 16) + long valid_freq[NUM_FREQ_VALID] = { - -499<<16, - -450<<16, - -400<<16, - -350<<16, - -300<<16, - -250<<16, - -200<<16, - -150<<16, - -100<<16, - -75<<16, - -50<<16, - -25<<16, - -10<<16, - -5<<16, - -1<<16, + -499 * SHIFTED_PPM, + -450 * SHIFTED_PPM, + -400 * SHIFTED_PPM, + -350 * SHIFTED_PPM, + -300 * SHIFTED_PPM, + -250 * SHIFTED_PPM, + -200 * SHIFTED_PPM, + -150 * SHIFTED_PPM, + -100 * SHIFTED_PPM, + -75 * SHIFTED_PPM, + -50 * SHIFTED_PPM, + -25 * SHIFTED_PPM, + -10 * SHIFTED_PPM, + -5 * SHIFTED_PPM, + -1 * SHIFTED_PPM, -1000, - 1<<16, - 5<<16, - 10<<16, - 25<<16, - 50<<16, - 75<<16, - 100<<16, - 150<<16, - 200<<16, - 250<<16, - 300<<16, - 350<<16, - 400<<16, - 450<<16, - 499<<16, + 1 * SHIFTED_PPM, + 5 * SHIFTED_PPM, + 10 * SHIFTED_PPM, + 25 * SHIFTED_PPM, + 50 * SHIFTED_PPM, + 75 * SHIFTED_PPM, + 100 * SHIFTED_PPM, + 150 * SHIFTED_PPM, + 200 * SHIFTED_PPM, + 250 * SHIFTED_PPM, + 300 * SHIFTED_PPM, + 350 * SHIFTED_PPM, + 400 * SHIFTED_PPM, + 450 * SHIFTED_PPM, + 499 * SHIFTED_PPM, }; long outofrange_freq[NUM_FREQ_OUTOFRANGE] = { - -1000<<16, - -550<<16, - 550<<16, - 1000<<16, + -1000 * SHIFTED_PPM, + -550 * SHIFTED_PPM, + 550 * SHIFTED_PPM, + 1000 * SHIFTED_PPM, }; #define LONG_MAX (~0UL>>1) -- cgit v1.2.3 From d9ea7a3f66a5c7e1a2f73cf4b20f5eff3ced4ff8 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 19 Mar 2024 11:43:50 +0800 Subject: hv: vmbus: Convert sprintf() family to sysfs_emit() family Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). sprintf() and scnprintf() will be converted as well if these files have such abused cases. This patch is generated by make coccicheck M= MODE=patch \ COCCI=scripts/coccinelle/api/device_attr_show.cocci No functional change intended. CC: "K. Y. Srinivasan" CC: Haiyang Zhang CC: Wei Liu CC: Dexuan Cui CC: linux-hyperv@vger.kernel.org Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20240319034350.1574454-1-lizhijian@fujitsu.com Signed-off-by: Wei Liu Message-ID: <20240319034350.1574454-1-lizhijian@fujitsu.com> --- drivers/hv/vmbus_drv.c | 94 ++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 52 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 7f7965f3d187..121f1ab32b51 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -131,7 +131,7 @@ static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.child_relid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid); } static DEVICE_ATTR_RO(id); @@ -142,7 +142,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->state); + return sysfs_emit(buf, "%d\n", hv_dev->channel->state); } static DEVICE_ATTR_RO(state); @@ -153,7 +153,7 @@ static ssize_t monitor_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.monitorid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid); } static DEVICE_ATTR_RO(monitor_id); @@ -164,8 +164,8 @@ static ssize_t class_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_type); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_type); } static DEVICE_ATTR_RO(class_id); @@ -176,8 +176,8 @@ static ssize_t device_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_instance); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_instance); } static DEVICE_ATTR_RO(device_id); @@ -186,7 +186,7 @@ static ssize_t modalias_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); + return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); } static DEVICE_ATTR_RO(modalias); @@ -199,7 +199,7 @@ static ssize_t numa_node_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); + return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); } static DEVICE_ATTR_RO(numa_node); #endif @@ -212,9 +212,8 @@ static ssize_t server_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_pending); @@ -226,9 +225,8 @@ static ssize_t client_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_pending); @@ -240,9 +238,8 @@ static ssize_t server_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_latency); @@ -254,9 +251,8 @@ static ssize_t client_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_latency); @@ -268,9 +264,8 @@ static ssize_t server_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_conn_id); @@ -282,9 +277,8 @@ static ssize_t client_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_conn_id); @@ -303,7 +297,7 @@ static ssize_t out_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask); } static DEVICE_ATTR_RO(out_intr_mask); @@ -321,7 +315,7 @@ static ssize_t out_read_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_read_index); + return sysfs_emit(buf, "%d\n", outbound.current_read_index); } static DEVICE_ATTR_RO(out_read_index); @@ -340,7 +334,7 @@ static ssize_t out_write_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_write_index); + return sysfs_emit(buf, "%d\n", outbound.current_write_index); } static DEVICE_ATTR_RO(out_write_index); @@ -359,7 +353,7 @@ static ssize_t out_read_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread); } static DEVICE_ATTR_RO(out_read_bytes_avail); @@ -378,7 +372,7 @@ static ssize_t out_write_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(out_write_bytes_avail); @@ -396,7 +390,7 @@ static ssize_t in_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask); } static DEVICE_ATTR_RO(in_intr_mask); @@ -414,7 +408,7 @@ static ssize_t in_read_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_read_index); + return sysfs_emit(buf, "%d\n", inbound.current_read_index); } static DEVICE_ATTR_RO(in_read_index); @@ -432,7 +426,7 @@ static ssize_t in_write_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_write_index); + return sysfs_emit(buf, "%d\n", inbound.current_write_index); } static DEVICE_ATTR_RO(in_write_index); @@ -451,7 +445,7 @@ static ssize_t in_read_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread); } static DEVICE_ATTR_RO(in_read_bytes_avail); @@ -470,7 +464,7 @@ static ssize_t in_write_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(in_write_bytes_avail); @@ -480,7 +474,7 @@ static ssize_t channel_vp_mapping_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); struct vmbus_channel *channel = hv_dev->channel, *cur_sc; - int buf_size = PAGE_SIZE, n_written, tot_written; + int n_written; struct list_head *cur; if (!channel) @@ -488,25 +482,21 @@ static ssize_t channel_vp_mapping_show(struct device *dev, mutex_lock(&vmbus_connection.channel_mutex); - tot_written = snprintf(buf, buf_size, "%u:%u\n", - channel->offermsg.child_relid, channel->target_cpu); + n_written = sysfs_emit(buf, "%u:%u\n", + channel->offermsg.child_relid, + channel->target_cpu); list_for_each(cur, &channel->sc_list) { - if (tot_written >= buf_size - 1) - break; cur_sc = list_entry(cur, struct vmbus_channel, sc_list); - n_written = scnprintf(buf + tot_written, - buf_size - tot_written, - "%u:%u\n", - cur_sc->offermsg.child_relid, - cur_sc->target_cpu); - tot_written += n_written; + n_written += sysfs_emit_at(buf, n_written, "%u:%u\n", + cur_sc->offermsg.child_relid, + cur_sc->target_cpu); } mutex_unlock(&vmbus_connection.channel_mutex); - return tot_written; + return n_written; } static DEVICE_ATTR_RO(channel_vp_mapping); @@ -516,7 +506,7 @@ static ssize_t vendor_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->vendor_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id); } static DEVICE_ATTR_RO(vendor); @@ -526,7 +516,7 @@ static ssize_t device_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->device_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->device_id); } static DEVICE_ATTR_RO(device); @@ -551,7 +541,7 @@ static ssize_t driver_override_show(struct device *dev, ssize_t len; device_lock(dev); - len = snprintf(buf, PAGE_SIZE, "%s\n", hv_dev->driver_override); + len = sysfs_emit(buf, "%s\n", hv_dev->driver_override); device_unlock(dev); return len; -- cgit v1.2.3 From f971f6dd3742d22dd13710306fb4365ea7bcb536 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Fri, 22 Mar 2024 06:46:02 -0700 Subject: hv/hv_kvp_daemon: Handle IPv4 and Ipv6 combination for keyfile format If the network configuration strings are passed as a combination of IPv4 and IPv6 addresses, the current KVP daemon does not handle processing for the keyfile configuration format. With these changes, the keyfile config generation logic scans through the list twice to generate IPv4 and IPv6 sections for the configuration files to handle this support. Testcases ran:Rhel 9, Hyper-V VMs (IPv4 only, IPv6 only, IPv4 and IPv6 combination) Co-developed-by: Ani Sinha Signed-off-by: Ani Sinha Signed-off-by: Shradha Gupta Reviewed-by: Easwar Hariharan Tested-by: Ani Sinha Reviewed-by: Ani Sinha Link: https://lore.kernel.org/r/1711115162-11629-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711115162-11629-1-git-send-email-shradhagupta@linux.microsoft.com> --- tools/hv/hv_kvp_daemon.c | 213 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 172 insertions(+), 41 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 318e2dad27e0..ae57bf69ad4a 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -76,6 +76,12 @@ enum { DNS }; +enum { + IPV4 = 1, + IPV6, + IP_TYPE_MAX +}; + static int in_hand_shake; static char *os_name = ""; @@ -102,6 +108,11 @@ static struct utsname uts_buf; #define MAX_FILE_NAME 100 #define ENTRIES_PER_BLOCK 50 +/* + * Change this entry if the number of addresses increases in future + */ +#define MAX_IP_ENTRIES 64 +#define OUTSTR_BUF_SIZE ((INET6_ADDRSTRLEN + 1) * MAX_IP_ENTRIES) struct kvp_record { char key[HV_KVP_EXCHANGE_MAX_KEY_SIZE]; @@ -1171,6 +1182,18 @@ static int process_ip_string(FILE *f, char *ip_string, int type) return 0; } +int ip_version_check(const char *input_addr) +{ + struct in6_addr addr; + + if (inet_pton(AF_INET, input_addr, &addr)) + return IPV4; + else if (inet_pton(AF_INET6, input_addr, &addr)) + return IPV6; + + return -EINVAL; +} + /* * Only IPv4 subnet strings needs to be converted to plen * For IPv6 the subnet is already privided in plen format @@ -1197,14 +1220,75 @@ static int kvp_subnet_to_plen(char *subnet_addr_str) return plen; } +static int process_dns_gateway_nm(FILE *f, char *ip_string, int type, + int ip_sec) +{ + char addr[INET6_ADDRSTRLEN], *output_str; + int ip_offset = 0, error = 0, ip_ver; + char *param_name; + + if (type == DNS) + param_name = "dns"; + else if (type == GATEWAY) + param_name = "gateway"; + else + return -EINVAL; + + output_str = (char *)calloc(OUTSTR_BUF_SIZE, sizeof(char)); + if (!output_str) + return -ENOMEM; + + while (1) { + memset(addr, 0, sizeof(addr)); + + if (!parse_ip_val_buffer(ip_string, &ip_offset, addr, + (MAX_IP_ADDR_SIZE * 2))) + break; + + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if ((ip_ver == IPV4 && ip_sec == IPV4) || + (ip_ver == IPV6 && ip_sec == IPV6)) { + /* + * do a bound check to avoid out-of bound writes + */ + if ((OUTSTR_BUF_SIZE - strlen(output_str)) > + (strlen(addr) + 1)) { + strncat(output_str, addr, + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + strncat(output_str, ",", + OUTSTR_BUF_SIZE - + strlen(output_str) - 1); + } + } else { + continue; + } + } + + if (strlen(output_str)) { + /* + * This is to get rid of that extra comma character + * in the end of the string + */ + output_str[strlen(output_str) - 1] = '\0'; + error = fprintf(f, "%s=%s\n", param_name, output_str); + } + + free(output_str); + return error; +} + static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, - int is_ipv6) + int ip_sec) { char addr[INET6_ADDRSTRLEN]; char subnet_addr[INET6_ADDRSTRLEN]; - int error, i = 0; + int error = 0, i = 0; int ip_offset = 0, subnet_offset = 0; - int plen; + int plen, ip_ver; memset(addr, 0, sizeof(addr)); memset(subnet_addr, 0, sizeof(subnet_addr)); @@ -1216,10 +1300,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, subnet_addr, (MAX_IP_ADDR_SIZE * 2))) { - if (!is_ipv6) + ip_ver = ip_version_check(addr); + if (ip_ver < 0) + continue; + + if (ip_ver == IPV4 && ip_sec == IPV4) plen = kvp_subnet_to_plen((char *)subnet_addr); - else + else if (ip_ver == IPV6 && ip_sec == IPV6) plen = atoi(subnet_addr); + else + continue; if (plen < 0) return plen; @@ -1233,17 +1323,16 @@ static int process_ip_string_nm(FILE *f, char *ip_string, char *subnet, memset(subnet_addr, 0, sizeof(subnet_addr)); } - return 0; + return error; } static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) { - int error = 0; + int error = 0, ip_ver; char if_filename[PATH_MAX]; char nm_filename[PATH_MAX]; FILE *ifcfg_file, *nmfile; char cmd[PATH_MAX]; - int is_ipv6 = 0; char *mac_addr; int str_len; @@ -1421,52 +1510,94 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) if (error) goto setval_error; - if (new_val->addr_family & ADDR_FAMILY_IPV6) { - error = fprintf(nmfile, "\n[ipv6]\n"); - if (error < 0) - goto setval_error; - is_ipv6 = 1; - } else { - error = fprintf(nmfile, "\n[ipv4]\n"); - if (error < 0) - goto setval_error; - } - /* * Now we populate the keyfile format + * + * The keyfile format expects the IPv6 and IPv4 configuration in + * different sections. Therefore we iterate through the list twice, + * once to populate the IPv4 section and the next time for IPv6 */ + ip_ver = IPV4; + do { + if (ip_ver == IPV4) { + error = fprintf(nmfile, "\n[ipv4]\n"); + if (error < 0) + goto setval_error; + } else { + error = fprintf(nmfile, "\n[ipv6]\n"); + if (error < 0) + goto setval_error; + } - if (new_val->dhcp_enabled) { - error = kvp_write_file(nmfile, "method", "", "auto"); - if (error < 0) - goto setval_error; - } else { - error = kvp_write_file(nmfile, "method", "", "manual"); + /* + * Write the configuration for ipaddress, netmask, gateway and + * name services + */ + error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, + (char *)new_val->sub_net, + ip_ver); if (error < 0) goto setval_error; - } - /* - * Write the configuration for ipaddress, netmask, gateway and - * name services - */ - error = process_ip_string_nm(nmfile, (char *)new_val->ip_addr, - (char *)new_val->sub_net, is_ipv6); - if (error < 0) - goto setval_error; + /* + * As dhcp_enabled is only valid for ipv4, we do not set dhcp + * methods for ipv6 based on dhcp_enabled flag. + * + * For ipv4, set method to manual only when dhcp_enabled is + * false and specific ipv4 addresses are configured. If neither + * dhcp_enabled is true and no ipv4 addresses are configured, + * set method to 'disabled'. + * + * For ipv6, set method to manual when we configure ipv6 + * addresses. Otherwise set method to 'auto' so that SLAAC from + * RA may be used. + */ + if (ip_ver == IPV4) { + if (new_val->dhcp_enabled) { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } else if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "disabled"); + if (error < 0) + goto setval_error; + } + } else if (ip_ver == IPV6) { + if (error) { + error = kvp_write_file(nmfile, "method", "", + "manual"); + if (error < 0) + goto setval_error; + } else { + error = kvp_write_file(nmfile, "method", "", + "auto"); + if (error < 0) + goto setval_error; + } + } - /* we do not want ipv4 addresses in ipv6 section and vice versa */ - if (is_ipv6 != is_ipv4((char *)new_val->gate_way)) { - error = fprintf(nmfile, "gateway=%s\n", (char *)new_val->gate_way); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->gate_way, + GATEWAY, ip_ver); if (error < 0) goto setval_error; - } - if (is_ipv6 != is_ipv4((char *)new_val->dns_addr)) { - error = fprintf(nmfile, "dns=%s\n", (char *)new_val->dns_addr); + error = process_dns_gateway_nm(nmfile, + (char *)new_val->dns_addr, DNS, + ip_ver); if (error < 0) goto setval_error; - } + + ip_ver++; + } while (ip_ver < IP_TYPE_MAX); + fclose(nmfile); fclose(ifcfg_file); -- cgit v1.2.3 From 03f5a999adba062456c8c818a683beb1b498983a Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:54 -0700 Subject: Drivers: hv: vmbus: Leak pages if set_memory_encrypted() fails In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. VMBus code could free decrypted pages if set_memory_encrypted()/decrypted() fails. Leak the pages if this happens. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-2-mhklinux@outlook.com> --- drivers/hv/connection.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 3cabeeabb1ca..f001ae880e1d 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -237,8 +237,17 @@ int vmbus_connect(void) vmbus_connection.monitor_pages[0], 1); ret |= set_memory_decrypted((unsigned long) vmbus_connection.monitor_pages[1], 1); - if (ret) + if (ret) { + /* + * If set_memory_decrypted() fails, the encryption state + * of the memory is unknown. So leak the memory instead + * of risking returning decrypted memory to the free list. + * For simplicity, always handle both pages the same. + */ + vmbus_connection.monitor_pages[0] = NULL; + vmbus_connection.monitor_pages[1] = NULL; goto cleanup; + } /* * Set_memory_decrypted() will change the memory contents if @@ -337,13 +346,19 @@ void vmbus_disconnect(void) vmbus_connection.int_page = NULL; } - set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[0], 1); - set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[1], 1); + if (vmbus_connection.monitor_pages[0]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[0], 1)) + hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); + vmbus_connection.monitor_pages[0] = NULL; + } - hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); - hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); - vmbus_connection.monitor_pages[0] = NULL; - vmbus_connection.monitor_pages[1] = NULL; + if (vmbus_connection.monitor_pages[1]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[1], 1)) + hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); + vmbus_connection.monitor_pages[1] = NULL; + } } /* -- cgit v1.2.3 From 211f514ebf1ef5de37b1cf6df9d28a56cfd242ca Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:55 -0700 Subject: Drivers: hv: vmbus: Track decrypted status in vmbus_gpadl In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. In order to make sure callers of vmbus_establish_gpadl() and vmbus_teardown_gpadl() don't return decrypted/shared pages to allocators, add a field in struct vmbus_gpadl to keep track of the decryption status of the buffers. This will allow the callers to know if they should free or leak the pages. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-3-mhklinux@outlook.com> --- drivers/hv/channel.c | 25 +++++++++++++++++++++---- include/linux/hyperv.h | 1 + 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index adbf674355b2..98259b492502 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -436,9 +436,18 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, (atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1); ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo); - if (ret) + if (ret) { + gpadl->decrypted = false; return ret; + } + /* + * Set the "decrypted" flag to true for the set_memory_decrypted() + * success case. In the failure case, the encryption state of the + * memory is unknown. Leave "decrypted" as true to ensure the + * memory will be leaked instead of going back on the free list. + */ + gpadl->decrypted = true; ret = set_memory_decrypted((unsigned long)kbuffer, PFN_UP(size)); if (ret) { @@ -527,9 +536,15 @@ cleanup: kfree(msginfo); - if (ret) - set_memory_encrypted((unsigned long)kbuffer, - PFN_UP(size)); + if (ret) { + /* + * If set_memory_encrypted() fails, the decrypted flag is + * left as true so the memory is leaked instead of being + * put back on the free list. + */ + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } return ret; } @@ -850,6 +865,8 @@ post_msg_err: if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); + gpadl->decrypted = ret; + return ret; } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6ef0557b4bff..96ceb4095425 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -832,6 +832,7 @@ struct vmbus_gpadl { u32 gpadl_handle; u32 size; void *buffer; + bool decrypted; }; struct vmbus_channel { -- cgit v1.2.3 From bbf9ac34677b57506a13682b31a2a718934c0e31 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:56 -0700 Subject: hv_netvsc: Don't free decrypted memory In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The netvsc driver could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the gpadl to decide whether to free the memory. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-4-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-4-mhklinux@outlook.com> --- drivers/net/hyperv/netvsc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index a6fcbda64ecc..2b6ec979a62f 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -154,8 +154,11 @@ static void free_netvsc_device(struct rcu_head *head) int i; kfree(nvdev->extension); - vfree(nvdev->recv_buf); - vfree(nvdev->send_buf); + + if (!nvdev->recv_buf_gpadl_handle.decrypted) + vfree(nvdev->recv_buf); + if (!nvdev->send_buf_gpadl_handle.decrypted) + vfree(nvdev->send_buf); bitmap_free(nvdev->send_section_map); for (i = 0; i < VRSS_CHANNEL_MAX; i++) { -- cgit v1.2.3 From 3d788b2fbe6a1a1a9e3db09742b90809d51638b7 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:57 -0700 Subject: uio_hv_generic: Don't free decrypted memory In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The VMBus device UIO driver could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the gpadl to decide whether to free the memory. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-5-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-5-mhklinux@outlook.com> --- drivers/uio/uio_hv_generic.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c index 20d9762331bd..6be3462b109f 100644 --- a/drivers/uio/uio_hv_generic.c +++ b/drivers/uio/uio_hv_generic.c @@ -181,12 +181,14 @@ hv_uio_cleanup(struct hv_device *dev, struct hv_uio_private_data *pdata) { if (pdata->send_gpadl.gpadl_handle) { vmbus_teardown_gpadl(dev->channel, &pdata->send_gpadl); - vfree(pdata->send_buf); + if (!pdata->send_gpadl.decrypted) + vfree(pdata->send_buf); } if (pdata->recv_gpadl.gpadl_handle) { vmbus_teardown_gpadl(dev->channel, &pdata->recv_gpadl); - vfree(pdata->recv_buf); + if (!pdata->recv_gpadl.decrypted) + vfree(pdata->recv_buf); } } @@ -295,7 +297,8 @@ hv_uio_probe(struct hv_device *dev, ret = vmbus_establish_gpadl(channel, pdata->recv_buf, RECV_BUFFER_SIZE, &pdata->recv_gpadl); if (ret) { - vfree(pdata->recv_buf); + if (!pdata->recv_gpadl.decrypted) + vfree(pdata->recv_buf); goto fail_close; } @@ -317,7 +320,8 @@ hv_uio_probe(struct hv_device *dev, ret = vmbus_establish_gpadl(channel, pdata->send_buf, SEND_BUFFER_SIZE, &pdata->send_gpadl); if (ret) { - vfree(pdata->send_buf); + if (!pdata->send_gpadl.decrypted) + vfree(pdata->send_buf); goto fail_close; } -- cgit v1.2.3 From 30d18df6567be09c1433e81993e35e3da573ac48 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 11 Mar 2024 09:15:58 -0700 Subject: Drivers: hv: vmbus: Don't free ring buffers that couldn't be re-encrypted In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. The VMBus ring buffer code could free decrypted/shared pages if set_memory_decrypted() fails. Check the decrypted field in the struct vmbus_gpadl for the ring buffers to decide whether to free the memory. Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-6-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-6-mhklinux@outlook.com> --- drivers/hv/channel.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 98259b492502..fb8cd8469328 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -153,7 +153,9 @@ void vmbus_free_ring(struct vmbus_channel *channel) hv_ringbuffer_cleanup(&channel->inbound); if (channel->ringbuffer_page) { - __free_pages(channel->ringbuffer_page, + /* In a CoCo VM leak the memory if it didn't get re-encrypted */ + if (!channel->ringbuffer_gpadlhandle.decrypted) + __free_pages(channel->ringbuffer_page, get_order(channel->ringbuffer_pagecount << PAGE_SHIFT)); channel->ringbuffer_page = NULL; -- cgit v1.2.3 From a4833e3abae132d613ce7da0e0c9a9465d1681fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 10 Apr 2024 12:38:13 -0400 Subject: SUNRPC: Fix rpcgss_context trace event acceptor field The rpcgss_context trace event acceptor field is a dynamically sized string that records the "data" parameter. But this parameter is also dependent on the "len" field to determine the size of the data. It needs to use __string_len() helper macro where the length can be passed in. It also incorrectly uses strncpy() to save it instead of __assign_str(). As these macros can change, it is not wise to open code them in trace events. As of commit c759e609030c ("tracing: Remove __assign_str_len()"), __assign_str() can be used for both __string() and __string_len() fields. Before that commit, __assign_str_len() is required to be used. This needs to be noted for backporting. (In actuality, commit c1fa617caeb0 ("tracing: Rework __assign_str() and __string() to not duplicate getting the string") is the commit that makes __string_str_len() obsolete). Cc: stable@vger.kernel.org Fixes: 0c77668ddb4e ("SUNRPC: Introduce trace points in rpc_auth_gss.ko") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Chuck Lever --- include/trace/events/rpcgss.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index ba2d96a1bc2f..f50fcafc69de 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -609,7 +609,7 @@ TRACE_EVENT(rpcgss_context, __field(unsigned int, timeout) __field(u32, window_size) __field(int, len) - __string(acceptor, data) + __string_len(acceptor, data, len) ), TP_fast_assign( @@ -618,7 +618,7 @@ TRACE_EVENT(rpcgss_context, __entry->timeout = timeout; __entry->window_size = window_size; __entry->len = len; - strncpy(__get_str(acceptor), data, len); + __assign_str(acceptor, data); ), TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s", -- cgit v1.2.3 From ec4535b2a1d709d3a1fbec26739c672f13c98a7b Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 8 Apr 2024 18:32:17 -0300 Subject: smb: client: fix NULL ptr deref in cifs_mark_open_handles_for_deleted_file() cifs_get_fattr() may be called with a NULL inode, so check for a non-NULL inode before calling cifs_mark_open_handles_for_deleted_file(). This fixes the following oops: mount.cifs //srv/share /mnt -o ...,vers=3.1.1 cd /mnt touch foo; tail -f foo & rm foo cat foo BUG: kernel NULL pointer dereference, address: 00000000000005c0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 696 Comm: cat Not tainted 6.9.0-rc2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014 RIP: 0010:__lock_acquire+0x5d/0x1c70 Code: 00 00 44 8b a4 24 a0 00 00 00 45 85 f6 0f 84 bb 06 00 00 8b 2d 48 e2 95 01 45 89 c3 41 89 d2 45 89 c8 85 ed 0 0 <48> 81 3f 40 7a 76 83 44 0f 44 d8 83 fe 01 0f 86 1b 03 00 00 31 d2 RSP: 0018:ffffc90000b37490 EFLAGS: 00010002 RAX: 0000000000000000 RBX: ffff888110021ec0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000005c0 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000200 FS: 00007f2a1fa08740(0000) GS:ffff888157a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000005c0 CR3: 000000011ac7c000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x180/0x490 ? srso_alias_return_thunk+0x5/0xfbef5 ? exc_page_fault+0x70/0x230 ? asm_exc_page_fault+0x26/0x30 ? __lock_acquire+0x5d/0x1c70 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 lock_acquire+0xc0/0x2d0 ? cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] ? srso_alias_return_thunk+0x5/0xfbef5 ? kmem_cache_alloc+0x2d9/0x370 _raw_spin_lock+0x34/0x80 ? cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] cifs_mark_open_handles_for_deleted_file+0x3a/0x100 [cifs] cifs_get_fattr+0x24c/0x940 [cifs] ? srso_alias_return_thunk+0x5/0xfbef5 cifs_get_inode_info+0x96/0x120 [cifs] cifs_lookup+0x16e/0x800 [cifs] cifs_atomic_open+0xc7/0x5d0 [cifs] ? lookup_open.isra.0+0x3ce/0x5f0 ? __pfx_cifs_atomic_open+0x10/0x10 [cifs] lookup_open.isra.0+0x3ce/0x5f0 path_openat+0x42b/0xc30 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 do_filp_open+0xc4/0x170 do_sys_openat2+0xab/0xe0 __x64_sys_openat+0x57/0xa0 do_syscall_64+0xc1/0x1e0 entry_SYSCALL_64_after_hwframe+0x72/0x7a Fixes: ffceb7640cbf ("smb: client: do not defer close open handles to deleted files") Reviewed-by: Meetakshi Setiya Reviewed-by: Bharath SM Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 91b07ef9e25c..60afab5c83d4 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1105,7 +1105,8 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, } else { cifs_open_info_to_fattr(fattr, data, sb); } - if (!rc && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + if (!rc && *inode && + (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING)) cifs_mark_open_handles_for_deleted_file(*inode, full_path); break; case -EREMOTE: -- cgit v1.2.3 From 06dfcd4098cfdc4d4577d94793a4f9125386da8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Mon, 8 Apr 2024 10:08:53 +0300 Subject: net: dsa: mt7530: fix enabling EEE on MT7531 switch on all boards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 40b5d2f15c09 ("net: dsa: mt7530: Add support for EEE features") brought EEE support but did not enable EEE on MT7531 switch MACs. EEE is enabled on MT7531 switch MACs by pulling the LAN2LED0 pin low on the board (bootstrapping), unsetting the EEE_DIS bit on the trap register, or setting the internal EEE switch bit on the CORE_PLL_GROUP4 register. Thanks to SkyLake Huang (黃啟澤) from MediaTek for providing information on the internal EEE switch bit. There are existing boards that were not designed to pull the pin low. Because of that, the EEE status currently depends on the board design. The EEE_DIS bit on the trap pertains to the LAN2LED0 pin which is usually used to control an LED. Once the bit is unset, the pin will be low. That will make the active low LED turn on. The pin is controlled by the switch PHY. It seems that the PHY controls the pin in the way that it inverts the pin state. That means depending on the wiring of the LED connected to LAN2LED0 on the board, the LED may be on without an active link. To not cause this unwanted behaviour whilst enabling EEE on all boards, set the internal EEE switch bit on the CORE_PLL_GROUP4 register. My testing on MT7531 shows a certain amount of traffic loss when EEE is enabled. That said, I haven't come across a board that enables EEE. So enable EEE on the switch MACs but disable EEE advertisement on the switch PHYs. This way, we don't change the behaviour of the majority of the boards that have this switch. The mediatek-ge PHY driver already disables EEE advertisement on the switch PHYs but my testing shows that it is somehow enabled afterwards. Disabling EEE advertisement before the PHY driver initialises keeps it off. With this change, EEE can now be enabled using ethtool. Fixes: 40b5d2f15c09 ("net: dsa: mt7530: Add support for EEE features") Reviewed-by: Florian Fainelli Signed-off-by: Arınç ÜNAL Tested-by: Daniel Golle Reviewed-by: Daniel Golle Link: https://lore.kernel.org/r/20240408-for-net-mt7530-fix-eee-for-mt7531-mt7988-v3-1-84fdef1f008b@arinc9.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 17 ++++++++++++----- drivers/net/dsa/mt7530.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 1035820c2377..451ff0620c2e 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2505,18 +2505,25 @@ mt7531_setup(struct dsa_switch *ds) mt7530_rmw(priv, MT7531_GPIO_MODE0, MT7531_GPIO0_MASK, MT7531_GPIO0_INTERRUPT); - /* Enable PHY core PLL, since phy_device has not yet been created - * provided for phy_[read,write]_mmd_indirect is called, we provide - * our own mt7531_ind_mmd_phy_[read,write] to complete this - * function. + /* Enable Energy-Efficient Ethernet (EEE) and PHY core PLL, since + * phy_device has not yet been created provided for + * phy_[read,write]_mmd_indirect is called, we provide our own + * mt7531_ind_mmd_phy_[read,write] to complete this function. */ val = mt7531_ind_c45_phy_read(priv, MT753X_CTRL_PHY_ADDR, MDIO_MMD_VEND2, CORE_PLL_GROUP4); - val |= MT7531_PHY_PLL_BYPASS_MODE; + val |= MT7531_RG_SYSPLL_DMY2 | MT7531_PHY_PLL_BYPASS_MODE; val &= ~MT7531_PHY_PLL_OFF; mt7531_ind_c45_phy_write(priv, MT753X_CTRL_PHY_ADDR, MDIO_MMD_VEND2, CORE_PLL_GROUP4, val); + /* Disable EEE advertisement on the switch PHYs. */ + for (i = MT753X_CTRL_PHY_ADDR; + i < MT753X_CTRL_PHY_ADDR + MT7530_NUM_PHYS; i++) { + mt7531_ind_c45_phy_write(priv, i, MDIO_MMD_AN, MDIO_AN_EEE_ADV, + 0); + } + mt7531_setup_common(ds); /* Setup VLAN ID 0 for VLAN-unaware bridges */ diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index d17b318e6ee4..9439db495001 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -616,6 +616,7 @@ enum mt7531_clk_skew { #define RG_SYSPLL_DDSFBK_EN BIT(12) #define RG_SYSPLL_BIAS_EN BIT(11) #define RG_SYSPLL_BIAS_LPF_EN BIT(10) +#define MT7531_RG_SYSPLL_DMY2 BIT(6) #define MT7531_PHY_PLL_OFF BIT(5) #define MT7531_PHY_PLL_BYPASS_MODE BIT(4) -- cgit v1.2.3 From 5e700b384ec13f5bcac9855cb28fcc674f1d3593 Mon Sep 17 00:00:00 2001 From: Noah Loomans Date: Wed, 10 Apr 2024 20:26:19 +0200 Subject: platform/chrome: cros_ec_uart: properly fix race condition The cros_ec_uart_probe() function calls devm_serdev_device_open() before it calls serdev_device_set_client_ops(). This can trigger a NULL pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... Call Trace: ... ? ttyport_receive_buf A simplified version of crashing code is as follows: static inline size_t serdev_controller_receive_buf(struct serdev_controller *ctrl, const u8 *data, size_t count) { struct serdev_device *serdev = ctrl->serdev; if (!serdev || !serdev->ops->receive_buf) // CRASH! return 0; return serdev->ops->receive_buf(serdev, data, count); } It assumes that if SERPORT_ACTIVE is set and serdev exists, serdev->ops will also exist. This conflicts with the existing cros_ec_uart_probe() logic, as it first calls devm_serdev_device_open() (which sets SERPORT_ACTIVE), and only later sets serdev->ops via serdev_device_set_client_ops(). Commit 01f95d42b8f4 ("platform/chrome: cros_ec_uart: fix race condition") attempted to fix a similar race condition, but while doing so, made the window of error for this race condition to happen much wider. Attempt to fix the race condition again, making sure we fully setup before calling devm_serdev_device_open(). Fixes: 01f95d42b8f4 ("platform/chrome: cros_ec_uart: fix race condition") Cc: stable@vger.kernel.org Signed-off-by: Noah Loomans Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20240410182618.169042-2-noah@noahloomans.com Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_uart.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c index 68d80559fddc..eb5eddeb73f7 100644 --- a/drivers/platform/chrome/cros_ec_uart.c +++ b/drivers/platform/chrome/cros_ec_uart.c @@ -263,12 +263,6 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) if (!ec_dev) return -ENOMEM; - ret = devm_serdev_device_open(dev, serdev); - if (ret) { - dev_err(dev, "Unable to open UART device"); - return ret; - } - serdev_device_set_drvdata(serdev, ec_dev); init_waitqueue_head(&ec_uart->response.wait_queue); @@ -280,14 +274,6 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) return ret; } - ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate); - if (ret < 0) { - dev_err(dev, "Failed to set up host baud rate (%d)", ret); - return ret; - } - - serdev_device_set_flow_control(serdev, ec_uart->flowcontrol); - /* Initialize ec_dev for cros_ec */ ec_dev->phys_name = dev_name(dev); ec_dev->dev = dev; @@ -301,6 +287,20 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops); + ret = devm_serdev_device_open(dev, serdev); + if (ret) { + dev_err(dev, "Unable to open UART device"); + return ret; + } + + ret = serdev_device_set_baudrate(serdev, ec_uart->baudrate); + if (ret < 0) { + dev_err(dev, "Failed to set up host baud rate (%d)", ret); + return ret; + } + + serdev_device_set_flow_control(serdev, ec_uart->flowcontrol); + return cros_ec_register(ec_dev); } -- cgit v1.2.3 From 97e176fcbbf3c0f2bd410c9b241177c051f57176 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 10 Apr 2024 15:11:28 +0200 Subject: r8169: add missing conditional compiling for call to r8169_remove_leds Add missing dependency on CONFIG_R8169_LEDS. As-is a link error occurs if config option CONFIG_R8169_LEDS isn't enabled. Fixes: 19fa4f2a85d7 ("r8169: fix LED-related deadlock on module removal") Reported-by: Venkat Rao Bagalkote Signed-off-by: Heiner Kallweit Tested-By: Venkat Rao Bagalkote Link: https://lore.kernel.org/r/d080038c-eb6b-45ac-9237-b8c1cdd7870f@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 8a27328eae34..0fc5fe564ae5 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5046,7 +5046,8 @@ static void rtl_remove_one(struct pci_dev *pdev) cancel_work_sync(&tp->wk.work); - r8169_remove_leds(tp->leds); + if (IS_ENABLED(CONFIG_R8169_LEDS)) + r8169_remove_leds(tp->leds); unregister_netdev(tp->dev); -- cgit v1.2.3 From beccf29114886f1604e26f739cd108f048878ca8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Apr 2024 12:53:28 -0400 Subject: bcachefs: Fix a race in btree_update_nodes_written() One btree update might have terminated in a node update, and then while it is in flight another btree update might free that original node. This race has to be handled in btree_update_nodes_written() - we were missing a READ_ONCE(). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index a4a63e363047..c4a5e83a56a4 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -704,9 +704,13 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "%s", bch2_err_str(ret)); err: - if (as->b) { - - b = as->b; + /* + * We have to be careful because another thread might be getting ready + * to free as->b and calling btree_update_reparent() on us - we'll + * recheck under btree_update_lock below: + */ + b = READ_ONCE(as->b); + if (b) { btree_path_idx_t path_idx = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); struct btree_path *path = trans->paths + path_idx; -- cgit v1.2.3 From 517236cb3e2f77bc785f06802dfbcca19dffd9ad Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Apr 2024 00:10:18 -0400 Subject: bcachefs: Kill read lock dropping in bch2_btree_node_lock_write_nofail() dropping read locks in bch2_btree_node_lock_write_nofail() dates from before we had the cycle detector; we can now tell the cycle detector directly when taking a lock may not fail because we can't handle transaction restarts. This is needed for adding should_be_locked asserts. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index b9b151e693ed..f2caf491957e 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { - struct btree_path *linked; - unsigned i, iter; - int ret; - - /* - * XXX BIG FAT NOTICE - * - * Drop all read locks before taking a write lock: - * - * This is a hack, because bch2_btree_node_lock_write_nofail() is a - * hack - but by dropping read locks first, this should never fail, and - * we only use this in code paths where whatever read locks we've - * already taken are no longer needed: - */ - - trans_for_each_path(trans, linked, iter) { - if (!linked->nodes_locked) - continue; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_read_locked(linked, i)) { - btree_node_unlock(trans, linked, i); - btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); - } - } - - ret = __btree_node_lock_write(trans, path, b, true); + int ret = __btree_node_lock_write(trans, path, b, true); BUG_ON(ret); } -- cgit v1.2.3 From 1189bdda6c991cbf9342d84410042dd5f3a792e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Apr 2024 01:30:22 -0400 Subject: bcachefs: Fix __bch2_btree_and_journal_iter_init_node_iter() We weren't respecting trans->journal_replay_not_finished - we shouldn't be searching the journal keys unless we have a ref on them. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index a6413a8747d3..1e8cf49a6935 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -469,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); - INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; + INIT_LIST_HEAD(&iter->journal.list); + + if (trans->journal_replay_not_finished) { + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); + if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); + } } /* @@ -487,9 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - if (trans->journal_replay_not_finished && - !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) - list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ -- cgit v1.2.3 From 65acf6e0501ac8880a4f73980d01b5d27648b956 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Apr 2024 12:07:41 +0000 Subject: netfilter: complete validation of user input In my recent commit, I missed that do_replace() handlers use copy_from_sockptr() (which I fixed), followed by unsafe copy_from_sockptr_offset() calls. In all functions, we can perform the @optlen validation before even calling xt_alloc_table_info() with the following check: if ((u64)optlen < (u64)tmp.size + sizeof(tmp)) return -EINVAL; Fixes: 0c83842df40f ("netfilter: validate user input for expected length") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Pablo Neira Ayuso Link: https://lore.kernel.org/r/20240409120741.3538135-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/netfilter/arp_tables.c | 4 ++++ net/ipv4/netfilter/ip_tables.c | 4 ++++ net/ipv6/netfilter/ip6_tables.c | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b150c9929b12..14365b20f1c5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -966,6 +966,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1266,6 +1268,8 @@ static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 487670759578..fe89a056eb06 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1118,6 +1118,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1504,6 +1506,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 636b360311c5..131f7bb2110d 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1135,6 +1135,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1513,6 +1515,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; -- cgit v1.2.3 From 0553e753ea9ee724acaf6b3dfc7354702af83567 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 9 Apr 2024 22:08:09 +0300 Subject: net/mlx5: E-switch, store eswitch pointer before registering devlink_param Next patch will move devlink register to be first. Therefore, whenever mlx5 will register a param, the user will be notified. In order to notify the user, devlink is using the get() callback of the param. Hence, resources that are being used by the get() callback must be set before the devlink param is registered. Therefore, store eswitch pointer inside mdev before registering the param. Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 9 +++------ drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 3047d7015c52..1789800faaeb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1868,6 +1868,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) if (err) goto abort; + dev->priv.eswitch = esw; err = esw_offloads_init(esw); if (err) goto reps_err; @@ -1892,11 +1893,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC; else esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE; - if (MLX5_ESWITCH_MANAGER(dev) && - mlx5_esw_vport_match_metadata_supported(esw)) - esw->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; - - dev->priv.eswitch = esw; BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); esw_info(dev, @@ -1908,6 +1904,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) reps_err: mlx5_esw_vports_cleanup(esw); + dev->priv.eswitch = NULL; abort: if (esw->work_queue) destroy_workqueue(esw->work_queue); @@ -1926,7 +1923,6 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) esw_info(esw->dev, "cleanup\n"); - esw->dev->priv.eswitch = NULL; destroy_workqueue(esw->work_queue); WARN_ON(refcount_read(&esw->qos.refcnt)); mutex_destroy(&esw->state_lock); @@ -1937,6 +1933,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) mutex_destroy(&esw->offloads.encap_tbl_lock); mutex_destroy(&esw->offloads.decap_tbl_lock); esw_offloads_cleanup(esw); + esw->dev->priv.eswitch = NULL; mlx5_esw_vports_cleanup(esw); debugfs_remove_recursive(esw->debugfs_root); devl_params_unregister(priv_to_devlink(esw->dev), mlx5_eswitch_params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index baaae628b0a0..e3cce110e52f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2476,6 +2476,10 @@ int esw_offloads_init(struct mlx5_eswitch *esw) if (err) return err; + if (MLX5_ESWITCH_MANAGER(esw->dev) && + mlx5_esw_vport_match_metadata_supported(esw)) + esw->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; + err = devl_params_register(priv_to_devlink(esw->dev), esw_devlink_params, ARRAY_SIZE(esw_devlink_params)); -- cgit v1.2.3 From c6e77aa9dd82bc18a89bf49418f8f7e961cfccc8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 9 Apr 2024 22:08:10 +0300 Subject: net/mlx5: Register devlink first under devlink lock In case device is having a non fatal FW error during probe, the driver will report the error to user via devlink. This will trigger a WARN_ON, since mlx5 is calling devlink_register() last. In order to avoid the WARN_ON[1], change mlx5 to invoke devl_register() first under devlink lock. [1] WARNING: CPU: 5 PID: 227 at net/devlink/health.c:483 devlink_recover_notify.constprop.0+0xb8/0xc0 CPU: 5 PID: 227 Comm: kworker/u16:3 Not tainted 6.4.0-rc5_for_upstream_min_debug_2023_06_12_12_38 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_health0000:08:00.0 mlx5_fw_reporter_err_work [mlx5_core] RIP: 0010:devlink_recover_notify.constprop.0+0xb8/0xc0 Call Trace: ? __warn+0x79/0x120 ? devlink_recover_notify.constprop.0+0xb8/0xc0 ? report_bug+0x17c/0x190 ? handle_bug+0x3c/0x60 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? devlink_recover_notify.constprop.0+0xb8/0xc0 devlink_health_report+0x4a/0x1c0 mlx5_fw_reporter_err_work+0xa4/0xd0 [mlx5_core] process_one_work+0x1bb/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x4d/0x3c0 ? process_one_work+0x3c0/0x3c0 kthread+0xc6/0xf0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 37 ++++++++++++---------- .../ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 - 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c2593625c09a..59806553889e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1480,6 +1480,14 @@ int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev) if (err) goto err_register; + err = mlx5_crdump_enable(dev); + if (err) + mlx5_core_err(dev, "mlx5_crdump_enable failed with error code %d\n", err); + + err = mlx5_hwmon_dev_register(dev); + if (err) + mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err); + mutex_unlock(&dev->intf_state_mutex); return 0; @@ -1505,7 +1513,10 @@ int mlx5_init_one(struct mlx5_core_dev *dev) int err; devl_lock(devlink); + devl_register(devlink); err = mlx5_init_one_devl_locked(dev); + if (err) + devl_unregister(devlink); devl_unlock(devlink); return err; } @@ -1517,6 +1528,8 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev) devl_lock(devlink); mutex_lock(&dev->intf_state_mutex); + mlx5_hwmon_dev_unregister(dev); + mlx5_crdump_disable(dev); mlx5_unregister_device(dev); if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { @@ -1534,6 +1547,7 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev) mlx5_function_teardown(dev, true); out: mutex_unlock(&dev->intf_state_mutex); + devl_unregister(devlink); devl_unlock(devlink); } @@ -1680,16 +1694,20 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev) } devl_lock(devlink); + devl_register(devlink); + err = mlx5_devlink_params_register(priv_to_devlink(dev)); - devl_unlock(devlink); if (err) { mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err); goto query_hca_caps_err; } + devl_unlock(devlink); return 0; query_hca_caps_err: + devl_unregister(devlink); + devl_unlock(devlink); mlx5_function_disable(dev, true); out: dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; @@ -1702,6 +1720,7 @@ void mlx5_uninit_one_light(struct mlx5_core_dev *dev) devl_lock(devlink); mlx5_devlink_params_unregister(priv_to_devlink(dev)); + devl_unregister(devlink); devl_unlock(devlink); if (dev->state != MLX5_DEVICE_STATE_UP) return; @@ -1943,16 +1962,7 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_init_one; } - err = mlx5_crdump_enable(dev); - if (err) - dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); - - err = mlx5_hwmon_dev_register(dev); - if (err) - mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err); - pci_save_state(pdev); - devlink_register(devlink); return 0; err_init_one: @@ -1973,16 +1983,9 @@ static void remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(dev); set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); - /* mlx5_drain_fw_reset() and mlx5_drain_health_wq() are using - * devlink notify APIs. - * Hence, we must drain them before unregistering the devlink. - */ mlx5_drain_fw_reset(dev); mlx5_drain_health_wq(dev); - devlink_unregister(devlink); mlx5_sriov_disable(pdev, false); - mlx5_hwmon_dev_unregister(dev); - mlx5_crdump_disable(dev); mlx5_uninit_one(dev); mlx5_pci_close(dev); mlx5_mdev_uninit(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index bc863e1f062e..e3bf8c7e4baa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -101,7 +101,6 @@ static void mlx5_sf_dev_remove(struct auxiliary_device *adev) devlink = priv_to_devlink(mdev); set_bit(MLX5_BREAK_FW_WAIT, &mdev->intf_state); mlx5_drain_health_wq(mdev); - devlink_unregister(devlink); if (mlx5_dev_is_lightweight(mdev)) mlx5_uninit_one_light(mdev); else -- cgit v1.2.3 From 9f7e8fbb91f8fa29548e2f6ab50c03b628c67ede Mon Sep 17 00:00:00 2001 From: Michael Liang Date: Tue, 9 Apr 2024 22:08:11 +0300 Subject: net/mlx5: offset comp irq index in name by one The mlx5 comp irq name scheme is changed a little bit between commit 3663ad34bc70 ("net/mlx5: Shift control IRQ to the last index") and commit 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation"). The index in the comp irq name used to start from 0 but now it starts from 1. There is nothing critical here, but it's harmless to change back to the old behavior, a.k.a starting from 0. Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation") Reviewed-by: Mohamed Khalfella Reviewed-by: Yuanyuan Zhong Signed-off-by: Michael Liang Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 4dcf995cb1a2..6bac8ad70ba6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -19,6 +19,7 @@ #define MLX5_IRQ_CTRL_SF_MAX 8 /* min num of vectors for SFs to be enabled */ #define MLX5_IRQ_VEC_COMP_BASE_SF 2 +#define MLX5_IRQ_VEC_COMP_BASE 1 #define MLX5_EQ_SHARE_IRQ_MAX_COMP (8) #define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX) @@ -246,6 +247,7 @@ static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx) return; } + vecidx -= MLX5_IRQ_VEC_COMP_BASE; snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx); } @@ -585,7 +587,7 @@ struct mlx5_irq *mlx5_irq_request_vector(struct mlx5_core_dev *dev, u16 cpu, struct mlx5_irq_table *table = mlx5_irq_table_get(dev); struct mlx5_irq_pool *pool = table->pcif_pool; struct irq_affinity_desc af_desc; - int offset = 1; + int offset = MLX5_IRQ_VEC_COMP_BASE; if (!pool->xa_num_irqs.max) offset = 0; -- cgit v1.2.3 From 7c6782ad4911cbee874e85630226ed389ff2e453 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 9 Apr 2024 22:08:12 +0300 Subject: net/mlx5: Properly link new fs rules into the tree Previously, add_rule_fg would only add newly created rules from the handle into the tree when they had a refcount of 1. On the other hand, create_flow_handle tries hard to find and reference already existing identical rules instead of creating new ones. These two behaviors can result in a situation where create_flow_handle 1) creates a new rule and references it, then 2) in a subsequent step during the same handle creation references it again, resulting in a rule with a refcount of 2 that is not linked into the tree, will have a NULL parent and root and will result in a crash when the flow group is deleted because del_sw_hw_rule, invoked on rule deletion, assumes node->parent is != NULL. This happened in the wild, due to another bug related to incorrect handling of duplicate pkt_reformat ids, which lead to the code in create_flow_handle incorrectly referencing a just-added rule in the same flow handle, resulting in the problem described above. Full details are at [1]. This patch changes add_rule_fg to add new rules without parents into the tree, properly initializing them and avoiding the crash. This makes it more consistent with how rules are added to an FTE in create_flow_handle. Fixes: 74491de93712 ("net/mlx5: Add multi dest support") Link: https://lore.kernel.org/netdev/ea5264d6-6b55-4449-a602-214c6f509c1e@163.com/T/#u [1] Signed-off-by: Cosmin Ratiu Reviewed-by: Tariq Toukan Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index e6bfa7e4f146..2a9421342a50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1808,8 +1808,9 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg, } trace_mlx5_fs_set_fte(fte, false); + /* Link newly added rules into the tree. */ for (i = 0; i < handle->num_rules; i++) { - if (refcount_read(&handle->rule[i]->node.refcount) == 1) { + if (!handle->rule[i]->node.parent) { tree_add_node(&handle->rule[i]->node, &fte->node); trace_mlx5_fs_add_rule(handle->rule[i]); } -- cgit v1.2.3 From 9eca93f4d5ab03905516a68683674d9c50ff95bd Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 9 Apr 2024 22:08:13 +0300 Subject: net/mlx5: Correctly compare pkt reformat ids struct mlx5_pkt_reformat contains a naked union of a u32 id and a dr_action pointer which is used when the action is SW-managed (when pkt_reformat.owner is set to MLX5_FLOW_RESOURCE_OWNER_SW). Using id directly in that case is incorrect, as it maps to the least significant 32 bits of the 64-bit pointer in mlx5_fs_dr_action and not to the pkt reformat id allocated in firmware. For the purpose of comparing whether two rules are identical, interpreting the least significant 32 bits of the mlx5_fs_dr_action pointer as an id mostly works... until it breaks horribly and produces the outcome described in [1]. This patch fixes mlx5_flow_dests_cmp to correctly compare ids using mlx5_fs_dr_action_get_pkt_reformat_id for the SW-managed rules. Link: https://lore.kernel.org/netdev/ea5264d6-6b55-4449-a602-214c6f509c1e@163.com/T/#u [1] Fixes: 6a48faeeca10 ("net/mlx5: Add direct rule fs_cmd implementation") Signed-off-by: Cosmin Ratiu Reviewed-by: Mark Bloch Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2a9421342a50..cf085a478e3e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1664,6 +1664,16 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft, return err; } +static bool mlx5_pkt_reformat_cmp(struct mlx5_pkt_reformat *p1, + struct mlx5_pkt_reformat *p2) +{ + return p1->owner == p2->owner && + (p1->owner == MLX5_FLOW_RESOURCE_OWNER_FW ? + p1->id == p2->id : + mlx5_fs_dr_action_get_pkt_reformat_id(p1) == + mlx5_fs_dr_action_get_pkt_reformat_id(p2)); +} + static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, struct mlx5_flow_destination *d2) { @@ -1675,8 +1685,8 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, ((d1->vport.flags & MLX5_FLOW_DEST_VPORT_VHCA_ID) ? (d1->vport.vhca_id == d2->vport.vhca_id) : true) && ((d1->vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID) ? - (d1->vport.pkt_reformat->id == - d2->vport.pkt_reformat->id) : true)) || + mlx5_pkt_reformat_cmp(d1->vport.pkt_reformat, + d2->vport.pkt_reformat) : true)) || (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && d1->ft == d2->ft) || (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR && -- cgit v1.2.3 From ee3572409f74a838154af74ce1e56e62c17786a8 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:14 +0300 Subject: net/mlx5e: RSS, Block changing channels number when RXFH is configured Changing the channels number after configuring the receive flow hash indirection table may affect the RSS table size. The previous configuration may no longer be compatible with the new receive flow hash indirection table. Block changing the channels number when RXFH is configured and changing the channels number requires resizing the RSS table size. Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index cc51ce16df14..93461b0c5703 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -451,6 +451,23 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, mutex_lock(&priv->state_lock); + /* If RXFH is configured, changing the channels number is allowed only if + * it does not require resizing the RSS table. This is because the previous + * configuration may no longer be compatible with the new RSS table. + */ + if (netif_is_rxfh_configured(priv->netdev)) { + int cur_rqt_size = mlx5e_rqt_size(priv->mdev, cur_params->num_channels); + int new_rqt_size = mlx5e_rqt_size(priv->mdev, count); + + if (new_rqt_size != cur_rqt_size) { + err = -EINVAL; + netdev_err(priv->netdev, + "%s: RXFH is configured, block changing channels number that affects RSS table size (new: %d, current: %d)\n", + __func__, new_rqt_size, cur_rqt_size); + goto out; + } + } + /* Don't allow changing the number of channels if HTB offload is active, * because the numeration of the QoS SQs will change, while per-queue * qdiscs are attached. -- cgit v1.2.3 From ecb829459a841198e142f72fadab56424ae96519 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:15 +0300 Subject: net/mlx5e: Fix mlx5e_priv_init() cleanup flow When mlx5e_priv_init() fails, the cleanup flow calls mlx5e_selq_cleanup which calls mlx5e_selq_apply() that assures that the `priv->state_lock` is held using lockdep_is_held(). Acquire the state_lock in mlx5e_selq_cleanup(). Kernel log: ============================= WARNING: suspicious RCU usage 6.8.0-rc3_net_next_841a9b5 #1 Not tainted ----------------------------- drivers/net/ethernet/mellanox/mlx5/core/en/selq.c:124 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by systemd-modules/293: #0: ffffffffa05067b0 (devices_rwsem){++++}-{3:3}, at: ib_register_client+0x109/0x1b0 [ib_core] #1: ffff8881096c65c0 (&device->client_data_rwsem){++++}-{3:3}, at: add_client_context+0x104/0x1c0 [ib_core] stack backtrace: CPU: 4 PID: 293 Comm: systemd-modules Not tainted 6.8.0-rc3_net_next_841a9b5 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x8a/0xa0 lockdep_rcu_suspicious+0x154/0x1a0 mlx5e_selq_apply+0x94/0xa0 [mlx5_core] mlx5e_selq_cleanup+0x3a/0x60 [mlx5_core] mlx5e_priv_init+0x2be/0x2f0 [mlx5_core] mlx5_rdma_setup_rn+0x7c/0x1a0 [mlx5_core] rdma_init_netdev+0x4e/0x80 [ib_core] ? mlx5_rdma_netdev_free+0x70/0x70 [mlx5_core] ipoib_intf_init+0x64/0x550 [ib_ipoib] ipoib_intf_alloc+0x4e/0xc0 [ib_ipoib] ipoib_add_one+0xb0/0x360 [ib_ipoib] add_client_context+0x112/0x1c0 [ib_core] ib_register_client+0x166/0x1b0 [ib_core] ? 0xffffffffa0573000 ipoib_init_module+0xeb/0x1a0 [ib_ipoib] do_one_initcall+0x61/0x250 do_init_module+0x8a/0x270 init_module_from_file+0x8b/0xd0 idempotent_init_module+0x17d/0x230 __x64_sys_finit_module+0x61/0xb0 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x46/0x4e Fixes: 8bf30be75069 ("net/mlx5e: Introduce select queue parameters") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/selq.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c b/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c index f675b1926340..f66bbc846464 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/selq.c @@ -57,6 +57,7 @@ int mlx5e_selq_init(struct mlx5e_selq *selq, struct mutex *state_lock) void mlx5e_selq_cleanup(struct mlx5e_selq *selq) { + mutex_lock(selq->state_lock); WARN_ON_ONCE(selq->is_prepared); kvfree(selq->standby); @@ -67,6 +68,7 @@ void mlx5e_selq_cleanup(struct mlx5e_selq *selq) kvfree(selq->standby); selq->standby = NULL; + mutex_unlock(selq->state_lock); } void mlx5e_selq_prepare_params(struct mlx5e_selq *selq, struct mlx5e_params *params) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 91848eae4565..b375ef268671 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5726,9 +5726,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) kfree(priv->tx_rates); kfree(priv->txq2sq); destroy_workqueue(priv->wq); - mutex_lock(&priv->state_lock); mlx5e_selq_cleanup(&priv->selq); - mutex_unlock(&priv->state_lock); free_cpumask_var(priv->scratchpad.cpumask); for (i = 0; i < priv->htb_max_qos_sqs; i++) -- cgit v1.2.3 From 2f436f1869771d46e1a9f85738d5a1a7c5653a4e Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:16 +0300 Subject: net/mlx5e: HTB, Fix inconsistencies with QoS SQs number When creating a new HTB class while the interface is down, the variable that follows the number of QoS SQs (htb_max_qos_sqs) may not be consistent with the number of HTB classes. Previously, we compared these two values to ensure that the node_qid is lower than the number of QoS SQs, and we allocated stats for that SQ when they are equal. Change the check to compare the node_qid with the current number of leaf nodes and fix the checking conditions to ensure allocation of stats_list and stats for each node. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/qos.c | 33 ++++++++++++------------ 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c index e87e26f2c669..6743806b8480 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -83,24 +83,25 @@ int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs, txq_ix = mlx5e_qid_from_qos(chs, node_qid); - WARN_ON(node_qid > priv->htb_max_qos_sqs); - if (node_qid == priv->htb_max_qos_sqs) { - struct mlx5e_sq_stats *stats, **stats_list = NULL; - - if (priv->htb_max_qos_sqs == 0) { - stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev), - sizeof(*stats_list), - GFP_KERNEL); - if (!stats_list) - return -ENOMEM; - } + WARN_ON(node_qid >= mlx5e_htb_cur_leaf_nodes(priv->htb)); + if (!priv->htb_qos_sq_stats) { + struct mlx5e_sq_stats **stats_list; + + stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev), + sizeof(*stats_list), GFP_KERNEL); + if (!stats_list) + return -ENOMEM; + + WRITE_ONCE(priv->htb_qos_sq_stats, stats_list); + } + + if (!priv->htb_qos_sq_stats[node_qid]) { + struct mlx5e_sq_stats *stats; + stats = kzalloc(sizeof(*stats), GFP_KERNEL); - if (!stats) { - kvfree(stats_list); + if (!stats) return -ENOMEM; - } - if (stats_list) - WRITE_ONCE(priv->htb_qos_sq_stats, stats_list); + WRITE_ONCE(priv->htb_qos_sq_stats[node_qid], stats); /* Order htb_max_qos_sqs increment after writing the array pointer. * Pairs with smp_load_acquire in en_stats.c. -- cgit v1.2.3 From 86b0ca5b118d3a0bae5e5645a13e66f8a4f6c525 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 9 Apr 2024 22:08:17 +0300 Subject: net/mlx5e: Do not produce metadata freelist entries in Tx port ts WQE xmit Free Tx port timestamping metadata entries in the NAPI poll context and consume metadata enties in the WQE xmit path. Do not free a Tx port timestamping metadata entry in the WQE xmit path even in the error path to avoid a race between two metadata entry producers. Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-10-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h | 8 +++++++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 7 +++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h index 86f1854698b4..883c044852f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -95,9 +95,15 @@ static inline void mlx5e_ptp_metadata_fifo_push(struct mlx5e_ptp_metadata_fifo * } static inline u8 +mlx5e_ptp_metadata_fifo_peek(struct mlx5e_ptp_metadata_fifo *fifo) +{ + return fifo->data[fifo->mask & fifo->cc]; +} + +static inline void mlx5e_ptp_metadata_fifo_pop(struct mlx5e_ptp_metadata_fifo *fifo) { - return fifo->data[fifo->mask & fifo->cc++]; + fifo->cc++; } static inline void diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 2fa076b23fbe..e21a3b4128ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -398,6 +398,8 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) { u8 metadata_index = be32_to_cpu(eseg->flow_table_metadata); + mlx5e_ptp_metadata_fifo_pop(&sq->ptpsq->metadata_freelist); + mlx5e_skb_cb_hwtstamp_init(skb); mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb, metadata_index); @@ -496,9 +498,6 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, err_drop: stats->dropped++; - if (unlikely(sq->ptpsq && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) - mlx5e_ptp_metadata_fifo_push(&sq->ptpsq->metadata_freelist, - be32_to_cpu(eseg->flow_table_metadata)); dev_kfree_skb_any(skb); mlx5e_tx_flush(sq); } @@ -657,7 +656,7 @@ static void mlx5e_cqe_ts_id_eseg(struct mlx5e_ptpsq *ptpsq, struct sk_buff *skb, { if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) eseg->flow_table_metadata = - cpu_to_be32(mlx5e_ptp_metadata_fifo_pop(&ptpsq->metadata_freelist)); + cpu_to_be32(mlx5e_ptp_metadata_fifo_peek(&ptpsq->metadata_freelist)); } static void mlx5e_txwqe_build_eseg(struct mlx5e_priv *priv, struct mlx5e_txqsq *sq, -- cgit v1.2.3 From 49e6c9387051716169ff6a6c5ddd4d9f358db2e9 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 9 Apr 2024 22:08:18 +0300 Subject: net/mlx5e: RSS, Block XOR hash with over 128 channels When supporting more than 128 channels, the RQT size is calculated by multiplying the number of channels by 2 and rounding up to the nearest power of 2. The index of the RQT is derived from the RSS hash calculations. If XOR8 is used as the RSS hash function, there are only 256 possible hash results, and therefore, only 256 indexes can be reached in the RQT. Block setting the RSS hash function to XOR when the number of channels exceeds 128. Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240409190820.227554-11-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c | 7 ++++++ drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 28 ++++++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c index bcafb4bf9415..8d9a3b5ec973 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c @@ -179,6 +179,13 @@ u32 mlx5e_rqt_size(struct mlx5_core_dev *mdev, unsigned int num_channels) return min_t(u32, rqt_size, max_cap_rqt_size); } +#define MLX5E_MAX_RQT_SIZE_ALLOWED_WITH_XOR8_HASH 256 + +unsigned int mlx5e_rqt_max_num_channels_allowed_for_xor8(void) +{ + return MLX5E_MAX_RQT_SIZE_ALLOWED_WITH_XOR8_HASH / MLX5E_UNIFORM_SPREAD_RQT_FACTOR; +} + void mlx5e_rqt_destroy(struct mlx5e_rqt *rqt) { mlx5_core_destroy_rqt(rqt->mdev, rqt->rqtn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h index e0bc30308c77..2f9e04a8418f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h @@ -38,6 +38,7 @@ static inline u32 mlx5e_rqt_get_rqtn(struct mlx5e_rqt *rqt) } u32 mlx5e_rqt_size(struct mlx5_core_dev *mdev, unsigned int num_channels); +unsigned int mlx5e_rqt_max_num_channels_allowed_for_xor8(void); int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn, u32 *vhca_id); int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 93461b0c5703..8f101181648c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -451,6 +451,17 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, mutex_lock(&priv->state_lock); + if (mlx5e_rx_res_get_current_hash(priv->rx_res).hfunc == ETH_RSS_HASH_XOR) { + unsigned int xor8_max_channels = mlx5e_rqt_max_num_channels_allowed_for_xor8(); + + if (count > xor8_max_channels) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: Requested number of channels (%d) exceeds the maximum allowed by the XOR8 RSS hfunc (%d)\n", + __func__, count, xor8_max_channels); + goto out; + } + } + /* If RXFH is configured, changing the channels number is allowed only if * it does not require resizing the RSS table. This is because the previous * configuration may no longer be compatible with the new RSS table. @@ -1298,17 +1309,30 @@ int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, struct mlx5e_priv *priv = netdev_priv(dev); u32 *rss_context = &rxfh->rss_context; u8 hfunc = rxfh->hfunc; + unsigned int count; int err; mutex_lock(&priv->state_lock); + + count = priv->channels.params.num_channels; + + if (hfunc == ETH_RSS_HASH_XOR) { + unsigned int xor8_max_channels = mlx5e_rqt_max_num_channels_allowed_for_xor8(); + + if (count > xor8_max_channels) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: Cannot set RSS hash function to XOR, current number of channels (%d) exceeds the maximum allowed for XOR8 RSS hfunc (%d)\n", + __func__, count, xor8_max_channels); + goto unlock; + } + } + if (*rss_context && rxfh->rss_delete) { err = mlx5e_rx_res_rss_destroy(priv->rx_res, *rss_context); goto unlock; } if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) { - unsigned int count = priv->channels.params.num_channels; - err = mlx5e_rx_res_rss_init(priv->rx_res, rss_context, count); if (err) goto unlock; -- cgit v1.2.3 From 7772dc7460e8ef359f3eee88c3b708cb403e19af Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 9 Apr 2024 22:08:19 +0300 Subject: net/mlx5: Disallow SRIOV switchdev mode when in multi-PF netdev Adaptations need to be made for the auxiliary device management in the core driver level. Block this combination for now. Fixes: 678eb448055a ("net/mlx5: SD, Implement basic query and instantiation") Signed-off-by: Tariq Toukan Reviewed-by: Dragos Tatulea Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20240409190820.227554-12-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index e3cce110e52f..1f60954c12f7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -43,6 +43,7 @@ #include "rdma.h" #include "en.h" #include "fs_core.h" +#include "lib/mlx5.h" #include "lib/devcom.h" #include "lib/eq.h" #include "lib/fs_chains.h" @@ -3711,6 +3712,12 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, if (esw_mode_from_devlink(mode, &mlx5_mode)) return -EINVAL; + if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV && mlx5_get_sd(esw->dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change E-Switch mode to switchdev when multi-PF netdev (Socket Direct) is configured."); + return -EPERM; + } + mlx5_lag_disable_change(esw->dev); err = mlx5_esw_try_lock(esw); if (err < 0) { -- cgit v1.2.3 From fe87922cee6161f066f4b9dd542033e048eeedaf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:41:10 +0200 Subject: net/mlx5: fix possible stack overflows A couple of debug functions use a 512 byte temporary buffer and call another function that has another buffer of the same size, which in turn exceeds the usual warning limit for excessive stack usage: drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c:1073:1: error: stack frame size (1448) exceeds limit (1024) in 'dr_dump_start' [-Werror,-Wframe-larger-than] dr_dump_start(struct seq_file *file, loff_t *pos) drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c:1009:1: error: stack frame size (1120) exceeds limit (1024) in 'dr_dump_domain' [-Werror,-Wframe-larger-than] dr_dump_domain(struct seq_file *file, struct mlx5dr_domain *dmn) drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c:705:1: error: stack frame size (1104) exceeds limit (1024) in 'dr_dump_matcher_rx_tx' [-Werror,-Wframe-larger-than] dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, Rework these so that each of the various code paths only ever has one of these buffers in it, and exactly the functions that declare one have the 'noinline_for_stack' annotation that prevents them from all being inlined into the same caller. Fixes: 917d1e799ddf ("net/mlx5: DR, Change SWS usage to debug fs seq_file interface") Reviewed-by: Simon Horman Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/all/20240219100506.648089-1-arnd@kernel.org/ Signed-off-by: Arnd Bergmann Acked-by: Tariq Toukan Link: https://lore.kernel.org/r/20240408074142.3007036-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/steering/dr_dbg.c | 82 +++++++++++----------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c index 64f4cc284aea..030a5776c937 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c @@ -205,12 +205,11 @@ dr_dump_hex_print(char hex[DR_HEX_SIZE], char *src, u32 size) } static int -dr_dump_rule_action_mem(struct seq_file *file, const u64 rule_id, +dr_dump_rule_action_mem(struct seq_file *file, char *buff, const u64 rule_id, struct mlx5dr_rule_action_member *action_mem) { struct mlx5dr_action *action = action_mem->action; const u64 action_id = DR_DBG_PTR_TO_ID(action); - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; u64 hit_tbl_ptr, miss_tbl_ptr; u32 hit_tbl_id, miss_tbl_id; int ret; @@ -488,10 +487,9 @@ dr_dump_rule_action_mem(struct seq_file *file, const u64 rule_id, } static int -dr_dump_rule_mem(struct seq_file *file, struct mlx5dr_ste *ste, +dr_dump_rule_mem(struct seq_file *file, char *buff, struct mlx5dr_ste *ste, bool is_rx, const u64 rule_id, u8 format_ver) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; char hw_ste_dump[DR_HEX_SIZE]; u32 mem_rec_type; int ret; @@ -522,7 +520,8 @@ dr_dump_rule_mem(struct seq_file *file, struct mlx5dr_ste *ste, } static int -dr_dump_rule_rx_tx(struct seq_file *file, struct mlx5dr_rule_rx_tx *rule_rx_tx, +dr_dump_rule_rx_tx(struct seq_file *file, char *buff, + struct mlx5dr_rule_rx_tx *rule_rx_tx, bool is_rx, const u64 rule_id, u8 format_ver) { struct mlx5dr_ste *ste_arr[DR_RULE_MAX_STES + DR_ACTION_MAX_STES]; @@ -533,7 +532,7 @@ dr_dump_rule_rx_tx(struct seq_file *file, struct mlx5dr_rule_rx_tx *rule_rx_tx, return 0; while (i--) { - ret = dr_dump_rule_mem(file, ste_arr[i], is_rx, rule_id, + ret = dr_dump_rule_mem(file, buff, ste_arr[i], is_rx, rule_id, format_ver); if (ret < 0) return ret; @@ -542,7 +541,8 @@ dr_dump_rule_rx_tx(struct seq_file *file, struct mlx5dr_rule_rx_tx *rule_rx_tx, return 0; } -static int dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) +static noinline_for_stack int +dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) { struct mlx5dr_rule_action_member *action_mem; const u64 rule_id = DR_DBG_PTR_TO_ID(rule); @@ -565,19 +565,19 @@ static int dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) return ret; if (rx->nic_matcher) { - ret = dr_dump_rule_rx_tx(file, rx, true, rule_id, format_ver); + ret = dr_dump_rule_rx_tx(file, buff, rx, true, rule_id, format_ver); if (ret < 0) return ret; } if (tx->nic_matcher) { - ret = dr_dump_rule_rx_tx(file, tx, false, rule_id, format_ver); + ret = dr_dump_rule_rx_tx(file, buff, tx, false, rule_id, format_ver); if (ret < 0) return ret; } list_for_each_entry(action_mem, &rule->rule_actions_list, list) { - ret = dr_dump_rule_action_mem(file, rule_id, action_mem); + ret = dr_dump_rule_action_mem(file, buff, rule_id, action_mem); if (ret < 0) return ret; } @@ -586,10 +586,10 @@ static int dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) } static int -dr_dump_matcher_mask(struct seq_file *file, struct mlx5dr_match_param *mask, +dr_dump_matcher_mask(struct seq_file *file, char *buff, + struct mlx5dr_match_param *mask, u8 criteria, const u64 matcher_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; char dump[DR_HEX_SIZE]; int ret; @@ -681,10 +681,10 @@ dr_dump_matcher_mask(struct seq_file *file, struct mlx5dr_match_param *mask, } static int -dr_dump_matcher_builder(struct seq_file *file, struct mlx5dr_ste_build *builder, +dr_dump_matcher_builder(struct seq_file *file, char *buff, + struct mlx5dr_ste_build *builder, u32 index, bool is_rx, const u64 matcher_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; int ret; ret = snprintf(buff, MLX5DR_DEBUG_DUMP_BUFF_LENGTH, @@ -702,11 +702,10 @@ dr_dump_matcher_builder(struct seq_file *file, struct mlx5dr_ste_build *builder, } static int -dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, +dr_dump_matcher_rx_tx(struct seq_file *file, char *buff, bool is_rx, struct mlx5dr_matcher_rx_tx *matcher_rx_tx, const u64 matcher_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; enum dr_dump_rec_type rec_type; u64 s_icm_addr, e_icm_addr; int i, ret; @@ -731,7 +730,7 @@ dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, return ret; for (i = 0; i < matcher_rx_tx->num_of_builders; i++) { - ret = dr_dump_matcher_builder(file, + ret = dr_dump_matcher_builder(file, buff, &matcher_rx_tx->ste_builder[i], i, is_rx, matcher_id); if (ret < 0) @@ -741,7 +740,7 @@ dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, return 0; } -static int +static noinline_for_stack int dr_dump_matcher(struct seq_file *file, struct mlx5dr_matcher *matcher) { struct mlx5dr_matcher_rx_tx *rx = &matcher->rx; @@ -763,19 +762,19 @@ dr_dump_matcher(struct seq_file *file, struct mlx5dr_matcher *matcher) if (ret) return ret; - ret = dr_dump_matcher_mask(file, &matcher->mask, + ret = dr_dump_matcher_mask(file, buff, &matcher->mask, matcher->match_criteria, matcher_id); if (ret < 0) return ret; if (rx->nic_tbl) { - ret = dr_dump_matcher_rx_tx(file, true, rx, matcher_id); + ret = dr_dump_matcher_rx_tx(file, buff, true, rx, matcher_id); if (ret < 0) return ret; } if (tx->nic_tbl) { - ret = dr_dump_matcher_rx_tx(file, false, tx, matcher_id); + ret = dr_dump_matcher_rx_tx(file, buff, false, tx, matcher_id); if (ret < 0) return ret; } @@ -803,11 +802,10 @@ dr_dump_matcher_all(struct seq_file *file, struct mlx5dr_matcher *matcher) } static int -dr_dump_table_rx_tx(struct seq_file *file, bool is_rx, +dr_dump_table_rx_tx(struct seq_file *file, char *buff, bool is_rx, struct mlx5dr_table_rx_tx *table_rx_tx, const u64 table_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; enum dr_dump_rec_type rec_type; u64 s_icm_addr; int ret; @@ -829,7 +827,8 @@ dr_dump_table_rx_tx(struct seq_file *file, bool is_rx, return 0; } -static int dr_dump_table(struct seq_file *file, struct mlx5dr_table *table) +static noinline_for_stack int +dr_dump_table(struct seq_file *file, struct mlx5dr_table *table) { struct mlx5dr_table_rx_tx *rx = &table->rx; struct mlx5dr_table_rx_tx *tx = &table->tx; @@ -848,14 +847,14 @@ static int dr_dump_table(struct seq_file *file, struct mlx5dr_table *table) return ret; if (rx->nic_dmn) { - ret = dr_dump_table_rx_tx(file, true, rx, + ret = dr_dump_table_rx_tx(file, buff, true, rx, DR_DBG_PTR_TO_ID(table)); if (ret < 0) return ret; } if (tx->nic_dmn) { - ret = dr_dump_table_rx_tx(file, false, tx, + ret = dr_dump_table_rx_tx(file, buff, false, tx, DR_DBG_PTR_TO_ID(table)); if (ret < 0) return ret; @@ -881,10 +880,10 @@ static int dr_dump_table_all(struct seq_file *file, struct mlx5dr_table *tbl) } static int -dr_dump_send_ring(struct seq_file *file, struct mlx5dr_send_ring *ring, +dr_dump_send_ring(struct seq_file *file, char *buff, + struct mlx5dr_send_ring *ring, const u64 domain_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; int ret; ret = snprintf(buff, MLX5DR_DEBUG_DUMP_BUFF_LENGTH, @@ -902,13 +901,13 @@ dr_dump_send_ring(struct seq_file *file, struct mlx5dr_send_ring *ring, return 0; } -static noinline_for_stack int +static int dr_dump_domain_info_flex_parser(struct seq_file *file, + char *buff, const char *flex_parser_name, const u8 flex_parser_value, const u64 domain_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; int ret; ret = snprintf(buff, MLX5DR_DEBUG_DUMP_BUFF_LENGTH, @@ -925,11 +924,11 @@ dr_dump_domain_info_flex_parser(struct seq_file *file, return 0; } -static noinline_for_stack int -dr_dump_domain_info_caps(struct seq_file *file, struct mlx5dr_cmd_caps *caps, +static int +dr_dump_domain_info_caps(struct seq_file *file, char *buff, + struct mlx5dr_cmd_caps *caps, const u64 domain_id) { - char buff[MLX5DR_DEBUG_DUMP_BUFF_LENGTH]; struct mlx5dr_cmd_vport_cap *vport_caps; unsigned long i, vports_num; int ret; @@ -969,34 +968,35 @@ dr_dump_domain_info_caps(struct seq_file *file, struct mlx5dr_cmd_caps *caps, } static int -dr_dump_domain_info(struct seq_file *file, struct mlx5dr_domain_info *info, +dr_dump_domain_info(struct seq_file *file, char *buff, + struct mlx5dr_domain_info *info, const u64 domain_id) { int ret; - ret = dr_dump_domain_info_caps(file, &info->caps, domain_id); + ret = dr_dump_domain_info_caps(file, buff, &info->caps, domain_id); if (ret < 0) return ret; - ret = dr_dump_domain_info_flex_parser(file, "icmp_dw0", + ret = dr_dump_domain_info_flex_parser(file, buff, "icmp_dw0", info->caps.flex_parser_id_icmp_dw0, domain_id); if (ret < 0) return ret; - ret = dr_dump_domain_info_flex_parser(file, "icmp_dw1", + ret = dr_dump_domain_info_flex_parser(file, buff, "icmp_dw1", info->caps.flex_parser_id_icmp_dw1, domain_id); if (ret < 0) return ret; - ret = dr_dump_domain_info_flex_parser(file, "icmpv6_dw0", + ret = dr_dump_domain_info_flex_parser(file, buff, "icmpv6_dw0", info->caps.flex_parser_id_icmpv6_dw0, domain_id); if (ret < 0) return ret; - ret = dr_dump_domain_info_flex_parser(file, "icmpv6_dw1", + ret = dr_dump_domain_info_flex_parser(file, buff, "icmpv6_dw1", info->caps.flex_parser_id_icmpv6_dw1, domain_id); if (ret < 0) @@ -1032,12 +1032,12 @@ dr_dump_domain(struct seq_file *file, struct mlx5dr_domain *dmn) if (ret) return ret; - ret = dr_dump_domain_info(file, &dmn->info, domain_id); + ret = dr_dump_domain_info(file, buff, &dmn->info, domain_id); if (ret < 0) return ret; if (dmn->info.supp_sw_steering) { - ret = dr_dump_send_ring(file, dmn->send_ring, domain_id); + ret = dr_dump_send_ring(file, buff, dmn->send_ring, domain_id); if (ret < 0) return ret; } -- cgit v1.2.3 From 2f7b1d8b5505efb0057cd1ab85fca206063ea4c3 Mon Sep 17 00:00:00 2001 From: Pin-yen Lin Date: Tue, 12 Mar 2024 19:51:55 +0800 Subject: clk: mediatek: Do a runtime PM get on controllers during probe mt8183-mfgcfg has a mutual dependency with genpd during the probing stage, which leads to a deadlock in the following call stack: CPU0: genpd_lock --> clk_prepare_lock genpd_power_off_work_fn() genpd_lock() generic_pm_domain::power_off() clk_unprepare() clk_prepare_lock() CPU1: clk_prepare_lock --> genpd_lock clk_register() __clk_core_init() clk_prepare_lock() clk_pm_runtime_get() genpd_lock() Do a runtime PM get at the probe function to make sure clk_register() won't acquire the genpd lock. Instead of only modifying mt8183-mfgcfg, do this on all mediatek clock controller probings because we don't believe this would cause any regression. Verified on MT8183 and MT8192 Chromebooks. Fixes: acddfc2c261b ("clk: mediatek: Add MT8183 clock support") Signed-off-by: Pin-yen Lin Link: https://lore.kernel.org/r/20240312115249.3341654-1-treapking@chromium.org Reviewed-by: AngeloGioacchino Del Regno Tested-by: AngeloGioacchino Del Regno Signed-off-by: Stephen Boyd --- drivers/clk/mediatek/clk-mtk.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/clk/mediatek/clk-mtk.c b/drivers/clk/mediatek/clk-mtk.c index 2e55368dc4d8..bd37ab4d1a9b 100644 --- a/drivers/clk/mediatek/clk-mtk.c +++ b/drivers/clk/mediatek/clk-mtk.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "clk-mtk.h" @@ -494,6 +495,16 @@ static int __mtk_clk_simple_probe(struct platform_device *pdev, return IS_ERR(base) ? PTR_ERR(base) : -ENOMEM; } + + devm_pm_runtime_enable(&pdev->dev); + /* + * Do a pm_runtime_resume_and_get() to workaround a possible + * deadlock between clk_register() and the genpd framework. + */ + r = pm_runtime_resume_and_get(&pdev->dev); + if (r) + return r; + /* Calculate how many clk_hw_onecell_data entries to allocate */ num_clks = mcd->num_clks + mcd->num_composite_clks; num_clks += mcd->num_fixed_clks + mcd->num_factor_clks; @@ -574,6 +585,8 @@ static int __mtk_clk_simple_probe(struct platform_device *pdev, goto unregister_clks; } + pm_runtime_put(&pdev->dev); + return r; unregister_clks: @@ -604,6 +617,8 @@ free_data: free_base: if (mcd->shared_io && base) iounmap(base); + + pm_runtime_put(&pdev->dev); return r; } -- cgit v1.2.3 From d3e8a91a848a5941e3c31ecebd6b2612b37e01a6 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 13 Mar 2024 22:05:37 +0000 Subject: clk: mediatek: mt7988-infracfg: fix clocks for 2nd PCIe port Due to what seems to be an undocumented oddity in MediaTek's MT7988 SoC design the CLK_INFRA_PCIE_PERI_26M_CK_P2 clock requires CLK_INFRA_PCIE_PERI_26M_CK_P3 to be enabled. This currently leads to PCIe port 2 not working in Linux. Reflect the apparent relationship in the clk driver to make sure PCIe port 2 of the MT7988 SoC works. Fixes: 4b4719437d85f ("clk: mediatek: add drivers for MT7988 SoC") Suggested-by: Sam Shih Signed-off-by: Daniel Golle Link: https://lore.kernel.org/r/1da2506a51f970706bf4ec9509dd04e0471065e5.1710367453.git.daniel@makrotopia.org Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Stephen Boyd --- drivers/clk/mediatek/clk-mt7988-infracfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/mediatek/clk-mt7988-infracfg.c b/drivers/clk/mediatek/clk-mt7988-infracfg.c index 449041f8abbc..c8c023afe3e5 100644 --- a/drivers/clk/mediatek/clk-mt7988-infracfg.c +++ b/drivers/clk/mediatek/clk-mt7988-infracfg.c @@ -156,7 +156,7 @@ static const struct mtk_gate infra_clks[] = { GATE_INFRA0(CLK_INFRA_PCIE_PERI_26M_CK_P1, "infra_pcie_peri_ck_26m_ck_p1", "csw_infra_f26m_sel", 8), GATE_INFRA0(CLK_INFRA_PCIE_PERI_26M_CK_P2, "infra_pcie_peri_ck_26m_ck_p2", - "csw_infra_f26m_sel", 9), + "infra_pcie_peri_ck_26m_ck_p3", 9), GATE_INFRA0(CLK_INFRA_PCIE_PERI_26M_CK_P3, "infra_pcie_peri_ck_26m_ck_p3", "csw_infra_f26m_sel", 10), /* INFRA1 */ -- cgit v1.2.3 From 33623113a48ea906f1955cbf71094f6aa4462e8f Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 9 Apr 2024 12:41:59 +0200 Subject: net: sparx5: fix wrong config being used when reconfiguring PCS The wrong port config is being used if the PCS is reconfigured. Fix this by correctly using the new config instead of the old one. Fixes: 946e7fd5053a ("net: sparx5: add port module support") Signed-off-by: Daniel Machon Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20240409-link-mode-reconfiguration-fix-v2-1-db6a507f3627@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/sparx5/sparx5_port.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index 3a1b1a1f5a19..60dd2fd603a8 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -731,7 +731,7 @@ static int sparx5_port_pcs_low_set(struct sparx5 *sparx5, bool sgmii = false, inband_aneg = false; int err; - if (port->conf.inband) { + if (conf->inband) { if (conf->portmode == PHY_INTERFACE_MODE_SGMII || conf->portmode == PHY_INTERFACE_MODE_QSGMII) inband_aneg = true; /* Cisco-SGMII in-band-aneg */ @@ -948,7 +948,7 @@ int sparx5_port_pcs_set(struct sparx5 *sparx5, if (err) return -EINVAL; - if (port->conf.inband) { + if (conf->inband) { /* Enable/disable 1G counters in ASM */ spx5_rmw(ASM_PORT_CFG_CSC_STAT_DIS_SET(high_speed_dev), ASM_PORT_CFG_CSC_STAT_DIS, -- cgit v1.2.3 From d51dc8dd6ab6f93a894ff8b38d3b8d02c98eb9fb Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Tue, 9 Apr 2024 13:37:53 +0200 Subject: Revert "s390/ism: fix receive message buffer allocation" This reverts commit 58effa3476536215530c9ec4910ffc981613b413. Review was not finished on this patch. So it's not ready for upstreaming. Signed-off-by: Gerd Bayer Link: https://lore.kernel.org/r/20240409113753.2181368-1-gbayer@linux.ibm.com Fixes: 58effa347653 ("s390/ism: fix receive message buffer allocation") Signed-off-by: Paolo Abeni --- drivers/s390/net/ism_drv.c | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index affb05521e14..2c8e964425dc 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -14,8 +14,6 @@ #include #include #include -#include -#include #include "ism.h" @@ -294,15 +292,13 @@ out: static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, - DMA_FROM_DEVICE); - folio_put(virt_to_folio(dmb->cpu_addr)); + dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, + dmb->cpu_addr, dmb->dma_addr); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { unsigned long bit; - int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -319,30 +315,14 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = - folio_address(folio_alloc(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY, - get_order(dmb->dmb_len))); + dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, + &dmb->dma_addr, + GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!dmb->cpu_addr) + clear_bit(dmb->sba_idx, ism->sba_bitmap); - if (!dmb->cpu_addr) { - rc = -ENOMEM; - goto out_bit; - } - dmb->dma_addr = dma_map_page(&ism->pdev->dev, - virt_to_page(dmb->cpu_addr), 0, - dmb->dmb_len, DMA_FROM_DEVICE); - if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { - rc = -ENOMEM; - goto out_free; - } - - return 0; - -out_free: - kfree(dmb->cpu_addr); -out_bit: - clear_bit(dmb->sba_idx, ism->sba_bitmap); - return rc; + return dmb->cpu_addr ? 0 : -ENOMEM; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, -- cgit v1.2.3 From 17c560113231ddc20088553c7b499b289b664311 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Tue, 9 Apr 2024 18:01:14 +0300 Subject: net: dsa: mt7530: trap link-local frames regardless of ST Port State MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Clause 5 of IEEE Std 802-2014, two sublayers of the data link layer (DLL) of the Open Systems Interconnection basic reference model (OSI/RM) are described; the medium access control (MAC) and logical link control (LLC) sublayers. The MAC sublayer is the one facing the physical layer. In 8.2 of IEEE Std 802.1Q-2022, the Bridge architecture is described. A Bridge component comprises a MAC Relay Entity for interconnecting the Ports of the Bridge, at least two Ports, and higher layer entities with at least a Spanning Tree Protocol Entity included. Each Bridge Port also functions as an end station and shall provide the MAC Service to an LLC Entity. Each instance of the MAC Service is provided to a distinct LLC Entity that supports protocol identification, multiplexing, and demultiplexing, for protocol data unit (PDU) transmission and reception by one or more higher layer entities. It is described in 8.13.9 of IEEE Std 802.1Q-2022 that in a Bridge, the LLC Entity associated with each Bridge Port is modeled as being directly connected to the attached Local Area Network (LAN). On the switch with CPU port architecture, CPU port functions as Management Port, and the Management Port functionality is provided by software which functions as an end station. Software is connected to an IEEE 802 LAN that is wholly contained within the system that incorporates the Bridge. Software provides access to the LLC Entity associated with each Bridge Port by the value of the source port field on the special tag on the frame received by software. We call frames that carry control information to determine the active topology and current extent of each Virtual Local Area Network (VLAN), i.e., spanning tree or Shortest Path Bridging (SPB) and Multiple VLAN Registration Protocol Data Units (MVRPDUs), and frames from other link constrained protocols, such as Extensible Authentication Protocol over LAN (EAPOL) and Link Layer Discovery Protocol (LLDP), link-local frames. They are not forwarded by a Bridge. Permanently configured entries in the filtering database (FDB) ensure that such frames are discarded by the Forwarding Process. In 8.6.3 of IEEE Std 802.1Q-2022, this is described in detail: Each of the reserved MAC addresses specified in Table 8-1 (01-80-C2-00-00-[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F]) shall be permanently configured in the FDB in C-VLAN components and ERs. Each of the reserved MAC addresses specified in Table 8-2 (01-80-C2-00-00-[01,02,03,04,05,06,07,08,09,0A,0E]) shall be permanently configured in the FDB in S-VLAN components. Each of the reserved MAC addresses specified in Table 8-3 (01-80-C2-00-00-[01,02,04,0E]) shall be permanently configured in the FDB in TPMR components. The FDB entries for reserved MAC addresses shall specify filtering for all Bridge Ports and all VIDs. Management shall not provide the capability to modify or remove entries for reserved MAC addresses. The addresses in Table 8-1, Table 8-2, and Table 8-3 determine the scope of propagation of PDUs within a Bridged Network, as follows: The Nearest Bridge group address (01-80-C2-00-00-0E) is an address that no conformant Two-Port MAC Relay (TPMR) component, Service VLAN (S-VLAN) component, Customer VLAN (C-VLAN) component, or MAC Bridge can forward. PDUs transmitted using this destination address, or any other addresses that appear in Table 8-1, Table 8-2, and Table 8-3 (01-80-C2-00-00-[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F]), can therefore travel no further than those stations that can be reached via a single individual LAN from the originating station. The Nearest non-TPMR Bridge group address (01-80-C2-00-00-03), is an address that no conformant S-VLAN component, C-VLAN component, or MAC Bridge can forward; however, this address is relayed by a TPMR component. PDUs using this destination address, or any of the other addresses that appear in both Table 8-1 and Table 8-2 but not in Table 8-3 (01-80-C2-00-00-[00,03,05,06,07,08,09,0A,0B,0C,0D,0F]), will be relayed by any TPMRs but will propagate no further than the nearest S-VLAN component, C-VLAN component, or MAC Bridge. The Nearest Customer Bridge group address (01-80-C2-00-00-00) is an address that no conformant C-VLAN component, MAC Bridge can forward; however, it is relayed by TPMR components and S-VLAN components. PDUs using this destination address, or any of the other addresses that appear in Table 8-1 but not in either Table 8-2 or Table 8-3 (01-80-C2-00-00-[00,0B,0C,0D,0F]), will be relayed by TPMR components and S-VLAN components but will propagate no further than the nearest C-VLAN component or MAC Bridge. Because the LLC Entity associated with each Bridge Port is provided via CPU port, we must not filter these frames but forward them to CPU port. In a Bridge, the transmission Port is majorly decided by ingress and egress rules, FDB, and spanning tree Port State functions of the Forwarding Process. For link-local frames, only CPU port should be designated as destination port in the FDB, and the other functions of the Forwarding Process must not interfere with the decision of the transmission Port. We call this process trapping frames to CPU port. Therefore, on the switch with CPU port architecture, link-local frames must be trapped to CPU port, and certain link-local frames received by a Port of a Bridge comprising a TPMR component or an S-VLAN component must be excluded from it. A Bridge of the switch with CPU port architecture cannot comprise a Two-Port MAC Relay (TPMR) component as a TPMR component supports only a subset of the functionality of a MAC Bridge. A Bridge comprising two Ports (Management Port doesn't count) of this architecture will either function as a standard MAC Bridge or a standard VLAN Bridge. Therefore, a Bridge of this architecture can only comprise S-VLAN components, C-VLAN components, or MAC Bridge components. Since there's no TPMR component, we don't need to relay PDUs using the destination addresses specified on the Nearest non-TPMR section, and the proportion of the Nearest Customer Bridge section where they must be relayed by TPMR components. One option to trap link-local frames to CPU port is to add static FDB entries with CPU port designated as destination port. However, because that Independent VLAN Learning (IVL) is being used on every VID, each entry only applies to a single VLAN Identifier (VID). For a Bridge comprising a MAC Bridge component or a C-VLAN component, there would have to be 16 times 4096 entries. This switch intellectual property can only hold a maximum of 2048 entries. Using this option, there also isn't a mechanism to prevent link-local frames from being discarded when the spanning tree Port State of the reception Port is discarding. The remaining option is to utilise the BPC, RGAC1, RGAC2, RGAC3, and RGAC4 registers. Whilst this applies to every VID, it doesn't contain all of the reserved MAC addresses without affecting the remaining Standard Group MAC Addresses. The REV_UN frame tag utilised using the RGAC4 register covers the remaining 01-80-C2-00-00-[04,05,06,07,08,09,0A,0B,0C,0D,0F] destination addresses. It also includes the 01-80-C2-00-00-22 to 01-80-C2-00-00-FF destination addresses which may be relayed by MAC Bridges or VLAN Bridges. The latter option provides better but not complete conformance. This switch intellectual property also does not provide a mechanism to trap link-local frames with specific destination addresses to CPU port by Bridge, to conform to the filtering rules for the distinct Bridge components. Therefore, regardless of the type of the Bridge component, link-local frames with these destination addresses will be trapped to CPU port: 01-80-C2-00-00-[00,01,02,03,0E] In a Bridge comprising a MAC Bridge component or a C-VLAN component: Link-local frames with these destination addresses won't be trapped to CPU port which won't conform to IEEE Std 802.1Q-2022: 01-80-C2-00-00-[04,05,06,07,08,09,0A,0B,0C,0D,0F] In a Bridge comprising an S-VLAN component: Link-local frames with these destination addresses will be trapped to CPU port which won't conform to IEEE Std 802.1Q-2022: 01-80-C2-00-00-00 Link-local frames with these destination addresses won't be trapped to CPU port which won't conform to IEEE Std 802.1Q-2022: 01-80-C2-00-00-[04,05,06,07,08,09,0A] Currently on this switch intellectual property, if the spanning tree Port State of the reception Port is discarding, link-local frames will be discarded. To trap link-local frames regardless of the spanning tree Port State, make the switch regard them as Bridge Protocol Data Units (BPDUs). This switch intellectual property only lets the frames regarded as BPDUs bypass the spanning tree Port State function of the Forwarding Process. With this change, the only remaining interference is the ingress rules. When the reception Port has no PVID assigned on software, VLAN-untagged frames won't be allowed in. There doesn't seem to be a mechanism on the switch intellectual property to have link-local frames bypass this function of the Forwarding Process. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Reviewed-by: Daniel Golle Signed-off-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20240409-b4-for-net-mt7530-fix-link-local-when-stp-discarding-v2-1-07b1150164ac@arinc9.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 229 ++++++++++++++++++++++++++++++++++++++++------- drivers/net/dsa/mt7530.h | 5 ++ 2 files changed, 200 insertions(+), 34 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 451ff0620c2e..c0d0bce0b594 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -950,20 +950,173 @@ static void mt7530_setup_port5(struct dsa_switch *ds, phy_interface_t interface) mutex_unlock(&priv->reg_mutex); } -/* On page 205, section "8.6.3 Frame filtering" of the active standard, IEEE Std - * 802.1Q™-2022, it is stated that frames with 01:80:C2:00:00:00-0F as MAC DA - * must only be propagated to C-VLAN and MAC Bridge components. That means - * VLAN-aware and VLAN-unaware bridges. On the switch designs with CPU ports, - * these frames are supposed to be processed by the CPU (software). So we make - * the switch only forward them to the CPU port. And if received from a CPU - * port, forward to a single port. The software is responsible of making the - * switch conform to the latter by setting a single port as destination port on - * the special tag. +/* In Clause 5 of IEEE Std 802-2014, two sublayers of the data link layer (DLL) + * of the Open Systems Interconnection basic reference model (OSI/RM) are + * described; the medium access control (MAC) and logical link control (LLC) + * sublayers. The MAC sublayer is the one facing the physical layer. * - * This switch intellectual property cannot conform to this part of the standard - * fully. Whilst the REV_UN frame tag covers the remaining :04-0D and :0F MAC - * DAs, it also includes :22-FF which the scope of propagation is not supposed - * to be restricted for these MAC DAs. + * In 8.2 of IEEE Std 802.1Q-2022, the Bridge architecture is described. A + * Bridge component comprises a MAC Relay Entity for interconnecting the Ports + * of the Bridge, at least two Ports, and higher layer entities with at least a + * Spanning Tree Protocol Entity included. + * + * Each Bridge Port also functions as an end station and shall provide the MAC + * Service to an LLC Entity. Each instance of the MAC Service is provided to a + * distinct LLC Entity that supports protocol identification, multiplexing, and + * demultiplexing, for protocol data unit (PDU) transmission and reception by + * one or more higher layer entities. + * + * It is described in 8.13.9 of IEEE Std 802.1Q-2022 that in a Bridge, the LLC + * Entity associated with each Bridge Port is modeled as being directly + * connected to the attached Local Area Network (LAN). + * + * On the switch with CPU port architecture, CPU port functions as Management + * Port, and the Management Port functionality is provided by software which + * functions as an end station. Software is connected to an IEEE 802 LAN that is + * wholly contained within the system that incorporates the Bridge. Software + * provides access to the LLC Entity associated with each Bridge Port by the + * value of the source port field on the special tag on the frame received by + * software. + * + * We call frames that carry control information to determine the active + * topology and current extent of each Virtual Local Area Network (VLAN), i.e., + * spanning tree or Shortest Path Bridging (SPB) and Multiple VLAN Registration + * Protocol Data Units (MVRPDUs), and frames from other link constrained + * protocols, such as Extensible Authentication Protocol over LAN (EAPOL) and + * Link Layer Discovery Protocol (LLDP), link-local frames. They are not + * forwarded by a Bridge. Permanently configured entries in the filtering + * database (FDB) ensure that such frames are discarded by the Forwarding + * Process. In 8.6.3 of IEEE Std 802.1Q-2022, this is described in detail: + * + * Each of the reserved MAC addresses specified in Table 8-1 + * (01-80-C2-00-00-[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F]) shall be + * permanently configured in the FDB in C-VLAN components and ERs. + * + * Each of the reserved MAC addresses specified in Table 8-2 + * (01-80-C2-00-00-[01,02,03,04,05,06,07,08,09,0A,0E]) shall be permanently + * configured in the FDB in S-VLAN components. + * + * Each of the reserved MAC addresses specified in Table 8-3 + * (01-80-C2-00-00-[01,02,04,0E]) shall be permanently configured in the FDB in + * TPMR components. + * + * The FDB entries for reserved MAC addresses shall specify filtering for all + * Bridge Ports and all VIDs. Management shall not provide the capability to + * modify or remove entries for reserved MAC addresses. + * + * The addresses in Table 8-1, Table 8-2, and Table 8-3 determine the scope of + * propagation of PDUs within a Bridged Network, as follows: + * + * The Nearest Bridge group address (01-80-C2-00-00-0E) is an address that no + * conformant Two-Port MAC Relay (TPMR) component, Service VLAN (S-VLAN) + * component, Customer VLAN (C-VLAN) component, or MAC Bridge can forward. + * PDUs transmitted using this destination address, or any other addresses + * that appear in Table 8-1, Table 8-2, and Table 8-3 + * (01-80-C2-00-00-[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F]), can + * therefore travel no further than those stations that can be reached via a + * single individual LAN from the originating station. + * + * The Nearest non-TPMR Bridge group address (01-80-C2-00-00-03), is an + * address that no conformant S-VLAN component, C-VLAN component, or MAC + * Bridge can forward; however, this address is relayed by a TPMR component. + * PDUs using this destination address, or any of the other addresses that + * appear in both Table 8-1 and Table 8-2 but not in Table 8-3 + * (01-80-C2-00-00-[00,03,05,06,07,08,09,0A,0B,0C,0D,0F]), will be relayed by + * any TPMRs but will propagate no further than the nearest S-VLAN component, + * C-VLAN component, or MAC Bridge. + * + * The Nearest Customer Bridge group address (01-80-C2-00-00-00) is an address + * that no conformant C-VLAN component, MAC Bridge can forward; however, it is + * relayed by TPMR components and S-VLAN components. PDUs using this + * destination address, or any of the other addresses that appear in Table 8-1 + * but not in either Table 8-2 or Table 8-3 (01-80-C2-00-00-[00,0B,0C,0D,0F]), + * will be relayed by TPMR components and S-VLAN components but will propagate + * no further than the nearest C-VLAN component or MAC Bridge. + * + * Because the LLC Entity associated with each Bridge Port is provided via CPU + * port, we must not filter these frames but forward them to CPU port. + * + * In a Bridge, the transmission Port is majorly decided by ingress and egress + * rules, FDB, and spanning tree Port State functions of the Forwarding Process. + * For link-local frames, only CPU port should be designated as destination port + * in the FDB, and the other functions of the Forwarding Process must not + * interfere with the decision of the transmission Port. We call this process + * trapping frames to CPU port. + * + * Therefore, on the switch with CPU port architecture, link-local frames must + * be trapped to CPU port, and certain link-local frames received by a Port of a + * Bridge comprising a TPMR component or an S-VLAN component must be excluded + * from it. + * + * A Bridge of the switch with CPU port architecture cannot comprise a Two-Port + * MAC Relay (TPMR) component as a TPMR component supports only a subset of the + * functionality of a MAC Bridge. A Bridge comprising two Ports (Management Port + * doesn't count) of this architecture will either function as a standard MAC + * Bridge or a standard VLAN Bridge. + * + * Therefore, a Bridge of this architecture can only comprise S-VLAN components, + * C-VLAN components, or MAC Bridge components. Since there's no TPMR component, + * we don't need to relay PDUs using the destination addresses specified on the + * Nearest non-TPMR section, and the proportion of the Nearest Customer Bridge + * section where they must be relayed by TPMR components. + * + * One option to trap link-local frames to CPU port is to add static FDB entries + * with CPU port designated as destination port. However, because that + * Independent VLAN Learning (IVL) is being used on every VID, each entry only + * applies to a single VLAN Identifier (VID). For a Bridge comprising a MAC + * Bridge component or a C-VLAN component, there would have to be 16 times 4096 + * entries. This switch intellectual property can only hold a maximum of 2048 + * entries. Using this option, there also isn't a mechanism to prevent + * link-local frames from being discarded when the spanning tree Port State of + * the reception Port is discarding. + * + * The remaining option is to utilise the BPC, RGAC1, RGAC2, RGAC3, and RGAC4 + * registers. Whilst this applies to every VID, it doesn't contain all of the + * reserved MAC addresses without affecting the remaining Standard Group MAC + * Addresses. The REV_UN frame tag utilised using the RGAC4 register covers the + * remaining 01-80-C2-00-00-[04,05,06,07,08,09,0A,0B,0C,0D,0F] destination + * addresses. It also includes the 01-80-C2-00-00-22 to 01-80-C2-00-00-FF + * destination addresses which may be relayed by MAC Bridges or VLAN Bridges. + * The latter option provides better but not complete conformance. + * + * This switch intellectual property also does not provide a mechanism to trap + * link-local frames with specific destination addresses to CPU port by Bridge, + * to conform to the filtering rules for the distinct Bridge components. + * + * Therefore, regardless of the type of the Bridge component, link-local frames + * with these destination addresses will be trapped to CPU port: + * + * 01-80-C2-00-00-[00,01,02,03,0E] + * + * In a Bridge comprising a MAC Bridge component or a C-VLAN component: + * + * Link-local frames with these destination addresses won't be trapped to CPU + * port which won't conform to IEEE Std 802.1Q-2022: + * + * 01-80-C2-00-00-[04,05,06,07,08,09,0A,0B,0C,0D,0F] + * + * In a Bridge comprising an S-VLAN component: + * + * Link-local frames with these destination addresses will be trapped to CPU + * port which won't conform to IEEE Std 802.1Q-2022: + * + * 01-80-C2-00-00-00 + * + * Link-local frames with these destination addresses won't be trapped to CPU + * port which won't conform to IEEE Std 802.1Q-2022: + * + * 01-80-C2-00-00-[04,05,06,07,08,09,0A] + * + * To trap link-local frames to CPU port as conformant as this switch + * intellectual property can allow, link-local frames are made to be regarded as + * Bridge Protocol Data Units (BPDUs). This is because this switch intellectual + * property only lets the frames regarded as BPDUs bypass the spanning tree Port + * State function of the Forwarding Process. + * + * The only remaining interference is the ingress rules. When the reception Port + * has no PVID assigned on software, VLAN-untagged frames won't be allowed in. + * There doesn't seem to be a mechanism on the switch intellectual property to + * have link-local frames bypass this function of the Forwarding Process. */ static void mt753x_trap_frames(struct mt7530_priv *priv) @@ -971,35 +1124,43 @@ mt753x_trap_frames(struct mt7530_priv *priv) /* Trap 802.1X PAE frames and BPDUs to the CPU port(s) and egress them * VLAN-untagged. */ - mt7530_rmw(priv, MT753X_BPC, MT753X_PAE_EG_TAG_MASK | - MT753X_PAE_PORT_FW_MASK | MT753X_BPDU_EG_TAG_MASK | - MT753X_BPDU_PORT_FW_MASK, - MT753X_PAE_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_PAE_PORT_FW(MT753X_BPDU_CPU_ONLY) | - MT753X_BPDU_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_BPDU_CPU_ONLY); + mt7530_rmw(priv, MT753X_BPC, + MT753X_PAE_BPDU_FR | MT753X_PAE_EG_TAG_MASK | + MT753X_PAE_PORT_FW_MASK | MT753X_BPDU_EG_TAG_MASK | + MT753X_BPDU_PORT_FW_MASK, + MT753X_PAE_BPDU_FR | + MT753X_PAE_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_PAE_PORT_FW(MT753X_BPDU_CPU_ONLY) | + MT753X_BPDU_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_BPDU_CPU_ONLY); /* Trap frames with :01 and :02 MAC DAs to the CPU port(s) and egress * them VLAN-untagged. */ - mt7530_rmw(priv, MT753X_RGAC1, MT753X_R02_EG_TAG_MASK | - MT753X_R02_PORT_FW_MASK | MT753X_R01_EG_TAG_MASK | - MT753X_R01_PORT_FW_MASK, - MT753X_R02_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_R02_PORT_FW(MT753X_BPDU_CPU_ONLY) | - MT753X_R01_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_BPDU_CPU_ONLY); + mt7530_rmw(priv, MT753X_RGAC1, + MT753X_R02_BPDU_FR | MT753X_R02_EG_TAG_MASK | + MT753X_R02_PORT_FW_MASK | MT753X_R01_BPDU_FR | + MT753X_R01_EG_TAG_MASK | MT753X_R01_PORT_FW_MASK, + MT753X_R02_BPDU_FR | + MT753X_R02_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_R02_PORT_FW(MT753X_BPDU_CPU_ONLY) | + MT753X_R01_BPDU_FR | + MT753X_R01_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_BPDU_CPU_ONLY); /* Trap frames with :03 and :0E MAC DAs to the CPU port(s) and egress * them VLAN-untagged. */ - mt7530_rmw(priv, MT753X_RGAC2, MT753X_R0E_EG_TAG_MASK | - MT753X_R0E_PORT_FW_MASK | MT753X_R03_EG_TAG_MASK | - MT753X_R03_PORT_FW_MASK, - MT753X_R0E_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_R0E_PORT_FW(MT753X_BPDU_CPU_ONLY) | - MT753X_R03_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | - MT753X_BPDU_CPU_ONLY); + mt7530_rmw(priv, MT753X_RGAC2, + MT753X_R0E_BPDU_FR | MT753X_R0E_EG_TAG_MASK | + MT753X_R0E_PORT_FW_MASK | MT753X_R03_BPDU_FR | + MT753X_R03_EG_TAG_MASK | MT753X_R03_PORT_FW_MASK, + MT753X_R0E_BPDU_FR | + MT753X_R0E_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_R0E_PORT_FW(MT753X_BPDU_CPU_ONLY) | + MT753X_R03_BPDU_FR | + MT753X_R03_EG_TAG(MT7530_VLAN_EG_UNTAGGED) | + MT753X_BPDU_CPU_ONLY); } static void diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 9439db495001..585db03c0548 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -65,6 +65,7 @@ enum mt753x_id { /* Registers for BPDU and PAE frame control*/ #define MT753X_BPC 0x24 +#define MT753X_PAE_BPDU_FR BIT(25) #define MT753X_PAE_EG_TAG_MASK GENMASK(24, 22) #define MT753X_PAE_EG_TAG(x) FIELD_PREP(MT753X_PAE_EG_TAG_MASK, x) #define MT753X_PAE_PORT_FW_MASK GENMASK(18, 16) @@ -75,20 +76,24 @@ enum mt753x_id { /* Register for :01 and :02 MAC DA frame control */ #define MT753X_RGAC1 0x28 +#define MT753X_R02_BPDU_FR BIT(25) #define MT753X_R02_EG_TAG_MASK GENMASK(24, 22) #define MT753X_R02_EG_TAG(x) FIELD_PREP(MT753X_R02_EG_TAG_MASK, x) #define MT753X_R02_PORT_FW_MASK GENMASK(18, 16) #define MT753X_R02_PORT_FW(x) FIELD_PREP(MT753X_R02_PORT_FW_MASK, x) +#define MT753X_R01_BPDU_FR BIT(9) #define MT753X_R01_EG_TAG_MASK GENMASK(8, 6) #define MT753X_R01_EG_TAG(x) FIELD_PREP(MT753X_R01_EG_TAG_MASK, x) #define MT753X_R01_PORT_FW_MASK GENMASK(2, 0) /* Register for :03 and :0E MAC DA frame control */ #define MT753X_RGAC2 0x2c +#define MT753X_R0E_BPDU_FR BIT(25) #define MT753X_R0E_EG_TAG_MASK GENMASK(24, 22) #define MT753X_R0E_EG_TAG(x) FIELD_PREP(MT753X_R0E_EG_TAG_MASK, x) #define MT753X_R0E_PORT_FW_MASK GENMASK(18, 16) #define MT753X_R0E_PORT_FW(x) FIELD_PREP(MT753X_R0E_PORT_FW_MASK, x) +#define MT753X_R03_BPDU_FR BIT(9) #define MT753X_R03_EG_TAG_MASK GENMASK(8, 6) #define MT753X_R03_EG_TAG(x) FIELD_PREP(MT753X_R03_EG_TAG_MASK, x) #define MT753X_R03_PORT_FW_MASK GENMASK(2, 0) -- cgit v1.2.3 From 47d8ac011fe1c9251070e1bd64cb10b48193ec51 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 9 Apr 2024 22:09:39 +0200 Subject: af_unix: Fix garbage collector racing against connect() Garbage collector does not take into account the risk of embryo getting enqueued during the garbage collection. If such embryo has a peer that carries SCM_RIGHTS, two consecutive passes of scan_children() may see a different set of children. Leading to an incorrectly elevated inflight count, and then a dangling pointer within the gc_inflight_list. sockets are AF_UNIX/SOCK_STREAM S is an unconnected socket L is a listening in-flight socket bound to addr, not in fdtable V's fd will be passed via sendmsg(), gets inflight count bumped connect(S, addr) sendmsg(S, [V]); close(V) __unix_gc() ---------------- ------------------------- ----------- NS = unix_create1() skb1 = sock_wmalloc(NS) L = unix_find_other(addr) unix_state_lock(L) unix_peer(S) = NS // V count=1 inflight=0 NS = unix_peer(S) skb2 = sock_alloc() skb_queue_tail(NS, skb2[V]) // V became in-flight // V count=2 inflight=1 close(V) // V count=1 inflight=1 // GC candidate condition met for u in gc_inflight_list: if (total_refs == inflight_refs) add u to gc_candidates // gc_candidates={L, V} for u in gc_candidates: scan_children(u, dec_inflight) // embryo (skb1) was not // reachable from L yet, so V's // inflight remains unchanged __skb_queue_tail(L, skb1) unix_state_unlock(L) for u in gc_candidates: if (u.inflight) scan_children(u, inc_inflight_move_tail) // V count=1 inflight=2 (!) If there is a GC-candidate listening socket, lock/unlock its state. This makes GC wait until the end of any ongoing connect() to that socket. After flipping the lock, a possibly SCM-laden embryo is already enqueued. And if there is another embryo coming, it can not possibly carry SCM_RIGHTS. At this point, unix_inflight() can not happen because unix_gc_lock is already taken. Inflight graph remains unaffected. Fixes: 1fd05ba5a2f2 ("[AF_UNIX]: Rewrite garbage collector, fixes race.") Signed-off-by: Michal Luczaj Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240409201047.1032217-1-mhal@rbox.co Signed-off-by: Paolo Abeni --- net/unix/garbage.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index fa39b6265238..6433a414acf8 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -274,11 +274,22 @@ static void __unix_gc(struct work_struct *work) * receive queues. Other, non candidate sockets _can_ be * added to queue, so we must make sure only to touch * candidates. + * + * Embryos, though never candidates themselves, affect which + * candidates are reachable by the garbage collector. Before + * being added to a listener's queue, an embryo may already + * receive data carrying SCM_RIGHTS, potentially making the + * passed socket a candidate that is not yet reachable by the + * collector. It becomes reachable once the embryo is + * enqueued. Therefore, we must ensure that no SCM-laden + * embryo appears in a (candidate) listener's queue between + * consecutive scan_children() calls. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { + struct sock *sk = &u->sk; long total_refs; - total_refs = file_count(u->sk.sk_socket->file); + total_refs = file_count(sk->sk_socket->file); WARN_ON_ONCE(!u->inflight); WARN_ON_ONCE(total_refs < u->inflight); @@ -286,6 +297,11 @@ static void __unix_gc(struct work_struct *work) list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); + + if (sk->sk_state == TCP_LISTEN) { + unix_state_lock(sk); + unix_state_unlock(sk); + } } } -- cgit v1.2.3 From dfe648903f42296866d79f10d03f8c85c9dfba30 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:45 -0700 Subject: x86/bugs: Fix BHI documentation Fix up some inaccuracies in the BHI documentation. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/8c84f7451bfe0dd08543c6082a383f390d4aa7e2.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 15 ++++++++------- Documentation/admin-guide/kernel-parameters.txt | 12 +++++++----- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index b70b1d8bd8e6..3cf18e4a1d9a 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -439,11 +439,11 @@ The possible values in this file are: - System is protected by retpoline * - BHI: BHI_DIS_S - System is protected by BHI_DIS_S - * - BHI: SW loop; KVM SW loop + * - BHI: SW loop, KVM SW loop - System is protected by software clearing sequence * - BHI: Syscall hardening - Syscalls are hardened against BHI - * - BHI: Syscall hardening; KVM: SW loop + * - BHI: Syscall hardening, KVM: SW loop - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence Full mitigation might require a microcode update from the CPU @@ -666,13 +666,14 @@ kernel command line. of the HW BHI control and the SW BHB clearing sequence. on - unconditionally enable. + (default) Enable the HW or SW mitigation as + needed. off - unconditionally disable. + Disable the mitigation. auto - enable if hardware mitigation - control(BHI_DIS_S) is available, otherwise - enable alternate mitigation in KVM. + Enable the HW mitigation if needed, but + *don't* enable the SW mitigation except for KVM. + The system may be vulnerable. For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 70046a019d42..a029ad6c4963 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3444,6 +3444,7 @@ retbleed=off [X86] spec_rstack_overflow=off [X86] spec_store_bypass_disable=off [X86,PPC] + spectre_bhi=off [X86] spectre_v2_user=off [X86] srbds=off [X86,INTEL] ssbd=force-off [ARM64] @@ -6069,11 +6070,12 @@ deployment of the HW BHI control and the SW BHB clearing sequence. - on - unconditionally enable. - off - unconditionally disable. - auto - (default) enable hardware mitigation - (BHI_DIS_S) if available, otherwise enable - alternate mitigation in KVM. + on - (default) Enable the HW or SW mitigation + as needed. + off - Disable the mitigation. + auto - Enable the HW mitigation if needed, but + *don't* enable the SW mitigation except + for KVM. The system may be vulnerable. spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. -- cgit v1.2.3 From cb2db5bb04d7f778fbc1a1ea2507aab436f1bff3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:46 -0700 Subject: x86/bugs: Cache the value of MSR_IA32_ARCH_CAPABILITIES There's no need to keep reading MSR_IA32_ARCH_CAPABILITIES over and over. It's even read in the BHI sysfs function which is a big no-no. Just read it once and cache it. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/9592a18a814368e75f8f4b9d74d3883aa4fd1eaf.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 27f5004a8e24..ff59fa8bb610 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -61,6 +61,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); +static u64 __ro_after_init ia32_cap; + static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; @@ -144,6 +146,8 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } + ia32_cap = x86_read_arch_cap_msr(); + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); @@ -301,8 +305,6 @@ static const char * const taa_strings[] = { static void __init taa_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_OFF; return; @@ -341,7 +343,6 @@ static void __init taa_select_mitigation(void) * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ - ia32_cap = x86_read_arch_cap_msr(); if ( (ia32_cap & ARCH_CAP_MDS_NO) && !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; @@ -401,8 +402,6 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { @@ -413,8 +412,6 @@ static void __init mmio_select_mitigation(void) if (mmio_mitigation == MMIO_MITIGATION_OFF) return; - ia32_cap = x86_read_arch_cap_msr(); - /* * Enable CPU buffer clear mitigation for host and VMM, if also affected * by MDS or TAA. Otherwise, enable mitigation for VMM only. @@ -508,7 +505,7 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR) + if (ia32_cap & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; @@ -659,8 +656,6 @@ void update_srbds_msr(void) static void __init srbds_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; @@ -669,7 +664,6 @@ static void __init srbds_select_mitigation(void) * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - ia32_cap = x86_read_arch_cap_msr(); if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; @@ -813,7 +807,7 @@ static void __init gds_select_mitigation(void) /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ - if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (!(ia32_cap & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it @@ -1908,8 +1902,6 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { - u64 ia32_cap = x86_read_arch_cap_msr(); - /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -2818,7 +2810,7 @@ static const char *spectre_bhi_state(void) else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) + !(ia32_cap & ARCH_CAP_RRSBA)) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; -- cgit v1.2.3 From d0485730d2189ffe5d986d4e9e191f1e4d5ffd24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Apr 2024 09:25:36 +0200 Subject: x86/bugs: Rename various 'ia32_cap' variables to 'x86_arch_cap_msr' So we are using the 'ia32_cap' value in a number of places, which got its name from MSR_IA32_ARCH_CAPABILITIES MSR register. But there's very little 'IA32' about it - this isn't 32-bit only code, nor does it originate from there, it's just a historic quirk that many Intel MSR names are prefixed with IA32_. This is already clear from the helper method around the MSR: x86_read_arch_cap_msr(), which doesn't have the IA32 prefix. So rename 'ia32_cap' to 'x86_arch_cap_msr' to be consistent with its role and with the naming of the helper function. Signed-off-by: Ingo Molnar Cc: Josh Poimboeuf Cc: Nikolay Borisov Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/9592a18a814368e75f8f4b9d74d3883aa4fd1eaf.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/apic/apic.c | 6 +++--- arch/x86/kernel/cpu/bugs.c | 30 +++++++++++++-------------- arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++++++---------------------- 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a42d8a6f7149..c342c4aa9c68 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1687,11 +1687,11 @@ static int x2apic_state; static bool x2apic_hw_locked(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; u64 msr; - ia32_cap = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) { rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); return (msr & LEGACY_XAPIC_DISABLED); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ff59fa8bb610..1b0cfc136432 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -61,7 +61,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); -static u64 __ro_after_init ia32_cap; +static u64 __ro_after_init x86_arch_cap_msr; static DEFINE_MUTEX(spec_ctrl_mutex); @@ -146,7 +146,7 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } - ia32_cap = x86_read_arch_cap_msr(); + x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); @@ -343,8 +343,8 @@ static void __init taa_select_mitigation(void) * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ - if ( (ia32_cap & ARCH_CAP_MDS_NO) && - !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* @@ -434,7 +434,7 @@ static void __init mmio_select_mitigation(void) * be propagated to uncore buffers, clearing the Fill buffers on idle * is required irrespective of SMT state. */ - if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); /* @@ -444,10 +444,10 @@ static void __init mmio_select_mitigation(void) * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS * affected systems. */ - if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || (boot_cpu_has(X86_FEATURE_MD_CLEAR) && boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(ia32_cap & ARCH_CAP_MDS_NO))) + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) mmio_mitigation = MMIO_MITIGATION_VERW; else mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; @@ -505,7 +505,7 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; @@ -664,7 +664,7 @@ static void __init srbds_select_mitigation(void) * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) @@ -807,7 +807,7 @@ static void __init gds_select_mitigation(void) /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ - if (!(ia32_cap & ARCH_CAP_GDS_CTRL)) { + if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it @@ -1541,14 +1541,14 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) return; - ia32_cap = x86_read_arch_cap_msr(); + x86_arch_cap_msr = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_RRSBA) { + if (x86_arch_cap_msr & ARCH_CAP_RRSBA) { x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; update_spec_ctrl(x86_spec_ctrl_base); } @@ -1916,7 +1916,7 @@ static void update_mds_branch_idle(void) if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || - (ia32_cap & ARCH_CAP_FBSDP_NO)) { + (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); } } @@ -2810,7 +2810,7 @@ static const char *spectre_bhi_state(void) else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(ia32_cap & ARCH_CAP_RRSBA)) + !(x86_arch_cap_msr & ARCH_CAP_RRSBA)) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 754d91857d63..605c26c009c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1284,25 +1284,25 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi u64 x86_read_arch_cap_msr(void) { - u64 ia32_cap = 0; + u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); - return ia32_cap; + return x86_arch_cap_msr; } -static bool arch_cap_mmio_immune(u64 ia32_cap) +static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr) { - return (ia32_cap & ARCH_CAP_FBSDP_NO && - ia32_cap & ARCH_CAP_PSDP_NO && - ia32_cap & ARCH_CAP_SBDR_SSDP_NO); + return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO && + x86_arch_cap_msr & ARCH_CAP_PSDP_NO && + x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO); } -static bool __init vulnerable_to_rfds(u64 ia32_cap) +static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) { /* The "immunity" bit trumps everything else: */ - if (ia32_cap & ARCH_CAP_RFDS_NO) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO) return false; /* @@ -1310,7 +1310,7 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) * indicate that mitigation is needed because guest is running on a * vulnerable hardware or may migrate to such hardware: */ - if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) return true; /* Only consult the blacklist when there is no enumeration: */ @@ -1319,11 +1319,11 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { - u64 ia32_cap = x86_read_arch_cap_msr(); + u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && - !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) @@ -1335,7 +1335,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && - !(ia32_cap & ARCH_CAP_SSB_NO) && + !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1346,17 +1346,17 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Don't use AutoIBRS when SNP is enabled because it degrades host * userspace indirect branch performance. */ - if ((ia32_cap & ARCH_CAP_IBRS_ALL) || + if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || (cpu_has(c, X86_FEATURE_AUTOIBRS) && !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO)) setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && - !(ia32_cap & ARCH_CAP_MDS_NO)) { + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); @@ -1375,9 +1375,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * TSX_CTRL check alone is not sufficient for cases when the microcode * update is not present or running as guest that don't get TSX_CTRL. */ - if (!(ia32_cap & ARCH_CAP_TAA_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) && (cpu_has(c, X86_FEATURE_RTM) || - (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); /* @@ -1403,7 +1403,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ - if (!arch_cap_mmio_immune(ia32_cap)) { + if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) @@ -1411,7 +1411,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { - if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA)) setup_force_cpu_bug(X86_BUG_RETBLEED); } @@ -1429,15 +1429,15 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * disabling AVX2. The only way to do this in HW is to clear XCR0[2], * which means that AVX will be disabled. */ - if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) && boot_cpu_has(X86_FEATURE_AVX)) setup_force_cpu_bug(X86_BUG_GDS); - if (vulnerable_to_rfds(ia32_cap)) + if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(ia32_cap & ARCH_CAP_BHI_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && !cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) @@ -1447,7 +1447,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) return; /* Rogue Data Cache Load? No! */ - if (ia32_cap & ARCH_CAP_RDCL_NO) + if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); -- cgit v1.2.3 From 1cea8a280dfd1016148a3820676f2f03e3f5b898 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:47 -0700 Subject: x86/bugs: Fix BHI handling of RRSBA The ARCH_CAP_RRSBA check isn't correct: RRSBA may have already been disabled by the Spectre v2 mitigation (or can otherwise be disabled by the BHI mitigation itself if needed). In that case retpolines are fine. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/6f56f13da34a0834b69163467449be7f58f253dc.1712813475.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1b0cfc136432..08dfb94fcb3a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1538,20 +1538,25 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +static bool __ro_after_init rrsba_disabled; + /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 x86_arch_cap_msr; + if (rrsba_disabled) + return; - if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) { + rrsba_disabled = true; return; + } - x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; - if (x86_arch_cap_msr & ARCH_CAP_RRSBA) { - x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - update_spec_ctrl(x86_spec_ctrl_base); - } + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + rrsba_disabled = true; } static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) @@ -1652,9 +1657,11 @@ static void __init bhi_select_mitigation(void) return; /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !(x86_read_arch_cap_msr() & ARCH_CAP_RRSBA)) - return; + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + spec_ctrl_disable_kernel_rrsba(); + if (rrsba_disabled) + return; + } if (spec_ctrl_bhi_dis()) return; @@ -2809,8 +2816,7 @@ static const char *spectre_bhi_state(void) return "; BHI: BHI_DIS_S"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; - else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - !(x86_arch_cap_msr & ARCH_CAP_RRSBA)) + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && rrsba_disabled) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Syscall hardening, KVM: SW loop"; -- cgit v1.2.3 From 5f882f3b0a8bf0788d5a0ee44b1191de5319bb8a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:48 -0700 Subject: x86/bugs: Clarify that syscall hardening isn't a BHI mitigation While syscall hardening helps prevent some BHI attacks, there's still other low-hanging fruit remaining. Don't classify it as a mitigation and make it clear that the system may still be vulnerable if it doesn't have a HW or SW mitigation enabled. Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Sean Christopherson Link: https://lore.kernel.org/r/b5951dae3fdee7f1520d5136a27be3bdfe95f88b.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 11 +++++------ Documentation/admin-guide/kernel-parameters.txt | 3 +-- arch/x86/kernel/cpu/bugs.c | 6 +++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 3cf18e4a1d9a..5a39acf82483 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -441,10 +441,10 @@ The possible values in this file are: - System is protected by BHI_DIS_S * - BHI: SW loop, KVM SW loop - System is protected by software clearing sequence - * - BHI: Syscall hardening - - Syscalls are hardened against BHI - * - BHI: Syscall hardening, KVM: SW loop - - System is protected from userspace attacks by syscall hardening; KVM is protected by software clearing sequence + * - BHI: Vulnerable + - System is vulnerable to BHI + * - BHI: Vulnerable, KVM: SW loop + - System is vulnerable; KVM is protected by software clearing sequence Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will @@ -661,8 +661,7 @@ kernel command line. spectre_bhi= [X86] Control mitigation of Branch History Injection - (BHI) vulnerability. Syscalls are hardened against BHI - regardless of this setting. This setting affects the deployment + (BHI) vulnerability. This setting affects the deployment of the HW BHI control and the SW BHB clearing sequence. on diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a029ad6c4963..a3874cc97892 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6065,8 +6065,7 @@ See Documentation/admin-guide/laptops/sonypi.rst spectre_bhi= [X86] Control mitigation of Branch History Injection - (BHI) vulnerability. Syscalls are hardened against BHI - reglardless of this setting. This setting affects the + (BHI) vulnerability. This setting affects the deployment of the HW BHI control and the SW BHB clearing sequence. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 08dfb94fcb3a..9eeb60f5fbb3 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2818,10 +2818,10 @@ static const char *spectre_bhi_state(void) return "; BHI: SW loop, KVM: SW loop"; else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && rrsba_disabled) return "; BHI: Retpoline"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) - return "; BHI: Syscall hardening, KVM: SW loop"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + return "; BHI: Vulnerable, KVM: SW loop"; - return "; BHI: Vulnerable (Syscall hardening enabled)"; + return "; BHI: Vulnerable"; } static ssize_t spectre_v2_show_state(char *buf) -- cgit v1.2.3 From 713a85195aad25d8a26786a37b674e3e5ec09e3c Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Wed, 10 Apr 2024 09:13:55 +0000 Subject: net: ena: Fix potential sign extension issue Small unsigned types are promoted to larger signed types in the case of multiplication, the result of which may overflow. In case the result of such a multiplication has its MSB turned on, it will be sign extended with '1's. This changes the multiplication result. Code example of the phenomenon: ------------------------------- u16 x, y; size_t z1, z2; x = y = 0xffff; printk("x=%x y=%x\n",x,y); z1 = x*y; z2 = (size_t)x*y; printk("z1=%lx z2=%lx\n", z1, z2); Output: ------- x=ffff y=ffff z1=fffffffffffe0001 z2=fffe0001 The expected result of ffff*ffff is fffe0001, and without the explicit casting to avoid the unwanted sign extension we got fffffffffffe0001. This commit adds an explicit casting to avoid the sign extension issue. Fixes: 689b2bdaaa14 ("net: ena: add functions for handling Low Latency Queues in ena_com") Signed-off-by: Arthur Kiyanovski Signed-off-by: David Arinzon Reviewed-by: Shannon Nelson Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amazon/ena/ena_com.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 9e9e4a03f1a8..2d8a66ea82fa 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -351,7 +351,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev, ENA_COM_BOUNCE_BUFFER_CNTRL_CNT; io_sq->bounce_buf_ctrl.next_to_use = 0; - size = io_sq->bounce_buf_ctrl.buffer_size * + size = (size_t)io_sq->bounce_buf_ctrl.buffer_size * io_sq->bounce_buf_ctrl.buffers_num; dev_node = dev_to_node(ena_dev->dmadev); -- cgit v1.2.3 From f7e417180665234fdb7af2ebe33d89aaa434d16f Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Wed, 10 Apr 2024 09:13:56 +0000 Subject: net: ena: Wrong missing IO completions check order Missing IO completions check is called every second (HZ jiffies). This commit fixes several issues with this check: 1. Duplicate queues check: Max of 4 queues are scanned on each check due to monitor budget. Once reaching the budget, this check exits under the assumption that the next check will continue to scan the remainder of the queues, but in practice, next check will first scan the last already scanned queue which is not necessary and may cause the full queue scan to last a couple of seconds longer. The fix is to start every check with the next queue to scan. For example, on 8 IO queues: Bug: [0,1,2,3], [3,4,5,6], [6,7] Fix: [0,1,2,3], [4,5,6,7] 2. Unbalanced queues check: In case the number of active IO queues is not a multiple of budget, there will be checks which don't utilize the full budget because the full scan exits when reaching the last queue id. The fix is to run every TX completion check with exact queue budget regardless of the queue id. For example, on 7 IO queues: Bug: [0,1,2,3], [4,5,6], [0,1,2,3] Fix: [0,1,2,3], [4,5,6,0], [1,2,3,4] The budget may be lowered in case the number of IO queues is less than the budget (4) to make sure there are no duplicate queues on the same check. For example, on 3 IO queues: Bug: [0,1,2,0], [1,2,0,1] Fix: [0,1,2], [0,1,2] Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Amit Bernstein Signed-off-by: David Arinzon Reviewed-by: Shannon Nelson Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 09e7da1a69c9..59befc0f463c 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -3481,10 +3481,11 @@ static void check_for_missing_completions(struct ena_adapter *adapter) { struct ena_ring *tx_ring; struct ena_ring *rx_ring; - int i, budget, rc; + int qid, budget, rc; int io_queue_count; io_queue_count = adapter->xdp_num_queues + adapter->num_io_queues; + /* Make sure the driver doesn't turn the device in other process */ smp_rmb(); @@ -3497,27 +3498,29 @@ static void check_for_missing_completions(struct ena_adapter *adapter) if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT) return; - budget = ENA_MONITORED_TX_QUEUES; + budget = min_t(u32, io_queue_count, ENA_MONITORED_TX_QUEUES); - for (i = adapter->last_monitored_tx_qid; i < io_queue_count; i++) { - tx_ring = &adapter->tx_ring[i]; - rx_ring = &adapter->rx_ring[i]; + qid = adapter->last_monitored_tx_qid; + + while (budget) { + qid = (qid + 1) % io_queue_count; + + tx_ring = &adapter->tx_ring[qid]; + rx_ring = &adapter->rx_ring[qid]; rc = check_missing_comp_in_tx_queue(adapter, tx_ring); if (unlikely(rc)) return; - rc = !ENA_IS_XDP_INDEX(adapter, i) ? + rc = !ENA_IS_XDP_INDEX(adapter, qid) ? check_for_rx_interrupt_queue(adapter, rx_ring) : 0; if (unlikely(rc)) return; budget--; - if (!budget) - break; } - adapter->last_monitored_tx_qid = i % io_queue_count; + adapter->last_monitored_tx_qid = qid; } /* trigger napi schedule after 2 consecutive detections */ -- cgit v1.2.3 From bf02d9fe00632d22fa91d34749c7aacf397b6cde Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Wed, 10 Apr 2024 09:13:57 +0000 Subject: net: ena: Fix incorrect descriptor free behavior ENA has two types of TX queues: - queues which only process TX packets arriving from the network stack - queues which only process TX packets forwarded to it by XDP_REDIRECT or XDP_TX instructions The ena_free_tx_bufs() cycles through all descriptors in a TX queue and unmaps + frees every descriptor that hasn't been acknowledged yet by the device (uncompleted TX transactions). The function assumes that the processed TX queue is necessarily from the first category listed above and ends up using napi_consume_skb() for descriptors belonging to an XDP specific queue. This patch solves a bug in which, in case of a VF reset, the descriptors aren't freed correctly, leading to crashes. Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action") Signed-off-by: Shay Agroskin Signed-off-by: David Arinzon Reviewed-by: Shannon Nelson Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 59befc0f463c..be5acfa41ee0 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -718,8 +718,11 @@ void ena_unmap_tx_buff(struct ena_ring *tx_ring, static void ena_free_tx_bufs(struct ena_ring *tx_ring) { bool print_once = true; + bool is_xdp_ring; u32 i; + is_xdp_ring = ENA_IS_XDP_INDEX(tx_ring->adapter, tx_ring->qid); + for (i = 0; i < tx_ring->ring_size; i++) { struct ena_tx_buffer *tx_info = &tx_ring->tx_buffer_info[i]; @@ -739,10 +742,15 @@ static void ena_free_tx_bufs(struct ena_ring *tx_ring) ena_unmap_tx_buff(tx_ring, tx_info); - dev_kfree_skb_any(tx_info->skb); + if (is_xdp_ring) + xdp_return_frame(tx_info->xdpf); + else + dev_kfree_skb_any(tx_info->skb); } - netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev, - tx_ring->qid)); + + if (!is_xdp_ring) + netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev, + tx_ring->qid)); } static void ena_free_all_tx_bufs(struct ena_adapter *adapter) -- cgit v1.2.3 From 36a1ca01f0452f2549420e7279c2588729bd94df Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Wed, 10 Apr 2024 09:13:58 +0000 Subject: net: ena: Set tx_info->xdpf value to NULL The patch mentioned in the `Fixes` tag removed the explicit assignment of tx_info->xdpf to NULL with the justification that there's no need to set tx_info->xdpf to NULL and tx_info->num_of_bufs to 0 in case of a mapping error. Both values won't be used once the mapping function returns an error, and their values would be overridden by the next transmitted packet. While both values do indeed get overridden in the next transmission call, the value of tx_info->xdpf is also used to check whether a TX descriptor's transmission has been completed (i.e. a completion for it was polled). An example scenario: 1. Mapping failed, tx_info->xdpf wasn't set to NULL 2. A VF reset occurred leading to IO resource destruction and a call to ena_free_tx_bufs() function 3. Although the descriptor whose mapping failed was freed by the transmission function, it still passes the check if (!tx_info->skb) (skb and xdp_frame are in a union) 4. The xdp_frame associated with the descriptor is freed twice This patch returns the assignment of NULL to tx_info->xdpf to make the cleaning function knows that the descriptor is already freed. Fixes: 504fd6a5390c ("net: ena: fix DMA mapping function issues in XDP") Signed-off-by: Shay Agroskin Signed-off-by: David Arinzon Reviewed-by: Shannon Nelson Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amazon/ena/ena_xdp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_xdp.c b/drivers/net/ethernet/amazon/ena/ena_xdp.c index 337c435d3ce9..5b175e7e92a1 100644 --- a/drivers/net/ethernet/amazon/ena/ena_xdp.c +++ b/drivers/net/ethernet/amazon/ena/ena_xdp.c @@ -89,7 +89,7 @@ int ena_xdp_xmit_frame(struct ena_ring *tx_ring, rc = ena_xdp_tx_map_frame(tx_ring, tx_info, xdpf, &ena_tx_ctx); if (unlikely(rc)) - return rc; + goto err; ena_tx_ctx.req_id = req_id; @@ -112,7 +112,9 @@ int ena_xdp_xmit_frame(struct ena_ring *tx_ring, error_unmap_dma: ena_unmap_tx_buff(tx_ring, tx_info); +err: tx_info->xdpf = NULL; + return rc; } -- cgit v1.2.3 From f969eb84ce482331a991079ab7a5c4dc3b7f89bf Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 7 Apr 2024 14:56:04 +0800 Subject: netfilter: nf_tables: Fix potential data-race in __nft_expr_type_get() nft_unregister_expr() can concurrent with __nft_expr_type_get(), and there is not any protection when iterate over nf_tables_expressions list in __nft_expr_type_get(). Therefore, there is potential data-race of nf_tables_expressions list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_expressions list in __nft_expr_type_get(), and use rcu_read_lock() in the caller nft_expr_type_get() to protect the entire type query process. Fixes: ef1f7df9170d ("netfilter: nf_tables: expression ops overloading") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d89d77946719..53b8c00863ad 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3060,7 +3060,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -3092,9 +3092,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES -- cgit v1.2.3 From d78d867dcea69c328db30df665be5be7d0148484 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sun, 7 Apr 2024 14:56:05 +0800 Subject: netfilter: nf_tables: Fix potential data-race in __nft_obj_type_get() nft_unregister_obj() can concurrent with __nft_obj_type_get(), and there is not any protection when iterate over nf_tables_objects list in __nft_obj_type_get(). Therefore, there is potential data-race of nf_tables_objects list entry. Use list_for_each_entry_rcu() to iterate over nf_tables_objects list in __nft_obj_type_get(), and use rcu_read_lock() in the caller nft_obj_type_get() to protect the entire type query process. Fixes: e50092404c1b ("netfilter: nf_tables: add stateful objects") Signed-off-by: Ziyang Xuan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 53b8c00863ad..f11d0c0a2c73 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7611,7 +7611,7 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; - list_for_each_entry(type, &nf_tables_objects, list) { + list_for_each_entry_rcu(type, &nf_tables_objects, list) { if (type->family != NFPROTO_UNSPEC && type->family != family) continue; @@ -7627,9 +7627,13 @@ nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; + rcu_read_lock(); type = __nft_obj_type_get(objtype, family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES -- cgit v1.2.3 From 751de2012eafa4d46d8081056761fa0e9cc8a178 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 11:24:59 +0200 Subject: netfilter: br_netfilter: skip conntrack input hook for promisc packets For historical reasons, when bridge device is in promisc mode, packets that are directed to the taps follow bridge input hook path. This patch adds a workaround to reset conntrack for these packets. Jianbo Liu reports warning splats in their test infrastructure where cloned packets reach the br_netfilter input hook to confirm the conntrack object. Scratch one bit from BR_INPUT_SKB_CB to annotate that this packet has reached the input hook because it is passed up to the bridge device to reach the taps. [ 57.571874] WARNING: CPU: 1 PID: 0 at net/bridge/br_netfilter_hooks.c:616 br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.572749] Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_isc si ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5ctl mlx5_core [ 57.575158] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.8.0+ #19 [ 57.575700] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 57.576662] RIP: 0010:br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.577195] Code: fe ff ff 41 bd 04 00 00 00 be 04 00 00 00 e9 4a ff ff ff be 04 00 00 00 48 89 ef e8 f3 a9 3c e1 66 83 ad b4 00 00 00 04 eb 91 <0f> 0b e9 f1 fe ff ff 0f 0b e9 df fe ff ff 48 89 df e8 b3 53 47 e1 [ 57.578722] RSP: 0018:ffff88885f845a08 EFLAGS: 00010202 [ 57.579207] RAX: 0000000000000002 RBX: ffff88812dfe8000 RCX: 0000000000000000 [ 57.579830] RDX: ffff88885f845a60 RSI: ffff8881022dc300 RDI: 0000000000000000 [ 57.580454] RBP: ffff88885f845a60 R08: 0000000000000001 R09: 0000000000000003 [ 57.581076] R10: 00000000ffff1300 R11: 0000000000000002 R12: 0000000000000000 [ 57.581695] R13: ffff8881047ffe00 R14: ffff888108dbee00 R15: ffff88814519b800 [ 57.582313] FS: 0000000000000000(0000) GS:ffff88885f840000(0000) knlGS:0000000000000000 [ 57.583040] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 57.583564] CR2: 000000c4206aa000 CR3: 0000000103847001 CR4: 0000000000370eb0 [ 57.584194] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 57.584820] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 57.585440] Call Trace: [ 57.585721] [ 57.585976] ? __warn+0x7d/0x130 [ 57.586323] ? br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.586811] ? report_bug+0xf1/0x1c0 [ 57.587177] ? handle_bug+0x3f/0x70 [ 57.587539] ? exc_invalid_op+0x13/0x60 [ 57.587929] ? asm_exc_invalid_op+0x16/0x20 [ 57.588336] ? br_nf_local_in+0x157/0x180 [br_netfilter] [ 57.588825] nf_hook_slow+0x3d/0xd0 [ 57.589188] ? br_handle_vlan+0x4b/0x110 [ 57.589579] br_pass_frame_up+0xfc/0x150 [ 57.589970] ? br_port_flags_change+0x40/0x40 [ 57.590396] br_handle_frame_finish+0x346/0x5e0 [ 57.590837] ? ipt_do_table+0x32e/0x430 [ 57.591221] ? br_handle_local_finish+0x20/0x20 [ 57.591656] br_nf_hook_thresh+0x4b/0xf0 [br_netfilter] [ 57.592286] ? br_handle_local_finish+0x20/0x20 [ 57.592802] br_nf_pre_routing_finish+0x178/0x480 [br_netfilter] [ 57.593348] ? br_handle_local_finish+0x20/0x20 [ 57.593782] ? nf_nat_ipv4_pre_routing+0x25/0x60 [nf_nat] [ 57.594279] br_nf_pre_routing+0x24c/0x550 [br_netfilter] [ 57.594780] ? br_nf_hook_thresh+0xf0/0xf0 [br_netfilter] [ 57.595280] br_handle_frame+0x1f3/0x3d0 [ 57.595676] ? br_handle_local_finish+0x20/0x20 [ 57.596118] ? br_handle_frame_finish+0x5e0/0x5e0 [ 57.596566] __netif_receive_skb_core+0x25b/0xfc0 [ 57.597017] ? __napi_build_skb+0x37/0x40 [ 57.597418] __netif_receive_skb_list_core+0xfb/0x220 Fixes: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack") Reported-by: Jianbo Liu Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_input.c | 15 +++++++++++---- net/bridge/br_netfilter_hooks.c | 6 ++++++ net/bridge/br_private.h | 1 + net/bridge/netfilter/nf_conntrack_bridge.c | 14 ++++++++++---- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f21097e73482..ceaa5a89b947 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -30,7 +30,7 @@ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) return netif_receive_skb(skb); } -static int br_pass_frame_up(struct sk_buff *skb) +static int br_pass_frame_up(struct sk_buff *skb, bool promisc) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); @@ -65,6 +65,8 @@ static int br_pass_frame_up(struct sk_buff *skb) br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); + BR_INPUT_SKB_CB(skb)->promisc = promisc; + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); @@ -82,6 +84,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; + bool promisc; u16 vid = 0; u8 state; @@ -137,7 +140,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); - local_rcv = !!(br->dev->flags & IFF_PROMISC); + promisc = !!(br->dev->flags & IFF_PROMISC); + local_rcv = promisc; + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -200,7 +205,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, false); if (now != dst->used) dst->used = now; @@ -213,7 +218,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } if (local_rcv) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, promisc); out: return 0; @@ -386,6 +391,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto forward; } + BR_INPUT_SKB_CB(skb)->promisc = false; + /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 35e10c5a766d..22e35623c148 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -600,11 +600,17 @@ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 86ea5e6689b5..d4bedc87b1d8 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -589,6 +589,7 @@ struct br_input_skb_cb { #endif u8 proxyarp_replied:1; u8 src_port_isolated:1; + u8 promisc:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_filtered:1; #endif diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6f877e31709b..c3c51b9a6826 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -294,18 +294,24 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - enum ip_conntrack_info ctinfo; + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; + struct nf_conntrack *nfct = skb_nfct(skb); struct nf_conn *ct; - if (skb->pkt_type == PACKET_HOST) + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; /* nf_conntrack_confirm() cannot handle concurrent clones, * this happens for broad/multicast frames with e.g. macvlan on top * of the bridge device. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) + ct = container_of(nfct, struct nf_conn, ct_general); + if (nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) return NF_ACCEPT; /* let inet prerouting call conntrack again */ -- cgit v1.2.3 From 29b359cf6d95fd60730533f7f10464e95bd17c73 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 10 Apr 2024 18:50:45 +0200 Subject: netfilter: nft_set_pipapo: walk over current view on netlink dump The generation mask can be updated while netlink dump is in progress. The pipapo set backend walk iterator cannot rely on it to infer what view of the datastructure is to be used. Add notation to specify if user wants to read/update the set. Based on patch from Florian Westphal. Fixes: 2b84e215f874 ("netfilter: nft_set_pipapo: .walk does not deal with generations") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 ++++++++++++++ net/netfilter/nf_tables_api.c | 6 ++++++ net/netfilter/nft_set_pipapo.c | 5 +++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e27c28b612e4..3f1ed467f951 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -307,9 +307,23 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) return (void *)priv; } + +/** + * enum nft_iter_type - nftables set iterator type + * + * @NFT_ITER_READ: read-only iteration over set elements + * @NFT_ITER_UPDATE: iteration under mutex to update set element state + */ +enum nft_iter_type { + NFT_ITER_UNSPEC, + NFT_ITER_READ, + NFT_ITER_UPDATE, +}; + struct nft_set; struct nft_set_iter { u8 genmask; + enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f11d0c0a2c73..a7a34db62ea9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -626,6 +626,7 @@ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; @@ -5445,6 +5446,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; @@ -5518,6 +5520,7 @@ static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; @@ -5892,6 +5895,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); + args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; @@ -7376,6 +7380,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, + .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; @@ -10879,6 +10884,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, continue; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index df8de5090246..11e44e4dfb1f 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2115,13 +2115,14 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; unsigned int i, r; + WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + rcu_read_lock(); - if (iter->genmask == nft_genmask_cur(net)) + if (iter->type == NFT_ITER_READ) m = rcu_dereference(priv->match); else m = priv->clone; -- cgit v1.2.3 From 3cfc9ec039af60dbd8965ae085b2c2ccdcfbe1cc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Apr 2024 21:05:13 +0200 Subject: netfilter: nft_set_pipapo: do not free live element Pablo reports a crash with large batches of elements with a back-to-back add/remove pattern. Quoting Pablo: add_elem("00000000") timeout 100 ms ... add_elem("0000000X") timeout 100 ms del_elem("0000000X") <---------------- delete one that was just added ... add_elem("00005000") timeout 100 ms 1) nft_pipapo_remove() removes element 0000000X Then, KASAN shows a splat. Looking at the remove function there is a chance that we will drop a rule that maps to a non-deactivated element. Removal happens in two steps, first we do a lookup for key k and return the to-be-removed element and mark it as inactive in the next generation. Then, in a second step, the element gets removed from the set/map. The _remove function does not work correctly if we have more than one element that share the same key. This can happen if we insert an element into a set when the set already holds an element with same key, but the element mapping to the existing key has timed out or is not active in the next generation. In such case its possible that removal will unmap the wrong element. If this happens, we will leak the non-deactivated element, it becomes unreachable. The element that got deactivated (and will be freed later) will remain reachable in the set data structure, this can result in a crash when such an element is retrieved during lookup (stale pointer). Add a check that the fully matching key does in fact map to the element that we have marked as inactive in the deactivation step. If not, we need to continue searching. Add a bug/warn trap at the end of the function as well, the remove function must not ever be called with an invisible/unreachable/non-existent element. v2: avoid uneeded temporary variable (Stefano) Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reported-by: Pablo Neira Ayuso Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 11e44e4dfb1f..eeaf05ffba95 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2077,6 +2077,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; @@ -2089,16 +2091,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); - } - if (i == m->field_count) { - priv->dirty = true; - pipapo_drop(m, rulemap); - return; + if (last && f->mt[rulemap[i].to].e == e) { + priv->dirty = true; + pipapo_drop(m, rulemap); + return; + } } first_rule += rules_f0; } + + WARN_ON_ONCE(1); /* elem_priv not found */ } /** -- cgit v1.2.3 From 87b3593bed1868b2d9fe096c01bcdf0ea86cbebf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 13:47:33 +0200 Subject: netfilter: flowtable: validate pppoe header Ensure there is sufficient room to access the protocol field of the PPPoe header. Validate it once before the flowtable lookup, then use a helper function to access protocol field. Reported-by: syzbot+b6f07e1c07ef40199081@syzkaller.appspotmail.com Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 12 +++++++++++- net/netfilter/nf_flow_table_inet.c | 3 ++- net/netfilter/nf_flow_table_ip.c | 8 +++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a763dd327c6e..9abb7ee40d72 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -336,7 +336,7 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) { __be16 proto; @@ -352,6 +352,16 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) return 0; } +static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) +{ + if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + return false; + + *inner_proto = __nf_flow_pppoe_proto(skb); + + return true; +} + #define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count) \ diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 9505f9d188ff..6eef15648b7b 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,7 +21,8 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): - proto = nf_flow_pppoe_proto(skb); + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; break; default: proto = skb->protocol; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e45fade76409..9e9e105052da 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -273,10 +273,11 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; + __be16 inner_proto; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -287,7 +288,8 @@ static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, } break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_proto(skb) == proto) { + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { *offset += PPPOE_SES_HLEN; return true; } @@ -316,7 +318,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): - skb->protocol = nf_flow_pppoe_proto(skb); + skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; -- cgit v1.2.3 From 6db5dc7b351b9569940cd1cf445e237c42cd6d27 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Apr 2024 00:09:00 +0200 Subject: netfilter: flowtable: incorrect pppoe tuple pppoe traffic reaching ingress path does not match the flowtable entry because the pppoe header is expected to be at the network header offset. This bug causes a mismatch in the flow table lookup, so pppoe packets enter the classical forwarding path. Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 9e9e105052da..5383bed3d3e0 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -157,7 +157,7 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].proto = skb->protocol; break; case htons(ETH_P_PPP_SES): - phdr = (struct pppoe_hdr *)skb_mac_header(skb); + phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break; -- cgit v1.2.3 From 1aa4ad4eb695bac1b0a7ba542a16d6833c9c8dd8 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 11 Apr 2024 08:58:45 +0300 Subject: serial: core: Fix missing shutdown and startup for serial base port We are seeing start_tx being called after port shutdown as noted by Jiri. This happens because we are missing the startup and shutdown related functions for the serial base port. Let's fix the issue by adding startup and shutdown functions for the serial base port to block tx flushing for the serial base port when the port is not in use. Fixes: 84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM") Cc: stable Reported-by: Jiri Slaby Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20240411055848.38190-1-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_base.h | 4 ++++ drivers/tty/serial/serial_core.c | 20 +++++++++++++++++--- drivers/tty/serial/serial_port.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/drivers/tty/serial/serial_base.h b/drivers/tty/serial/serial_base.h index c74c548f0db6..b6c38d2edfd4 100644 --- a/drivers/tty/serial/serial_base.h +++ b/drivers/tty/serial/serial_base.h @@ -22,6 +22,7 @@ struct serial_ctrl_device { struct serial_port_device { struct device dev; struct uart_port *port; + unsigned int tx_enabled:1; }; int serial_base_ctrl_init(void); @@ -30,6 +31,9 @@ void serial_base_ctrl_exit(void); int serial_base_port_init(void); void serial_base_port_exit(void); +void serial_base_port_startup(struct uart_port *port); +void serial_base_port_shutdown(struct uart_port *port); + int serial_base_driver_register(struct device_driver *driver); void serial_base_driver_unregister(struct device_driver *driver); diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 2247efe97250..c476d884356d 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -323,16 +323,26 @@ static int uart_startup(struct tty_struct *tty, struct uart_state *state, bool init_hw) { struct tty_port *port = &state->port; + struct uart_port *uport; int retval; if (tty_port_initialized(port)) - return 0; + goto out_base_port_startup; retval = uart_port_startup(tty, state, init_hw); - if (retval) + if (retval) { set_bit(TTY_IO_ERROR, &tty->flags); + return retval; + } - return retval; +out_base_port_startup: + uport = uart_port_check(state); + if (!uport) + return -EIO; + + serial_base_port_startup(uport); + + return 0; } /* @@ -355,6 +365,9 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state) if (tty) set_bit(TTY_IO_ERROR, &tty->flags); + if (uport) + serial_base_port_shutdown(uport); + if (tty_port_initialized(port)) { tty_port_set_initialized(port, false); @@ -1775,6 +1788,7 @@ static void uart_tty_port_shutdown(struct tty_port *port) uport->ops->stop_rx(uport); uart_port_unlock_irq(uport); + serial_base_port_shutdown(uport); uart_port_shutdown(port); /* diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index 22b9eeb23e68..7e3a1c7b097c 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -39,8 +39,12 @@ static int serial_port_runtime_resume(struct device *dev) /* Flush any pending TX for the port */ uart_port_lock_irqsave(port, &flags); + if (!port_dev->tx_enabled) + goto unlock; if (__serial_port_busy(port)) port->ops->start_tx(port); + +unlock: uart_port_unlock_irqrestore(port, flags); out: @@ -60,6 +64,11 @@ static int serial_port_runtime_suspend(struct device *dev) return 0; uart_port_lock_irqsave(port, &flags); + if (!port_dev->tx_enabled) { + uart_port_unlock_irqrestore(port, flags); + return 0; + } + busy = __serial_port_busy(port); if (busy) port->ops->start_tx(port); @@ -71,6 +80,31 @@ static int serial_port_runtime_suspend(struct device *dev) return busy ? -EBUSY : 0; } +static void serial_base_port_set_tx(struct uart_port *port, + struct serial_port_device *port_dev, + bool enabled) +{ + unsigned long flags; + + uart_port_lock_irqsave(port, &flags); + port_dev->tx_enabled = enabled; + uart_port_unlock_irqrestore(port, flags); +} + +void serial_base_port_startup(struct uart_port *port) +{ + struct serial_port_device *port_dev = port->port_dev; + + serial_base_port_set_tx(port, port_dev, true); +} + +void serial_base_port_shutdown(struct uart_port *port) +{ + struct serial_port_device *port_dev = port->port_dev; + + serial_base_port_set_tx(port, port_dev, false); +} + static DEFINE_RUNTIME_DEV_PM_OPS(serial_port_pm, serial_port_runtime_suspend, serial_port_runtime_resume, NULL); -- cgit v1.2.3 From 34b990e9bb54d20b9675ca9483be8668eed374d8 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 9 Apr 2024 13:29:10 -0300 Subject: usb: misc: onboard_usb_hub: Disable the USB hub clock on failure In case regulator_bulk_enable() fails, the previously enabled USB hub clock should be disabled. Fix it accordingly. Fixes: 65e62b8a955a ("usb: misc: onboard_usb_hub: Add support for clock input") Cc: stable Signed-off-by: Fabio Estevam Reviewed-by: Frieder Schrempf Acked-by: Matthias Kaehlcke Link: https://lore.kernel.org/r/20240409162910.2061640-1-festevam@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/onboard_usb_hub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/onboard_usb_hub.c b/drivers/usb/misc/onboard_usb_hub.c index c6101ed2d9d4..d8049275a023 100644 --- a/drivers/usb/misc/onboard_usb_hub.c +++ b/drivers/usb/misc/onboard_usb_hub.c @@ -78,7 +78,7 @@ static int onboard_hub_power_on(struct onboard_hub *hub) err = regulator_bulk_enable(hub->pdata->num_supplies, hub->supplies); if (err) { dev_err(hub->dev, "failed to enable supplies: %pe\n", ERR_PTR(err)); - return err; + goto disable_clk; } fsleep(hub->pdata->reset_us); @@ -87,6 +87,10 @@ static int onboard_hub_power_on(struct onboard_hub *hub) hub->is_powered_on = true; return 0; + +disable_clk: + clk_disable_unprepare(hub->clk); + return err; } static int onboard_hub_power_off(struct onboard_hub *hub) -- cgit v1.2.3 From c8d2f34ea96ea3bce6ba2535f867f0d4ee3b22e1 Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sat, 23 Mar 2024 17:48:43 +0100 Subject: speakup: Avoid crash on very long word In case a console is set up really large and contains a really long word (> 256 characters), we have to stop before the length of the word buffer. Signed-off-by: Samuel Thibault Fixes: c6e3fd22cd538 ("Staging: add speakup to the staging directory") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240323164843.1426997-1-samuel.thibault@ens-lyon.org Signed-off-by: Greg Kroah-Hartman --- drivers/accessibility/speakup/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accessibility/speakup/main.c b/drivers/accessibility/speakup/main.c index 1fbc9b921c4f..736c2eb8c0f3 100644 --- a/drivers/accessibility/speakup/main.c +++ b/drivers/accessibility/speakup/main.c @@ -574,7 +574,7 @@ static u_long get_word(struct vc_data *vc) } attr_ch = get_char(vc, (u_short *)tmp_pos, &spk_attr); buf[cnt++] = attr_ch; - while (tmpx < vc->vc_cols - 1) { + while (tmpx < vc->vc_cols - 1 && cnt < sizeof(buf) - 1) { tmp_pos += 2; tmpx++; ch = get_char(vc, (u_short *)tmp_pos, &temp); -- cgit v1.2.3 From 0dc04112bee6fdd6eb847ccb32214703022c0269 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Tue, 12 Mar 2024 07:19:58 +0200 Subject: mei: me: disable RPL-S on SPS and IGN firmwares Extend the quirk to disable MEI interface on Intel PCH Ignition (IGN) and SPS firmwares for RPL-S devices. These firmwares do not support the MEI protocol. Fixes: 3ed8c7d39cfe ("mei: me: add raptor lake point S DID") Cc: stable@vger.kernel.org Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20240312051958.118478-1-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/pci-me.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index b5757993c9b2..c39718042e2e 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -116,7 +116,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_ADP_P, MEI_ME_PCH15_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_ADP_N, MEI_ME_PCH15_CFG)}, - {MEI_PCI_DEVICE(MEI_DEV_ID_RPL_S, MEI_ME_PCH15_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_RPL_S, MEI_ME_PCH15_SPS_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_MTL_M, MEI_ME_PCH15_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_ARL_S, MEI_ME_PCH15_CFG)}, -- cgit v1.2.3 From 26ac2df47d4c58f17210b7a59037e40f7eca693e Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Thu, 14 Mar 2024 14:51:13 +0800 Subject: misc: rtsx: Fix rts5264 driver status incorrect when card removed rts5264 driver not clean express link error and set EXTRA_CAPS_SD_EXPRESS capability back when card removed Fixes: 6a511c9b3a0d ("misc: rtsx: add to support new card reader rts5264") Cc: stable Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20240314065113.5962-1-ricky_wu@realtek.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rtsx_pcr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c index 1a64364700eb..0ad2ff9065aa 100644 --- a/drivers/misc/cardreader/rtsx_pcr.c +++ b/drivers/misc/cardreader/rtsx_pcr.c @@ -1002,7 +1002,7 @@ static irqreturn_t rtsx_pci_isr(int irq, void *dev_id) } else { pcr->card_removed |= SD_EXIST; pcr->card_inserted &= ~SD_EXIST; - if (PCI_PID(pcr) == PID_5261) { + if ((PCI_PID(pcr) == PID_5261) || (PCI_PID(pcr) == PID_5264)) { rtsx_pci_write_register(pcr, RTS5261_FW_STATUS, RTS5261_EXPRESS_LINK_FAIL_MASK, 0); pcr->extra_caps |= EXTRA_CAPS_SD_EXPRESS; -- cgit v1.2.3 From e3dc66d998d2b0c2734db9ca1d6c94c97349529a Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 3 Apr 2024 13:13:40 +0800 Subject: Revert "mei: vsc: Call wake_up() in the threaded IRQ handler" This reverts commit 058a38acba15fd8e7b262ec6e17c4204cb15f984. It's not necessary to avoid a spinlock, a sleeping lock on PREEMPT_RT, in an interrupt handler as the interrupt handler itself would be called in a process context if PREEMPT_RT is enabled. So revert the patch. Cc: stable@vger.kernel.org # for 6.8 Signed-off-by: Sakari Ailus Acked-by: Tomas Winkler Link: https://lore.kernel.org/r/20240403051341.3534650-1-wentong.wu@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/vsc-tp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/mei/vsc-tp.c b/drivers/misc/mei/vsc-tp.c index ecfb70cd057c..968a92a7425d 100644 --- a/drivers/misc/mei/vsc-tp.c +++ b/drivers/misc/mei/vsc-tp.c @@ -419,6 +419,8 @@ static irqreturn_t vsc_tp_isr(int irq, void *data) atomic_inc(&tp->assert_cnt); + wake_up(&tp->xfer_wait); + return IRQ_WAKE_THREAD; } @@ -426,8 +428,6 @@ static irqreturn_t vsc_tp_thread_isr(int irq, void *data) { struct vsc_tp *tp = data; - wake_up(&tp->xfer_wait); - if (tp->event_notify) tp->event_notify(tp->event_notify_context); -- cgit v1.2.3 From f6085a96c97387154be7eaebd1a5420eb3cd55dc Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 3 Apr 2024 13:13:41 +0800 Subject: mei: vsc: Unregister interrupt handler for system suspend Unregister the MEI VSC interrupt handler before system suspend and re-register it at system resume time. This mirrors implementation of other MEI devices. This patch fixes the bug that causes continuous stream of MEI VSC errors after system resume. Fixes: 386a766c4169 ("mei: Add MEI hardware support for IVSC device") Cc: stable@vger.kernel.org # for 6.8 Reported-by: Dominik Brodowski Signed-off-by: Wentong Wu Signed-off-by: Sakari Ailus Acked-by: Tomas Winkler Link: https://lore.kernel.org/r/20240403051341.3534650-2-wentong.wu@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/platform-vsc.c | 17 ++++++++- drivers/misc/mei/vsc-tp.c | 84 +++++++++++++++++++++++++++++------------ drivers/misc/mei/vsc-tp.h | 3 ++ 3 files changed, 78 insertions(+), 26 deletions(-) diff --git a/drivers/misc/mei/platform-vsc.c b/drivers/misc/mei/platform-vsc.c index 6c9f00bcb94b..b543e6b9f3cf 100644 --- a/drivers/misc/mei/platform-vsc.c +++ b/drivers/misc/mei/platform-vsc.c @@ -400,25 +400,40 @@ static void mei_vsc_remove(struct platform_device *pdev) static int mei_vsc_suspend(struct device *dev) { struct mei_device *mei_dev = dev_get_drvdata(dev); + struct mei_vsc_hw *hw = mei_dev_to_vsc_hw(mei_dev); mei_stop(mei_dev); + mei_disable_interrupts(mei_dev); + + vsc_tp_free_irq(hw->tp); + return 0; } static int mei_vsc_resume(struct device *dev) { struct mei_device *mei_dev = dev_get_drvdata(dev); + struct mei_vsc_hw *hw = mei_dev_to_vsc_hw(mei_dev); int ret; - ret = mei_restart(mei_dev); + ret = vsc_tp_request_irq(hw->tp); if (ret) return ret; + ret = mei_restart(mei_dev); + if (ret) + goto err_free; + /* start timer if stopped in suspend */ schedule_delayed_work(&mei_dev->timer_work, HZ); return 0; + +err_free: + vsc_tp_free_irq(hw->tp); + + return ret; } static DEFINE_SIMPLE_DEV_PM_OPS(mei_vsc_pm_ops, mei_vsc_suspend, mei_vsc_resume); diff --git a/drivers/misc/mei/vsc-tp.c b/drivers/misc/mei/vsc-tp.c index 968a92a7425d..e6a98dba8a73 100644 --- a/drivers/misc/mei/vsc-tp.c +++ b/drivers/misc/mei/vsc-tp.c @@ -94,6 +94,27 @@ static const struct acpi_gpio_mapping vsc_tp_acpi_gpios[] = { {} }; +static irqreturn_t vsc_tp_isr(int irq, void *data) +{ + struct vsc_tp *tp = data; + + atomic_inc(&tp->assert_cnt); + + wake_up(&tp->xfer_wait); + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t vsc_tp_thread_isr(int irq, void *data) +{ + struct vsc_tp *tp = data; + + if (tp->event_notify) + tp->event_notify(tp->event_notify_context); + + return IRQ_HANDLED; +} + /* wakeup firmware and wait for response */ static int vsc_tp_wakeup_request(struct vsc_tp *tp) { @@ -383,6 +404,37 @@ int vsc_tp_register_event_cb(struct vsc_tp *tp, vsc_tp_event_cb_t event_cb, } EXPORT_SYMBOL_NS_GPL(vsc_tp_register_event_cb, VSC_TP); +/** + * vsc_tp_request_irq - request irq for vsc_tp device + * @tp: vsc_tp device handle + */ +int vsc_tp_request_irq(struct vsc_tp *tp) +{ + struct spi_device *spi = tp->spi; + struct device *dev = &spi->dev; + int ret; + + irq_set_status_flags(spi->irq, IRQ_DISABLE_UNLAZY); + ret = request_threaded_irq(spi->irq, vsc_tp_isr, vsc_tp_thread_isr, + IRQF_TRIGGER_FALLING | IRQF_ONESHOT, + dev_name(dev), tp); + if (ret) + return ret; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(vsc_tp_request_irq, VSC_TP); + +/** + * vsc_tp_free_irq - free irq for vsc_tp device + * @tp: vsc_tp device handle + */ +void vsc_tp_free_irq(struct vsc_tp *tp) +{ + free_irq(tp->spi->irq, tp); +} +EXPORT_SYMBOL_NS_GPL(vsc_tp_free_irq, VSC_TP); + /** * vsc_tp_intr_synchronize - synchronize vsc_tp interrupt * @tp: vsc_tp device handle @@ -413,27 +465,6 @@ void vsc_tp_intr_disable(struct vsc_tp *tp) } EXPORT_SYMBOL_NS_GPL(vsc_tp_intr_disable, VSC_TP); -static irqreturn_t vsc_tp_isr(int irq, void *data) -{ - struct vsc_tp *tp = data; - - atomic_inc(&tp->assert_cnt); - - wake_up(&tp->xfer_wait); - - return IRQ_WAKE_THREAD; -} - -static irqreturn_t vsc_tp_thread_isr(int irq, void *data) -{ - struct vsc_tp *tp = data; - - if (tp->event_notify) - tp->event_notify(tp->event_notify_context); - - return IRQ_HANDLED; -} - static int vsc_tp_match_any(struct acpi_device *adev, void *data) { struct acpi_device **__adev = data; @@ -490,10 +521,9 @@ static int vsc_tp_probe(struct spi_device *spi) tp->spi = spi; irq_set_status_flags(spi->irq, IRQ_DISABLE_UNLAZY); - ret = devm_request_threaded_irq(dev, spi->irq, vsc_tp_isr, - vsc_tp_thread_isr, - IRQF_TRIGGER_FALLING | IRQF_ONESHOT, - dev_name(dev), tp); + ret = request_threaded_irq(spi->irq, vsc_tp_isr, vsc_tp_thread_isr, + IRQF_TRIGGER_FALLING | IRQF_ONESHOT, + dev_name(dev), tp); if (ret) return ret; @@ -522,6 +552,8 @@ static int vsc_tp_probe(struct spi_device *spi) err_destroy_lock: mutex_destroy(&tp->mutex); + free_irq(spi->irq, tp); + return ret; } @@ -532,6 +564,8 @@ static void vsc_tp_remove(struct spi_device *spi) platform_device_unregister(tp->pdev); mutex_destroy(&tp->mutex); + + free_irq(spi->irq, tp); } static const struct acpi_device_id vsc_tp_acpi_ids[] = { diff --git a/drivers/misc/mei/vsc-tp.h b/drivers/misc/mei/vsc-tp.h index f9513ddc3e40..14ca195cbddc 100644 --- a/drivers/misc/mei/vsc-tp.h +++ b/drivers/misc/mei/vsc-tp.h @@ -37,6 +37,9 @@ int vsc_tp_xfer(struct vsc_tp *tp, u8 cmd, const void *obuf, size_t olen, int vsc_tp_register_event_cb(struct vsc_tp *tp, vsc_tp_event_cb_t event_cb, void *context); +int vsc_tp_request_irq(struct vsc_tp *tp); +void vsc_tp_free_irq(struct vsc_tp *tp); + void vsc_tp_intr_enable(struct vsc_tp *tp); void vsc_tp_intr_disable(struct vsc_tp *tp); void vsc_tp_intr_synchronize(struct vsc_tp *tp); -- cgit v1.2.3 From d1718530e3f640b7d5f0050e725216eab57a85d8 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 8 Apr 2024 10:16:33 -0700 Subject: comedi: vmk80xx: fix incomplete endpoint checking While vmk80xx does have endpoint checking implemented, some things can fall through the cracks. Depending on the hardware model, URBs can have either bulk or interrupt type, and current version of vmk80xx_find_usb_endpoints() function does not take that fully into account. While this warning does not seem to be too harmful, at the very least it will crash systems with 'panic_on_warn' set on them. Fix the issue found by Syzkaller [1] by somewhat simplifying the endpoint checking process with usb_find_common_endpoints() and ensuring that only expected endpoint types are present. This patch has not been tested on real hardware. [1] Syzkaller report: usb 1-1: BOGUS urb xfer, pipe 1 != type 3 WARNING: CPU: 0 PID: 781 at drivers/usb/core/urb.c:504 usb_submit_urb+0xc4e/0x18c0 drivers/usb/core/urb.c:503 ... Call Trace: usb_start_wait_urb+0x113/0x520 drivers/usb/core/message.c:59 vmk80xx_reset_device drivers/comedi/drivers/vmk80xx.c:227 [inline] vmk80xx_auto_attach+0xa1c/0x1a40 drivers/comedi/drivers/vmk80xx.c:818 comedi_auto_config+0x238/0x380 drivers/comedi/drivers.c:1067 usb_probe_interface+0x5cd/0xb00 drivers/usb/core/driver.c:399 ... Similar issue also found by Syzkaller: Link: https://syzkaller.appspot.com/bug?extid=5205eb2f17de3e01946e Reported-and-tested-by: syzbot+5f29dc6a889fc42bd896@syzkaller.appspotmail.com Cc: stable Fixes: 49253d542cc0 ("staging: comedi: vmk80xx: factor out usb endpoint detection") Reviewed-by: Ian Abbott Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240408171633.31649-1-n.zhandarovich@fintech.ru Signed-off-by: Greg Kroah-Hartman --- drivers/comedi/drivers/vmk80xx.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/drivers/comedi/drivers/vmk80xx.c b/drivers/comedi/drivers/vmk80xx.c index 4536ed43f65b..84dce5184a77 100644 --- a/drivers/comedi/drivers/vmk80xx.c +++ b/drivers/comedi/drivers/vmk80xx.c @@ -641,33 +641,22 @@ static int vmk80xx_find_usb_endpoints(struct comedi_device *dev) struct vmk80xx_private *devpriv = dev->private; struct usb_interface *intf = comedi_to_usb_interface(dev); struct usb_host_interface *iface_desc = intf->cur_altsetting; - struct usb_endpoint_descriptor *ep_desc; - int i; - - if (iface_desc->desc.bNumEndpoints != 2) - return -ENODEV; - - for (i = 0; i < iface_desc->desc.bNumEndpoints; i++) { - ep_desc = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_is_int_in(ep_desc) || - usb_endpoint_is_bulk_in(ep_desc)) { - if (!devpriv->ep_rx) - devpriv->ep_rx = ep_desc; - continue; - } + struct usb_endpoint_descriptor *ep_rx_desc, *ep_tx_desc; + int ret; - if (usb_endpoint_is_int_out(ep_desc) || - usb_endpoint_is_bulk_out(ep_desc)) { - if (!devpriv->ep_tx) - devpriv->ep_tx = ep_desc; - continue; - } - } + if (devpriv->model == VMK8061_MODEL) + ret = usb_find_common_endpoints(iface_desc, &ep_rx_desc, + &ep_tx_desc, NULL, NULL); + else + ret = usb_find_common_endpoints(iface_desc, NULL, NULL, + &ep_rx_desc, &ep_tx_desc); - if (!devpriv->ep_rx || !devpriv->ep_tx) + if (ret) return -ENODEV; + devpriv->ep_rx = ep_rx_desc; + devpriv->ep_tx = ep_tx_desc; + if (!usb_endpoint_maxp(devpriv->ep_rx) || !usb_endpoint_maxp(devpriv->ep_tx)) return -EINVAL; -- cgit v1.2.3 From a90bca2228c0646fc29a72689d308e5fe03e6d78 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 13 Mar 2024 17:43:41 -0400 Subject: fs: sysfs: Fix reference leak in sysfs_break_active_protection() The sysfs_break_active_protection() routine has an obvious reference leak in its error path. If the call to kernfs_find_and_get() fails then kn will be NULL, so the companion sysfs_unbreak_active_protection() routine won't get called (and would only cause an access violation by trying to dereference kn->parent if it was called). As a result, the reference to kobj acquired at the start of the function will never be released. Fix the leak by adding an explicit kobject_put() call when kn is NULL. Signed-off-by: Alan Stern Fixes: 2afc9166f79b ("scsi: sysfs: Introduce sysfs_{un,}break_active_protection()") Cc: Bart Van Assche Cc: stable@vger.kernel.org Reviewed-by: Bart Van Assche Acked-by: Tejun Heo Link: https://lore.kernel.org/r/8a4d3f0f-c5e3-4b70-a188-0ca433f9e6f9@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 6b7652fb8050..7cd64021d453 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -463,6 +463,8 @@ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, kn = kernfs_find_and_get(kobj->sd, attr->name); if (kn) kernfs_break_active_protection(kn); + else + kobject_put(kobj); return kn; } EXPORT_SYMBOL_GPL(sysfs_break_active_protection); -- cgit v1.2.3 From aaef73821a3b0194a01bd23ca77774f704a04d40 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Sat, 30 Mar 2024 19:01:14 +0000 Subject: binder: check offset alignment in binder_get_object() Commit 6d98eb95b450 ("binder: avoid potential data leakage when copying txn") introduced changes to how binder objects are copied. In doing so, it unintentionally removed an offset alignment check done through calls to binder_alloc_copy_from_buffer() -> check_buffer(). These calls were replaced in binder_get_object() with copy_from_user(), so now an explicit offset alignment check is needed here. This avoids later complications when unwinding the objects gets harder. It is worth noting this check existed prior to commit 7a67a39320df ("binder: add function to copy binder object from buffer"), likely removed due to redundancy at the time. Fixes: 6d98eb95b450 ("binder: avoid potential data leakage when copying txn") Cc: stable@vger.kernel.org Signed-off-by: Carlos Llamas Acked-by: Todd Kjos Link: https://lore.kernel.org/r/20240330190115.1877819-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index bad28cf42010..dd6923d37931 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1708,8 +1708,10 @@ static size_t binder_get_object(struct binder_proc *proc, size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); - if (offset > buffer->data_size || read_size < sizeof(*hdr)) + if (offset > buffer->data_size || read_size < sizeof(*hdr) || + !IS_ALIGNED(offset, sizeof(u32))) return 0; + if (u) { if (copy_from_user(object, u + offset, read_size)) return 0; -- cgit v1.2.3 From f488138b526715c6d2568d7329c4477911be4210 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 11 Apr 2024 11:45:57 +0200 Subject: NFSD: fix endianness issue in nfsd4_encode_fattr4 The nfs4 mount fails with EIO on 64-bit big endian architectures since v6.7. The issue arises from employing a union in the nfsd4_encode_fattr4() function to overlay a 32-bit array with a 64-bit values based bitmap, which does not function as intended. Address the endianness issue by utilizing bitmap_from_arr32() to copy 32-bit attribute masks into a bitmap in an endianness-agnostic manner. Cc: stable@vger.kernel.org Fixes: fce7913b13d0 ("NFSD: Use a bitmask loop to encode FATTR4 results") Link: https://bugs.launchpad.net/ubuntu/+source/nfs-utils/+bug/2060217 Signed-off-by: Vasily Gorbik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fac938f563ad..1955481832e0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3490,11 +3490,13 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct dentry *dentry, const u32 *bmval, int ignore_crossmnt) { + DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); struct nfsd4_fattr_args args; struct svc_fh *tempfh = NULL; int starting_len = xdr->buf->len; __be32 *attrlen_p, status; int attrlen_offset; + u32 attrmask[3]; int err; struct nfsd4_compoundres *resp = rqstp->rq_resp; u32 minorversion = resp->cstate.minorversion; @@ -3502,10 +3504,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, .mnt = exp->ex_path.mnt, .dentry = dentry, }; - union { - u32 attrmask[3]; - unsigned long mask[2]; - } u; unsigned long bit; bool file_modified = false; u64 size = 0; @@ -3521,20 +3519,19 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, /* * Make a local copy of the attribute bitmap that can be modified. */ - memset(&u, 0, sizeof(u)); - u.attrmask[0] = bmval[0]; - u.attrmask[1] = bmval[1]; - u.attrmask[2] = bmval[2]; + attrmask[0] = bmval[0]; + attrmask[1] = bmval[1]; + attrmask[2] = bmval[2]; args.rdattr_err = 0; if (exp->ex_fslocs.migrated) { - status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1], - &u.attrmask[2], &args.rdattr_err); + status = fattr_handle_absent_fs(&attrmask[0], &attrmask[1], + &attrmask[2], &args.rdattr_err); if (status) goto out; } args.size = 0; - if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), &file_modified, &size); if (status) @@ -3553,16 +3550,16 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ - u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; - if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; + if ((attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || - (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + (attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { err = vfs_statfs(&path, &args.statfs); if (err) goto out_nfserr; } - if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && + if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); status = nfserr_jukebox; @@ -3577,10 +3574,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.fhp = fhp; args.acl = NULL; - if (u.attrmask[0] & FATTR4_WORD0_ACL) { + if (attrmask[0] & FATTR4_WORD0_ACL) { err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl); if (err == -EOPNOTSUPP) - u.attrmask[0] &= ~FATTR4_WORD0_ACL; + attrmask[0] &= ~FATTR4_WORD0_ACL; else if (err == -EINVAL) { status = nfserr_attrnotsupp; goto out; @@ -3592,17 +3589,17 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL args.context = NULL; - if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || - u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || + attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) err = security_inode_getsecctx(d_inode(dentry), &args.context, &args.contextlen); else err = -EOPNOTSUPP; args.contextsupport = (err == 0); - if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) - u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; else if (err) goto out_nfserr; } @@ -3610,8 +3607,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ /* attrmask */ - status = nfsd4_encode_bitmap4(xdr, u.attrmask[0], - u.attrmask[1], u.attrmask[2]); + status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1], + attrmask[2]); if (status) goto out; @@ -3620,7 +3617,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, attrlen_p = xdr_reserve_space(xdr, XDR_UNIT); if (!attrlen_p) goto out_resource; - for_each_set_bit(bit, (const unsigned long *)&u.mask, + bitmap_from_arr32(attr_bitmap, attrmask, + ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); + for_each_set_bit(bit, attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) { status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args); if (status != nfs_ok) -- cgit v1.2.3 From 156539fd65019e8ed6b9fbac0583cf519cdbb227 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 22 Mar 2024 21:38:40 +1100 Subject: Documentation: embargoed-hardware-issues.rst: Add myself for Power Unfortunately Anton has left IBM. Add myself as the contact for Power, until someone else volunteers. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20240322103840.668746-1-mpe@ellerman.id.au Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index bb2100228cc7..6e9a4597bf2c 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -252,7 +252,7 @@ an involved disclosed party. The current ambassadors list: AMD Tom Lendacky Ampere Darren Hart ARM Catalin Marinas - IBM Power Anton Blanchard + IBM Power Michael Ellerman IBM Z Christian Borntraeger Intel Tony Luck Qualcomm Trilok Soni -- cgit v1.2.3 From 50a9b7fc151e67b9e642232d32e8c5a5ac13e64a Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 5 Apr 2024 13:07:11 -0700 Subject: drm/xe/display: Fix double mutex initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All of these mutexes are already initialized by the display side since commit 3fef3e6ff86a ("drm/i915: move display mutex inits to display code"), so the xe shouldn´t initialize them. Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Cc: Jani Nikula Cc: Arun R Murthy Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240405200711.2041428-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 117de185edf2c5767f03575219bf7a43b161ff0d) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/xe_display.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index e4db069f0db3..6ec375c1c4b6 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -108,11 +108,6 @@ int xe_display_create(struct xe_device *xe) xe->display.hotplug.dp_wq = alloc_ordered_workqueue("xe-dp", 0); drmm_mutex_init(&xe->drm, &xe->sb_lock); - drmm_mutex_init(&xe->drm, &xe->display.backlight.lock); - drmm_mutex_init(&xe->drm, &xe->display.audio.mutex); - drmm_mutex_init(&xe->drm, &xe->display.wm.wm_mutex); - drmm_mutex_init(&xe->drm, &xe->display.pps.mutex); - drmm_mutex_init(&xe->drm, &xe->display.hdcp.hdcp_mutex); xe->enabled_irq_mask = ~0; err = drmm_add_action_or_reset(&xe->drm, display_destroy, NULL); -- cgit v1.2.3 From a8ad8715472bb8f6a2ea8b4072a28151eb9f4f24 Mon Sep 17 00:00:00 2001 From: Karthik Poosa Date: Fri, 5 Apr 2024 18:31:27 +0530 Subject: drm/xe/hwmon: Cast result to output precision on left shift of operand Address potential overflow in result of left shift of a lower precision (u32) operand before assignment to higher precision (u64) variable. v2: - Update commit message. (Himal) Fixes: 4446fcf220ce ("drm/xe/hwmon: Expose power1_max_interval") Signed-off-by: Karthik Poosa Reviewed-by: Anshuman Gupta Cc: Badal Nilawar Link: https://patchwork.freedesktop.org/patch/msgid/20240405130127.1392426-5-karthik.poosa@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 883232b47b81108b0252197c747f396ecd51455a) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_hwmon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index b82233a41606..9ac7fbe201b3 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -290,7 +290,7 @@ xe_hwmon_power1_max_interval_show(struct device *dev, struct device_attribute *a * As y can be < 2, we compute tau4 = (4 | x) << y * and then add 2 when doing the final right shift to account for units */ - tau4 = ((1 << x_w) | x) << y; + tau4 = (u64)((1 << x_w) | x) << y; /* val in hwmon interface units (millisec) */ out = mul_u64_u32_shr(tau4, SF_TIME, hwmon->scl_shift_time + x_w); @@ -330,7 +330,7 @@ xe_hwmon_power1_max_interval_store(struct device *dev, struct device_attribute * r = FIELD_PREP(PKG_MAX_WIN, PKG_MAX_WIN_DEFAULT); x = REG_FIELD_GET(PKG_MAX_WIN_X, r); y = REG_FIELD_GET(PKG_MAX_WIN_Y, r); - tau4 = ((1 << x_w) | x) << y; + tau4 = (u64)((1 << x_w) | x) << y; max_win = mul_u64_u32_shr(tau4, SF_TIME, hwmon->scl_shift_time + x_w); if (val > max_win) -- cgit v1.2.3 From 9cb46b31f3d08ed3fce86349e8c12f96d7c88717 Mon Sep 17 00:00:00 2001 From: Himal Prasad Ghimiray Date: Mon, 1 Apr 2024 23:23:00 +0530 Subject: drm/xe/xe_migrate: Cast to output precision before multiplying operands Addressing potential overflow in result of multiplication of two lower precision (u32) operands before widening it to higher precision (u64). -v2 Fix commit message and description. (Rodrigo) Cc: Rodrigo Vivi Signed-off-by: Himal Prasad Ghimiray Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240401175300.3823653-1-himal.prasad.ghimiray@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 34820967ae7b45411f8f4f737c2d63b0c608e0d7) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_migrate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index ee1bb938c493..2ba4fb9511f6 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -227,7 +227,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, if (vm->flags & XE_VM_FLAG_64K && level == 1) flags = XE_PDE_64K; - entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) * + entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) * XE_PAGE_SIZE, pat_index); xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, entry | flags); @@ -235,7 +235,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, /* Write PDE's that point to our BO. */ for (i = 0; i < num_entries - num_level; i++) { - entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE, + entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE, pat_index); xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + @@ -291,7 +291,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) drm_suballoc_manager_init(&m->vm_update_sa, - (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * + (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * NUM_VMUSA_UNIT_PER_PAGE, 0); m->pt_bo = bo; @@ -490,7 +490,7 @@ static void emit_pte(struct xe_migrate *m, struct xe_vm *vm = m->q->vm; u16 pat_index; u32 ptes; - u64 ofs = at_pt * XE_PAGE_SIZE; + u64 ofs = (u64)at_pt * XE_PAGE_SIZE; u64 cur_ofs; /* Indirect access needs compression enabled uncached PAT index */ -- cgit v1.2.3 From f76646c83f028c62853c23dac49204232e903597 Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Thu, 4 Apr 2024 09:12:56 -0700 Subject: drm/xe: Label RING_CONTEXT_CONTROL as masked RING_CONTEXT_CONTROL is a masked register. v2: Also clean up setting register value (Lucas) Reviewed-by: Matt Roper Reviewed-by: Lucas De Marchi Signed-off-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20240404161256.3852502-1-ashutosh.dixit@intel.com (cherry picked from commit dc30c6e7149baaae4288c742de95212b31f07438) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/regs/xe_engine_regs.h | 2 +- drivers/gpu/drm/xe/xe_lrc.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index 0b1266c88a6a..deddc8be48c0 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -125,7 +125,7 @@ #define RING_EXECLIST_STATUS_LO(base) XE_REG((base) + 0x234) #define RING_EXECLIST_STATUS_HI(base) XE_REG((base) + 0x234 + 4) -#define RING_CONTEXT_CONTROL(base) XE_REG((base) + 0x244) +#define RING_CONTEXT_CONTROL(base) XE_REG((base) + 0x244, XE_REG_OPTION_MASKED) #define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH REG_BIT(3) #define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT REG_BIT(0) diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 1426febe86eb..57066faf575e 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -525,9 +525,8 @@ static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) { - regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) | - _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | - CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; + regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | + CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); /* TODO: Timestamp */ } -- cgit v1.2.3 From ebaed6d4def877d2035786ff318379eb750044c8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Mar 2024 11:29:10 -0700 Subject: peci: linux/peci.h: fix Excess kernel-doc description warning Remove the @controller: line to prevent the kernel-doc warning: include/linux/peci.h:84: warning: Excess struct member 'controller' description in 'peci_device' Signed-off-by: Randy Dunlap Cc: Iwona Winiarska Cc: openbmc@lists.ozlabs.org Reviewed-by: Iwona Winiarska Fixes: 6523d3b2ffa2 ("peci: Add core infrastructure") Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240329182910.29495-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/peci.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/peci.h b/include/linux/peci.h index 9b3d36aff431..90e241458ef6 100644 --- a/include/linux/peci.h +++ b/include/linux/peci.h @@ -58,7 +58,6 @@ static inline struct peci_controller *to_peci_controller(void *d) /** * struct peci_device - PECI device * @dev: device object to register PECI device to the device model - * @controller: manages the bus segment hosting this PECI device * @info: PECI device characteristics * @info.family: device family * @info.model: device model -- cgit v1.2.3 From fd706c9b1674e2858766bfbf7430534c2b26fbef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Apr 2024 16:55:54 -0700 Subject: KVM: x86: Snapshot if a vCPU's vendor model is AMD vs. Intel compatible Add kvm_vcpu_arch.is_amd_compatible to cache if a vCPU's vendor model is compatible with AMD, i.e. if the vCPU vendor is AMD or Hygon, along with helpers to check if a vCPU is compatible AMD vs. Intel. To handle Intel vs. AMD behavior related to masking the LVTPC entry, KVM will need to check for vendor compatibility on every PMI injection, i.e. querying for AMD will soon be a moderately hot path. Note! This subtly (or maybe not-so-subtly) makes "Intel compatible" KVM's default behavior, both if userspace omits (or never sets) CPUID 0x0 and if userspace sets a completely unknown vendor. One could argue that KVM should treat such vCPUs as not being compatible with Intel *or* AMD, but that would add useless complexity to KVM. KVM needs to do *something* in the face of vendor specific behavior, and so unless KVM conjured up a magic third option, choosing to treat unknown vendors as neither Intel nor AMD means that checks on AMD compatibility would yield Intel behavior, and checks for Intel compatibility would yield AMD behavior. And that's far worse as it would effectively yield random behavior depending on whether KVM checked for AMD vs. Intel vs. !AMD vs. !Intel. And practically speaking, all x86 CPUs follow either Intel or AMD architecture, i.e. "supporting" an unknown third architecture adds no value. Deliberately don't convert any of the existing guest_cpuid_is_intel() checks, as the Intel side of things is messier due to some flows explicitly checking for exactly vendor==Intel, versus some flows assuming anything that isn't "AMD compatible" gets Intel behavior. The Intel code will be cleaned up in the future. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20240405235603.1173076-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/cpuid.h | 10 ++++++++++ arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.c | 2 +- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 16e07a2eee19..6efd1497b026 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -855,6 +855,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + bool is_amd_compatible; /* * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bfc0bfcb2bc6..77352a4abd87 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -376,6 +376,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); + vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 856e3037e74f..23dbb9eb277c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -120,6 +120,16 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } +static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.is_amd_compatible; +} + +static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu) +{ + return !guest_cpuid_is_amd_compatible(vcpu); +} + static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 992e651540e8..bf4de6d7e39c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4935,7 +4935,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->cpu_role.base.level, is_efer_nx(context), guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), - guest_cpuid_is_amd_or_hygon(vcpu)); + guest_cpuid_is_amd_compatible(vcpu)); } static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47d9f03b7778..ebcc12d1e1de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3470,7 +3470,7 @@ static bool is_mci_status_msr(u32 msr) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_or_hygon(vcpu)) + if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; -- cgit v1.2.3 From 49ff3b4aec51e3abfc9369997cc603319b02af9a Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Apr 2024 16:55:55 -0700 Subject: KVM: x86/pmu: Do not mask LVTPC when handling a PMI on AMD platforms On AMD and Hygon platforms, the local APIC does not automatically set the mask bit of the LVTPC register when handling a PMI and there is no need to clear it in the kernel's PMI handler. For guests, the mask bit is currently set by kvm_apic_local_deliver() and unless it is cleared by the guest kernel's PMI handler, PMIs stop arriving and break use-cases like sampling with perf record. This does not affect non-PerfMonV2 guests because PMIs are handled in the guest kernel by x86_pmu_handle_irq() which always clears the LVTPC mask bit irrespective of the vendor. Before: $ perf record -e cycles:u true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (1 samples) ] After: $ perf record -e cycles:u true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.002 MB perf.data (19 samples) ] Fixes: a16eb25b09c0 ("KVM: x86: Mask LVTPC when handling a PMI") Cc: stable@vger.kernel.org Signed-off-by: Sandipan Das Reviewed-by: Jim Mattson [sean: use is_intel_compatible instead of !is_amd_or_hygon()] Signed-off-by: Sean Christopherson Message-ID: <20240405235603.1173076-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cf37586f0466..ebf41023be38 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2776,7 +2776,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); - if (r && lvt_type == APIC_LVTPC) + if (r && lvt_type == APIC_LVTPC && + guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } -- cgit v1.2.3 From 2b8dbf69ec60faf6c7db49e57d7f316409ccec92 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Apr 2024 14:17:57 -0700 Subject: perf annotate: Make sure to call symbol__annotate2() in TUI The symbol__annotate2() initializes some data structures needed by TUI. It has a logic to prevent calling it multiple times by checking if it has the annotated source. But data type profiling uses a different code (symbol__annotate) to allocate the annotated lines in advance. So TUI missed to call symbol__annotate2() when it shows the annotation browser. Make symbol__annotate() reentrant and handle that situation properly. This fixes a crash in the annotation browser started by perf report in TUI like below. $ perf report -s type,sym --tui # and press 'a' key and then move down Fixes: 81e57deec325 ("perf report: Support data type profiling") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240405211800.1412920-2-namhyung@kernel.org --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/util/annotate.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ec5e21932876..4790c735599b 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -970,7 +970,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, if (dso->annotate_warned) return -1; - if (not_annotated) { + if (not_annotated || !sym->annotate2) { err = symbol__annotate2(ms, evsel, &browser.arch); if (err) { char msg[BUFSIZ]; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ac002d907d81..50ca92255ff6 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2461,6 +2461,9 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, if (parch) *parch = arch; + if (!list_empty(¬es->src->source)) + return 0; + args.arch = arch; args.ms = *ms; if (annotate_opts.full_addr) -- cgit v1.2.3 From b372e96bd0a32729d55d27f613c8bc80708a82e1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 25 Mar 2024 09:21:20 +1100 Subject: ceph: redirty page before returning AOP_WRITEPAGE_ACTIVATE The page has been marked clean before writepage is called. If we don't redirty it before postponing the write, it might never get written. Cc: stable@vger.kernel.org Fixes: 503d4fa6ee28 ("ceph: remove reliance on bdi congestion") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1340d77124ae..ee9caf7916fb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -795,8 +795,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) ihold(inode); if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_fs_client(inode)->write_congested) + ceph_inode_to_fs_client(inode)->write_congested) { + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; + } wait_on_page_fscache(page); -- cgit v1.2.3 From f3408580bac8ce5cd76e7391e529c0a22e7c7eb2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 9 Apr 2024 15:55:42 -0700 Subject: perf lock contention: Add a missing NULL check I got a report for a failure in BPF verifier on a recent kernel with perf lock contention command. It checks task->sighand->siglock without checking if sighand is NULL or not. Let's add one. ; if (&curr->sighand->siglock == (void *)lock) 265: (79) r1 = *(u64 *)(r0 +2624) ; frame1: R0_w=trusted_ptr_task_struct(off=0,imm=0) ; R1_w=rcu_ptr_or_null_sighand_struct(off=0,imm=0) 266: (b7) r2 = 0 ; frame1: R2_w=0 267: (0f) r1 += r2 R1 pointer arithmetic on rcu_ptr_or_null_ prohibited, null-check it first processed 164 insns (limit 1000000) max_states_per_insn 1 total_states 15 peak_states 15 mark_read 5 -- END PROG LOAD LOG -- libbpf: prog 'contention_end': failed to load: -13 libbpf: failed to load object 'lock_contention_bpf' libbpf: failed to load BPF skeleton 'lock_contention_bpf': -13 Failed to load lock-contention BPF skeleton lock contention BPF setup failed lock contention did not detect any lock contention Fixes: 1811e82767dcc ("perf lock contention: Track and show siglock with address") Reviewed-by: Ian Rogers Acked-by: Arnaldo Carvalho de Melo Cc: Song Liu Cc: bpf@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240409225542.1870999-1-namhyung@kernel.org --- tools/perf/util/bpf_skel/lock_contention.bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index fb54bd38e7d0..d931a898c434 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -284,6 +284,7 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) struct task_struct *curr; struct mm_struct___old *mm_old; struct mm_struct___new *mm_new; + struct sighand_struct *sighand; switch (flags) { case LCB_F_READ: /* rwsem */ @@ -305,7 +306,9 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) break; case LCB_F_SPIN: /* spinlock */ curr = bpf_get_current_task_btf(); - if (&curr->sighand->siglock == (void *)lock) + sighand = curr->sighand; + + if (sighand && &sighand->siglock == (void *)lock) return LCD_F_SIGHAND_LOCK; break; default: -- cgit v1.2.3 From 3ef842a77e7cdf757fe3f1d2999aa2cc88eb53ba Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:12 -0700 Subject: tools/include: Sync uapi/drm/i915_drm.h with the kernel sources To pick up changes from: b112364867499 ("drm/i915: Add GuC submission interface version query") 5cf0fbf763741 ("drm/i915: Add some boring kerneldoc") This should be used to beautify DRM syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-2-namhyung@kernel.org --- tools/include/uapi/drm/i915_drm.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index fd4f9574d177..2ee338860b7e 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -3013,6 +3013,7 @@ struct drm_i915_query_item { * - %DRM_I915_QUERY_MEMORY_REGIONS (see struct drm_i915_query_memory_regions) * - %DRM_I915_QUERY_HWCONFIG_BLOB (see `GuC HWCONFIG blob uAPI`) * - %DRM_I915_QUERY_GEOMETRY_SUBSLICES (see struct drm_i915_query_topology_info) + * - %DRM_I915_QUERY_GUC_SUBMISSION_VERSION (see struct drm_i915_query_guc_submission_version) */ __u64 query_id; #define DRM_I915_QUERY_TOPOLOGY_INFO 1 @@ -3021,6 +3022,7 @@ struct drm_i915_query_item { #define DRM_I915_QUERY_MEMORY_REGIONS 4 #define DRM_I915_QUERY_HWCONFIG_BLOB 5 #define DRM_I915_QUERY_GEOMETRY_SUBSLICES 6 +#define DRM_I915_QUERY_GUC_SUBMISSION_VERSION 7 /* Must be kept compact -- no holes and well documented */ /** @@ -3566,6 +3568,20 @@ struct drm_i915_query_memory_regions { struct drm_i915_memory_region_info regions[]; }; +/** + * struct drm_i915_query_guc_submission_version - query GuC submission interface version + */ +struct drm_i915_query_guc_submission_version { + /** @branch: Firmware branch version. */ + __u32 branch; + /** @major: Firmware major version. */ + __u32 major; + /** @minor: Firmware minor version. */ + __u32 minor; + /** @patch: Firmware patch version. */ + __u32 patch; +}; + /** * DOC: GuC HWCONFIG blob uAPI * -- cgit v1.2.3 From 4cfa8a873d3e3a87894f8de056ee69a857b5adcd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:13 -0700 Subject: tools/include: Sync uapi/linux/fs.h with the kernel sources To pick up the changes from: 41bcbe59c3b3f ("fs: FS_IOC_GETUUID") ae8c511757304 ("fs: add FS_IOC_GETFSSYSFSPATH") 73fa7547c70b3 ("vfs: add RWF_NOAPPEND flag for pwritev2") This should be used to beautify fs syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-3-namhyung@kernel.org --- tools/include/uapi/linux/fs.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 48ad69f7722e..45e4e64fd664 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -64,6 +64,24 @@ struct fstrim_range { __u64 minlen; }; +/* + * We include a length field because some filesystems (vfat) have an identifier + * that we do want to expose as a UUID, but doesn't have the standard length. + * + * We use a fixed size buffer beacuse this interface will, by fiat, never + * support "UUIDs" longer than 16 bytes; we don't want to force all downstream + * users to have to deal with that. + */ +struct fsuuid2 { + __u8 len; + __u8 uuid[16]; +}; + +struct fs_sysfs_path { + __u8 len; + __u8 name[128]; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -215,6 +233,13 @@ struct fsxattr { #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) +/* Returns the external filesystem UUID, the same one blkid returns */ +#define FS_IOC_GETFSUUID _IOR(0x15, 0, struct fsuuid2) +/* + * Returns the path component under /sys/fs/ that refers to this filesystem; + * also /sys/kernel/debug/ for filesystems with debugfs exports + */ +#define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) @@ -301,9 +326,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO negation of O_APPEND */ +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_NOAPPEND) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) -- cgit v1.2.3 From bee3b820c66a6aae0e16d0ac47f9744446f33bff Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:14 -0700 Subject: tools/include: Sync uapi/linux/kvm.h and asm/kvm.h with the kernel sources To pick up the changes from: 6bda055d6258 ("KVM: define __KVM_HAVE_GUEST_DEBUG unconditionally") 5d9cb71642db ("KVM: arm64: move ARM-specific defines to uapi/asm/kvm.h") 71cd774ad2f9 ("KVM: s390: move s390-specific structs to uapi/asm/kvm.h") d750951c9ed7 ("KVM: powerpc: move powerpc-specific structs to uapi/asm/kvm.h") bcac0477277e ("KVM: x86: move x86-specific structs to uapi/asm/kvm.h") c0a411904e15 ("KVM: remove more traces of device assignment UAPI") f3c80061c0d3 ("KVM: SEV: fix compat ABI for KVM_MEMORY_ENCRYPT_OP") That should be used to beautify the KVM arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h diff -u tools/arch/powerpc/include/uapi/asm/kvm.h arch/powerpc/include/uapi/asm/kvm.h diff -u tools/arch/s390/include/uapi/asm/kvm.h arch/s390/include/uapi/asm/kvm.h diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-4-namhyung@kernel.org --- tools/arch/arm64/include/uapi/asm/kvm.h | 15 +- tools/arch/powerpc/include/uapi/asm/kvm.h | 45 +- tools/arch/s390/include/uapi/asm/kvm.h | 315 +++++++++++++- tools/arch/x86/include/uapi/asm/kvm.h | 308 ++++++++++++- tools/include/uapi/linux/kvm.h | 689 +----------------------------- 5 files changed, 672 insertions(+), 700 deletions(-) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 89d2fc872d9f..964df31da975 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -37,9 +37,7 @@ #include #include -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_READONLY_MEM #define __KVM_HAVE_VCPU_EVENTS #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -76,11 +74,11 @@ struct kvm_regs { /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ - KVM_ARM_DEVICE_TYPE_SHIFT) +#define KVM_ARM_DEVICE_TYPE_MASK __GENMASK(KVM_ARM_DEVICE_TYPE_SHIFT + 15, \ + KVM_ARM_DEVICE_TYPE_SHIFT) #define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ - KVM_ARM_DEVICE_ID_SHIFT) +#define KVM_ARM_DEVICE_ID_MASK __GENMASK(KVM_ARM_DEVICE_ID_SHIFT + 15, \ + KVM_ARM_DEVICE_ID_SHIFT) /* Supported device IDs */ #define KVM_ARM_DEVICE_VGIC_V2 0 @@ -162,6 +160,11 @@ struct kvm_sync_regs { __u64 device_irq_level; }; +/* Bits for run->s.regs.device_irq_level */ +#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) +#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) +#define KVM_ARM_DEV_PMU (1 << 2) + /* * PMU filter structure. Describe a range of events with a particular * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 9f18fa090f1f..1691297a766a 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -28,7 +28,6 @@ #define __KVM_HAVE_PPC_SMT #define __KVM_HAVE_IRQCHIP #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_GUEST_DEBUG /* Not always available, but if it is, this is the correct offset. */ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -733,4 +732,48 @@ struct kvm_ppc_xive_eq { #define KVM_XIVE_TIMA_PAGE_OFFSET 0 #define KVM_XIVE_ESB_PAGE_OFFSET 4 +/* for KVM_PPC_GET_PVINFO */ + +#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + +struct kvm_ppc_pvinfo { + /* out */ + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +/* for KVM_PPC_GET_SMMU_INFO */ +#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 + +struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ +}; + +struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 +#define KVM_PPC_1T_SEGMENTS 0x00000002 +#define KVM_PPC_NO_HASH 0x00000004 + +struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index abe926d43cbe..05eaf6db3ad4 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -12,7 +12,320 @@ #include #define __KVM_S390 -#define __KVM_HAVE_GUEST_DEBUG + +struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; +}; + +#define KVM_S390_CMMA_PEEK (1 << 0) + +/** + * kvm_s390_cmma_log - Used for CMMA migration. + * + * Used both for input and output. + * + * @start_gfn: Guest page number to start from. + * @count: Size of the result buffer. + * @flags: Control operation mode via KVM_S390_CMMA_* flags + * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty + * pages are still remaining. + * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set + * in the PGSTE. + * @values: Pointer to the values buffer. + * + * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. + */ +struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; +}; + +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + +/* for KVM_S390_MEM_OP */ +struct kvm_s390_mem_op { + /* in */ + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + union { + struct { + __u8 ar; /* the access register number */ + __u8 key; /* access key, ignored if flag unset */ + __u8 pad1[6]; /* ignored */ + __u64 old_addr; /* ignored if cmpxchg flag unset */ + }; + __u32 sida_offset; /* offset into the sida */ + __u8 reserved[32]; /* ignored */ + }; +}; +/* types for kvm_s390_mem_op->op */ +#define KVM_S390_MEMOP_LOGICAL_READ 0 +#define KVM_S390_MEMOP_LOGICAL_WRITE 1 +#define KVM_S390_MEMOP_SIDA_READ 2 +#define KVM_S390_MEMOP_SIDA_WRITE 3 +#define KVM_S390_MEMOP_ABSOLUTE_READ 4 +#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 +#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 + +/* flags for kvm_s390_mem_op->flags */ +#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) +#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) + +/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ +#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) +#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) + +struct kvm_s390_psw { + __u64 mask; + __u64 addr; +}; + +/* valid values for type in kvm_s390_interrupt */ +#define KVM_S390_SIGP_STOP 0xfffe0000u +#define KVM_S390_PROGRAM_INT 0xfffe0001u +#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u +#define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u +#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u +#define KVM_S390_MCHK 0xfffe1000u +#define KVM_S390_INT_CLOCK_COMP 0xffff1004u +#define KVM_S390_INT_CPU_TIMER 0xffff1005u +#define KVM_S390_INT_VIRTIO 0xffff2603u +#define KVM_S390_INT_SERVICE 0xffff2401u +#define KVM_S390_INT_EMERGENCY 0xffff1201u +#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u +/* Anything below 0xfffe0000u is taken by INT_IO */ +#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ + (((schid)) | \ + ((ssid) << 16) | \ + ((cssid) << 18) | \ + ((ai) << 26)) +#define KVM_S390_INT_IO_MIN 0x00000000u +#define KVM_S390_INT_IO_MAX 0xfffdffffu +#define KVM_S390_INT_IO_AI_MASK 0x04000000u + + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +struct kvm_s390_io_info { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; +}; + +struct kvm_s390_ext_info { + __u32 ext_params; + __u32 pad; + __u64 ext_params2; +}; + +struct kvm_s390_pgm_info { + __u64 trans_exc_code; + __u64 mon_code; + __u64 per_address; + __u32 data_exc_code; + __u16 code; + __u16 mon_class_nr; + __u8 per_code; + __u8 per_atmid; + __u8 exc_access_id; + __u8 per_access_id; + __u8 op_access_id; +#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 +#define KVM_S390_PGM_FLAGS_ILC_0 0x02 +#define KVM_S390_PGM_FLAGS_ILC_1 0x04 +#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 +#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 + __u8 flags; + __u8 pad[2]; +}; + +struct kvm_s390_prefix_info { + __u32 address; +}; + +struct kvm_s390_extcall_info { + __u16 code; +}; + +struct kvm_s390_emerg_info { + __u16 code; +}; + +#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 +struct kvm_s390_stop_info { + __u32 flags; +}; + +struct kvm_s390_mchk_info { + __u64 cr14; + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 pad; + __u8 fixed_logout[16]; +}; + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + +struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ +}; + +struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; +}; + +struct kvm_s390_pv_sec_parm { + __u64 origin; + __u64 length; +}; + +struct kvm_s390_pv_unp { + __u64 addr; + __u64 size; + __u64 tweak; +}; + +enum pv_cmd_dmp_id { + KVM_PV_DUMP_INIT, + KVM_PV_DUMP_CONFIG_STOR_STATE, + KVM_PV_DUMP_COMPLETE, + KVM_PV_DUMP_CPU, +}; + +struct kvm_s390_pv_dmp { + __u64 subcmd; + __u64 buff_addr; + __u64 buff_len; + __u64 gaddr; /* For dump storage state */ + __u64 reserved[4]; +}; + +enum pv_cmd_info_id { + KVM_PV_INFO_VM, + KVM_PV_INFO_DUMP, +}; + +struct kvm_s390_pv_info_dump { + __u64 dump_cpu_buffer_len; + __u64 dump_config_mem_buffer_per_1m; + __u64 dump_config_finalize_len; +}; + +struct kvm_s390_pv_info_vm { + __u64 inst_calls_list[4]; + __u64 max_cpus; + __u64 max_guests; + __u64 max_guest_addr; + __u64 feature_indication; +}; + +struct kvm_s390_pv_info_header { + __u32 id; + __u32 len_max; + __u32 len_written; + __u32 reserved; +}; + +struct kvm_s390_pv_info { + struct kvm_s390_pv_info_header header; + union { + struct kvm_s390_pv_info_dump dump; + struct kvm_s390_pv_info_vm vm; + }; +}; + +enum pv_cmd_id { + KVM_PV_ENABLE, + KVM_PV_DISABLE, + KVM_PV_SET_SEC_PARMS, + KVM_PV_UNPACK, + KVM_PV_VERIFY, + KVM_PV_PREP_RESET, + KVM_PV_UNSHARE_ALL, + KVM_PV_INFO, + KVM_PV_DUMP, + KVM_PV_ASYNC_CLEANUP_PREPARE, + KVM_PV_ASYNC_CLEANUP_PERFORM, +}; + +struct kvm_pv_cmd { + __u32 cmd; /* Command to be executed */ + __u16 rc; /* Ultravisor return code */ + __u16 rrc; /* Ultravisor return reason code */ + __u64 data; /* Data or address */ + __u32 flags; /* flags for future extensions. Must be 0 for now */ + __u32 reserved[3]; +}; + +struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; +}; + +/* types for kvm_s390_zpci_op->op */ +#define KVM_S390_ZPCIOP_REG_AEN 0 +#define KVM_S390_ZPCIOP_DEREG_AEN 1 + +/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ +#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) /* Device control API: s390-specific devices */ #define KVM_DEV_FLIC_GET_ALL_IRQS 1 diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index a448d0964fc0..ef11aa4cab42 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -7,6 +7,8 @@ * */ +#include +#include #include #include #include @@ -40,7 +42,6 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 @@ -49,7 +50,6 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS -#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -526,9 +526,301 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 -#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0) #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) +/* for KVM_CAP_MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +/* for KVM_CAP_XEN_HVM */ +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + __u8 runstate_update_flag; + union { + __u64 gfn; +#define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; + } shared_info; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; +#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) +#define KVM_XEN_EVTCHN_UPDATE (1 << 1) +#define KVM_XEN_EVTCHN_RESET (1 << 2) + /* + * Events sent by the guest are either looped back to + * the guest itself (potentially on a different port#) + * or signalled via an eventfd. + */ + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; + __u64 pad[8]; + } u; +}; + + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; +#define KVM_XEN_INVALID_GPA ((__u64)-1) + __u64 hva; + __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 + +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u32 pad0; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_send_start { + __u32 policy; + __u32 pad0; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u32 pad1; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u32 pad2; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u32 pad3; + __u64 session_uaddr; + __u32 session_len; + __u32 pad4; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + /* * Masked event layout. * Bits Description @@ -549,10 +841,10 @@ struct kvm_pmu_event_filter { ((__u64)(!!(exclude)) << 55)) #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ - (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) -#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) + (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55)) #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ @@ -560,7 +852,7 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ /* x86-specific KVM_EXIT_HYPERCALL flags. */ -#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0) #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c3308536482b..2190adbe3002 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -16,6 +16,11 @@ #define KVM_API_VERSION 12 +/* + * Backwards-compatible definitions. + */ +#define __KVM_HAVE_GUEST_DEBUG + /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -85,43 +90,6 @@ struct kvm_pit_config { #define KVM_PIT_SPEAKER_DUMMY 1 -struct kvm_s390_skeys { - __u64 start_gfn; - __u64 count; - __u64 skeydata_addr; - __u32 flags; - __u32 reserved[9]; -}; - -#define KVM_S390_CMMA_PEEK (1 << 0) - -/** - * kvm_s390_cmma_log - Used for CMMA migration. - * - * Used both for input and output. - * - * @start_gfn: Guest page number to start from. - * @count: Size of the result buffer. - * @flags: Control operation mode via KVM_S390_CMMA_* flags - * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty - * pages are still remaining. - * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set - * in the PGSTE. - * @values: Pointer to the values buffer. - * - * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls. - */ -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 @@ -315,11 +283,6 @@ struct kvm_run { __u32 ipb; } s390_sieic; /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 __u64 s390_reset_flags; /* KVM_EXIT_S390_UCONTROL */ struct { @@ -536,43 +499,6 @@ struct kvm_translation { __u8 pad[5]; }; -/* for KVM_S390_MEM_OP */ -struct kvm_s390_mem_op { - /* in */ - __u64 gaddr; /* the guest address */ - __u64 flags; /* flags */ - __u32 size; /* amount of bytes */ - __u32 op; /* type of operation */ - __u64 buf; /* buffer in userspace */ - union { - struct { - __u8 ar; /* the access register number */ - __u8 key; /* access key, ignored if flag unset */ - __u8 pad1[6]; /* ignored */ - __u64 old_addr; /* ignored if cmpxchg flag unset */ - }; - __u32 sida_offset; /* offset into the sida */ - __u8 reserved[32]; /* ignored */ - }; -}; -/* types for kvm_s390_mem_op->op */ -#define KVM_S390_MEMOP_LOGICAL_READ 0 -#define KVM_S390_MEMOP_LOGICAL_WRITE 1 -#define KVM_S390_MEMOP_SIDA_READ 2 -#define KVM_S390_MEMOP_SIDA_WRITE 3 -#define KVM_S390_MEMOP_ABSOLUTE_READ 4 -#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 -#define KVM_S390_MEMOP_ABSOLUTE_CMPXCHG 6 - -/* flags for kvm_s390_mem_op->flags */ -#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) -#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) -#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) - -/* flags specifying extension support via KVM_CAP_S390_MEM_OP_EXTENSION */ -#define KVM_S390_MEMOP_EXTENSION_CAP_BASE (1 << 0) -#define KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG (1 << 1) - /* for KVM_INTERRUPT */ struct kvm_interrupt { /* in */ @@ -637,124 +563,6 @@ struct kvm_mp_state { __u32 mp_state; }; -struct kvm_s390_psw { - __u64 mask; - __u64 addr; -}; - -/* valid values for type in kvm_s390_interrupt */ -#define KVM_S390_SIGP_STOP 0xfffe0000u -#define KVM_S390_PROGRAM_INT 0xfffe0001u -#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u -#define KVM_S390_RESTART 0xfffe0003u -#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u -#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u -#define KVM_S390_MCHK 0xfffe1000u -#define KVM_S390_INT_CLOCK_COMP 0xffff1004u -#define KVM_S390_INT_CPU_TIMER 0xffff1005u -#define KVM_S390_INT_VIRTIO 0xffff2603u -#define KVM_S390_INT_SERVICE 0xffff2401u -#define KVM_S390_INT_EMERGENCY 0xffff1201u -#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u -/* Anything below 0xfffe0000u is taken by INT_IO */ -#define KVM_S390_INT_IO(ai,cssid,ssid,schid) \ - (((schid)) | \ - ((ssid) << 16) | \ - ((cssid) << 18) | \ - ((ai) << 26)) -#define KVM_S390_INT_IO_MIN 0x00000000u -#define KVM_S390_INT_IO_MAX 0xfffdffffu -#define KVM_S390_INT_IO_AI_MASK 0x04000000u - - -struct kvm_s390_interrupt { - __u32 type; - __u32 parm; - __u64 parm64; -}; - -struct kvm_s390_io_info { - __u16 subchannel_id; - __u16 subchannel_nr; - __u32 io_int_parm; - __u32 io_int_word; -}; - -struct kvm_s390_ext_info { - __u32 ext_params; - __u32 pad; - __u64 ext_params2; -}; - -struct kvm_s390_pgm_info { - __u64 trans_exc_code; - __u64 mon_code; - __u64 per_address; - __u32 data_exc_code; - __u16 code; - __u16 mon_class_nr; - __u8 per_code; - __u8 per_atmid; - __u8 exc_access_id; - __u8 per_access_id; - __u8 op_access_id; -#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01 -#define KVM_S390_PGM_FLAGS_ILC_0 0x02 -#define KVM_S390_PGM_FLAGS_ILC_1 0x04 -#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06 -#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08 - __u8 flags; - __u8 pad[2]; -}; - -struct kvm_s390_prefix_info { - __u32 address; -}; - -struct kvm_s390_extcall_info { - __u16 code; -}; - -struct kvm_s390_emerg_info { - __u16 code; -}; - -#define KVM_S390_STOP_FLAG_STORE_STATUS 0x01 -struct kvm_s390_stop_info { - __u32 flags; -}; - -struct kvm_s390_mchk_info { - __u64 cr14; - __u64 mcic; - __u64 failing_storage_address; - __u32 ext_damage_code; - __u32 pad; - __u8 fixed_logout[16]; -}; - -struct kvm_s390_irq { - __u64 type; - union { - struct kvm_s390_io_info io; - struct kvm_s390_ext_info ext; - struct kvm_s390_pgm_info pgm; - struct kvm_s390_emerg_info emerg; - struct kvm_s390_extcall_info extcall; - struct kvm_s390_prefix_info prefix; - struct kvm_s390_stop_info stop; - struct kvm_s390_mchk_info mchk; - char reserved[64]; - } u; -}; - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - /* for KVM_SET_GUEST_DEBUG */ #define KVM_GUESTDBG_ENABLE 0x00000001 @@ -810,50 +618,6 @@ struct kvm_enable_cap { __u8 pad[64]; }; -/* for KVM_PPC_GET_PVINFO */ - -#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) - -struct kvm_ppc_pvinfo { - /* out */ - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -/* for KVM_PPC_GET_SMMU_INFO */ -#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 - -struct kvm_ppc_one_page_size { - __u32 page_shift; /* Page shift (or 0) */ - __u32 pte_enc; /* Encoding in the HPTE (>>12) */ -}; - -struct kvm_ppc_one_seg_page_size { - __u32 page_shift; /* Base page shift of segment (or 0) */ - __u32 slb_enc; /* SLB encoding for BookS */ - struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; -}; - -#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 -#define KVM_PPC_1T_SEGMENTS 0x00000002 -#define KVM_PPC_NO_HASH 0x00000004 - -struct kvm_ppc_smmu_info { - __u64 flags; - __u32 slb_size; - __u16 data_keys; /* # storage keys supported for data */ - __u16 instr_keys; /* # storage keys supported for instructions */ - struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; -}; - -/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ @@ -923,9 +687,7 @@ struct kvm_ppc_resize_hpt { /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 #define KVM_CAP_USER_NMI 22 -#ifdef __KVM_HAVE_GUEST_DEBUG #define KVM_CAP_SET_GUEST_DEBUG 23 -#endif #ifdef __KVM_HAVE_PIT #define KVM_CAP_REINJECT_CONTROL 24 #endif @@ -1156,8 +918,6 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 -#ifdef KVM_CAP_IRQ_ROUTING - struct kvm_irq_routing_irqchip { __u32 irqchip; __u32 pin; @@ -1222,42 +982,6 @@ struct kvm_irq_routing { struct kvm_irq_routing_entry entries[]; }; -#endif - -#ifdef KVM_CAP_MCE -/* x86 MCE */ -struct kvm_x86_mce { - __u64 status; - __u64 addr; - __u64 misc; - __u64 mcg_status; - __u8 bank; - __u8 pad1[7]; - __u64 pad2[3]; -}; -#endif - -#ifdef KVM_CAP_XEN_HVM -#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) -#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) -#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) -#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) -#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) -#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) -#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) -#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; -#endif - #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) /* * Available with KVM_CAP_IRQFD_RESAMPLE @@ -1442,11 +1166,6 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ -struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; -}; #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) @@ -1641,89 +1360,6 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -struct kvm_s390_pv_sec_parm { - __u64 origin; - __u64 length; -}; - -struct kvm_s390_pv_unp { - __u64 addr; - __u64 size; - __u64 tweak; -}; - -enum pv_cmd_dmp_id { - KVM_PV_DUMP_INIT, - KVM_PV_DUMP_CONFIG_STOR_STATE, - KVM_PV_DUMP_COMPLETE, - KVM_PV_DUMP_CPU, -}; - -struct kvm_s390_pv_dmp { - __u64 subcmd; - __u64 buff_addr; - __u64 buff_len; - __u64 gaddr; /* For dump storage state */ - __u64 reserved[4]; -}; - -enum pv_cmd_info_id { - KVM_PV_INFO_VM, - KVM_PV_INFO_DUMP, -}; - -struct kvm_s390_pv_info_dump { - __u64 dump_cpu_buffer_len; - __u64 dump_config_mem_buffer_per_1m; - __u64 dump_config_finalize_len; -}; - -struct kvm_s390_pv_info_vm { - __u64 inst_calls_list[4]; - __u64 max_cpus; - __u64 max_guests; - __u64 max_guest_addr; - __u64 feature_indication; -}; - -struct kvm_s390_pv_info_header { - __u32 id; - __u32 len_max; - __u32 len_written; - __u32 reserved; -}; - -struct kvm_s390_pv_info { - struct kvm_s390_pv_info_header header; - union { - struct kvm_s390_pv_info_dump dump; - struct kvm_s390_pv_info_vm vm; - }; -}; - -enum pv_cmd_id { - KVM_PV_ENABLE, - KVM_PV_DISABLE, - KVM_PV_SET_SEC_PARMS, - KVM_PV_UNPACK, - KVM_PV_VERIFY, - KVM_PV_PREP_RESET, - KVM_PV_UNSHARE_ALL, - KVM_PV_INFO, - KVM_PV_DUMP, - KVM_PV_ASYNC_CLEANUP_PREPARE, - KVM_PV_ASYNC_CLEANUP_PERFORM, -}; - -struct kvm_pv_cmd { - __u32 cmd; /* Command to be executed */ - __u16 rc; /* Ultravisor return code */ - __u16 rrc; /* Ultravisor return reason code */ - __u64 data; /* Data or address */ - __u32 flags; /* flags for future extensions. Must be 0 for now */ - __u32 reserved[3]; -}; - /* Available with KVM_CAP_S390_PROTECTED */ #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) @@ -1737,58 +1373,6 @@ struct kvm_pv_cmd { #define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr) #define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr) -struct kvm_xen_hvm_attr { - __u16 type; - __u16 pad[3]; - union { - __u8 long_mode; - __u8 vector; - __u8 runstate_update_flag; - struct { - __u64 gfn; -#define KVM_XEN_INVALID_GFN ((__u64)-1) - } shared_info; - struct { - __u32 send_port; - __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ - __u32 flags; -#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) -#define KVM_XEN_EVTCHN_UPDATE (1 << 1) -#define KVM_XEN_EVTCHN_RESET (1 << 2) - /* - * Events sent by the guest are either looped back to - * the guest itself (potentially on a different port#) - * or signalled via an eventfd. - */ - union { - struct { - __u32 port; - __u32 vcpu; - __u32 priority; - } port; - struct { - __u32 port; /* Zero for eventfd */ - __s32 fd; - } eventfd; - __u32 padding[4]; - } deliver; - } evtchn; - __u32 xen_version; - __u64 pad[8]; - } u; -}; - - -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ -#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 -#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 -#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ -#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 -#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ -#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 - /* Per-vCPU Xen attributes */ #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) @@ -1799,242 +1383,6 @@ struct kvm_xen_hvm_attr { #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) -struct kvm_xen_vcpu_attr { - __u16 type; - __u16 pad[3]; - union { - __u64 gpa; -#define KVM_XEN_INVALID_GPA ((__u64)-1) - __u64 pad[8]; - struct { - __u64 state; - __u64 state_entry_time; - __u64 time_running; - __u64 time_runnable; - __u64 time_blocked; - __u64 time_offline; - } runstate; - __u32 vcpu_id; - struct { - __u32 port; - __u32 priority; - __u64 expires_ns; - } timer; - __u8 vector; - } u; -}; - -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 -#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 -/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ -#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 -#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 -#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 - -/* Secure Encrypted Virtualization command */ -enum sev_cmd_id { - /* Guest initialization commands */ - KVM_SEV_INIT = 0, - KVM_SEV_ES_INIT, - /* Guest launch commands */ - KVM_SEV_LAUNCH_START, - KVM_SEV_LAUNCH_UPDATE_DATA, - KVM_SEV_LAUNCH_UPDATE_VMSA, - KVM_SEV_LAUNCH_SECRET, - KVM_SEV_LAUNCH_MEASURE, - KVM_SEV_LAUNCH_FINISH, - /* Guest migration commands (outgoing) */ - KVM_SEV_SEND_START, - KVM_SEV_SEND_UPDATE_DATA, - KVM_SEV_SEND_UPDATE_VMSA, - KVM_SEV_SEND_FINISH, - /* Guest migration commands (incoming) */ - KVM_SEV_RECEIVE_START, - KVM_SEV_RECEIVE_UPDATE_DATA, - KVM_SEV_RECEIVE_UPDATE_VMSA, - KVM_SEV_RECEIVE_FINISH, - /* Guest status and debug commands */ - KVM_SEV_GUEST_STATUS, - KVM_SEV_DBG_DECRYPT, - KVM_SEV_DBG_ENCRYPT, - /* Guest certificates commands */ - KVM_SEV_CERT_EXPORT, - /* Attestation report */ - KVM_SEV_GET_ATTESTATION_REPORT, - /* Guest Migration Extension */ - KVM_SEV_SEND_CANCEL, - - KVM_SEV_NR_MAX, -}; - -struct kvm_sev_cmd { - __u32 id; - __u64 data; - __u32 error; - __u32 sev_fd; -}; - -struct kvm_sev_launch_start { - __u32 handle; - __u32 policy; - __u64 dh_uaddr; - __u32 dh_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_launch_update_data { - __u64 uaddr; - __u32 len; -}; - - -struct kvm_sev_launch_secret { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -struct kvm_sev_launch_measure { - __u64 uaddr; - __u32 len; -}; - -struct kvm_sev_guest_status { - __u32 handle; - __u32 policy; - __u32 state; -}; - -struct kvm_sev_dbg { - __u64 src_uaddr; - __u64 dst_uaddr; - __u32 len; -}; - -struct kvm_sev_attestation_report { - __u8 mnonce[16]; - __u64 uaddr; - __u32 len; -}; - -struct kvm_sev_send_start { - __u32 policy; - __u64 pdh_cert_uaddr; - __u32 pdh_cert_len; - __u64 plat_certs_uaddr; - __u32 plat_certs_len; - __u64 amd_certs_uaddr; - __u32 amd_certs_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_send_update_data { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -struct kvm_sev_receive_start { - __u32 handle; - __u32 policy; - __u64 pdh_uaddr; - __u32 pdh_len; - __u64 session_uaddr; - __u32 session_len; -}; - -struct kvm_sev_receive_update_data { - __u64 hdr_uaddr; - __u32 hdr_len; - __u64 guest_uaddr; - __u32 guest_len; - __u64 trans_uaddr; - __u32 trans_len; -}; - -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) - -struct kvm_assigned_pci_dev { - __u32 assigned_dev_id; - __u32 busnr; - __u32 devfn; - __u32 flags; - __u32 segnr; - union { - __u32 reserved[11]; - }; -}; - -#define KVM_DEV_IRQ_HOST_INTX (1 << 0) -#define KVM_DEV_IRQ_HOST_MSI (1 << 1) -#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) - -#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) -#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) -#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) - -#define KVM_DEV_IRQ_HOST_MASK 0x00ff -#define KVM_DEV_IRQ_GUEST_MASK 0xff00 - -struct kvm_assigned_irq { - __u32 assigned_dev_id; - __u32 host_irq; /* ignored (legacy field) */ - __u32 guest_irq; - __u32 flags; - union { - __u32 reserved[12]; - }; -}; - -struct kvm_assigned_msix_nr { - __u32 assigned_dev_id; - __u16 entry_nr; - __u16 padding; -}; - -#define KVM_MAX_MSIX_PER_DEV 256 -struct kvm_assigned_msix_entry { - __u32 assigned_dev_id; - __u32 gsi; - __u16 entry; /* The index of entry in the MSI-X table */ - __u16 padding[3]; -}; - -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) - -/* Available with KVM_CAP_ARM_USER_IRQ */ - -/* Bits for run->s.regs.device_irq_level */ -#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) -#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) -#define KVM_ARM_DEV_PMU (1 << 2) - -struct kvm_hyperv_eventfd { - __u32 conn_id; - __s32 fd; - __u32 flags; - __u32 padding[3]; -}; - -#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff -#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) - #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) @@ -2180,33 +1528,6 @@ struct kvm_stats_desc { /* Available with KVM_CAP_S390_ZPCI_OP */ #define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op) -struct kvm_s390_zpci_op { - /* in */ - __u32 fh; /* target device */ - __u8 op; /* operation to perform */ - __u8 pad[3]; - union { - /* for KVM_S390_ZPCIOP_REG_AEN */ - struct { - __u64 ibv; /* Guest addr of interrupt bit vector */ - __u64 sb; /* Guest addr of summary bit */ - __u32 flags; - __u32 noi; /* Number of interrupts */ - __u8 isc; /* Guest interrupt subclass */ - __u8 sbo; /* Offset of guest summary bit vector */ - __u16 pad; - } reg_aen; - __u64 reserved[8]; - } u; -}; - -/* types for kvm_s390_zpci_op->op */ -#define KVM_S390_ZPCIOP_REG_AEN 0 -#define KVM_S390_ZPCIOP_DEREG_AEN 1 - -/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ -#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) - /* Available with KVM_CAP_MEMORY_ATTRIBUTES */ #define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes) -- cgit v1.2.3 From b7ce17f257da17a4163da82e0fb7726c2de85da7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:15 -0700 Subject: tools/include: Sync uapi/sound/asound.h with the kernel sources To pick up the changes from: 85df6b5a6658 ("ALSA: pcm: clarify and fix default msbits value for all formats") This should be used to beautify sound syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: linux-sound@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-5-namhyung@kernel.org --- tools/include/uapi/sound/asound.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index d5b9cfbd9cea..628d46a0da92 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 16) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -416,7 +416,7 @@ struct snd_pcm_hw_params { unsigned int rmask; /* W: requested masks */ unsigned int cmask; /* R: changed masks */ unsigned int info; /* R: Info flags for returned setup */ - unsigned int msbits; /* R: used most significant bits */ + unsigned int msbits; /* R: used most significant bits (in sample bit-width) */ unsigned int rate_num; /* R: rate numerator */ unsigned int rate_den; /* R: rate denominator */ snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ -- cgit v1.2.3 From 58e1b92df491c35abad7ddd2e393b89244e16bd5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:16 -0700 Subject: tools/include: Sync x86 CPU feature headers with the kernel sources To pick up the changes from: 598c2fafc06f ("perf/x86/amd/lbr: Use freeze based on availability") 7f274e609f3d ("x86/cpufeatures: Add new word for scattered features") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h diff -u tools/arch/x86/include/asm/required-features.h arch/x86/include/asm/required-features.h diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-6-namhyung@kernel.org --- tools/arch/x86/include/asm/cpufeatures.h | 17 ++++++++++++----- tools/arch/x86/include/asm/disabled-features.h | 11 +++++++++-- tools/arch/x86/include/asm/required-features.h | 3 ++- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 25160d26764b..a38f8f9ba657 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 21 /* N 32-bit words worth of info */ +#define NCAPINTS 22 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -81,10 +81,8 @@ #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - -/* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -97,7 +95,7 @@ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ @@ -461,6 +459,14 @@ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0x80000022, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ + /* * BUG word(s) */ @@ -508,4 +514,5 @@ /* BUG word 2 */ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 1f23960d2b06..c492bdc97b05 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -123,6 +123,12 @@ # define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) #endif +#ifdef CONFIG_KVM_AMD_SEV +#define DISABLE_SEV_SNP 0 +#else +#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -147,8 +153,9 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 0 +#define DISABLED_MASK19 (DISABLE_SEV_SNP) #define DISABLED_MASK20 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define DISABLED_MASK21 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 7ba1726b71c7..e9187ddd3d1f 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -99,6 +99,7 @@ #define REQUIRED_MASK18 0 #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define REQUIRED_MASK21 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ -- cgit v1.2.3 From 978f2a60dd5ca6c25dfd5e24e7191b16af0ec429 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:17 -0700 Subject: tools/include: Sync x86 asm/irq_vectors.h with the kernel sources To pick up the changes from: 0cbca1bf44a0 ("x86: irq: unconditionally define KVM interrupt vectors") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/irq_vectors.h arch/x86/include/asm/irq_vectors.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-7-namhyung@kernel.org --- tools/arch/x86/include/asm/irq_vectors.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h index 3f73ac3ed3a0..d18bfb238f66 100644 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ b/tools/arch/x86/include/asm/irq_vectors.h @@ -84,11 +84,9 @@ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 /* Vector for KVM to deliver posted interrupt IPI */ -#if IS_ENABLED(CONFIG_KVM) #define POSTED_INTR_VECTOR 0xf2 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 #define POSTED_INTR_NESTED_VECTOR 0xf0 -#endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef -- cgit v1.2.3 From c781a72f9ddd09baddde9df1f59955c0d6ea4944 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:18 -0700 Subject: tools/include: Sync x86 asm/msr-index.h with the kernel sources To pick up the changes from: 8076fcde016c ("x86/rfds: Mitigate Register File Data Sampling (RFDS)") d7b69b590bc9 ("x86/sev: Dump SEV_STATUS") cd6df3f378f6 ("x86/cpu: Add MSR numbers for FRED configuration") 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-8-namhyung@kernel.org --- tools/arch/x86/include/asm/msr-index.h | 74 +++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 1f9dc9bd13eb..05956bd8bacf 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -176,6 +176,14 @@ * CPU is not vulnerable to Gather * Data Sampling (GDS). */ +#define ARCH_CAP_RFDS_NO BIT(27) /* + * Not susceptible to Register + * File Data Sampling. + */ +#define ARCH_CAP_RFDS_CLEAR BIT(28) /* + * VERW clears CPU Register + * File. + */ #define ARCH_CAP_XAPIC_DISABLE BIT(21) /* * IA32_XAPIC_DISABLE_STATUS MSR @@ -605,34 +613,47 @@ #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) - -/* SNP feature bits enabled by the hypervisor */ -#define MSR_AMD64_SNP_VTOM BIT_ULL(3) -#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) -#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) -#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) -#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) -#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) -#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) -#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) -#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) -#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) -#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) -#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) -#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) - -/* SNP feature bits reserved for future use. */ -#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) -#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) -#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -719,8 +740,15 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_AMD64_SYSCFG 0xc0010010 -#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 #define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24 +#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) +#define MSR_AMD64_SYSCFG_MFDM_BIT 19 +#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT) + #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 -- cgit v1.2.3 From 99e4e1174acd7f5a942d37e1ac6c115f870d5975 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:19 -0700 Subject: tools/include: Sync asm-generic/bitops/fls.h with the kernel sources To pick up the changes from: cb4ede926134 ("riscv: Avoid code duplication with generic bitops implementation") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/asm-generic/bitops/__fls.h include/asm-generic/bitops/__fls.h diff -u tools/include/asm-generic/bitops/fls.h include/asm-generic/bitops/fls.h Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Palmer Dabbelt Cc: linux-arch@vger.kernel.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-9-namhyung@kernel.org --- tools/include/asm-generic/bitops/__fls.h | 8 ++++++-- tools/include/asm-generic/bitops/fls.h | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h index 03f721a8a2b1..54ccccf96e21 100644 --- a/tools/include/asm-generic/bitops/__fls.h +++ b/tools/include/asm-generic/bitops/__fls.h @@ -5,12 +5,12 @@ #include /** - * __fls - find last (most-significant) set bit in a long word + * generic___fls - find last (most-significant) set bit in a long word * @word: the word to search * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long generic___fls(unsigned long word) { int num = BITS_PER_LONG - 1; @@ -41,4 +41,8 @@ static __always_inline unsigned long __fls(unsigned long word) return num; } +#ifndef __HAVE_ARCH___FLS +#define __fls(word) generic___fls(word) +#endif + #endif /* _ASM_GENERIC_BITOPS___FLS_H_ */ diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h index b168bb10e1be..26f3ce1dd6e4 100644 --- a/tools/include/asm-generic/bitops/fls.h +++ b/tools/include/asm-generic/bitops/fls.h @@ -3,14 +3,14 @@ #define _ASM_GENERIC_BITOPS_FLS_H_ /** - * fls - find last (most-significant) bit set + * generic_fls - find last (most-significant) bit set * @x: the word to search * * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline int generic_fls(unsigned int x) { int r = 32; @@ -39,4 +39,8 @@ static __always_inline int fls(unsigned int x) return r; } +#ifndef __HAVE_ARCH_FLS +#define fls(x) generic_fls(x) +#endif + #endif /* _ASM_GENERIC_BITOPS_FLS_H_ */ -- cgit v1.2.3 From 1cebd7f74976455ccd89c1dfbcf00bca52d0a512 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 8 Apr 2024 11:55:20 -0700 Subject: tools/include: Sync arm64 asm/cputype.h with the kernel sources To pick up the changes from: fb091ff39479 ("arm64: Subscribe Microsoft Azure Cobalt 100 to ARM Neoverse N2 errata") This should address these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20240408185520.1550865-10-namhyung@kernel.org --- tools/arch/arm64/include/asm/cputype.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 7c7493cb571f..52f076afeb96 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -61,6 +61,7 @@ #define ARM_CPU_IMP_HISI 0x48 #define ARM_CPU_IMP_APPLE 0x61 #define ARM_CPU_IMP_AMPERE 0xC0 +#define ARM_CPU_IMP_MICROSOFT 0x6D #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 @@ -135,6 +136,8 @@ #define AMPERE_CPU_PART_AMPERE1 0xAC3 +#define MICROSOFT_CPU_PART_AZURE_COBALT_100 0xD49 /* Based on r0p0 of ARM Neoverse N2 */ + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) @@ -193,6 +196,7 @@ #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) +#define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX -- cgit v1.2.3 From 447112d7edd77fa468938377418434233bcb3709 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 17:13:42 -0800 Subject: KVM: VMX: Snapshot LBR capabilities during module initialization Snapshot VMX's LBR capabilities once during module initialization instead of calling into perf every time a vCPU reconfigures its vPMU. This will allow massaging the LBR capabilities, e.g. if the CPU doesn't support callstacks, without having to remember to update multiple locations. Opportunistically tag vmx_get_perf_capabilities() with __init, as it's only called from vmx_set_cpu_caps(). Reviewed-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240307011344.835640-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 2 +- arch/x86/kvm/vmx/vmx.c | 9 +++++---- arch/x86/kvm/vmx/vmx.h | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 12ade343a17e..be40474de6e4 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -535,7 +535,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) perf_capabilities = vcpu_get_perf_capabilities(vcpu); if (cpuid_model_is_consistent(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) - x86_perf_get_lbr(&lbr_desc->records); + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else lbr_desc->records.nr = 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e3315bda6237..aca41d3419f9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -218,6 +218,8 @@ module_param(ple_window_max, uint, 0444); int __read_mostly pt_mode = PT_MODE_SYSTEM; module_param(pt_mode, int, S_IRUGO); +struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; + static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -7862,10 +7864,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } -static u64 vmx_get_perf_capabilities(void) +static __init u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; u64 host_perf_cap = 0; if (!enable_pmu) @@ -7875,8 +7876,8 @@ static u64 vmx_get_perf_capabilities(void) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { - x86_perf_get_lbr(&lbr); - if (lbr.nr) + x86_perf_get_lbr(&vmx_lbr_caps); + if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1742c88ba9c8..90f9e4434646 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -110,6 +110,8 @@ struct lbr_desc { bool msr_passthrough; }; +extern struct x86_pmu_lbr vmx_lbr_caps; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. -- cgit v1.2.3 From 0d0b60865071115f6da76090f0dbc1f0e8b9c647 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 17:13:43 -0800 Subject: perf/x86/intel: Expose existence of callback support to KVM Add a "has_callstack" field to the x86_pmu_lbr structure used to pass information to KVM, and set it accordingly in x86_perf_get_lbr(). KVM will use has_callstack to avoid trying to create perf LBR events with PERF_SAMPLE_BRANCH_CALL_STACK on CPUs that don't support callstacks. Reviewed-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240307011344.835640-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/events/intel/lbr.c | 1 + arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 78cd5084104e..4367aa77cb8d 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1693,6 +1693,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; lbr->info = x86_pmu.lbr_info; + lbr->has_callstack = x86_pmu_has_lbr_callstack(); } EXPORT_SYMBOL_GPL(x86_perf_get_lbr); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 3736b8a46c04..7f1e17250546 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -555,6 +555,7 @@ struct x86_pmu_lbr { unsigned int from; unsigned int to; unsigned int info; + bool has_callstack; }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); -- cgit v1.2.3 From bb9dc859086df369f1fd34578dd5ca82d6321d21 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Mar 2024 17:13:44 -0800 Subject: KVM: VMX: Disable LBR virtualization if the CPU doesn't support LBR callstacks Disable LBR virtualization if the CPU doesn't support callstacks, which were introduced in HSW (see commit e9d7f7cd97c4 ("perf/x86/intel: Add basic Haswell LBR call stack support"), as KVM unconditionally configures the perf LBR event with PERF_SAMPLE_BRANCH_CALL_STACK, i.e. LBR virtualization always fails on pre-HSW CPUs. Simply disable LBR support on such CPUs, as it has never worked, i.e. there is no risk of breaking an existing setup, and figuring out a way to performantly context switch LBRs on old CPUs is not worth the effort. Fixes: be635e34c284 ("KVM: vmx/pmu: Expose LBR_FMT in the MSR_IA32_PERF_CAPABILITIES") Cc: Mingwei Zhang Cc: Jim Mattson Tested-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240307011344.835640-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aca41d3419f9..22411f4aff53 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7877,7 +7877,15 @@ static __init u64 vmx_get_perf_capabilities(void) if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { x86_perf_get_lbr(&vmx_lbr_caps); - if (vmx_lbr_caps.nr) + + /* + * KVM requires LBR callstack support, as the overhead due to + * context switching LBRs without said support is too high. + * See intel_pmu_create_guest_lbr_event() for more info. + */ + if (!vmx_lbr_caps.has_callstack) + memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); + else if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; } -- cgit v1.2.3 From 1bc26cb9090246190e8c540f5aa201cea2f895a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 8 Apr 2024 16:11:15 -0700 Subject: KVM: x86/mmu: Precisely invalidate MMU root_role during CPUID update Set kvm_mmu_page_role.invalid to mark the various MMU root_roles invalid during CPUID update in order to force a refresh, instead of zeroing out the entire role. This fixes a bug where kvm_mmu_free_roots() incorrectly thinks a root is indirect, i.e. not a TDP MMU, due to "direct" being zeroed, which in turn causes KVM to take mmu_lock for write instead of read. Note, paving over the entire role was largely unintentional, commit 7a458f0e1ba1 ("KVM: x86/mmu: remove extended bits from mmu_role, rename field") simply missed that "invalid" could be set. Fixes: 576a15de8d29 ("KVM: x86/mmu: Free TDP MMU roots while holding mmy_lock for read") Reported-by: syzbot+dc308fcfcd53f987de73@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000009b38080614c49bdb@google.com Cc: Phi Nguyen Link: https://lore.kernel.org/r/20240408231115.1387279-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 18a404b5923f..59c051845992 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5576,9 +5576,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ - vcpu->arch.root_mmu.root_role.word = 0; - vcpu->arch.guest_mmu.root_role.word = 0; - vcpu->arch.nested_mmu.root_role.word = 0; + vcpu->arch.root_mmu.root_role.invalid = 1; + vcpu->arch.guest_mmu.root_role.invalid = 1; + vcpu->arch.nested_mmu.root_role.invalid = 1; vcpu->arch.root_mmu.cpu_role.ext.valid = 0; vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; -- cgit v1.2.3 From 2673dfb591a359c75080dd5af3da484b89320d22 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:38 -0700 Subject: KVM: x86/mmu: Write-protect L2 SPTEs in TDP MMU when clearing dirty status Check kvm_mmu_page_ad_need_write_protect() when deciding whether to write-protect or clear D-bits on TDP MMU SPTEs, so that the TDP MMU accounts for any role-specific reasons for disabling D-bit dirty logging. Specifically, TDP MMU SPTEs must be write-protected when the TDP MMU is being used to run an L2 (i.e. L1 has disabled EPT) and PML is enabled. KVM always disables PML when running L2, even when L1 and L2 GPAs are in the some domain, so failing to write-protect TDP MMU SPTEs will cause writes made by L2 to not be reflected in the dirty log. Reported-by: syzbot+900d58a45dcaab9e4821@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=900d58a45dcaab9e4821 Fixes: 5982a5392663 ("KVM: x86/mmu: Use kvm_ad_enabled() to determine if TDP MMU SPTEs need wrprot") Cc: stable@vger.kernel.org Cc: Vipin Sharma Cc: Sean Christopherson Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-2-dmatlack@google.com [sean: massage shortlog and changelog, tweak ternary op formatting] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d078157e62aa..cf7d607a0125 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1548,6 +1548,16 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } +static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) +{ + /* + * All TDP MMU shadow pages share the same role as their root, aside + * from level, so it is valid to key off any shadow page to determine if + * write protection is needed for an entire tree. + */ + return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); +} + /* * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. @@ -1558,7 +1568,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { - u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; + const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; bool spte_set = false; @@ -1573,7 +1584,7 @@ retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - KVM_MMU_WARN_ON(kvm_ad_enabled() && + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter.old_spte)); if (!(iter.old_spte & dbit)) @@ -1620,8 +1631,8 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); @@ -1633,7 +1644,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, if (!mask) break; - KVM_MMU_WARN_ON(kvm_ad_enabled() && + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter.old_spte)); if (iter.level > PG_LEVEL_4K || -- cgit v1.2.3 From feac19aa4c26dc028d358bf23fa8f228ab46699e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:39 -0700 Subject: KVM: x86/mmu: Remove function comments above clear_dirty_{gfn_range,pt_masked}() Drop the comments above clear_dirty_gfn_range() and clear_dirty_pt_masked(), since each is word-for-word identical to the comment above their parent function. Leave the comment on the parent functions since they are APIs called by the KVM/x86 MMU. No functional change intended. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-3-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index cf7d607a0125..23759b2bca5e 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1558,13 +1558,6 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); } -/* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. - */ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { @@ -1621,13 +1614,6 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, return spte_set; } -/* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. - */ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { -- cgit v1.2.3 From b1a8d2b02b69c7d7685f2e19f32034065310dbae Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:40 -0700 Subject: KVM: x86/mmu: Fix and clarify comments about clearing D-bit vs. write-protecting Drop the "If AD bits are enabled/disabled" verbiage from the comments above kvm_tdp_mmu_clear_dirty_{slot,pt_masked}() since TDP MMU SPTEs may need to be write-protected even when A/D bits are enabled. i.e. These comments aren't technically correct. No functional change intended. Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-4-dmatlack@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 23759b2bca5e..04c1f0957fea 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1594,11 +1594,9 @@ retry: } /* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. + * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the + * memslot. Returns true if an SPTE has been changed and the TLBs need to be + * flushed. */ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -1656,11 +1654,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for + * which a bit is set in mask, starting at gfn. The given memslot is expected to + * contain all the GFNs represented by set bits in the mask. */ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, -- cgit v1.2.3 From 40e0ee6338f0c042c0dabe1f17eb76eac37b5425 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 15 Mar 2024 16:05:41 -0700 Subject: KVM: selftests: Add coverage of EPT-disabled to vmx_dirty_log_test Extend vmx_dirty_log_test to include accesses made by L2 when EPT is disabled. This commit adds explicit coverage of a bug caught by syzkaller, where the TDP MMU would clear D-bits instead of write-protecting SPTEs being used to map an L2, which only happens when L1 does not enable EPT, causing writes made by L2 to not be reflected in the dirty log when PML is enabled: $ ./vmx_dirty_log_test Nested EPT: disabled ==== Test Assertion Failure ==== x86_64/vmx_dirty_log_test.c:151: test_bit(0, bmap) pid=72052 tid=72052 errno=4 - Interrupted system call (stack trace empty) Page 0 incorrectly reported clean Opportunistically replace the volatile casts with {READ,WRITE}_ONCE(). Link: https://lore.kernel.org/kvm/000000000000c6526f06137f18cc@google.com/ Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20240315230541.1635322-5-dmatlack@google.com Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 60 +++++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index 7f6f5f23fb9b..977948fd52e6 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -28,16 +28,16 @@ #define NESTED_TEST_MEM1 0xc0001000 #define NESTED_TEST_MEM2 0xc0002000 -static void l2_guest_code(void) +static void l2_guest_code(u64 *a, u64 *b) { - *(volatile uint64_t *)NESTED_TEST_MEM1; - *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; + READ_ONCE(*a); + WRITE_ONCE(*a, 1); GUEST_SYNC(true); GUEST_SYNC(false); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); GUEST_SYNC(false); @@ -45,17 +45,33 @@ static void l2_guest_code(void) vmcall(); } +static void l2_guest_code_ept_enabled(void) +{ + l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); +} + +static void l2_guest_code_ept_disabled(void) +{ + /* Access the same L1 GPAs as l2_guest_code_ept_enabled() */ + l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); +} + void l1_guest_code(struct vmx_pages *vmx) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; GUEST_ASSERT(vmx->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx)); GUEST_ASSERT(load_vmcs(vmx)); - prepare_vmcs(vmx, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + if (vmx->eptp_gpa) + l2_rip = l2_guest_code_ept_enabled; + else + l2_rip = l2_guest_code_ept_disabled; + + prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(false); GUEST_ASSERT(!vmlaunch()); @@ -64,7 +80,7 @@ void l1_guest_code(struct vmx_pages *vmx) GUEST_DONE(); } -int main(int argc, char *argv[]) +static void test_vmx_dirty_log(bool enable_ept) { vm_vaddr_t vmx_pages_gva = 0; struct vmx_pages *vmx; @@ -76,8 +92,7 @@ int main(int argc, char *argv[]) struct ucall uc; bool done = false; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - TEST_REQUIRE(kvm_cpu_has_ept()); + pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled"); /* Create VM */ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); @@ -103,11 +118,16 @@ int main(int argc, char *argv[]) * * Note that prepare_eptp should be called only L1's GPA map is done, * meaning after the last call to virt_map. + * + * When EPT is disabled, the L2 guest code will still access the same L1 + * GPAs as the EPT enabled case. */ - prepare_eptp(vmx, vm, 0); - nested_map_memslot(vmx, vm, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + if (enable_ept) { + prepare_eptp(vmx, vm, 0); + nested_map_memslot(vmx, vm, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + } bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); @@ -148,3 +168,15 @@ int main(int argc, char *argv[]) } } } + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + test_vmx_dirty_log(/*enable_ept=*/false); + + if (kvm_cpu_has_ept()) + test_vmx_dirty_log(/*enable_ept=*/true); + + return 0; +} -- cgit v1.2.3 From eefb85b3f0310c2f4149c50cb9b13094ed1dde25 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 4 Mar 2024 16:37:42 -0800 Subject: KVM: Drop unused @may_block param from gfn_to_pfn_cache_invalidate_start() Remove gfn_to_pfn_cache_invalidate_start()'s unused @may_block parameter, which was leftover from KVM's abandoned (for now) attempt to support guest usage of gfn_to_pfn caches. Fixes: a4bff3df5147 ("KVM: pfncache: remove KVM_GUEST_USES_PFN usage") Reported-by: Like Xu Cc: Paul Durrant Cc: David Woodhouse Reviewed-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240305003742.245767-1-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 3 +-- virt/kvm/kvm_mm.h | 6 ++---- virt/kvm/pfncache.c | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fb49c2a60200..ff0a20565f90 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -832,8 +832,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * mn_active_invalidate_count (see above) instead of * mmu_invalidate_in_progress. */ - gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, - hva_range.may_block); + gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end); /* * If one or more memslots were found and thus zapped, notify arch code diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index ecefc7ec51af..715f19669d01 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -26,13 +26,11 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, #ifdef CONFIG_HAVE_KVM_PFNCACHE void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, - unsigned long end, - bool may_block); + unsigned long end); #else static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, - unsigned long end, - bool may_block) + unsigned long end) { } #endif /* HAVE_KVM_PFNCACHE */ diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index f618719644e0..e3453e869e92 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -23,7 +23,7 @@ * MMU notifier 'invalidate_range_start' hook. */ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, - unsigned long end, bool may_block) + unsigned long end) { struct gfn_to_pfn_cache *gpc; -- cgit v1.2.3 From 17f8dc2db52185460f212052f3a692c1fdc167ba Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 9 Apr 2024 08:56:03 +0800 Subject: ceph: switch to use cap_delay_lock for the unlink delay list The same list item will be used in both cap_delay_list and cap_unlink_delay_list, so it's buggy to use two different locks to protect them. Cc: stable@vger.kernel.org Fixes: dbc347ef7f0c ("ceph: add ceph_cap_unlink_work to fire check_caps() immediately") Link: https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/thread/AODC76VXRAMXKLFDCTK4TKFDDPWUSCN5 Reported-by: Marc Ruhmann Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Tested-by: Marc Ruhmann Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ++-- fs/ceph/mds_client.c | 9 ++++----- fs/ceph/mds_client.h | 3 +-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 55051ad09c19..c4941ba245ac 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4783,13 +4783,13 @@ int ceph_drop_caps_for_unlink(struct inode *inode) doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_unlink_delay_list); - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); /* * Fire the work immediately, because the MDS maybe diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab9c268a8bb..360b686c3c67 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2504,7 +2504,7 @@ static void ceph_cap_unlink_work(struct work_struct *work) struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "begin\n"); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_unlink_delay_list)) { struct ceph_inode_info *ci; struct inode *inode; @@ -2516,15 +2516,15 @@ static void ceph_cap_unlink_work(struct work_struct *work) inode = igrab(&ci->netfs.inode); if (inode) { - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); - spin_lock(&mdsc->cap_unlink_delay_lock); + spin_lock(&mdsc->cap_delay_lock); } } - spin_unlock(&mdsc->cap_unlink_delay_lock); + spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); } @@ -5404,7 +5404,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); - spin_lock_init(&mdsc->cap_unlink_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 03f8ff00874f..b88e80415224 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -461,9 +461,8 @@ struct ceph_mds_client { struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ - spinlock_t cap_delay_lock; /* protects cap_delay_list */ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ - spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ + spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; -- cgit v1.2.3 From d3e0469306793972fd2b1ea016fa7ab0658c9849 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Apr 2024 07:01:57 -0400 Subject: MAINTAINERS: remove myself as a Reviewer for Ceph It has been a couple of years since I stepped down as CephFS maintainer. I'm not involved in any meaningful way with the project these days, so while I'm happy to help review the occasional patch, I don't need to be cc'ed on all of them. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index aea47e04c3a5..c45e2c3f6ff9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4869,7 +4869,6 @@ F: drivers/power/supply/cw2015_battery.c CEPH COMMON CODE (LIBCEPH) M: Ilya Dryomov M: Xiubo Li -R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ @@ -4881,7 +4880,6 @@ F: net/ceph/ CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) M: Xiubo Li M: Ilya Dryomov -R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ -- cgit v1.2.3 From 28e0947651ce6a2200b9a7eceb93282e97d7e51a Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 6 Apr 2024 23:16:08 -0500 Subject: smb3: fix Open files on server counter going negative We were decrementing the count of open files on server twice for the case where we were closing cached directories. Fixes: 8e843bf38f7b ("cifs: return a single-use cfid if we did not get a lease") Cc: stable@vger.kernel.org Acked-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 13a9d7acf8f8..0ff2491c311d 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -433,8 +433,8 @@ smb2_close_cached_fid(struct kref *ref) if (cfid->is_open) { rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); - if (rc != -EBUSY && rc != -EAGAIN) - atomic_dec(&cfid->tcon->num_remote_opens); + if (rc) /* should we retry on -EBUSY or -EAGAIN? */ + cifs_dbg(VFS, "close cached dir rc %d\n", rc); } free_cached_dir(cfid); -- cgit v1.2.3 From c6ff459037b2e35450af2351037eac4c8aca1d6b Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 9 Apr 2024 11:28:59 -0300 Subject: smb: client: instantiate when creating SFU files In cifs_sfu_make_node(), on success, instantiate rather than leave it with dentry unhashed negative to support callers that expect mknod(2) to always instantiate. This fixes the following test case: mount.cifs //srv/share /mnt -o ...,sfu mkfifo /mnt/fifo ./xfstests/ltp/growfiles -b -W test -e 1 -u -i 0 -L 30 /mnt/fifo ... BUG: unable to handle page fault for address: 000000034cec4e58 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 1 PREEMPT SMP PTI CPU: 0 PID: 138098 Comm: growfiles Kdump: loaded Not tainted 5.14.0-436.3987_1240945149.el9.x86_64 #1 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:_raw_callee_save__kvm_vcpu_is_preempted+0x0/0x20 Code: e8 15 d9 61 00 e9 63 ff ff ff 41 bd ea ff ff ff e9 58 ff ff ff e8 d0 71 c0 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <48> 8b 04 fd 60 2b c1 99 80 b8 90 50 03 00 00 0f 95 c0 c3 cc cc cc RSP: 0018:ffffb6a143cf7cf8 EFLAGS: 00010206 RAX: ffff8a9bc30fb038 RBX: ffff8a9bc666a200 RCX: ffff8a9cc0260000 RDX: 00000000736f622e RSI: ffff8a9bc30fb038 RDI: 000000007665645f RBP: ffffb6a143cf7d70 R08: 0000000000001000 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000000 R12: ffff8a9bc666a200 R13: 0000559a302a12b0 R14: 0000000000001000 R15: 0000000000000000 FS: 00007fbed1dbb740(0000) GS:ffff8a9cf0000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000034cec4e58 CR3: 0000000128ec6006 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? show_trace_log_lvl+0x1c4/0x2df ? show_trace_log_lvl+0x1c4/0x2df ? __mutex_lock.constprop.0+0x5f7/0x6a0 ? __die_body.cold+0x8/0xd ? page_fault_oops+0x134/0x170 ? exc_page_fault+0x62/0x150 ? asm_exc_page_fault+0x22/0x30 ? _pfx_raw_callee_save__kvm_vcpu_is_preempted+0x10/0x10 __mutex_lock.constprop.0+0x5f7/0x6a0 ? __mod_memcg_lruvec_state+0x84/0xd0 pipe_write+0x47/0x650 ? do_anonymous_page+0x258/0x410 ? inode_security+0x22/0x60 ? selinux_file_permission+0x108/0x150 vfs_write+0x2cb/0x410 ksys_write+0x5f/0xe0 do_syscall_64+0x5c/0xf0 ? syscall_exit_to_user_mode+0x22/0x40 ? do_syscall_64+0x6b/0xf0 ? sched_clock_cpu+0x9/0xc0 ? exc_page_fault+0x62/0x150 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Cc: stable@vger.kernel.org Fixes: 72bc63f5e23a ("smb3: fix creating FIFOs when mounting with "sfu" mount option") Suggested-by: Al Viro Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 94 +++++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index b156eefa75d7..78c94d0350fe 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4964,68 +4964,84 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf, return 0; } -int cifs_sfu_make_node(unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) +static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) { - struct cifs_open_info_data buf = {}; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_fid fid; unsigned int bytes_written; - struct win_dev *pdev; + struct win_dev pdev = {}; struct kvec iov[2]; __u32 oplock = server->oplocks ? REQ_OPLOCK : 0; int rc; - if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode)) + switch (mode & S_IFMT) { + case S_IFCHR: + strscpy(pdev.type, "IntxCHR"); + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; + case S_IFBLK: + strscpy(pdev.type, "IntxBLK"); + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; + case S_IFIFO: + strscpy(pdev.type, "LnxFIFO"); + break; + default: return -EPERM; + } - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = GENERIC_WRITE, - .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL), - .disposition = FILE_CREATE, - .path = full_path, - .fid = &fid, - }; + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE, + FILE_CREATE, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL, ACL_NO_MODE); + oparms.fid = &fid; - rc = server->ops->open(xid, &oparms, &oplock, &buf); + rc = server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; - io_parms.length = sizeof(*pdev); - iov[1].iov_base = pdev; - iov[1].iov_len = sizeof(*pdev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - } else if (S_ISFIFO(mode)) { - memcpy(pdev->type, "LnxFIFO", 8); - } + io_parms.length = sizeof(pdev); + iov[1].iov_base = &pdev; + iov[1].iov_len = sizeof(pdev); rc = server->ops->sync_write(xid, &fid, &io_parms, &bytes_written, iov, 1); server->ops->close(xid, tcon, &fid); - d_drop(dentry); - /* FIXME: add code here to set EAs */ - cifs_free_open_info(&buf); + return rc; +} + +int cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct inode *new = NULL; + int rc; + + rc = __cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + if (rc) + return rc; + + if (tcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&new, full_path, NULL, + inode->i_sb, xid); + } else if (tcon->unix_ext) { + rc = cifs_get_inode_info_unix(&new, full_path, + inode->i_sb, xid); + } else { + rc = cifs_get_inode_info(&new, full_path, NULL, + inode->i_sb, xid, NULL); + } + if (!rc) + d_instantiate(dentry, new); return rc; } -- cgit v1.2.3 From 35f834265e0dc78b003aa0d1af65cafb89666b76 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 4 Apr 2024 18:06:56 -0500 Subject: smb3: fix broken reconnect when password changing on the server by allowing password rotation There are various use cases that are becoming more common in which password changes are scheduled on a server(s) periodically but the clients connected to this server need to stay connected (even in the face of brief network reconnects) due to mounts which can not be easily unmounted and mounted at will, and servers that do password rotation do not always have the ability to tell the clients exactly when to the new password will be effective, so add support for an alt password ("password2=") on mount (and also remount) so that we can anticipate the upcoming change to the server without risking breaking existing mounts. An alternative would have been to use the kernel keyring for this but the processes doing the reconnect do not have access to the keyring but do have access to the ses structure. Reviewed-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/connect.c | 8 ++++++++ fs/smb/client/fs_context.c | 21 +++++++++++++++++++++ fs/smb/client/fs_context.h | 2 ++ fs/smb/client/misc.c | 1 + fs/smb/client/smb2pdu.c | 11 +++++++++++ 6 files changed, 44 insertions(+) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index f6a302205f89..d6669ce4ae87 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1077,6 +1077,7 @@ struct cifs_ses { and after mount option parsing we fill it */ char *domainName; char *password; + char *password2; /* When key rotation used, new password may be set before it expires */ char workstation_name[CIFS_MAX_WORKSTATION_LEN]; struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 85679ae106fd..4e35970681bf 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2183,6 +2183,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) } ++delim; + /* BB consider adding support for password2 (Key Rotation) for multiuser in future */ ctx->password = kstrndup(delim, len, GFP_KERNEL); if (!ctx->password) { cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n", @@ -2206,6 +2207,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) kfree(ctx->username); ctx->username = NULL; kfree_sensitive(ctx->password); + /* no need to free ctx->password2 since not allocated in this path */ ctx->password = NULL; goto out_key_put; } @@ -2317,6 +2319,12 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) if (!ses->password) goto get_ses_fail; } + /* ctx->password freed at unmount */ + if (ctx->password2) { + ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); + if (!ses->password2) + goto get_ses_fail; + } if (ctx->domainname) { ses->domainName = kstrdup(ctx->domainname, GFP_KERNEL); if (!ses->domainName) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index b7bfe705b2c4..6c727d8c31e8 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -162,6 +162,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_string("username", Opt_user), fsparam_string("pass", Opt_pass), fsparam_string("password", Opt_pass), + fsparam_string("password2", Opt_pass2), fsparam_string("ip", Opt_ip), fsparam_string("addr", Opt_ip), fsparam_string("domain", Opt_domain), @@ -345,6 +346,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx new_ctx->nodename = NULL; new_ctx->username = NULL; new_ctx->password = NULL; + new_ctx->password2 = NULL; new_ctx->server_hostname = NULL; new_ctx->domainname = NULL; new_ctx->UNC = NULL; @@ -357,6 +359,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(prepath); DUP_CTX_STR(username); DUP_CTX_STR(password); + DUP_CTX_STR(password2); DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); @@ -905,6 +908,8 @@ static int smb3_reconfigure(struct fs_context *fc) else { kfree_sensitive(ses->password); ses->password = kstrdup(ctx->password, GFP_KERNEL); + kfree_sensitive(ses->password2); + ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); } STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); @@ -1305,6 +1310,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } break; + case Opt_pass2: + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; + if (strlen(param->string) == 0) + break; + + ctx->password2 = kstrdup(param->string, GFP_KERNEL); + if (ctx->password2 == NULL) { + cifs_errorf(fc, "OOM when copying password2 string\n"); + goto cifs_parse_mount_err; + } + break; case Opt_ip: if (strlen(param->string) == 0) { ctx->got_ip = false; @@ -1608,6 +1625,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_parse_mount_err: kfree_sensitive(ctx->password); ctx->password = NULL; + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; return -EINVAL; } @@ -1713,6 +1732,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; + kfree_sensitive(ctx->password2); + ctx->password2 = NULL; kfree(ctx->server_hostname); ctx->server_hostname = NULL; kfree(ctx->UNC); diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 8a35645e0b65..a947bddeba27 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -145,6 +145,7 @@ enum cifs_param { Opt_source, Opt_user, Opt_pass, + Opt_pass2, Opt_ip, Opt_domain, Opt_srcaddr, @@ -177,6 +178,7 @@ struct smb3_fs_context { char *username; char *password; + char *password2; char *domainname; char *source; char *server_hostname; diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 33ac4f8f5050..7d15a1969b81 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -98,6 +98,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->serverDomain); kfree(buf_to_free->serverNOS); kfree_sensitive(buf_to_free->password); + kfree_sensitive(buf_to_free->password2); kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kfree_sensitive(buf_to_free->auth_key.response); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c0c4933af5fc..86c647a947cc 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -367,6 +367,17 @@ again: } rc = cifs_setup_session(0, ses, server, nls_codepage); + if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect (key rotation + * could be enabled on the server e.g.) if an alternate + * password is available and the current password is expired, + * but do not swap on non pwd related errors like host down + */ + if (ses->password2) + swap(ses->password2, ses->password); + } + if ((rc == -EACCES) && !tcon->retry) { mutex_unlock(&ses->session_mutex); rc = -EHOSTDOWN; -- cgit v1.2.3 From a8fa658eebe8b17fc852482da52f8841be8931d6 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 22 Mar 2024 14:26:04 +0800 Subject: eventfs: Fix kernel-doc comments to functions This commit fix kernel-doc style comments with complete parameter descriptions for the lookup_file(),lookup_dir_entry() and lookup_file_dentry(). Link: https://lore.kernel.org/linux-trace-kernel/20240322062604.28862-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index dc067eeb6387..894c6ca1e500 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -336,6 +336,7 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, /** * lookup_file - look up a file in the tracefs filesystem + * @parent_ei: Pointer to the eventfs_inode that represents parent of the file * @dentry: the dentry to look up * @mode: the permission that the file should have. * @attr: saved attributes changed by user @@ -389,6 +390,7 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei, /** * lookup_dir_entry - look up a dir in the tracefs filesystem * @dentry: the directory to look up + * @pei: Pointer to the parent eventfs_inode if available * @ei: the eventfs_inode that represents the directory to create * * This function will look up a dentry for a directory represented by @@ -478,16 +480,20 @@ void eventfs_d_release(struct dentry *dentry) /** * lookup_file_dentry - create a dentry for a file of an eventfs_inode + * @dentry: The parent dentry under which the new file's dentry will be created * @ei: the eventfs_inode that the file will be created under * @idx: the index into the entry_attrs[] of the @ei - * @parent: The parent dentry of the created file. - * @name: The name of the file to create * @mode: The mode of the file. * @data: The data to use to set the inode of the file with on open() * @fops: The fops of the file to be created. * - * Create a dentry for a file of an eventfs_inode @ei and place it into the - * address located at @e_dentry. + * This function creates a dentry for a file associated with an + * eventfs_inode @ei. It uses the entry attributes specified by @idx, + * if available. The file will have the specified @mode and its inode will be + * set up with @data upon open. The file operations will be set to @fops. + * + * Return: Returns a pointer to the newly created file's dentry or an error + * pointer. */ static struct dentry * lookup_file_dentry(struct dentry *dentry, -- cgit v1.2.3 From d96c36004e31e2baaf8ea1b449b7d0b2c2bfb41a Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Fri, 22 Mar 2024 17:48:01 +0530 Subject: tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry Fix FTRACE_RECORD_RECURSION_SIZE entry, replace tab with a space character. It helps Kconfig parsers to read file without error. Link: https://lore.kernel.org/linux-trace-kernel/20240322121801.1803948-1-ppandit@redhat.com Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Fixes: 773c16705058 ("ftrace: Add recording of functions that caused recursion") Signed-off-by: Prasad Pandit Reviewed-by: Randy Dunlap Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61c541c36596..47345bf1d4a9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -965,7 +965,7 @@ config FTRACE_RECORD_RECURSION config FTRACE_RECORD_RECURSION_SIZE int "Max number of recursed functions to record" - default 128 + default 128 depends on FTRACE_RECORD_RECURSION help This defines the limit of number of functions that can be -- cgit v1.2.3 From 5281ec83454d70d98b71f1836fb16512566c01cd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:24 +0200 Subject: tracing: hide unused ftrace_event_id_fops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_PERF_EVENTS, a 'make W=1' build produces a warning about the unused ftrace_event_id_fops variable: kernel/trace/trace_events.c:2155:37: error: 'ftrace_event_id_fops' defined but not used [-Werror=unused-const-variable=] 2155 | static const struct file_operations ftrace_event_id_fops = { Hide this in the same #ifdef as the reference to it. Link: https://lore.kernel.org/linux-trace-kernel/20240403080702.3509288-7-arnd@kernel.org Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Mathieu Desnoyers Cc: Zheng Yejian Cc: Kees Cook Cc: Ajay Kaher Cc: Jinjie Ruan Cc: Clément Léger Cc: Dan Carpenter Cc: "Tzvetomir Stoyanov (VMware)" Fixes: 620a30e97feb ("tracing: Don't pass file_operations array to event_create_dir()") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7c364b87352e..52f75c36bbca 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1670,6 +1670,7 @@ static int trace_format_open(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_PERF_EVENTS static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -1684,6 +1685,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } +#endif static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, @@ -2152,10 +2154,12 @@ static const struct file_operations ftrace_event_format_fops = { .release = seq_release, }; +#ifdef CONFIG_PERF_EVENTS static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; +#endif static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_file_tr, -- cgit v1.2.3 From ffe3986fece696cf65e0ef99e74c75f848be8e30 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 9 Apr 2024 15:13:09 -0400 Subject: ring-buffer: Only update pages_touched when a new page is touched The "buffer_percent" logic that is used by the ring buffer splice code to only wake up the tasks when there's no data after the buffer is filled to the percentage of the "buffer_percent" file is dependent on three variables that determine the amount of data that is in the ring buffer: 1) pages_read - incremented whenever a new sub-buffer is consumed 2) pages_lost - incremented every time a writer overwrites a sub-buffer 3) pages_touched - incremented when a write goes to a new sub-buffer The percentage is the calculation of: (pages_touched - (pages_lost + pages_read)) / nr_pages Basically, the amount of data is the total number of sub-bufs that have been touched, minus the number of sub-bufs lost and sub-bufs consumed. This is divided by the total count to give the buffer percentage. When the percentage is greater than the value in the "buffer_percent" file, it wakes up splice readers waiting for that amount. It was observed that over time, the amount read from the splice was constantly decreasing the longer the trace was running. That is, if one asked for 60%, it would read over 60% when it first starts tracing, but then it would be woken up at under 60% and would slowly decrease the amount of data read after being woken up, where the amount becomes much less than the buffer percent. This was due to an accounting of the pages_touched incrementation. This value is incremented whenever a writer transfers to a new sub-buffer. But the place where it was incremented was incorrect. If a writer overflowed the current sub-buffer it would go to the next one. If it gets preempted by an interrupt at that time, and the interrupt performs a trace, it too will end up going to the next sub-buffer. But only one should increment the counter. Unfortunately, that was not the case. Change the cmpxchg() that does the real switch of the tail-page into a try_cmpxchg(), and on success, perform the increment of pages_touched. This will only increment the counter once for when the writer moves to a new sub-buffer, and not when there's a race and is incremented for when a writer and its preempting writer both move to the same new sub-buffer. Link: https://lore.kernel.org/linux-trace-kernel/20240409151309.0d0e5056@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 25476ead681b..6511dc3a00da 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1393,7 +1393,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); - local_inc(&cpu_buffer->pages_touched); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. @@ -1430,8 +1429,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - /* Again, either we update tail_page or an interrupt does */ - (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); + /* Either we update tail_page or an interrupt does */ + if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) + local_inc(&cpu_buffer->pages_touched); } } -- cgit v1.2.3 From 824f06ff81464f823cd6259ff2ec8fbeceb2afa5 Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Thu, 11 Apr 2024 23:36:33 +0000 Subject: fs/9p: Revert "fs/9p: fix dups even in uncached mode" This reverts commit be57855f505003c5cafff40338d5d0f23b00ba4d. It caused a regression involving duplicate inode numbers in some tester trees. The bad behavior seems to be dependent on inode reuse policy in underlying file system, so it did not trigger in my test setup. Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 4236058c7bbd..eeac06c2a84c 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -244,6 +244,21 @@ done: return res; } +static int v9fs_drop_inode(struct inode *inode) +{ + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) + return generic_drop_inode(inode); + /* + * in case of non cached mode always drop the + * inode because we want the inode attribute + * to always match that on the server. + */ + return 1; +} + static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -278,6 +293,7 @@ static const struct super_operations v9fs_super_ops_dotl = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = v9fs_statfs, + .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, -- cgit v1.2.3 From 7fd524b9bd1be210fe79035800f4bd78a41b349f Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Mon, 18 Mar 2024 12:22:32 +0100 Subject: fs/9p: drop inodes immediately on non-.L too Signed-off-by: Joakim Sindholt Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index eeac06c2a84c..55e67e36ae68 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -283,6 +283,7 @@ static const struct super_operations v9fs_super_ops = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = simple_statfs, + .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, -- cgit v1.2.3 From 3b0daecfeac0103aba8b293df07a0cbaf8b43f29 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 12 Apr 2024 06:11:25 +1000 Subject: amdkfd: use calloc instead of kzalloc to avoid integer overflow This uses calloc instead of doing the multiplication which might overflow. Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f9631f4b1a02..55aa74cbc532 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -779,8 +779,8 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, * nodes, but not more than args->num_of_nodes as that is * the amount of memory allocated by user */ - pa = kzalloc((sizeof(struct kfd_process_device_apertures) * - args->num_of_nodes), GFP_KERNEL); + pa = kcalloc(args->num_of_nodes, sizeof(struct kfd_process_device_apertures), + GFP_KERNEL); if (!pa) return -ENOMEM; -- cgit v1.2.3 From 2b3e79fea66e166622a454715ce981432ac8c6e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 01:01:11 -0400 Subject: bcachefs: Don't use bch2_btree_node_lock_write_nofail() in btree split path It turns out - btree splits happen with the rest of the transaction still locked, to avoid unnecessary restarts, which means using nofail doesn't work here - we can deadlock. Fortunately, we now have the ability to return errors here. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 41 +++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index c4a5e83a56a4..29a5bc7d8789 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1280,23 +1280,29 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) bch2_recalc_btree_reserve(c); } -static void bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +static int bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + bool nofail) { struct bch_fs *c = as->c; - struct btree *old; trace_and_count(c, btree_node_set_root, trans, b); - old = btree_node_root(c, b); + struct btree *old = btree_node_root(c, b); /* * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write_nofail(trans, path, &old->c); + if (nofail) { + bch2_btree_node_lock_write_nofail(trans, path, &old->c); + } else { + int ret = bch2_btree_node_lock_write(trans, path, &old->c); + if (ret) + return ret; + } bch2_btree_set_root_inmem(c, b); @@ -1310,6 +1316,7 @@ static void bch2_btree_set_root(struct btree_update *as, * depend on the new root would have to update the new root. */ bch2_btree_node_unlock_write(trans, path, old); + return 0; } /* Interior node updates: */ @@ -1652,15 +1659,16 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); - if (ret) - goto err; } else if (n3) { - bch2_btree_set_root(as, trans, trans->paths + path, n3); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, trans->paths + path, n1); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); } + if (ret) + goto err; + if (n3) { bch2_btree_update_get_open_buckets(as, n3); bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); @@ -1863,7 +1871,9 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * bch2_keylist_add(&as->parent_keys, &b->key); btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); - bch2_btree_set_root(as, trans, path, n); + int ret = bch2_btree_set_root(as, trans, path, n, true); + BUG_ON(ret); + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); bch2_trans_node_add(trans, path, n); @@ -2106,12 +2116,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); - if (ret) - goto err; } else { - bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); + ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); } + if (ret) + goto err; + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -- cgit v1.2.3 From 210cfef579260ed6c3b700e7baeae51a5e183f43 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 4 Apr 2024 17:02:09 -0500 Subject: selftests/powerpc/papr-vpd: Fix missing variable initialization The "close handle without consuming VPD" testcase has inconsistent results because it fails to initialize the location code object it passes to ioctl() to create a VPD handle. Initialize the location code to the empty string as intended. Signed-off-by: Nathan Lynch Fixes: 9118c5d32bdd ("powerpc/selftests: Add test for papr-vpd") Reported-by: Geetika Moolchandani Signed-off-by: Michael Ellerman Link: https://msgid.link/20240404-papr-vpd-test-uninit-lc-v2-1-37bff46c65a5@linux.ibm.com --- tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c index 505294da1b9f..d6f99eb9be65 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c +++ b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c @@ -154,7 +154,7 @@ static int dev_papr_vpd_null_handle(void) static int papr_vpd_close_handle_without_reading(void) { const int devfd = open(DEVPATH, O_RDONLY); - struct papr_location_code lc; + struct papr_location_code lc = { .str = "", }; int fd; SKIP_IF_MSG(devfd < 0 && errno == ENOENT, -- cgit v1.2.3 From 84b1cec4fac5889b6d138d4b5ff6f2d5ef126c5e Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 4 Apr 2024 10:27:17 +0000 Subject: iommu/amd: Fix possible irq lock inversion dependency issue LOCKDEP detector reported below warning: ---------------------------------------- [ 23.796949] ======================================================== [ 23.796950] WARNING: possible irq lock inversion dependency detected [ 23.796952] 6.8.0fix+ #811 Not tainted [ 23.796954] -------------------------------------------------------- [ 23.796954] kworker/0:1/8 just changed the state of lock: [ 23.796956] ff365325e084a9b8 (&domain->lock){..-.}-{3:3}, at: amd_iommu_flush_iotlb_all+0x1f/0x50 [ 23.796969] but this lock took another, SOFTIRQ-unsafe lock in the past: [ 23.796970] (pd_bitmap_lock){+.+.}-{3:3} [ 23.796972] and interrupts could create inverse lock ordering between them. [ 23.796973] other info that might help us debug this: [ 23.796974] Chain exists of: &domain->lock --> &dev_data->lock --> pd_bitmap_lock [ 23.796980] Possible interrupt unsafe locking scenario: [ 23.796981] CPU0 CPU1 [ 23.796982] ---- ---- [ 23.796983] lock(pd_bitmap_lock); [ 23.796985] local_irq_disable(); [ 23.796985] lock(&domain->lock); [ 23.796988] lock(&dev_data->lock); [ 23.796990] [ 23.796991] lock(&domain->lock); Fix this issue by disabling interrupt when acquiring pd_bitmap_lock. Note that this is temporary fix. We have a plan to replace custom bitmap allocator with IDA allocator. Fixes: 87a6f1f22c97 ("iommu/amd: Introduce per-device domain ID to fix potential TLB aliasing issue") Reviewed-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240404102717.6705-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d35c1b8c8e65..e692217fcb28 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1692,26 +1692,29 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid, static u16 domain_id_alloc(void) { + unsigned long flags; int id; - spin_lock(&pd_bitmap_lock); + spin_lock_irqsave(&pd_bitmap_lock, flags); id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); BUG_ON(id == 0); if (id > 0 && id < MAX_DOMAIN_ID) __set_bit(id, amd_iommu_pd_alloc_bitmap); else id = 0; - spin_unlock(&pd_bitmap_lock); + spin_unlock_irqrestore(&pd_bitmap_lock, flags); return id; } static void domain_id_free(int id) { - spin_lock(&pd_bitmap_lock); + unsigned long flags; + + spin_lock_irqsave(&pd_bitmap_lock, flags); if (id > 0 && id < MAX_DOMAIN_ID) __clear_bit(id, amd_iommu_pd_alloc_bitmap); - spin_unlock(&pd_bitmap_lock); + spin_unlock_irqrestore(&pd_bitmap_lock, flags); } static void free_gcr3_tbl_level1(u64 *tbl) -- cgit v1.2.3 From b650b38b006053ef96e90b8e3f7e6edc7845cd57 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 10 Apr 2024 08:57:02 +0000 Subject: iommu/amd: Do not enable SNP when V2 page table is enabled DTE[Mode]=0 is not supported when SNP is enabled in the host. That means to support SNP, IOMMU must be configured with V1 page table (See IOMMU spec [1] for the details). If user passes kernel command line to configure IOMMU domains with v2 page table (amd_iommu=pgtbl_v2) then disable SNP as the user asked by not forcing the page table to v1. [1] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_IOMMU.pdf Cc: Ashish Kalra Cc: Michael Roth Cc: Tom Lendacky Signed-off-by: Vasant Hegde Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20240410085702.31869-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 33228c1c8980..a714eb08c08b 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3232,28 +3232,29 @@ static void iommu_snp_enable(void) return; /* * The SNP support requires that IOMMU must be enabled, and is - * not configured in the passthrough mode. + * configured with V1 page table (DTE[Mode] = 0 is not supported). */ if (no_iommu || iommu_default_passthrough()) { pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return; + goto disable_snp; + } + + if (amd_iommu_pgtable != AMD_IOMMU_V1) { + pr_warn("SNP: IOMMU is configured with V2 page table mode, SNP cannot be supported.\n"); + goto disable_snp; } amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) { pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return; + goto disable_snp; } pr_info("IOMMU SNP support enabled.\n"); + return; - /* Enforce IOMMU v1 pagetable when SNP is enabled. */ - if (amd_iommu_pgtable != AMD_IOMMU_V1) { - pr_warn("Forcing use of AMD IOMMU v1 page table due to SNP.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; - } +disable_snp: + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); #endif } -- cgit v1.2.3 From 7537e31df80cb58c27f3b6fef702534ea87a5957 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Apr 2024 18:41:09 +0200 Subject: iommu: mtk: fix module autoloading Add MODULE_DEVICE_TABLE(), so modules could be properly autoloaded based on the alias from of_device_id table. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240410164109.233308-1-krzk@kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 1 + drivers/iommu/mtk_iommu_v1.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index b8c47f18bc26..6a2707fe7a78 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -1790,6 +1790,7 @@ static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt8365-m4u", .data = &mt8365_data}, {} }; +MODULE_DEVICE_TABLE(of, mtk_iommu_of_ids); static struct platform_driver mtk_iommu_driver = { .probe = mtk_iommu_probe, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index a9fa2a54dc9b..d6e4002200bd 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -600,6 +600,7 @@ static const struct of_device_id mtk_iommu_v1_of_ids[] = { { .compatible = "mediatek,mt2701-m4u", }, {} }; +MODULE_DEVICE_TABLE(of, mtk_iommu_v1_of_ids); static const struct component_master_ops mtk_iommu_v1_com_ops = { .bind = mtk_iommu_v1_bind, -- cgit v1.2.3 From 36d4fe147c870f6d3f6602befd7ef44393a1c87a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:50 -0700 Subject: x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto Unlike most other mitigations' "auto" options, spectre_bhi=auto only mitigates newer systems, which is confusing and not particularly useful. Remove it. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Cc: Sean Christopherson Cc: Linus Torvalds Link: https://lore.kernel.org/r/412e9dc87971b622bbbaf64740ebc1f140bff343.1712813475.git.jpoimboe@kernel.org --- Documentation/admin-guide/hw-vuln/spectre.rst | 4 ---- Documentation/admin-guide/kernel-parameters.txt | 3 --- arch/x86/Kconfig | 5 ----- arch/x86/kernel/cpu/bugs.c | 10 +--------- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 5a39acf82483..25a04cda4c2c 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -669,10 +669,6 @@ kernel command line. needed. off Disable the mitigation. - auto - Enable the HW mitigation if needed, but - *don't* enable the SW mitigation except for KVM. - The system may be vulnerable. For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a3874cc97892..902ecd92a29f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6072,9 +6072,6 @@ on - (default) Enable the HW or SW mitigation as needed. off - Disable the mitigation. - auto - Enable the HW mitigation if needed, but - *don't* enable the SW mitigation except - for KVM. The system may be vulnerable. spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 10a6251f58f3..b63b6767a63d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2651,11 +2651,6 @@ config SPECTRE_BHI_OFF bool "off" help Equivalent to setting spectre_bhi=off command line parameter. -config SPECTRE_BHI_AUTO - bool "auto" - depends on BROKEN - help - Equivalent to setting spectre_bhi=auto command line parameter. endchoice diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9eeb60f5fbb3..6af4780a18ed 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1625,13 +1625,10 @@ static bool __init spec_ctrl_bhi_dis(void) enum bhi_mitigations { BHI_MITIGATION_OFF, BHI_MITIGATION_ON, - BHI_MITIGATION_AUTO, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : - IS_ENABLED(CONFIG_SPECTRE_BHI_OFF) ? BHI_MITIGATION_OFF : - BHI_MITIGATION_AUTO; + IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { @@ -1642,8 +1639,6 @@ static int __init spectre_bhi_parse_cmdline(char *str) bhi_mitigation = BHI_MITIGATION_OFF; else if (!strcmp(str, "on")) bhi_mitigation = BHI_MITIGATION_ON; - else if (!strcmp(str, "auto")) - bhi_mitigation = BHI_MITIGATION_AUTO; else pr_err("Ignoring unknown spectre_bhi option (%s)", str); @@ -1673,9 +1668,6 @@ static void __init bhi_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n"); - if (bhi_mitigation == BHI_MITIGATION_AUTO) - return; - /* Mitigate syscalls when the mitigation is forced =on */ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); -- cgit v1.2.3 From 4f511739c54b549061993b53fc0380f48dfca23b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 10 Apr 2024 22:40:51 -0700 Subject: x86/bugs: Replace CONFIG_SPECTRE_BHI_{ON,OFF} with CONFIG_MITIGATION_SPECTRE_BHI For consistency with the other CONFIG_MITIGATION_* options, replace the CONFIG_SPECTRE_BHI_{ON,OFF} options with a single CONFIG_MITIGATION_SPECTRE_BHI option. [ mingo: Fix ] Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Sean Christopherson Cc: Linus Torvalds Cc: Nikolay Borisov Link: https://lore.kernel.org/r/3833812ea63e7fdbe36bf8b932e63f70d18e2a2a.1712813475.git.jpoimboe@kernel.org --- arch/x86/Kconfig | 17 +++-------------- arch/x86/kernel/cpu/bugs.c | 2 +- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b63b6767a63d..4474bf32d0a4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2633,27 +2633,16 @@ config MITIGATION_RFDS stored in floating point, vector and integer registers. See also -choice - prompt "Clear branch history" +config MITIGATION_SPECTRE_BHI + bool "Mitigate Spectre-BHB (Branch History Injection)" depends on CPU_SUP_INTEL - default SPECTRE_BHI_ON + default y help Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks where the branch history buffer is poisoned to speculatively steer indirect branches. See -config SPECTRE_BHI_ON - bool "on" - help - Equivalent to setting spectre_bhi=on command line parameter. -config SPECTRE_BHI_OFF - bool "off" - help - Equivalent to setting spectre_bhi=off command line parameter. - -endchoice - endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6af4780a18ed..ca295b0c1eee 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1628,7 +1628,7 @@ enum bhi_mitigations { }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_SPECTRE_BHI_ON) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { -- cgit v1.2.3 From 1b3108f6898ef2e03973d65255182792e94e2240 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2024 21:45:27 +0200 Subject: x86/cpu/amd: Make the CPUID 0x80000008 parser correct CPUID 0x80000008 ECX.cpu_nthreads describes the number of threads in the package. The parser uses this value to initialize the SMT domain level. That's wrong because cpu_nthreads does not describe the number of threads per physical core. So this needs to set the CORE domain level and let the later parsers set the SMT shift if available. Preset the SMT domain level with the assumption of one thread per core, which is correct ifrt here are no other CPUID leafs to parse, and propagate cpu_nthreads and the core level APIC bitwidth into the CORE domain. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Reported-by: "kernelci.org bot" Reported-by: Laura Nao Signed-off-by: Thomas Gleixner Tested-by: Laura Nao Link: https://lore.kernel.org/r/20240410194311.535206450@linutronix.de --- arch/x86/kernel/cpu/topology_amd.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 1a8b3ad493af..79a85a4814ca 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -29,11 +29,21 @@ static bool parse_8000_0008(struct topo_scan *tscan) if (!sft) sft = get_count_order(ecx.cpu_nthreads + 1); - topology_set_dom(tscan, TOPO_SMT_DOMAIN, sft, ecx.cpu_nthreads + 1); + /* + * cpu_nthreads describes the number of threads in the package + * sft is the number of APIC ID bits per package + * + * As the number of actual threads per core is not described in + * this leaf, just set the CORE domain shift and let the later + * parsers set SMT shift. Assume one thread per core by default + * which is correct if there are no other CPUID leafs to parse. + */ + topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1); return true; } -static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_id) +static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) { /* * Starting with Fam 17h the DIE domain could probably be used to @@ -73,12 +83,14 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* - * If leaf 0xb is available, then SMT shift is set already. If not - * take it from ecx.threads_per_core and use topo_update_dom() - - * topology_set_dom() would propagate and overwrite the already - * propagated CORE level. + * If leaf 0xb is available, then the domain shifts are set + * already and nothing to do here. */ if (!has_0xb) { + /* + * Leaf 0x80000008 set the CORE domain shift already. + * Update the SMT domain, but do not propagate it. + */ unsigned int nthreads = leaf.core_nthreads + 1; topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); -- cgit v1.2.3 From c064b536a8f9ab7c8e204da8f5a22f7420d0b56c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2024 21:45:28 +0200 Subject: x86/cpu/amd: Make the NODEID_MSR union actually work A system with NODEID_MSR was reported to crash during early boot without any output. The reason is that the union which is used for accessing the bitfields in the MSR is written wrongly and the resulting executable code accesses the wrong part of the MSR data. As a consequence a later division by that value results in 0 and that result is used for another division as divisor, which obviously does not work well. The magic world of C, unions and bitfields: union { u64 bita : 3, bitb : 3; u64 all; } x; x.all = foo(); a = x.bita; b = x.bitb; results in the effective executable code of: a = b = x.bita; because bita and bitb are treated as union members and therefore both end up at bit offset 0. Wrapping the bitfield into an anonymous struct: union { struct { u64 bita : 3, bitb : 3; }; u64 all; } x; works like expected. Rework the NODEID_MSR union in exactly that way to cure the problem. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Reported-by: "kernelci.org bot" Reported-by: Laura Nao Signed-off-by: Thomas Gleixner Tested-by: Laura Nao Link: https://lore.kernel.org/r/20240410194311.596282919@linutronix.de Closes: https://lore.kernel.org/all/20240322175210.124416-1-laura.nao@collabora.com/ --- arch/x86/kernel/cpu/topology_amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 79a85a4814ca..7f999aef1965 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -121,13 +121,13 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) static bool parse_fam10h_node_id(struct topo_scan *tscan) { - struct { - union { + union { + struct { u64 node_id : 3, nodes_per_pkg : 3, unused : 58; - u64 msr; }; + u64 msr; } nid; if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) -- cgit v1.2.3 From 7211274fe0ee352332255e41ab5e628b86e83994 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Apr 2024 18:55:38 +0200 Subject: x86/cpu/amd: Move TOPOEXT enablement into the topology parser The topology rework missed that early_init_amd() tries to re-enable the Topology Extensions when the BIOS disabled them. The new parser is invoked before early_init_amd() so the re-enable attempt happens too late. Move it into the AMD specific topology parser code where it belongs. Fixes: f7fb3b2dd92c ("x86/cpu: Provide an AMD/HYGON specific topology parser") Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/878r1j260l.ffs@tglx --- arch/x86/kernel/cpu/amd.c | 15 --------------- arch/x86/kernel/cpu/topology_amd.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9bf17c9c29da..cb9eece55904 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -535,7 +535,6 @@ clear_sev: static void early_init_amd(struct cpuinfo_x86 *c) { - u64 value; u32 dummy; if (c->x86 >= 0xf) @@ -603,20 +602,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_detect_mem_encrypt(c); - /* Re-enable TopologyExtensions if switched off by BIOS */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 7f999aef1965..a7aa6eff4ae5 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -147,6 +147,26 @@ static void legacy_set_llc(struct topo_scan *tscan) tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; } +static void topoext_fixup(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u64 msrval; + + /* Try to re-enable TopologyExtensions if switched off by BIOS */ + if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) + return; + + if (msr_set_bit(0xc0011005, 54) <= 0) + return; + + rdmsrl(0xc0011005, msrval); + if (msrval & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } +} + static void parse_topology_amd(struct topo_scan *tscan) { bool has_0xb = false; @@ -176,6 +196,7 @@ static void parse_topology_amd(struct topo_scan *tscan) void cpu_parse_topology_amd(struct topo_scan *tscan) { tscan->amd_nodes_per_pkg = 1; + topoext_fixup(tscan); parse_topology_amd(tscan); if (tscan->amd_nodes_per_pkg > 1) -- cgit v1.2.3 From 5b3625a4f6422e8982f90f0c11b5546149c962b8 Mon Sep 17 00:00:00 2001 From: Xuchun Shang Date: Thu, 11 Apr 2024 11:07:42 +0800 Subject: iommu/vt-d: Fix wrong use of pasid config The commit "iommu/vt-d: Add IOMMU perfmon support" introduce IOMMU PMU feature, but use the wrong config when set pasid filter. Fixes: 7232ab8b89e9 ("iommu/vt-d: Add IOMMU perfmon support") Signed-off-by: Xuchun Shang Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20240401060753.3321318-1-xuchun.shang@linux.alibaba.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/perfmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c index cf43e798eca4..44083d01852d 100644 --- a/drivers/iommu/intel/perfmon.c +++ b/drivers/iommu/intel/perfmon.c @@ -438,7 +438,7 @@ static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu, iommu_pmu_set_filter(domain, event->attr.config1, IOMMU_PMU_FILTER_DOMAIN, idx, event->attr.config1); - iommu_pmu_set_filter(pasid, event->attr.config1, + iommu_pmu_set_filter(pasid, event->attr.config2, IOMMU_PMU_FILTER_PASID, idx, event->attr.config1); iommu_pmu_set_filter(ats, event->attr.config2, -- cgit v1.2.3 From a34f3e20ddff02c4f12df2c0635367394e64c63d Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 11 Apr 2024 11:07:43 +0800 Subject: iommu/vt-d: Allocate local memory for page request queue The page request queue is per IOMMU, its allocation should be made NUMA-aware for performance reasons. Fixes: a222a7f0bb6c ("iommu/vt-d: Implement page request handling") Signed-off-by: Jacob Pan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240403214007.985600-1-jacob.jun.pan@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index c1bed89b1026..ee3b469e2da1 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -66,7 +66,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) struct page *pages; int irq, ret; - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); + pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); if (!pages) { pr_warn("IOMMU: %s: Failed to allocate page request queue\n", iommu->name); -- cgit v1.2.3 From 89436f4f54125b1297aec1f466efd8acb4ec613d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 11 Apr 2024 11:07:44 +0800 Subject: iommu/vt-d: Fix WARN_ON in iommu probe path Commit 1a75cc710b95 ("iommu/vt-d: Use rbtree to track iommu probed devices") adds all devices probed by the iommu driver in a rbtree indexed by the source ID of each device. It assumes that each device has a unique source ID. This assumption is incorrect and the VT-d spec doesn't state this requirement either. The reason for using a rbtree to track devices is to look up the device with PCI bus and devfunc in the paths of handling ATS invalidation time out error and the PRI I/O page faults. Both are PCI ATS feature related. Only track the devices that have PCI ATS capabilities in the rbtree to avoid unnecessary WARN_ON in the iommu probe path. Otherwise, on some platforms below kernel splat will be displayed and the iommu probe results in failure. WARNING: CPU: 3 PID: 166 at drivers/iommu/intel/iommu.c:158 intel_iommu_probe_device+0x319/0xd90 Call Trace: ? __warn+0x7e/0x180 ? intel_iommu_probe_device+0x319/0xd90 ? report_bug+0x1f8/0x200 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? intel_iommu_probe_device+0x319/0xd90 ? debug_mutex_init+0x37/0x50 __iommu_probe_device+0xf2/0x4f0 iommu_probe_device+0x22/0x70 iommu_bus_notifier+0x1e/0x40 notifier_call_chain+0x46/0x150 blocking_notifier_call_chain+0x42/0x60 bus_notify+0x2f/0x50 device_add+0x5ed/0x7e0 platform_device_add+0xf5/0x240 mfd_add_devices+0x3f9/0x500 ? preempt_count_add+0x4c/0xa0 ? up_write+0xa2/0x1b0 ? __debugfs_create_file+0xe3/0x150 intel_lpss_probe+0x49f/0x5b0 ? pci_conf1_write+0xa3/0xf0 intel_lpss_pci_probe+0xcf/0x110 [intel_lpss_pci] pci_device_probe+0x95/0x120 really_probe+0xd9/0x370 ? __pfx___driver_attach+0x10/0x10 __driver_probe_device+0x73/0x150 driver_probe_device+0x19/0xa0 __driver_attach+0xb6/0x180 ? __pfx___driver_attach+0x10/0x10 bus_for_each_dev+0x77/0xd0 bus_add_driver+0x114/0x210 driver_register+0x5b/0x110 ? __pfx_intel_lpss_pci_driver_init+0x10/0x10 [intel_lpss_pci] do_one_initcall+0x57/0x2b0 ? kmalloc_trace+0x21e/0x280 ? do_init_module+0x1e/0x210 do_init_module+0x5f/0x210 load_module+0x1d37/0x1fc0 ? init_module_from_file+0x86/0xd0 init_module_from_file+0x86/0xd0 idempotent_init_module+0x17c/0x230 __x64_sys_finit_module+0x56/0xb0 do_syscall_64+0x6e/0x140 entry_SYSCALL_64_after_hwframe+0x71/0x79 Fixes: 1a75cc710b95 ("iommu/vt-d: Use rbtree to track iommu probed devices") Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10689 Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20240407011429.136282-1-baolu.lu@linux.intel.com Reviewed-by: Kevin Tian Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 50eb9aed47cc..a7ecd90303dc 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4299,9 +4299,11 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } dev_iommu_priv_set(dev, info); - ret = device_rbtree_insert(iommu, info); - if (ret) - goto free; + if (pdev && pci_ats_supported(pdev)) { + ret = device_rbtree_insert(iommu, info); + if (ret) + goto free; + } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); @@ -4336,7 +4338,8 @@ static void intel_iommu_release_device(struct device *dev) struct intel_iommu *iommu = info->iommu; mutex_lock(&iommu->iopf_lock); - device_rbtree_remove(info); + if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) + device_rbtree_remove(info); mutex_unlock(&iommu->iopf_lock); if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && -- cgit v1.2.3 From b8246a2ad80a810cafbeddb30525278f9d64bca3 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 10 Apr 2024 10:16:43 +0000 Subject: iommu/amd: Change log message severity Use consistent log severity (pr_warn) to log all messages in SNP enable path. Suggested-by: Tom Lendacky Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20240410101643.32309-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index a714eb08c08b..ac6754a85f35 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3235,7 +3235,7 @@ static void iommu_snp_enable(void) * configured with V1 page table (DTE[Mode] = 0 is not supported). */ if (no_iommu || iommu_default_passthrough()) { - pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); + pr_warn("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n"); goto disable_snp; } @@ -3246,7 +3246,7 @@ static void iommu_snp_enable(void) amd_iommu_snp_en = check_feature(FEATURE_SNP); if (!amd_iommu_snp_en) { - pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); + pr_warn("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n"); goto disable_snp; } -- cgit v1.2.3 From e4a6bceac98eba3c00e874892736b34ea5fdaca3 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2024 16:26:28 -0700 Subject: selftests: timers: Fix posix_timers ksft_print_msg() warning After commit 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") the following warning occurs when building with an older gcc: posix_timers.c:250:2: warning: format not a string literal and no format arguments [-Wformat-security] 250 | ksft_print_msg(errmsg); | ^~~~~~~~~~~~~~ Fix this up by changing it to ksft_print_msg("%s", errmsg) Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Acked-by: Justin Stitt Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240410232637.4135564-1-jstultz@google.com --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index d86a0e00711e..348f47176e0a 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -247,7 +247,7 @@ static int check_timer_distribution(void) ksft_test_result_skip("check signal distribution (old kernel)\n"); return 0; err: - ksft_print_msg(errmsg); + ksft_print_msg("%s", errmsg); return -1; } -- cgit v1.2.3 From f7d5bcd35d427daac7e206b1073ca14f5db85c27 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 11 Apr 2024 11:45:40 -0700 Subject: selftests: kselftest: Mark functions that unconditionally call exit() as __noreturn After commit 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()"), clang warns: tools/testing/selftests/timers/../kselftest.h:398:6: warning: variable 'major' is used uninitialized whenever '||' condition is true [-Wsometimes-uninitialized] 398 | if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) | ^~~~~~~~~~~~ tools/testing/selftests/timers/../kselftest.h:401:9: note: uninitialized use occurs here 401 | return major > min_major || (major == min_major && minor >= min_minor); | ^~~~~ tools/testing/selftests/timers/../kselftest.h:398:6: note: remove the '||' if its condition is always false 398 | if (uname(&info) || sscanf(info.release, "%u.%u.", &major, &minor) != 2) | ^~~~~~~~~~~~~~~ tools/testing/selftests/timers/../kselftest.h:395:20: note: initialize the variable 'major' to silence this warning 395 | unsigned int major, minor; | ^ | = 0 This is a false positive because if uname() fails, ksft_exit_fail_msg() will be called, which unconditionally calls exit(), a noreturn function. However, clang does not know that ksft_exit_fail_msg() will call exit() at the point in the pipeline that the warning is emitted because inlining has not occurred, so it assumes control flow will resume normally after ksft_exit_fail_msg() is called. Make it clear to clang that all of the functions that call exit() unconditionally in kselftest.h are noreturn transitively by marking them explicitly with '__attribute__((__noreturn__))', which clears up the warning above and any future warnings that may appear for the same reason. Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Reported-by: John Stultz Signed-off-by: Nathan Chancellor Signed-off-by: Thomas Gleixner Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240411-mark-kselftest-exit-funcs-noreturn-v1-1-b027c948f586@kernel.org Closes: https://lore.kernel.org/all/20240410232637.4135564-2-jstultz@google.com/ --- tools/testing/selftests/kselftest.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 973b18e156b2..0591974b57e0 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -80,6 +80,9 @@ #define KSFT_XPASS 3 #define KSFT_SKIP 4 +#ifndef __noreturn +#define __noreturn __attribute__((__noreturn__)) +#endif #define __printf(a, b) __attribute__((format(printf, a, b))) /* counters */ @@ -300,13 +303,13 @@ void ksft_test_result_code(int exit_code, const char *test_name, va_end(args); } -static inline int ksft_exit_pass(void) +static inline __noreturn int ksft_exit_pass(void) { ksft_print_cnts(); exit(KSFT_PASS); } -static inline int ksft_exit_fail(void) +static inline __noreturn int ksft_exit_fail(void) { ksft_print_cnts(); exit(KSFT_FAIL); @@ -333,7 +336,7 @@ static inline int ksft_exit_fail(void) ksft_cnt.ksft_xfail + \ ksft_cnt.ksft_xskip) -static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; va_list args; @@ -348,19 +351,19 @@ static inline __printf(1, 2) int ksft_exit_fail_msg(const char *msg, ...) exit(KSFT_FAIL); } -static inline int ksft_exit_xfail(void) +static inline __noreturn int ksft_exit_xfail(void) { ksft_print_cnts(); exit(KSFT_XFAIL); } -static inline int ksft_exit_xpass(void) +static inline __noreturn int ksft_exit_xpass(void) { ksft_print_cnts(); exit(KSFT_XPASS); } -static inline __printf(1, 2) int ksft_exit_skip(const char *msg, ...) +static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) { int saved_errno = errno; va_list args; -- cgit v1.2.3 From ed366de8ec89d4f960d66c85fc37d9de22f7bf6d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Apr 2024 16:26:30 -0700 Subject: selftests: timers: Fix abs() warning in posix_timers test Building with clang results in the following warning: posix_timers.c:69:6: warning: absolute value function 'abs' given an argument of type 'long long' but has parameter of type 'int' which may cause truncation of value [-Wabsolute-value] if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { ^ So switch to using llabs() instead. Fixes: 0bc4b0cf1570 ("selftests: add basic posix timers selftests") Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240410232637.4135564-3-jstultz@google.com --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 348f47176e0a..c001dd79179d 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -66,7 +66,7 @@ static int check_diff(struct timeval start, struct timeval end) diff = end.tv_usec - start.tv_usec; diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC; - if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { + if (llabs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) { printf("Diff too high: %lld..", diff); return -1; } -- cgit v1.2.3 From 607638faf2ff1cede37458111496e7cc6c977f6f Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Wed, 10 Apr 2024 11:46:18 +0200 Subject: s390/qdio: handle deferred cc1 A deferred condition code 1 response indicates that I/O was not started and should be retried. The current QDIO implementation handles a cc1 response as I/O error, resulting in a failed QDIO setup. This can happen for example when a path verification request arrives at the same time as QDIO setup I/O is started. Fix this by retrying the QDIO setup I/O when a cc1 response is received. Note that since commit 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") commit 5ef1dc40ffa6 ("s390/cio: fix invalid -EBUSY on ccw_device_start") deferred cc1 responses are much more likely to occur. See the commit message of the latter for more background information. Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") Reviewed-by: Alexandra Winter Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/qdio_main.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 3d9f0834c78b..a1cb39f4b7a2 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -722,8 +722,8 @@ static void qdio_handle_activate_check(struct qdio_irq *irq_ptr, lgr_info_log(); } -static void qdio_establish_handle_irq(struct qdio_irq *irq_ptr, int cstat, - int dstat) +static int qdio_establish_handle_irq(struct qdio_irq *irq_ptr, int cstat, + int dstat, int dcc) { DBF_DEV_EVENT(DBF_INFO, irq_ptr, "qest irq"); @@ -731,15 +731,18 @@ static void qdio_establish_handle_irq(struct qdio_irq *irq_ptr, int cstat, goto error; if (dstat & ~(DEV_STAT_DEV_END | DEV_STAT_CHN_END)) goto error; + if (dcc == 1) + return -EAGAIN; if (!(dstat & DEV_STAT_DEV_END)) goto error; qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ESTABLISHED); - return; + return 0; error: DBF_ERROR("%4x EQ:error", irq_ptr->schid.sch_no); DBF_ERROR("ds: %2x cs:%2x", dstat, cstat); qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR); + return -EIO; } /* qdio interrupt handler */ @@ -748,7 +751,7 @@ void qdio_int_handler(struct ccw_device *cdev, unsigned long intparm, { struct qdio_irq *irq_ptr = cdev->private->qdio_data; struct subchannel_id schid; - int cstat, dstat; + int cstat, dstat, rc, dcc; if (!intparm || !irq_ptr) { ccw_device_get_schid(cdev, &schid); @@ -768,10 +771,12 @@ void qdio_int_handler(struct ccw_device *cdev, unsigned long intparm, qdio_irq_check_sense(irq_ptr, irb); cstat = irb->scsw.cmd.cstat; dstat = irb->scsw.cmd.dstat; + dcc = scsw_cmd_is_valid_cc(&irb->scsw) ? irb->scsw.cmd.cc : 0; + rc = 0; switch (irq_ptr->state) { case QDIO_IRQ_STATE_INACTIVE: - qdio_establish_handle_irq(irq_ptr, cstat, dstat); + rc = qdio_establish_handle_irq(irq_ptr, cstat, dstat, dcc); break; case QDIO_IRQ_STATE_CLEANUP: qdio_set_state(irq_ptr, QDIO_IRQ_STATE_INACTIVE); @@ -785,12 +790,25 @@ void qdio_int_handler(struct ccw_device *cdev, unsigned long intparm, if (cstat || dstat) qdio_handle_activate_check(irq_ptr, intparm, cstat, dstat); + else if (dcc == 1) + rc = -EAGAIN; break; case QDIO_IRQ_STATE_STOPPED: break; default: WARN_ON_ONCE(1); } + + if (rc == -EAGAIN) { + DBF_DEV_EVENT(DBF_INFO, irq_ptr, "qint retry"); + rc = ccw_device_start(cdev, irq_ptr->ccw, intparm, 0, 0); + if (!rc) + return; + DBF_ERROR("%4x RETRY ERR", irq_ptr->schid.sch_no); + DBF_ERROR("rc:%4x", rc); + qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR); + } + wake_up(&cdev->private->wait_q); } -- cgit v1.2.3 From 2d8527f2f911fab84aec04df4788c0c23af3df48 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Wed, 10 Apr 2024 11:46:19 +0200 Subject: s390/cio: fix race condition during online processing A race condition exists in ccw_device_set_online() that can cause the online process to fail, leaving the affected device in an inconsistent state. As a result, subsequent attempts to set that device online fail with return code ENODEV. The problem occurs when a path verification request arrives after a wait for final device state completed, but before the result state is evaluated. Fix this by ensuring that the CCW-device lock is held between determining final state and checking result state. Note that since: commit 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") path verification requests are much more likely to occur during boot, resulting in an increased chance of this race condition occurring. Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") Reviewed-by: Alexandra Winter Reviewed-by: Vineeth Vijayan Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/device.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index f95d12345d98..920f550bc313 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -363,10 +363,8 @@ int ccw_device_set_online(struct ccw_device *cdev) spin_lock_irq(cdev->ccwlock); ret = ccw_device_online(cdev); - spin_unlock_irq(cdev->ccwlock); - if (ret == 0) - wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev)); - else { + if (ret) { + spin_unlock_irq(cdev->ccwlock); CIO_MSG_EVENT(0, "ccw_device_online returned %d, " "device 0.%x.%04x\n", ret, cdev->private->dev_id.ssid, @@ -375,7 +373,12 @@ int ccw_device_set_online(struct ccw_device *cdev) put_device(&cdev->dev); return ret; } - spin_lock_irq(cdev->ccwlock); + /* Wait until a final state is reached */ + while (!dev_fsm_final_state(cdev)) { + spin_unlock_irq(cdev->ccwlock); + wait_event(cdev->private->wait_q, dev_fsm_final_state(cdev)); + spin_lock_irq(cdev->ccwlock); + } /* Check if online processing was successful */ if ((cdev->private->state != DEV_STATE_ONLINE) && (cdev->private->state != DEV_STATE_W4SENSE)) { -- cgit v1.2.3 From 6f76592ef63a1ffd8949f0828d24da7913ddb6d8 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Wed, 10 Apr 2024 11:46:20 +0200 Subject: s390/cio: log fake IRB events Add traces when queueing and delivering fake IRBs. These are significant events that might have an impact on device driver processing and are therefore relevant for problem analysis. Reviewed-by: Vineeth Vijayan Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/device_fsm.c | 5 +++++ drivers/s390/cio/device_ops.c | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 65d8b2cfd626..42791fa0b80e 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -504,6 +504,11 @@ callback: ccw_device_done(cdev, DEV_STATE_ONLINE); /* Deliver fake irb to device driver, if needed. */ if (cdev->private->flags.fake_irb) { + CIO_MSG_EVENT(2, "fakeirb: deliver device 0.%x.%04x intparm %lx type=%d\n", + cdev->private->dev_id.ssid, + cdev->private->dev_id.devno, + cdev->private->intparm, + cdev->private->flags.fake_irb); create_fake_irb(&cdev->private->dma_area->irb, cdev->private->flags.fake_irb); cdev->private->flags.fake_irb = 0; diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index 40c97f873075..acd6790dba4d 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -208,6 +208,10 @@ int ccw_device_start_timeout_key(struct ccw_device *cdev, struct ccw1 *cpa, if (!cdev->private->flags.fake_irb) { cdev->private->flags.fake_irb = FAKE_CMD_IRB; cdev->private->intparm = intparm; + CIO_MSG_EVENT(2, "fakeirb: queue device 0.%x.%04x intparm %lx type=%d\n", + cdev->private->dev_id.ssid, + cdev->private->dev_id.devno, intparm, + cdev->private->flags.fake_irb); return 0; } else /* There's already a fake I/O around. */ @@ -551,6 +555,10 @@ int ccw_device_tm_start_timeout_key(struct ccw_device *cdev, struct tcw *tcw, if (!cdev->private->flags.fake_irb) { cdev->private->flags.fake_irb = FAKE_TM_IRB; cdev->private->intparm = intparm; + CIO_MSG_EVENT(2, "fakeirb: queue device 0.%x.%04x intparm %lx type=%d\n", + cdev->private->dev_id.ssid, + cdev->private->dev_id.devno, intparm, + cdev->private->flags.fake_irb); return 0; } else /* There's already a fake I/O around. */ -- cgit v1.2.3 From 3ec4848913d695245716ea45ca4872d9dff097a5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 11 Apr 2024 11:23:48 +0800 Subject: block: fix that blk_time_get_ns() doesn't update time after schedule While monitoring the throttle time of IO from iocost, it's found that such time is always zero after the io_schedule() from ioc_rqos_throttle, for example, with the following debug patch: + printk("%s-%d: %s enter %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } + printk("%s-%d: %s exit %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); It can be observerd that blk_time_get_ns() always return the same time: [ 1068.096579] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.272587] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.274389] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.472690] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.474485] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.672656] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.674451] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.872655] fio-1268: ioc_rqos_throttle exit 1067901962288 And I think the root cause is that 'PF_BLOCK_TS' is always cleared by blk_flush_plug() before scheduel(), hence blk_plug_invalidate_ts() will never be called: blk_time_get_ns plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS; io_schedule: io_schedule_prepare blk_flush_plug __blk_flush_plug /* the flag is cleared, while time is not */ current->flags &= ~PF_BLOCK_TS; schedule sched_update_worker /* the flag is not set, hence plug->cur_ktime is not cleared */ if (tsk->flags & PF_BLOCK_TS) blk_plug_invalidate_ts() blk_time_get_ns /* got the time stashed before schedule */ return plug->cur_ktime; Fix the problem by clearing cached time in __blk_flush_plug(). Fixes: 06b23f92af87 ("block: update cached timestamp post schedule/preemption") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240411032349.3051233-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-core.c b/block/blk-core.c index 3a6f5603fb44..b795ac177281 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1197,6 +1197,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } -- cgit v1.2.3 From 16767502aa990cca2cb7d1372b31d328c4c85b40 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 12 Apr 2024 14:35:36 +0200 Subject: selftests: kselftest: Fix build failure with NOLIBC As Mark explains ksft_min_kernel_version() can't be compiled with nolibc, it doesn't implement uname(). Fixes: 6d029c25b71f ("selftests/timers/posix_timers: Reimplement check_timer_distribution()") Reported-by: Mark Brown Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240412123536.GA32444@redhat.com Closes: https://lore.kernel.org/all/f0523b3a-ea08-4615-b0fb-5b504a2d39df@sirena.org.uk/ --- tools/testing/selftests/kselftest.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 0591974b57e0..7b89362da4bf 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -395,6 +395,10 @@ static inline __noreturn __printf(1, 2) int ksft_exit_skip(const char *msg, ...) static inline int ksft_min_kernel_version(unsigned int min_major, unsigned int min_minor) { +#ifdef NOLIBC + ksft_print_msg("NOLIBC: Can't check kernel version: Function not implemented\n"); + return 0; +#else unsigned int major, minor; struct utsname info; @@ -402,6 +406,7 @@ static inline int ksft_min_kernel_version(unsigned int min_major, ksft_exit_fail_msg("Can't parse kernel version\n"); return major > min_major || (major == min_major && minor >= min_minor); +#endif } #endif /* __KSELFTEST_H */ -- cgit v1.2.3 From 46dad3c1e57897ab9228332f03e1c14798d2d3b9 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 12 Apr 2024 16:17:32 +0800 Subject: init/main.c: Fix potential static_command_line memory overflow We allocate memory of size 'xlen + strlen(boot_command_line) + 1' for static_command_line, but the strings copied into static_command_line are extra_command_line and command_line, rather than extra_command_line and boot_command_line. When strlen(command_line) > strlen(boot_command_line), static_command_line will overflow. This patch just recovers strlen(command_line) which was miss-consolidated with strlen(boot_command_line) in the commit f5c7310ac73e ("init/main: add checks for the return value of memblock_alloc*()") Link: https://lore.kernel.org/all/20240412081733.35925-2-ytcoode@gmail.com/ Fixes: f5c7310ac73e ("init/main: add checks for the return value of memblock_alloc*()") Cc: stable@vger.kernel.org Signed-off-by: Yuntao Wang Signed-off-by: Masami Hiramatsu (Google) --- init/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/init/main.c b/init/main.c index 881f6230ee59..5dcf5274c09c 100644 --- a/init/main.c +++ b/init/main.c @@ -636,6 +636,8 @@ static void __init setup_command_line(char *command_line) if (!saved_command_line) panic("%s: Failed to allocate %zu bytes\n", __func__, len + ilen); + len = xlen + strlen(command_line) + 1; + static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); if (!static_command_line) panic("%s: Failed to allocate %zu bytes\n", __func__, len); -- cgit v1.2.3 From d5cf50dafc9dd5faa1e61e7021e3496ddf7fd61e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 12 Apr 2024 10:05:10 -0700 Subject: Kconfig: add some hidden tabs on purpose Commit d96c36004e31 ("tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry") removed a hidden tab because it apparently showed breakage in some third-party kernel config parsing tool. It wasn't clear what tool it was, but let's make sure it gets fixed. Because if you can't parse tabs as whitespace, you should not be parsing the kernel Kconfig files. In fact, let's make such breakage more obvious than some esoteric ftrace record size option. If you can't parse tabs, you can't have page sizes. Yes, tab-vs-space confusion is sadly a traditional Unix thing, and 'make' is famous for being broken in this regard. But no, that does not mean that it's ok. I'd add more random tabs to our Kconfig files, but I don't want to make things uglier than necessary. But it *might* bbe necessary if it turns out we see more of this kind of silly tooling. Fixes: d96c36004e31 ("tracing: Fix FTRACE_RECORD_RECURSION_SIZE Kconfig entry") Link: https://lore.kernel.org/lkml/CAHk-=wj-hLLN_t_m5OL4dXLaxvXKy_axuoJYXif7iczbfgAevQ@mail.gmail.com/ Signed-off-by: Linus Torvalds --- arch/Kconfig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 9f066785bb71..65afb1de48b3 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1172,12 +1172,12 @@ config PAGE_SIZE_LESS_THAN_256KB config PAGE_SHIFT int - default 12 if PAGE_SIZE_4KB - default 13 if PAGE_SIZE_8KB - default 14 if PAGE_SIZE_16KB - default 15 if PAGE_SIZE_32KB - default 16 if PAGE_SIZE_64KB - default 18 if PAGE_SIZE_256KB + default 12 if PAGE_SIZE_4KB + default 13 if PAGE_SIZE_8KB + default 14 if PAGE_SIZE_16KB + default 15 if PAGE_SIZE_32KB + default 16 if PAGE_SIZE_64KB + default 18 if PAGE_SIZE_256KB # This allows to use a set of generic functions to determine mmap base # address by giving priority to top-down scheme only if the process -- cgit v1.2.3 From 11baa36d317321f5d54059f07d243c5a1dbbfbb2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Apr 2024 19:03:05 +0200 Subject: gpio: lpc32xx: fix module autoloading Add MODULE_DEVICE_TABLE(), so the module could be properly autoloaded based on the alias from of_device_id table. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-lpc32xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-lpc32xx.c b/drivers/gpio/gpio-lpc32xx.c index 5ef8af824980..c097e310c9e8 100644 --- a/drivers/gpio/gpio-lpc32xx.c +++ b/drivers/gpio/gpio-lpc32xx.c @@ -529,6 +529,7 @@ static const struct of_device_id lpc32xx_gpio_of_match[] = { { .compatible = "nxp,lpc3220-gpio", }, { }, }; +MODULE_DEVICE_TABLE(of, lpc32xx_gpio_of_match); static struct platform_driver lpc32xx_gpio_driver = { .driver = { -- cgit v1.2.3 From 79336504781e7fee5ddaf046dcc186c8dfdf60b1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 12 Apr 2024 08:41:15 +0900 Subject: ata: libata-scsi: Fix ata_scsi_dev_rescan() error path Commit 0c76106cb975 ("scsi: sd: Fix TCG OPAL unlock on system resume") incorrectly handles failures of scsi_resume_device() in ata_scsi_dev_rescan(), leading to a double call to spin_unlock_irqrestore() to unlock a device port. Fix this by redefining the goto labels used in case of errors and only unlock the port scsi_scan_mutex when scsi_resume_device() fails. Bug found with the Smatch static checker warning: drivers/ata/libata-scsi.c:4774 ata_scsi_dev_rescan() error: double unlocked 'ap->lock' (orig line 4757) Reported-by: Dan Carpenter Fixes: 0c76106cb975 ("scsi: sd: Fix TCG OPAL unlock on system resume") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel --- drivers/ata/libata-scsi.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2f4c58837641..e954976891a9 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4745,7 +4745,7 @@ void ata_scsi_dev_rescan(struct work_struct *work) * bail out. */ if (ap->pflags & ATA_PFLAG_SUSPENDED) - goto unlock; + goto unlock_ap; if (!sdev) continue; @@ -4758,7 +4758,7 @@ void ata_scsi_dev_rescan(struct work_struct *work) if (do_resume) { ret = scsi_resume_device(sdev); if (ret == -EWOULDBLOCK) - goto unlock; + goto unlock_scan; dev->flags &= ~ATA_DFLAG_RESUMING; } ret = scsi_rescan_device(sdev); @@ -4766,12 +4766,13 @@ void ata_scsi_dev_rescan(struct work_struct *work) spin_lock_irqsave(ap->lock, flags); if (ret) - goto unlock; + goto unlock_ap; } } -unlock: +unlock_ap: spin_unlock_irqrestore(ap->lock, flags); +unlock_scan: mutex_unlock(&ap->scsi_scan_mutex); /* Reschedule with a delay if scsi_rescan_device() returned an error */ -- cgit v1.2.3 From c0297e7dd50795d559f3534887a6de1756b35d0f Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 11 Apr 2024 20:12:24 +0000 Subject: ata: libata-core: Allow command duration limits detection for ACS-4 drives Even though the command duration limits (CDL) feature was first added in ACS-5 (major version 12), there are some ACS-4 (major version 11) drives that implement CDL as well. IDENTIFY_DEVICE, SUPPORTED_CAPABILITIES, and CURRENT_SETTINGS log pages are mandatory in the ACS-4 standard so it should be safe to read these log pages on older drives implementing the ACS-4 standard. Fixes: 62e4a60e0cdb ("scsi: ata: libata: Detect support for command duration limits") Cc: stable@vger.kernel.org Signed-off-by: Igor Pylypiv Signed-off-by: Damien Le Moal --- drivers/ata/libata-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index be3412cdb22e..c449d60d9bb9 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2539,7 +2539,7 @@ static void ata_dev_config_cdl(struct ata_device *dev) bool cdl_enabled; u64 val; - if (ata_id_major_version(dev->id) < 12) + if (ata_id_major_version(dev->id) < 11) goto not_supported; if (!ata_log_supported(dev, ATA_LOG_IDENTIFY_DEVICE) || -- cgit v1.2.3 From 283454c8a123072e5c386a5a2b5fc576aa455b6f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:15 -0700 Subject: af_unix: Call manage_oob() for every skb in unix_stream_read_generic(). When we call recv() for AF_UNIX socket, we first peek one skb and calls manage_oob() to check if the skb is sent with MSG_OOB. However, when we fetch the next (and the following) skb, manage_oob() is not called now, leading a wrong behaviour. Let's say a socket send()s "hello" with MSG_OOB and the peer tries to recv() 5 bytes with MSG_PEEK. Here, we should get only "hell" without 'o', but actually not: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hello' The first skb fills 4 bytes, and the next skb is peeked but not properly checked by manage_oob(). Let's move up the again label to call manage_oob() for evry skb. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'hello', MSG_OOB) 5 >>> c2.recv(5, MSG_PEEK) b'hell' Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d032eb5fa6df..f297320438bf 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2741,6 +2741,7 @@ redo: last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; +again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); @@ -2752,7 +2753,6 @@ redo: } } #endif -again: if (skb == NULL) { if (copied >= target) goto unlock; -- cgit v1.2.3 From 22dd70eb2c3d754862964377a75abafd3167346b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Apr 2024 10:10:16 -0700 Subject: af_unix: Don't peek OOB data without MSG_OOB. Currently, we can read OOB data without MSG_OOB by using MSG_PEEK when OOB data is sitting on the front row, which is apparently wrong. >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) b'a' If manage_oob() is called when no data has been copied, we only check if the socket enables SO_OOBINLINE or MSG_PEEK is not used. Otherwise, the skb is returned as is. However, here we should return NULL if MSG_PEEK is set and no data has been copied. Also, in such a case, we should not jump to the redo label because we will be caught in the loop and hog the CPU until normal data comes in. Then, we need to handle skb == NULL case with the if-clause below the manage_oob() block. With this patch: >>> from socket import * >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM) >>> c1.send(b'a', MSG_OOB) 1 >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT) Traceback (most recent call last): File "", line 1, in BlockingIOError: [Errno 11] Resource temporarily unavailable Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240410171016.7621-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f297320438bf..9a6ad5974dff 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2663,7 +2663,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } - } else if (!(flags & MSG_PEEK)) { + } else if (flags & MSG_PEEK) { + skb = NULL; + } else { skb_unlink(skb, &sk->sk_receive_queue); WRITE_ONCE(u->oob_skb, NULL); if (!WARN_ON_ONCE(skb_unref(skb))) @@ -2745,11 +2747,9 @@ again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); - if (!skb) { + if (!skb && copied) { unix_state_unlock(sk); - if (copied) - break; - goto redo; + break; } } #endif -- cgit v1.2.3 From 68aba00483c7c4102429bcdfdece7289a8ab5c8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Thu, 11 Apr 2024 11:13:18 +0000 Subject: net: sparx5: flower: fix fragment flags handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I noticed that only 3 out of the 4 input bits were used, mt.key->flags & FLOW_DIS_IS_FRAGMENT was never checked. In order to avoid a complicated maze, I converted it to use a 16 byte mapping table. As shown in the table below the old heuristics doesn't always do the right thing, ie. when FLOW_DIS_IS_FRAGMENT=1/1 then it used to only match follow-up fragment packets. Here are all the combinations, and their resulting new/old VCAP key/mask filter: /- FLOW_DIS_IS_FRAGMENT (key/mask) | /- FLOW_DIS_FIRST_FRAG (key/mask) | | /-- new VCAP fragment (key/mask) v v v v- old VCAP fragment (key/mask) 0/0 0/0 -/- -/- impossible (due to entry cond. on mask) 0/0 0/1 -/- 0/3 !! invalid (can't match non-fragment + follow-up frag) 0/0 1/0 -/- -/- impossible (key > mask) 0/0 1/1 1/3 1/3 first fragment 0/1 0/0 0/3 3/3 !! not fragmented 0/1 0/1 0/3 3/3 !! not fragmented (+ not first fragment) 0/1 1/0 -/- -/- impossible (key > mask) 0/1 1/1 -/- 1/3 !! invalid (non-fragment and first frag) 1/0 0/0 -/- -/- impossible (key > mask) 1/0 0/1 -/- -/- impossible (key > mask) 1/0 1/0 -/- -/- impossible (key > mask) 1/0 1/1 -/- -/- impossible (key > mask) 1/1 0/0 1/1 3/3 !! some fragment 1/1 0/1 3/3 3/3 follow-up fragment 1/1 1/0 -/- -/- impossible (key > mask) 1/1 1/1 1/3 1/3 first fragment In the datasheet the VCAP fragment values are documented as: 0 = no fragment 1 = initial fragment 2 = suspicious fragment 3 = valid follow-up fragment Result: 3 combinations match the old behavior, 3 combinations have been corrected, 2 combinations are now invalid, and fail, 8 combinations are impossible. It should now be aligned with how FLOW_DIS_IS_FRAGMENT and FLOW_DIS_FIRST_FRAG is set in __skb_flow_dissect() in net/core/flow_dissector.c Since the VCAP fragment values are not a bitfield, we have to ignore the suspicious fragment value, eg. when matching on any kind of fragment with FLOW_DIS_IS_FRAGMENT=1/1. Only compile tested, and logic tested in userspace, as I unfortunately don't have access to this switch chip (yet). Fixes: d6c2964db3fe ("net: microchip: sparx5: Adding more tc flower keys for the IS2 VCAP") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Steen Hegelund Tested-by: Daniel Machon Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20240411111321.114095-1-ast@fiberby.net Signed-off-by: Jakub Kicinski --- .../ethernet/microchip/sparx5/sparx5_tc_flower.c | 61 ++++++++++++++-------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c index 523e0c470894..55f255a3c9db 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c @@ -36,6 +36,27 @@ struct sparx5_tc_flower_template { u16 l3_proto; /* protocol specified in the template */ }; +/* SparX-5 VCAP fragment types: + * 0 = no fragment, 1 = initial fragment, + * 2 = suspicious fragment, 3 = valid follow-up fragment + */ +enum { /* key / mask */ + FRAG_NOT = 0x03, /* 0 / 3 */ + FRAG_SOME = 0x11, /* 1 / 1 */ + FRAG_FIRST = 0x13, /* 1 / 3 */ + FRAG_LATER = 0x33, /* 3 / 3 */ + FRAG_INVAL = 0xff, /* invalid */ +}; + +/* Flower fragment flag to VCAP fragment type mapping */ +static const u8 sparx5_vcap_frag_map[4][4] = { /* is_frag */ + { FRAG_INVAL, FRAG_INVAL, FRAG_INVAL, FRAG_FIRST }, /* 0/0 */ + { FRAG_NOT, FRAG_NOT, FRAG_INVAL, FRAG_INVAL }, /* 0/1 */ + { FRAG_INVAL, FRAG_INVAL, FRAG_INVAL, FRAG_INVAL }, /* 1/0 */ + { FRAG_SOME, FRAG_LATER, FRAG_INVAL, FRAG_FIRST } /* 1/1 */ + /* 0/0 0/1 1/0 1/1 <-- first_frag */ +}; + static int sparx5_tc_flower_es0_tpid(struct vcap_tc_flower_parse_usage *st) { @@ -145,29 +166,27 @@ sparx5_tc_flower_handler_control_usage(struct vcap_tc_flower_parse_usage *st) flow_rule_match_control(st->frule, &mt); if (mt.mask->flags) { - if (mt.mask->flags & FLOW_DIS_FIRST_FRAG) { - if (mt.key->flags & FLOW_DIS_FIRST_FRAG) { - value = 1; /* initial fragment */ - mask = 0x3; - } else { - if (mt.mask->flags & FLOW_DIS_IS_FRAGMENT) { - value = 3; /* follow up fragment */ - mask = 0x3; - } else { - value = 0; /* no fragment */ - mask = 0x3; - } - } - } else { - if (mt.mask->flags & FLOW_DIS_IS_FRAGMENT) { - value = 3; /* follow up fragment */ - mask = 0x3; - } else { - value = 0; /* no fragment */ - mask = 0x3; - } + u8 is_frag_key = !!(mt.key->flags & FLOW_DIS_IS_FRAGMENT); + u8 is_frag_mask = !!(mt.mask->flags & FLOW_DIS_IS_FRAGMENT); + u8 is_frag_idx = (is_frag_key << 1) | is_frag_mask; + + u8 first_frag_key = !!(mt.key->flags & FLOW_DIS_FIRST_FRAG); + u8 first_frag_mask = !!(mt.mask->flags & FLOW_DIS_FIRST_FRAG); + u8 first_frag_idx = (first_frag_key << 1) | first_frag_mask; + + /* Lookup verdict based on the 2 + 2 input bits */ + u8 vdt = sparx5_vcap_frag_map[is_frag_idx][first_frag_idx]; + + if (vdt == FRAG_INVAL) { + NL_SET_ERR_MSG_MOD(st->fco->common.extack, + "Match on invalid fragment flag combination"); + return -EINVAL; } + /* Extract VCAP fragment key and mask from verdict */ + value = (vdt >> 4) & 0x3; + mask = vdt & 0x3; + err = vcap_rule_add_key_u32(st->vrule, VCAP_KF_L3_FRAGMENT_TYPE, value, mask); -- cgit v1.2.3 From 37cc10da3a50e6d0cb9808a90b7da9b4868794dd Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Apr 2024 14:54:39 +0300 Subject: net/mlx5: Lag, restore buckets number to default after hash LAG deactivation The cited patch introduces the concept of buckets in LAG in hash mode. However, the patch doesn't clear the number of buckets in the LAG deactivation. This results in using the wrong number of buckets in case user create a hash mode LAG and afterwards create a non-hash mode LAG. Hence, restore buckets number to default after hash mode LAG deactivation. Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode") Signed-off-by: Shay Drory Reviewed-by: Maor Gottlieb Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index d14459e5c04f..69d482f7c5a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -703,8 +703,10 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev) return err; } - if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) + if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) { mlx5_lag_port_sel_destroy(ldev); + ldev->buckets = 1; + } if (mlx5_lag_has_drop_rule(ldev)) mlx5_lag_drop_rule_cleanup(ldev); -- cgit v1.2.3 From aa4ac90d04f4371466000825adb44935ecb5c974 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 11 Apr 2024 14:54:40 +0300 Subject: net/mlx5: SD, Handle possible devcom ERR_PTR Check if devcom holds an error pointer and return immediately. This fixes Smatch static checker warning: drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c:221 sd_register() error: 'devcom' dereferencing possible ERR_PTR() Enhance mlx5_devcom_register_component() so it stops returning NULL, making it easier for its callers. Fixes: d3d057666090 ("net/mlx5: SD, Implement devcom communication and primary election") Reported-by: Dan Carpenter Link: https://lore.kernel.org/all/f09666c8-e604-41f6-958b-4cc55c73faf9@gmail.com/T/ Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20240411115444.374475-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b375ef268671..319930c04093 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -209,8 +209,8 @@ static int mlx5e_devcom_init_mpv(struct mlx5e_priv *priv, u64 *data) *data, mlx5e_devcom_event_mpv, priv); - if (IS_ERR_OR_NULL(priv->devcom)) - return -EOPNOTSUPP; + if (IS_ERR(priv->devcom)) + return PTR_ERR(priv->devcom); if (mlx5_core_is_mp_master(priv->mdev)) { mlx5_devcom_send_event(priv->devcom, MPV_DEVCOM_MASTER_UP, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 1f60954c12f7..844d3e3a65dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3060,7 +3060,7 @@ void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw, u64 key) key, mlx5_esw_offloads_devcom_event, esw); - if (IS_ERR_OR_NULL(esw->devcom)) + if (IS_ERR(esw->devcom)) return; mlx5_devcom_send_event(esw->devcom, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c index e7d59cfa8708..7b0766c89f4c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c @@ -220,7 +220,7 @@ mlx5_devcom_register_component(struct mlx5_devcom_dev *devc, struct mlx5_devcom_comp *comp; if (IS_ERR_OR_NULL(devc)) - return NULL; + return ERR_PTR(-EINVAL); mutex_lock(&comp_list_lock); comp = devcom_component_get(devc, id, key, handler); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 5b28084e8a03..dd5d186dc614 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -213,8 +213,8 @@ static int sd_register(struct mlx5_core_dev *dev) sd = mlx5_get_sd(dev); devcom = mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_SD_GROUP, sd->group_id, NULL, dev); - if (!devcom) - return -ENOMEM; + if (IS_ERR(devcom)) + return PTR_ERR(devcom); sd->devcom = devcom; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 59806553889e..a39c4b25ba28 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -956,7 +956,7 @@ static void mlx5_register_hca_devcom_comp(struct mlx5_core_dev *dev) mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_HCA_PORTS, mlx5_query_nic_system_image_guid(dev), NULL, dev); - if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp)) + if (IS_ERR(dev->priv.hca_devcom_comp)) mlx5_core_err(dev, "Failed to register devcom HCA component\n"); } -- cgit v1.2.3 From bf729988303a27833a86acb561f42b9a3cc12728 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 11 Apr 2024 14:54:41 +0300 Subject: net/mlx5: Restore mistakenly dropped parts in register devlink flow Code parts from cited commit were mistakenly dropped while rebasing before submission. Add them here. Fixes: c6e77aa9dd82 ("net/mlx5: Register devlink first under devlink lock") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 ++++- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a39c4b25ba28..331ce47f51a1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1699,12 +1699,15 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev) err = mlx5_devlink_params_register(priv_to_devlink(dev)); if (err) { mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err); - goto query_hca_caps_err; + goto params_reg_err; } devl_unlock(devlink); return 0; +params_reg_err: + devl_unregister(devlink); + devl_unlock(devlink); query_hca_caps_err: devl_unregister(devlink); devl_unlock(devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index e3bf8c7e4baa..7ebe71280827 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -75,7 +75,6 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia goto peer_devlink_set_err; } - devlink_register(devlink); return 0; peer_devlink_set_err: -- cgit v1.2.3 From 6c685bdb9e1af966ec0278dbd4068ec39ae88c2d Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Thu, 11 Apr 2024 14:54:42 +0300 Subject: net/mlx5e: Use channel mdev reference instead of global mdev instance for coalescing Channels can potentially have independent mdev instances. Do not refer to the global mdev instance in the mlx5e_priv instance for channel FW operations related to coalescing. CQ numbers that would be valid on the channel's mdev instance may not be correctly referenced if using the mlx5e_priv instance. Fixes: 67936e138586 ("net/mlx5e: Let channels be SD-aware") Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 8f101181648c..67a29826bb57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -589,12 +589,12 @@ static int mlx5e_get_coalesce(struct net_device *netdev, static void mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { - struct mlx5_core_dev *mdev = priv->mdev; int tc; int i; for (i = 0; i < priv->channels.num; ++i) { struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5_core_dev *mdev = c->mdev; for (tc = 0; tc < c->num_tc; tc++) { mlx5_core_modify_cq_moderation(mdev, @@ -608,11 +608,11 @@ mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coal static void mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { - struct mlx5_core_dev *mdev = priv->mdev; int i; for (i = 0; i < priv->channels.num; ++i) { struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5_core_dev *mdev = c->mdev; mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq, coal->rx_coalesce_usecs, -- cgit v1.2.3 From fdce06bda7e56b2a34c53ea58bad7af2fae96da1 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Apr 2024 14:54:43 +0300 Subject: net/mlx5e: Acquire RTNL lock before RQs/SQs activation/deactivation netif_queue_set_napi asserts whether RTNL lock is held if the netdev is initialized. Acquire the RTNL lock before activating or deactivating RQs/SQs if the lock has not been held before in the flow. Fixes: f25e7b82635f ("net/mlx5e: link NAPI instances to queues and IRQs") Cc: Joe Damato Signed-off-by: Carolina Jubran Reviewed-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 0ab9db319530..22918b2ef7f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -108,7 +108,10 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) mlx5e_reset_txqsq_cc_pc(sq); sq->stats->recover++; clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); + rtnl_lock(); mlx5e_activate_txqsq(sq); + rtnl_unlock(); + if (sq->channel) mlx5e_trigger_napi_icosq(sq->channel); else @@ -179,12 +182,16 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) carrier_ok = netif_carrier_ok(netdev); netif_carrier_off(netdev); + rtnl_lock(); mlx5e_deactivate_priv_channels(priv); + rtnl_unlock(); mlx5e_ptp_close(chs->ptp); err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp); + rtnl_lock(); mlx5e_activate_priv_channels(priv); + rtnl_unlock(); /* return carrier back if needed */ if (carrier_ok) -- cgit v1.2.3 From fef965764cf562f28afb997b626fc7c3cec99693 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Apr 2024 14:54:44 +0300 Subject: net/mlx5e: Prevent deadlock while disabling aRFS When disabling aRFS under the `priv->state_lock`, any scheduled aRFS works are canceled using the `cancel_work_sync` function, which waits for the work to end if it has already started. However, while waiting for the work handler, the handler will try to acquire the `state_lock` which is already acquired. The worker acquires the lock to delete the rules if the state is down, which is not the worker's responsibility since disabling aRFS deletes the rules. Add an aRFS state variable, which indicates whether the aRFS is enabled and prevent adding rules when the aRFS is disabled. Kernel log: ====================================================== WARNING: possible circular locking dependency detected 6.7.0-rc4_net_next_mlx5_5483eb2 #1 Tainted: G I ------------------------------------------------------ ethtool/386089 is trying to acquire lock: ffff88810f21ce68 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}, at: __flush_work+0x74/0x4e0 but task is already holding lock: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&priv->state_lock){+.+.}-{3:3}: __mutex_lock+0x80/0xc90 arfs_handle_work+0x4b/0x3b0 [mlx5_core] process_one_work+0x1dc/0x4a0 worker_thread+0x1bf/0x3c0 kthread+0xd7/0x100 ret_from_fork+0x2d/0x50 ret_from_fork_asm+0x11/0x20 -> #0 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}: __lock_acquire+0x17b4/0x2c80 lock_acquire+0xd0/0x2b0 __flush_work+0x7a/0x4e0 __cancel_work_timer+0x131/0x1c0 arfs_del_rules+0x143/0x1e0 [mlx5_core] mlx5e_arfs_disable+0x1b/0x30 [mlx5_core] mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core] ethnl_set_channels+0x28f/0x3b0 ethnl_default_set_doit+0xec/0x240 genl_family_rcv_msg_doit+0xd0/0x120 genl_rcv_msg+0x188/0x2c0 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1a1/0x270 netlink_sendmsg+0x214/0x460 __sock_sendmsg+0x38/0x60 __sys_sendto+0x113/0x170 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&priv->state_lock); lock((work_completion)(&rule->arfs_work)); lock(&priv->state_lock); lock((work_completion)(&rule->arfs_work)); *** DEADLOCK *** 3 locks held by ethtool/386089: #0: ffffffff82ea7210 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40 #1: ffffffff82e94c88 (rtnl_mutex){+.+.}-{3:3}, at: ethnl_default_set_doit+0xd3/0x240 #2: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core] stack backtrace: CPU: 15 PID: 386089 Comm: ethtool Tainted: G I 6.7.0-rc4_net_next_mlx5_5483eb2 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x60/0xa0 check_noncircular+0x144/0x160 __lock_acquire+0x17b4/0x2c80 lock_acquire+0xd0/0x2b0 ? __flush_work+0x74/0x4e0 ? save_trace+0x3e/0x360 ? __flush_work+0x74/0x4e0 __flush_work+0x7a/0x4e0 ? __flush_work+0x74/0x4e0 ? __lock_acquire+0xa78/0x2c80 ? lock_acquire+0xd0/0x2b0 ? mark_held_locks+0x49/0x70 __cancel_work_timer+0x131/0x1c0 ? mark_held_locks+0x49/0x70 arfs_del_rules+0x143/0x1e0 [mlx5_core] mlx5e_arfs_disable+0x1b/0x30 [mlx5_core] mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core] ethnl_set_channels+0x28f/0x3b0 ethnl_default_set_doit+0xec/0x240 genl_family_rcv_msg_doit+0xd0/0x120 genl_rcv_msg+0x188/0x2c0 ? ethnl_ops_begin+0xb0/0xb0 ? genl_family_rcv_msg_dumpit+0xf0/0xf0 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1a1/0x270 netlink_sendmsg+0x214/0x460 __sock_sendmsg+0x38/0x60 __sys_sendto+0x113/0x170 ? do_user_addr_fault+0x53f/0x8f0 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e Fixes: 45bf454ae884 ("net/mlx5e: Enabling aRFS mechanism") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20240411115444.374475-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 27 ++++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index c7f542d0b8f0..93cf23278d93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -46,6 +46,10 @@ struct arfs_table { struct hlist_head rules_hash[ARFS_HASH_SIZE]; }; +enum { + MLX5E_ARFS_STATE_ENABLED, +}; + enum arfs_type { ARFS_IPV4_TCP, ARFS_IPV6_TCP, @@ -60,6 +64,7 @@ struct mlx5e_arfs_tables { spinlock_t arfs_lock; int last_filter_id; struct workqueue_struct *wq; + unsigned long state; }; struct arfs_tuple { @@ -170,6 +175,8 @@ int mlx5e_arfs_enable(struct mlx5e_flow_steering *fs) return err; } } + set_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state); + return 0; } @@ -455,6 +462,8 @@ static void arfs_del_rules(struct mlx5e_flow_steering *fs) int i; int j; + clear_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state); + spin_lock_bh(&arfs->arfs_lock); mlx5e_for_each_arfs_rule(rule, htmp, arfs->arfs_tables, i, j) { hlist_del_init(&rule->hlist); @@ -627,17 +636,8 @@ static void arfs_handle_work(struct work_struct *work) struct mlx5_flow_handle *rule; arfs = mlx5e_fs_get_arfs(priv->fs); - mutex_lock(&priv->state_lock); - if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - spin_lock_bh(&arfs->arfs_lock); - hlist_del(&arfs_rule->hlist); - spin_unlock_bh(&arfs->arfs_lock); - - mutex_unlock(&priv->state_lock); - kfree(arfs_rule); - goto out; - } - mutex_unlock(&priv->state_lock); + if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) + return; if (!arfs_rule->rule) { rule = arfs_add_rule(priv, arfs_rule); @@ -753,6 +753,11 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, return -EPROTONOSUPPORT; spin_lock_bh(&arfs->arfs_lock); + if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) { + spin_unlock_bh(&arfs->arfs_lock); + return -EPERM; + } + arfs_rule = arfs_find_rule(arfs_t, &fk); if (arfs_rule) { if (arfs_rule->rxq == rxq_index || work_busy(&arfs_rule->arfs_work)) { -- cgit v1.2.3 From 58caa786f1c02fd84919fb6db9eaecb22e8f7983 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 17:47:42 -0400 Subject: bcachefs: Fix UAFs of btree_insert_entry array The btree paths array is now dynamically resizable - and as well the btree_insert_entries array, as it needs to be the same size. The merge path (and interior update path) allocates new btree paths, thus can trigger a resize; thus we need to not retain direct pointers after invoking merge; similarly when running btree node triggers. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index aa9da4970740..9609ad370d71 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -499,9 +499,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - struct btree_insert_entry *btree_id_start) + unsigned btree_id_start) { - struct btree_insert_entry *i; bool trans_trigger_run; int ret, overwrite; @@ -514,13 +513,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, do { trans_trigger_run = false; - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + for (unsigned i = btree_id_start; + i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; i++) { - if (i->btree_id != btree_id) + if (trans->updates[i].btree_id != btree_id) continue; - ret = run_one_trans_trigger(trans, i, overwrite); + ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); if (ret < 0) return ret; if (ret) @@ -534,8 +533,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_insert_entry *btree_id_start = trans->updates; - unsigned btree_id = 0; + unsigned btree_id = 0, btree_id_start = 0; int ret = 0; /* @@ -549,8 +547,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) if (btree_id == BTREE_ID_alloc) continue; - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) + while (btree_id_start < trans->nr_updates && + trans->updates[btree_id_start].btree_id < btree_id) btree_id_start++; ret = run_btree_triggers(trans, btree_id, btree_id_start); @@ -558,11 +556,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) return ret; } - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; + if (i->btree_id > BTREE_ID_alloc) break; if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); if (ret) return ret; break; @@ -826,7 +826,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags struct bch_fs *c = trans->c; int ret = 0, u64s_delta = 0; - trans_for_each_update(trans, i) { + for (unsigned idx = 0; idx < trans->nr_updates; idx++) { + struct btree_insert_entry *i = trans->updates + idx; if (i->cached) continue; -- cgit v1.2.3 From 031ad9e7dbd18c63e671fc4d98be3082189b8a63 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 21:30:43 -0400 Subject: bcachefs: Check for packed bkeys that are too big add missing validation; fixes assertion pop in bkey unpack Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.h | 6 ++++++ fs/bcachefs/btree_io.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index cf23ff47bed8..3a45d128f608 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -314,6 +314,12 @@ static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } +static inline bool bkeyp_u64s_valid(const struct bkey_format *f, + const struct bkey_packed *k) +{ + return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); +} + static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, const struct bkey_packed *k) { diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index d7de82ac3893..b1aaf0550644 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -831,7 +831,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } -static bool __bkey_valid(struct bch_fs *c, struct btree *b, +static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, struct bset *i, struct bkey_packed *k) { if (bkey_p_next(k) > vstruct_last(i)) @@ -840,7 +840,7 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; - if (k->u64s < bkeyp_key_u64s(&b->format, k)) + if (!bkeyp_u64s_valid(&b->format, k)) return false; struct printbuf buf = PRINTBUF; @@ -884,11 +884,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, "invalid bkey format %u", k->format)) goto drop_this_key; - if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), + if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_bad_u64s, - "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) + "bad k->u64s %u (min %u max %lu)", k->u64s, + bkeyp_key_u64s(&b->format, k), + U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) goto drop_this_key; if (!write) @@ -947,13 +949,12 @@ drop_this_key: * do */ - if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { + if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { for (next_good_key = 1; next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; next_good_key++) - if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) + if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) goto got_good_key; - } /* -- cgit v1.2.3 From 87cb0239c87f608fd48fb50f9f53f129dcfd73f4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:38:07 -0400 Subject: bcachefs: btree node scan: handle encrypted nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 556f76f5c84e..20f2b37c4474 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -133,9 +133,19 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (le64_to_cpu(bn->magic) != bset_magic(c)) return; + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { + struct nonce nonce = btree_nonce(&bn->keys, 0); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); + } + if (btree_id_is_alloc(BTREE_NODE_ID(bn))) return; + if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), -- cgit v1.2.3 From dc32c118ec6b1032693c489a0aa9e011f0acdb1a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 21:20:27 -0400 Subject: bcachefs: fix unsafety in bch2_extent_ptr_to_text() Need to check if we have a valid bucket before checking if ptr is stale Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 0e3ca99fbd2d..36d12d2adb81 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -998,7 +998,9 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) + if (b >= ca->mi.first_bucket && + b < ca->mi.nbuckets && + ptr_stale(ca, ptr)) prt_printf(out, " stale"); } } -- cgit v1.2.3 From 2aeed876d7c2c2fb4c6b92f59bdf7e73cfe5e098 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:37:24 -0400 Subject: bcachefs: fix unsafety in bch2_stripe_to_text() .to_text() functions need to work on key values that didn't pass .valid Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 46 +++++++++++++++++++++++++--------------------- fs/bcachefs/ec.h | 2 ++ 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 082075244e16..c0b2b2f180e0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -131,29 +131,33 @@ fsck_err: void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; + struct bch_stripe s = {}; + + memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); + + unsigned nr_data = s.nr_blocks - s.nr_redundant; prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", - s->algorithm, - le16_to_cpu(s->sectors), - nr_data, - s->nr_redundant, - s->csum_type, - 1U << s->csum_granularity_bits); - - for (i = 0; i < s->nr_blocks; i++) { - const struct bch_extent_ptr *ptr = s->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); - if (i < nr_data) - prt_printf(out, "#%u", stripe_blockcount_get(s, i)); - prt_printf(out, " gen %u", ptr->gen); - if (ptr_stale(ca, ptr)) - prt_printf(out, " stale"); + s.algorithm, + le16_to_cpu(s.sectors), + nr_data, + s.nr_redundant, + s.csum_type, + 1U << s.csum_granularity_bits); + + for (unsigned i = 0; i < s.nr_blocks; i++) { + const struct bch_extent_ptr *ptr = sp->ptrs + i; + + if ((void *) ptr >= bkey_val_end(k)) + break; + + bch2_extent_ptr_to_text(out, c, ptr); + + if (s.csum_type < BCH_CSUM_NR && + i < nr_data && + stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) + prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); } } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f4369b02e805..f042616888b0 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -32,6 +32,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) static inline unsigned stripe_csum_offset(const struct bch_stripe *s, unsigned dev, unsigned csum_idx) { + EBUG_ON(s->csum_type >= BCH_CSUM_NR); + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; return sizeof(struct bch_stripe) + -- cgit v1.2.3 From 7b4c4ccf848b359762c1525bab5a12f008f14a65 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Apr 2024 23:58:36 -0400 Subject: bcachefs: fix race in bch2_btree_node_evict() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 84474324dba9..c7f156320a35 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1148,6 +1148,8 @@ wait_on_io: btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + if (unlikely(b->hash_val != btree_ptr_hash_val(k))) + goto out; if (btree_node_dirty(b)) { __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); @@ -1162,7 +1164,7 @@ wait_on_io: btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); mutex_unlock(&bc->lock); - +out: six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } -- cgit v1.2.3 From ba8ed36e72033dd6b89d44145f323e6c4508bfc9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 00:09:08 -0400 Subject: bcachefs: don't queue btree nodes for rewrites during scan many nodes found during scan will be old nodes, overwritten by newer nodes Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index b1aaf0550644..9678b2375bed 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1340,7 +1340,9 @@ start: rb->start_time); bio_put(&rb->bio); - if (saw_error && !btree_node_read_error(b)) { + if (saw_error && + !btree_node_read_error(b) && + c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->key.k.p); bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", -- cgit v1.2.3 From 9abb6dd7ce5a261f7aebf6f396b50a63db71133f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:17:00 -0400 Subject: bcachefs: Standardize helpers for printing enum strs with bounds checks Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 4 ++-- fs/bcachefs/buckets.h | 8 -------- fs/bcachefs/checksum.c | 23 ++++++++++++++--------- fs/bcachefs/checksum.h | 5 +++-- fs/bcachefs/compress.h | 8 -------- fs/bcachefs/ec.c | 14 ++++++-------- fs/bcachefs/extents.c | 7 ++++--- fs/bcachefs/journal_io.c | 17 +++++++++-------- fs/bcachefs/opts.c | 29 +++++++++++++++++++++++++---- fs/bcachefs/opts.h | 10 ++++++---- 10 files changed, 69 insertions(+), 56 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 364ae42022af..9480f1b44f07 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1314,7 +1314,7 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(write_buffer_keys, 11) \ x(datetime, 12) -enum { +enum bch_jset_entry_type { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, BCH_JSET_ENTRY_TYPES() #undef x @@ -1360,7 +1360,7 @@ struct jset_entry_blacklist_v2 { x(inodes, 1) \ x(key_version, 2) -enum { +enum bch_fs_usage_type { #define x(f, nr) BCH_FS_USAGE_##f = nr, BCH_FS_USAGE_TYPES() #undef x diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 00aaf4bb5139..f9af5adabe83 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -395,14 +395,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type) : "(invalid data type)"; } -static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) -{ - if (type < BCH_DATA_NR) - prt_str(out, __bch2_data_types[type]); - else - prt_printf(out, "(invalid data type %u)", type); -} - /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 4701457f6381..7ed779b411f6 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -429,15 +429,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, extent_nonce(version, crc_old), bio); if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { - bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" - "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", - __func__, - crc_old.csum.hi, - crc_old.csum.lo, - merged.hi, - merged.lo, - bch2_csum_types[crc_old.csum_type], - bch2_csum_types[new_csum_type]); + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type ", + __func__, + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo); + bch2_prt_csum_type(&buf, crc_old.csum_type); + prt_str(&buf, " new type "); + bch2_prt_csum_type(&buf, new_csum_type); + prt_str(&buf, ")"); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); return -EIO; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1b8c2c1016dc..e40499fde9a4 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out, struct bch_csum expected, struct bch_csum got) { - prt_printf(out, "checksum error: got "); + prt_str(out, "checksum error, type "); + bch2_prt_csum_type(out, type); + prt_str(out, ": got "); bch2_csum_to_text(out, type, got); prt_str(out, " should be "); bch2_csum_to_text(out, type, expected); - prt_printf(out, " type %s", bch2_csum_types[type]); } int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 58c2eb45570f..607fd5e232c9 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } -static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) -{ - if (type < BCH_COMPRESSION_TYPE_NR) - prt_str(out, __bch2_compression_types[type]); - else - prt_printf(out, "(invalid compression type %u)", type); -} - int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c0b2b2f180e0..556a217108d3 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -138,13 +138,13 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, unsigned nr_data = s.nr_blocks - s.nr_redundant; - prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", s.algorithm, le16_to_cpu(s.sectors), nr_data, - s.nr_redundant, - s.csum_type, - 1U << s.csum_granularity_bits); + s.nr_redundant); + bch2_prt_csum_type(out, s.csum_type); + prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); for (unsigned i = 0; i < s.nr_blocks; i++) { const struct bch_extent_ptr *ptr = sp->ptrs + i; @@ -611,10 +611,8 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct printbuf err = PRINTBUF; struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); - prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", - want.hi, want.lo, - got.hi, got.lo, - bch2_csum_types[v->csum_type]); + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); bch_err_ratelimited(ca, "%s", err.buf); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 36d12d2adb81..1a331e539204 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1030,11 +1030,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", crc.compressed_size, crc.uncompressed_size, - crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type]); + crc.offset, crc.nonce); + bch2_prt_csum_type(out, crc.csum_type); + prt_str(out, " compress "); bch2_prt_compression_type(out, crc.compression_type); break; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 725fcf46f631..9aa28b52ab92 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -247,7 +247,7 @@ static void journal_entry_err_msg(struct printbuf *out, if (entry) { prt_str(out, " type="); - prt_str(out, bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); } if (!jset) { @@ -403,7 +403,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs jset_entry_for_each_key(entry, k) { if (!first) { prt_newline(out); - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_prt_jset_entry_type(out, entry->type); + prt_str(out, ": "); } prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); @@ -563,9 +564,9 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - prt_printf(out, "type=%s v=%llu", - bch2_fs_usage_types[u->entry.btree_id], - le64_to_cpu(u->v)); + prt_str(out, "type="); + bch2_prt_fs_usage_type(out, u->entry.btree_id); + prt_printf(out, " v=%llu", le64_to_cpu(u->v)); } static int journal_entry_data_usage_validate(struct bch_fs *c, @@ -827,11 +828,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { + bch2_prt_jset_entry_type(out, entry->type); + if (entry->type < BCH_JSET_ENTRY_NR) { - prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + prt_str(out, ": "); bch2_jset_entry_ops[entry->type].to_text(out, c, entry); - } else { - prt_printf(out, "(unknown type %u)", entry->type); } } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index e1800c4119b5..bb068fd72465 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -43,7 +43,7 @@ const char * const __bch2_btree_ids[] = { NULL }; -const char * const bch2_csum_types[] = { +static const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; @@ -53,7 +53,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const __bch2_compression_types[] = { +static const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -83,18 +83,39 @@ const char * const bch2_member_states[] = { NULL }; -const char * const bch2_jset_entry_types[] = { +static const char * const __bch2_jset_entry_types[] = { BCH_JSET_ENTRY_TYPES() NULL }; -const char * const bch2_fs_usage_types[] = { +static const char * const __bch2_fs_usage_types[] = { BCH_FS_USAGE_TYPES() NULL }; #undef x +static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], + unsigned nr, const char *type, unsigned idx) +{ + if (idx < nr) + prt_str(out, opts[idx]); + else + prt_printf(out, "(unknown %s %u)", type, idx); +} + +#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \ +void bch2_prt_##name(struct printbuf *out, type t) \ +{ \ + prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\ +} + +PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); +PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); +PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); +PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); +PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); + static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 1ac4135cca1c..84e452835a17 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; -extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; -extern const char * const bch2_jset_entry_types[]; -extern const char * const bch2_fs_usage_types[]; extern const char * const bch2_d_types[]; +void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); +void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); +void bch2_prt_data_type(struct printbuf *, enum bch_data_type); +void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); +void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); + static inline const char *bch2_d_type_str(unsigned d_type) { return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; -- cgit v1.2.3 From 4518e80adfdbfdec1cc79c98bc73677ff44d96bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 14:05:36 -0400 Subject: bcachefs: Go rw if running any explicit recovery passes This fixes a bug where we fail to start when upgrading/downgrading because we forgot we needed to go rw. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index cb501460d615..0cec0f7d9703 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -44,7 +44,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean) + if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit) return bch2_fs_read_write_early(c); return 0; } -- cgit v1.2.3 From 82cf18f23e1ae17053983e325e550194f947e1f1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 21:07:05 -0400 Subject: bcachefs: Fix deadlock in journal replay btree_key_can_insert_cached() should be checking the watermark - BCH_TRANS_COMMIT_journal_replay really means nonblocking mode when watermark < reclaim, it was being used incorrectly. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 9609ad370d71..bbec91e8e650 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -397,12 +397,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags struct bkey_cached *ck = (void *) path->l[0].b; unsigned new_u64s; struct bkey_i *new_k; + unsigned watermark = flags & BCH_WATERMARK_MASK; EBUG_ON(path->level); - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c) && - !(flags & BCH_TRANS_COMMIT_journal_reclaim)) + if (watermark < BCH_WATERMARK_reclaim && + !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* -- cgit v1.2.3 From 9e203c43dc1cbaefb3888ee0ba885b2d20d47526 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 00:26:01 -0400 Subject: bcachefs: Fix missing write refs in fs fio paths bch2_journal_flush_seq requires us to have a write ref Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 ++ fs/bcachefs/fs-io-direct.c | 19 +++++++++++++------ fs/bcachefs/fs-io.c | 16 ++++++++-------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a31a5f706929..91c3c1fef233 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -709,6 +709,8 @@ struct btree_trans_buf { x(stripe_delete) \ x(reflink) \ x(fallocate) \ + x(fsync) \ + x(dio_write) \ x(discard) \ x(discard_fast) \ x(invalidate) \ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index f49e6c0f0f68..b889370a5088 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); + bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); + /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -590,22 +592,25 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + return -EROFS; + inode_lock(&inode->v); ret = generic_write_checks(req, iter); if (unlikely(ret <= 0)) - goto err; + goto err_put_write_ref; ret = file_remove_privs(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; ret = file_update_time(file); if (unlikely(ret)) - goto err; + goto err_put_write_ref; if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - goto err; + goto err_put_write_ref; inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); @@ -645,7 +650,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } ret = bch2_dio_write_loop(dio); -err: +out: if (locked) inode_unlock(&inode->v); return ret; @@ -653,7 +658,9 @@ err_put_bio: bch2_pagecache_block_put(inode); bio_put(bio); inode_dio_end(&inode->v); - goto err; +err_put_write_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + goto out; } void bch2_fs_fs_io_direct_exit(struct bch_fs *c) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 8c70123b6a0c..20b40477425f 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, static int bch2_flush_inode(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked u; - int ret; - if (c->opts.journal_flush_disabled) return 0; - ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); - if (ret) - return ret; + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + return -EROFS; - return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: - bch2_inode_flush_nocow_writes(c, inode); + struct bch_inode_unpacked u; + int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: + bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_inode_flush_nocow_writes(c, inode); + bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + return ret; } int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -- cgit v1.2.3 From 9054ef2ea944528b7935d0ce3f540d4dc0bc37ba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 16:13:13 -0400 Subject: bcachefs: Run merges at BCH_WATERMARK_btree This fixes a deadlock where the interior update path during journal replay ends up doing a ton of merges on the backpointers btree, and deadlocking. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 29a5bc7d8789..1a7537ee4329 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1926,6 +1926,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + flags &= ~BCH_WATERMARK_MASK; + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || @@ -2071,6 +2073,10 @@ err: bch2_path_put(trans, new_path, true); bch2_path_put(trans, sib_path, true); bch2_trans_verify_locks(trans); + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) + ret = 0; + if (!ret) + ret = bch2_trans_relock(trans); return ret; err_free_update: bch2_btree_node_free_never_used(as, trans, n); -- cgit v1.2.3 From 3f10048973c8366e27147d72bf3fd8d0e1d929f2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 18:39:03 -0400 Subject: bcachefs: Disable merges from interior update path There's been a bug in the btree write buffer where it wasn't triggering btree node merges - and leaving behind a bunch of nearly empty btree nodes. Then during journal replay, when updates to the backpointers btree aren't using the btree write buffer (because we require synchronization with journal replay), we end up doing those merges all at once. Then if it's the interior update path running them, we deadlock because those run with the highest watermark. There's no real need for the interior update path to be doing btree node merges; other code paths can handle that at lower watermarks. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 1a7537ee4329..82ac00aa0f7b 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1926,6 +1926,16 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + /* + * Work around a deadlock caused by the btree write buffer not doing + * merges and leaving tons of merges for us to do - we really don't need + * to be doing merges at all from the interior update path, and if the + * interior update path is generating too many new interior updates we + * deadlock: + */ + if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) + return 0; + flags &= ~BCH_WATERMARK_MASK; b = trans->paths[path].l[level].b; -- cgit v1.2.3 From 86dbf8c566417afd62087ecd3bedcf84f91ee4e4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Dec 2023 22:42:34 -0500 Subject: bcachefs: Fix btree node merging on write buffer btrees The btree write buffer flush fastpath that avoids the main transaction commit path had the unfortunate side effect of not doing btree node merging. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index baf63e2fddb6..36a6f42aba5e 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -316,6 +316,16 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { bch2_btree_node_unlock_write(trans, path, path->l[0].b); write_locked = false; + + ret = lockrestart_do(trans, + bch2_btree_iter_traverse(&iter) ?: + bch2_foreground_maybe_merge(trans, iter.path, 0, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc)); + if (ret) + goto err; } } @@ -382,10 +392,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) ret = commit_do(trans, NULL, NULL, BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim, + BCH_TRANS_COMMIT_no_journal_res , btree_write_buffered_insert(trans, i)); if (ret) goto err; -- cgit v1.2.3 From 69129794d94c544810e68b2b4eaa7e44063f9bf2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 12 Apr 2024 11:10:33 -0700 Subject: x86/bugs: Fix BHI retpoline check Confusingly, X86_FEATURE_RETPOLINE doesn't mean retpolines are enabled, as it also includes the original "AMD retpoline" which isn't a retpoline at all. Also replace cpu_feature_enabled() with boot_cpu_has() because this is before alternatives are patched and cpu_feature_enabled()'s fallback path is slower than plain old boot_cpu_has(). Fixes: ec9404e40e8f ("x86/bhi: Add BHI mitigation knob") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Cc: Borislav Petkov Cc: Linus Torvalds Link: https://lore.kernel.org/r/ad3807424a3953f0323c011a643405619f2a4927.1712944776.git.jpoimboe@kernel.org --- arch/x86/kernel/cpu/bugs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ca295b0c1eee..ab18185894df 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1652,7 +1652,8 @@ static void __init bhi_select_mitigation(void) return; /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { spec_ctrl_disable_kernel_rrsba(); if (rrsba_disabled) return; @@ -2804,11 +2805,13 @@ static const char *spectre_bhi_state(void) { if (!boot_cpu_has_bug(X86_BUG_BHI)) return "; BHI: Not affected"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) return "; BHI: BHI_DIS_S"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) return "; BHI: SW loop, KVM: SW loop"; - else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && rrsba_disabled) + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && + rrsba_disabled) return "; BHI: Retpoline"; else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) return "; BHI: Vulnerable, KVM: SW loop"; -- cgit v1.2.3 From 16b52bbee4823b01ab7fe3919373c981a38f3797 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 5 Apr 2024 17:56:35 +0300 Subject: kernfs: annotate different lockdep class for of->mutex of writable files The writable file /sys/power/resume may call vfs lookup helpers for arbitrary paths and readonly files can be read by overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. To avoid a lockdep warning of circular dependency between overlayfs inode lock and kernfs of->mutex, use a different lockdep class for writable and readonly kernfs files. Reported-by: syzbot+9a5b0ced8b1bfb238b56@syzkaller.appspotmail.com Fixes: 0fedefd4c4e3 ("kernfs: sysfs: support custom llseek method for sysfs entries") Suggested-by: Al Viro Signed-off-by: Amir Goldstein Signed-off-by: Al Viro --- fs/kernfs/file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e9df2f87072c..8502ef68459b 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -636,11 +636,18 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * - * Both paths of the branch look the same. They're supposed to + * For similar reasons, writable and readonly files are given different + * lockdep key, because the writable file /sys/power/resume may call vfs + * lookup helpers for arbitrary paths and readonly files can be read by + * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. + * + * All three cases look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); + else if (file->f_mode & FMODE_WRITE) + mutex_init(&of->mutex); else mutex_init(&of->mutex); -- cgit v1.2.3 From 89f9a1e876b5a7ad884918c03a46831af202c8a0 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Sun, 14 Apr 2024 19:49:45 +0800 Subject: bootconfig: use memblock_free_late to free xbc memory to buddy On the time to free xbc memory in xbc_exit(), memblock may has handed over memory to buddy allocator. So it doesn't make sense to free memory back to memblock. memblock_free() called by xbc_exit() even causes UAF bugs on architectures with CONFIG_ARCH_KEEP_MEMBLOCK disabled like x86. Following KASAN logs shows this case. This patch fixes the xbc memory free problem by calling memblock_free() in early xbc init error rewind path and calling memblock_free_late() in xbc exit path to free memory to buddy allocator. [ 9.410890] ================================================================== [ 9.418962] BUG: KASAN: use-after-free in memblock_isolate_range+0x12d/0x260 [ 9.426850] Read of size 8 at addr ffff88845dd30000 by task swapper/0/1 [ 9.435901] CPU: 9 PID: 1 Comm: swapper/0 Tainted: G U 6.9.0-rc3-00208-g586b5dfb51b9 #5 [ 9.446403] Hardware name: Intel Corporation RPLP LP5 (CPU:RaptorLake)/RPLP LP5 (ID:13), BIOS IRPPN02.01.01.00.00.19.015.D-00000000 Dec 28 2023 [ 9.460789] Call Trace: [ 9.463518] [ 9.465859] dump_stack_lvl+0x53/0x70 [ 9.469949] print_report+0xce/0x610 [ 9.473944] ? __virt_addr_valid+0xf5/0x1b0 [ 9.478619] ? memblock_isolate_range+0x12d/0x260 [ 9.483877] kasan_report+0xc6/0x100 [ 9.487870] ? memblock_isolate_range+0x12d/0x260 [ 9.493125] memblock_isolate_range+0x12d/0x260 [ 9.498187] memblock_phys_free+0xb4/0x160 [ 9.502762] ? __pfx_memblock_phys_free+0x10/0x10 [ 9.508021] ? mutex_unlock+0x7e/0xd0 [ 9.512111] ? __pfx_mutex_unlock+0x10/0x10 [ 9.516786] ? kernel_init_freeable+0x2d4/0x430 [ 9.521850] ? __pfx_kernel_init+0x10/0x10 [ 9.526426] xbc_exit+0x17/0x70 [ 9.529935] kernel_init+0x38/0x1e0 [ 9.533829] ? _raw_spin_unlock_irq+0xd/0x30 [ 9.538601] ret_from_fork+0x2c/0x50 [ 9.542596] ? __pfx_kernel_init+0x10/0x10 [ 9.547170] ret_from_fork_asm+0x1a/0x30 [ 9.551552] [ 9.555649] The buggy address belongs to the physical page: [ 9.561875] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x45dd30 [ 9.570821] flags: 0x200000000000000(node=0|zone=2) [ 9.576271] page_type: 0xffffffff() [ 9.580167] raw: 0200000000000000 ffffea0011774c48 ffffea0012ba1848 0000000000000000 [ 9.588823] raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 [ 9.597476] page dumped because: kasan: bad access detected [ 9.605362] Memory state around the buggy address: [ 9.610714] ffff88845dd2ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 9.618786] ffff88845dd2ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 9.626857] >ffff88845dd30000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.634930] ^ [ 9.638534] ffff88845dd30080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.646605] ffff88845dd30100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.654675] ================================================================== Link: https://lore.kernel.org/all/20240414114944.1012359-1-qiang4.zhang@linux.intel.com/ Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Stable@vger.kernel.org Signed-off-by: Qiang Zhang Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 7 ++++++- lib/bootconfig.c | 19 +++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index e5ee2c694401..3f4b4ac527ca 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -288,7 +288,12 @@ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ -void __init xbc_exit(void); +void __init _xbc_exit(bool early); + +static inline void xbc_exit(void) +{ + _xbc_exit(false); +} /* XBC embedded bootconfig data in kernel */ #ifdef CONFIG_BOOT_CONFIG_EMBED diff --git a/lib/bootconfig.c b/lib/bootconfig.c index c59d26068a64..8841554432d5 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -61,9 +61,12 @@ static inline void * __init xbc_alloc_mem(size_t size) return memblock_alloc(size, SMP_CACHE_BYTES); } -static inline void __init xbc_free_mem(void *addr, size_t size) +static inline void __init xbc_free_mem(void *addr, size_t size, bool early) { - memblock_free(addr, size); + if (early) + memblock_free(addr, size); + else if (addr) + memblock_free_late(__pa(addr), size); } #else /* !__KERNEL__ */ @@ -73,7 +76,7 @@ static inline void *xbc_alloc_mem(size_t size) return malloc(size); } -static inline void xbc_free_mem(void *addr, size_t size) +static inline void xbc_free_mem(void *addr, size_t size, bool early) { free(addr); } @@ -904,13 +907,13 @@ static int __init xbc_parse_tree(void) * If you need to reuse xbc_init() with new boot config, you can * use this. */ -void __init xbc_exit(void) +void __init _xbc_exit(bool early) { - xbc_free_mem(xbc_data, xbc_data_size); + xbc_free_mem(xbc_data, xbc_data_size, early); xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; - xbc_free_mem(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); + xbc_free_mem(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX, early); xbc_nodes = NULL; brace_index = 0; } @@ -963,7 +966,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) if (!xbc_nodes) { if (emsg) *emsg = "Failed to allocate bootconfig nodes"; - xbc_exit(); + _xbc_exit(true); return -ENOMEM; } memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); @@ -977,7 +980,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) *epos = xbc_err_pos; if (emsg) *emsg = xbc_err_msg; - xbc_exit(); + _xbc_exit(true); } else ret = xbc_node_num; -- cgit v1.2.3 From 1382e3b6a3500c245e5278c66d210c02926f804f Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 11 Apr 2024 08:11:24 +0300 Subject: net: change maximum number of UDP segments to 128 The commit fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") adds check of potential number of UDP segments vs UDP_MAX_SEGMENTS in linux/virtio_net.h. After this change certification test of USO guest-to-guest transmit on Windows driver for virtio-net device fails, for example with packet size of ~64K and mss of 536 bytes. In general the USO should not be more restrictive than TSO. Indeed, in case of unreasonably small mss a lot of segments can cause queue overflow and packet loss on the destination. Limit of 128 segments is good for any practical purpose, with minimal meaningful mss of 536 the maximal UDP packet will be divided to ~120 segments. The number of segments for UDP packets is validated vs UDP_MAX_SEGMENTS also in udp.c (v4,v6), this does not affect quest-to-guest path but does affect packets sent to host, for example. It is important to mention that UDP_MAX_SEGMENTS is kernel-only define and not available to user mode socket applications. In order to request MSS smaller than MTU the applications just uses setsockopt with SOL_UDP and UDP_SEGMENT and there is no limitations on socket API level. Fixes: fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") Signed-off-by: Yuri Benditovich Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 2 +- tools/testing/selftests/net/udpgso.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 17539d089666..e398e1dbd2d3 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { #define udp_assign_bit(nr, sk, val) \ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 1d975bf52af3..85b3baa3f7f3 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -34,7 +34,7 @@ #endif #ifndef UDP_MAX_SEGMENTS -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #endif #define CONST_MTU_TEST 1500 -- cgit v1.2.3 From 8541323285994528ad5be2c1bdc759e6c83b936e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 4 Apr 2024 21:05:14 -0300 Subject: iommufd: Add missing IOMMUFD_DRIVER kconfig for the selftest Some kconfigs don't automatically include this symbol which results in sub functions for some of the dirty tracking related things that are non-functional. Thus the test suite will fail. select IOMMUFD_DRIVER in the IOMMUFD_TEST kconfig to fix it. Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Link: https://lore.kernel.org/r/20240327182050.GA1363414@ziepe.ca Tested-by: Muhammad Usama Anjum Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 99d4b075df49..76656fe0470d 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -37,6 +37,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + select IOMMUFD_DRIVER default n help This is dangerous, do not enable unless running -- cgit v1.2.3 From 2760c51b8040d7cffedc337939e7475a17cc4b19 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 25 Mar 2024 14:00:48 +0500 Subject: iommufd: Add config needed for iommufd_fail_nth Add FAULT_INJECTION_DEBUG_FS and FAILSLAB configurations to the kconfig fragment for the iommfd selftests. These kconfigs are needed by the iommufd_fail_nth test. Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP") Link: https://lore.kernel.org/r/20240325090048.1423908-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/iommu/config b/tools/testing/selftests/iommu/config index 110d73917615..02a2a1b267c1 100644 --- a/tools/testing/selftests/iommu/config +++ b/tools/testing/selftests/iommu/config @@ -1,3 +1,5 @@ CONFIG_IOMMUFD=y +CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION=y CONFIG_IOMMUFD_TEST=y +CONFIG_FAILSLAB=y -- cgit v1.2.3 From 0bbac3facb5d6cc0171c45c9873a2dc96bea9680 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Apr 2024 13:38:39 -0700 Subject: Linux 6.9-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e1bf12891cb0..59d8a7f95d0a 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- cgit v1.2.3 From bceb86be9e970bc6d77bfd1f8e5272d9a2007c16 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 23:59:06 -0400 Subject: bcachefs: add missing bounds check in __bch2_bkey_val_invalid() Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey_methods.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5e52684764eb..e4481e4d5f38 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -175,7 +175,10 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", - bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + bch2_btree_node_type_str(type), + k.k->type < KEY_TYPE_MAX + ? bch2_bkey_types[k.k->type] + : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, err, -- cgit v1.2.3 From d789e9a7d5e2799f4d5425b0b620210d2fcad529 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 23:59:28 -0400 Subject: bcachefs: Interior known are required to have known key types For forwards compatibilyt, we allow bkeys of unknown type in leaf nodes; we can simply ignore metadata we don't understand. Pointers to btree nodes must always be of known types, howwever. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey_methods.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index e4481e4d5f38..db336a43fc08 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -171,7 +171,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (type >= BKEY_TYPE_NR) return 0; - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && + bkey_fsck_err_on((type == BKEY_TYPE_btree || + (flags & BKEY_INVALID_COMMIT)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", -- cgit v1.2.3 From 8cf2036e7b557282667a437d409e6307d55366ab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:34:14 -0400 Subject: bcachefs: add safety checks in bch2_btree_node_fill() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index c7f156320a35..c347d08d80bc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -711,7 +711,30 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct btree *b; u32 seq; - BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + if (unlikely(level >= BTREE_MAX_DEPTH)) { + int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", + level, BTREE_MAX_DEPTH); + return ERR_PTR(ret); + } + + if (unlikely(!bkey_is_btree_ptr(&k->k))) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + + if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); + printbuf_exit(&buf); + return ERR_PTR(ret); + } + /* * Parent node must be locked, else we could read in a btree node that's * been freed: -- cgit v1.2.3 From e879389f5777de5d94f27f8d88d7e92341afa3ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 15:54:33 -0400 Subject: bcachefs: Fix bch2_btree_node_fill() for !path We shouldn't be doing the unlock/relock dance when we're not using a path - this fixes an assertion pop when called from btree node scan. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index c347d08d80bc..02c70e813fac 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -709,7 +709,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - u32 seq; if (unlikely(level >= BTREE_MAX_DEPTH)) { int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", @@ -775,34 +774,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } set_btree_node_read_in_flight(b); - six_unlock_write(&b->c.lock); - seq = six_lock_seq(&b->c.lock); - six_unlock_intent(&b->c.lock); - /* Unlock before doing IO: */ - if (path && sync) - bch2_trans_unlock_noassert(trans); - - bch2_btree_node_read(trans, b, sync); + if (path) { + u32 seq = six_lock_seq(&b->c.lock); - if (!sync) - return NULL; + /* Unlock before doing IO: */ + six_unlock_intent(&b->c.lock); + bch2_trans_unlock_noassert(trans); - if (path) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); - return ERR_PTR(ret); - } - } + bch2_btree_node_read(trans, b, sync); - if (!six_relock_type(&b->c.lock, lock_type, seq)) { - BUG_ON(!path); + if (!sync) + return NULL; - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + if (!six_relock_type(&b->c.lock, lock_type, seq)) + b = NULL; + } else { + bch2_btree_node_read(trans, b, sync); + if (lock_type == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); } return b; @@ -1135,18 +1126,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; - struct btree *b; BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); - b = btree_cache_find(bc, k); + struct btree *b = btree_cache_find(bc, k); if (b) return 0; b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - return PTR_ERR_OR_ZERO(b); + if (!IS_ERR_OR_NULL(b)) + six_unlock_read(&b->c.lock); + return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) -- cgit v1.2.3 From bdae2a7e6020e1cf864a30dab2af323575569dc7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 22:43:11 -0400 Subject: bcachefs: sysfs internal/trigger_journal_flush Add a sysfs knob for immediately flushing the entire journal. Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index b18b0cc81b59..5be92fe3f4ea 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -25,6 +25,7 @@ #include "ec.h" #include "inode.h" #include "journal.h" +#include "journal_reclaim.h" #include "keylist.h" #include "move.h" #include "movinggc.h" @@ -138,6 +139,7 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); +write_attribute(trigger_journal_flush); write_attribute(prune_cache); write_attribute(btree_wakeup); rw_attribute(btree_gc_periodic); @@ -500,7 +502,7 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_rw, &c->flags)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; if (attr == &sysfs_prune_cache) { @@ -533,6 +535,11 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_flush) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -553,6 +560,7 @@ STORE(bch2_fs) size = ret; } #endif + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -651,6 +659,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, + &sysfs_trigger_journal_flush, &sysfs_prune_cache, &sysfs_btree_wakeup, -- cgit v1.2.3 From 27c15ed297cb71c2e7a839439b5a097081a32605 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2024 18:45:47 -0400 Subject: bcachefs: bch_member.btree_allocated_bitmap This adds a small (64 bit) per-device bitmap that tracks ranges that have btree nodes, for accelerating btree node scan if it is ever needed. - New helpers, bch2_dev_btree_bitmap_marked() and bch2_dev_bitmap_mark(), for checking and updating the bitmap - Interior btree update path updates the bitmaps when required - The check_allocations pass has a new fsck_err check, btree_bitmap_not_marked - New on disk format version, mi_btree_mitmap, which indicates the new bitmap is present - Upgrade table lists the required recovery pass and expected fsck error - Btree node scan uses the bitmap to skip ranges if we're on the new version Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 7 +++-- fs/bcachefs/btree_gc.c | 13 +++++++++ fs/bcachefs/btree_node_scan.c | 9 +++++-- fs/bcachefs/btree_update_interior.c | 24 +++++++++++++++++ fs/bcachefs/sb-downgrade.c | 5 +++- fs/bcachefs/sb-errors_types.h | 3 ++- fs/bcachefs/sb-members.c | 53 +++++++++++++++++++++++++++++++++++++ fs/bcachefs/sb-members.h | 21 +++++++++++++++ fs/bcachefs/super_types.h | 2 ++ 9 files changed, 131 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 9480f1b44f07..085987435a5e 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -578,7 +578,8 @@ struct bch_member { __le64 nbuckets; /* device size */ __le16 first_bucket; /* index of first bucket used */ __le16 bucket_size; /* sectors */ - __le32 pad; + __u8 btree_bitmap_shift; + __u8 pad[3]; __le64 last_mount; /* time_t */ __le64 flags; @@ -587,6 +588,7 @@ struct bch_member { __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; __le64 errors_reset_time; __le64 seq; + __le64 btree_allocated_bitmap; }; #define BCH_MEMBER_V1_BYTES 56 @@ -876,7 +878,8 @@ struct bch_sb_field_downgrade { x(rebalance_work, BCH_VERSION(1, 3)) \ x(member_seq, BCH_VERSION(1, 4)) \ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ - x(btree_subvolume_children, BCH_VERSION(1, 6)) + x(btree_subvolume_children, BCH_VERSION(1, 6)) \ + x(mi_btree_bitmap, BCH_VERSION(1, 7)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index d2555da55c6d..ecbd9598f69f 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -828,6 +828,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + struct printbuf buf = PRINTBUF; int ret = 0; deleted.p = k->k->p; @@ -848,11 +849,23 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (ret) goto err; + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k), + c, btree_bitmap_not_marked, + "btree ptr not marked in member info btree allocated bitmap\n %s", + (bch2_bkey_val_to_text(&buf, c, *k), + buf.buf))) { + mutex_lock(&c->sb_lock); + bch2_dev_btree_bitmap_mark(c, *k); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + ret = commit_do(trans, NULL, NULL, 0, bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 20f2b37c4474..866bd278439f 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -205,8 +205,13 @@ static int read_btree_nodes_worker(void *p) last_print = jiffies; } - try_read_btree_node(w->f, ca, bio, buf, - bucket * ca->mi.bucket_size + bucket_offset); + u64 sector = bucket * ca->mi.bucket_size + bucket_offset; + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && + !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) + continue; + + try_read_btree_node(w->f, ca, bio, buf, sector); } err: bio_put(bio); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 82ac00aa0f7b..6030c396754f 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -21,6 +21,7 @@ #include "keylist.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-members.h" #include "super-io.h" #include "trace.h" @@ -605,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as, bch2_keylist_push(keys); } +static bool btree_update_new_nodes_marked_sb(struct btree_update *as) +{ + for_each_keylist_key(&as->new_keys, k) + if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) + return false; + return true; +} + +static void btree_update_new_nodes_mark_sb(struct btree_update *as) +{ + struct bch_fs *c = as->c; + + mutex_lock(&c->sb_lock); + for_each_keylist_key(&as->new_keys, k) + bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + /* * The transactional part of an interior btree node update, where we journal the * update we did to the interior node and update alloc info: @@ -662,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; + if (!btree_update_new_nodes_marked_sb(as)) + btree_update_new_nodes_mark_sb(as); + /* * Wait for any in flight writes to finish before we free the old nodes * on disk: diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index d6f81179c3a2..a98ef940b7a3 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -51,7 +51,10 @@ BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ x(btree_subvolume_children, \ BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ - BCH_FSCK_ERR_subvol_children_not_set) + BCH_FSCK_ERR_subvol_children_not_set) \ + x(mi_btree_bitmap, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_btree_bitmap_not_marked) #define DOWNGRADE_TABLE() diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index d7d609131030..5b600c6d7aca 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -270,7 +270,8 @@ x(btree_ptr_v2_min_key_bad, 262) \ x(btree_root_unreadable_and_scan_found_nothing, 263) \ x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) + x(dup_backpointer_to_bad_csum_extent, 265) \ + x(btree_bitmap_not_marked, 266) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index eff5ce18c69c..522a969345e5 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "disk_groups.h" #include "opts.h" #include "replicas.h" @@ -426,3 +427,55 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bch2_write_super(c); mutex_unlock(&c->sb_lock); } + +/* + * Per member "range has btree nodes" bitmap: + * + * This is so that if we ever have to run the btree node scan to repair we don't + * have to scan full devices: + */ + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) +{ + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev), + ptr->offset, btree_sectors(c))) + return false; + return true; +} + +static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, + u64 start, unsigned sectors) +{ + struct bch_member *m = __bch2_members_v2_get_mut(mi, dev); + u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap); + + u64 end = start + sectors; + + int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6); + if (resize > 0) { + u64 new_bitmap = 0; + + for (unsigned i = 0; i < 64; i++) + if (bitmap & BIT_ULL(i)) + new_bitmap |= BIT_ULL(i >> resize); + bitmap = new_bitmap; + m->btree_bitmap_shift += resize; + } + + for (unsigned bit = sectors >> m->btree_bitmap_shift; + bit << m->btree_bitmap_shift < end; + bit++) + bitmap |= BIT_ULL(bit); + + m->btree_allocated_bitmap = cpu_to_le64(bitmap); +} + +void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index be0a94183271..b27c3e4467cf 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -3,6 +3,7 @@ #define _BCACHEFS_SB_MEMBERS_H #include "darray.h" +#include "bkey_types.h" extern char * const bch2_member_error_strs[]; @@ -220,6 +221,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), .valid = bch2_member_exists(mi), + .btree_bitmap_shift = mi->btree_bitmap_shift, + .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), }; } @@ -228,4 +231,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *); void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); void bch2_dev_errors_reset(struct bch_dev *); +static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors) +{ + u64 end = start + sectors; + + if (end > 64 << ca->mi.btree_bitmap_shift) + return false; + + for (unsigned bit = sectors >> ca->mi.btree_bitmap_shift; + bit << ca->mi.btree_bitmap_shift < end; + bit++) + if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit))) + return false; + return true; +} + +bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); +void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); + #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index ec784d975f66..11bcef170c2c 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -37,6 +37,8 @@ struct bch_member_cpu { u8 durability; u8 freespace_initialized; u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; }; #endif /* _BCACHEFS_SUPER_TYPES_H */ -- cgit v1.2.3 From f0a73d4fde5b285d94a702026216d9fd1fd2733d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 14 Apr 2024 00:51:48 -0400 Subject: bcachefs: Check for backpointer bucket_offset >= bucket size Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 8 +++++--- fs/bcachefs/backpointers.h | 9 +++------ fs/bcachefs/sb-errors_types.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 114328acde72..fadb1078903d 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -49,13 +49,15 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, if (!bch2_dev_exists2(c, bp.k->p.inode)) return 0; + struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode); struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; - bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), + bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || + !bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), c, err, - backpointer_pos_wrong, - "backpointer at wrong pos"); + backpointer_bucket_offset_wrong, + "backpointer bucket_offset wrong"); fsck_err: return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index da012ca7daee..85949b9fd880 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -53,14 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, u64 bucket_offset) { struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - struct bpos ret; - - ret = POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + struct bpos ret = POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); - return ret; } diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 5b600c6d7aca..4ca6e7b0d8aa 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -130,7 +130,7 @@ x(bucket_gens_nonzero_for_invalid_buckets, 122) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ x(need_discard_freespace_key_bad, 124) \ - x(backpointer_pos_wrong, 125) \ + x(backpointer_bucket_offset_wrong, 125) \ x(backpointer_to_missing_device, 126) \ x(backpointer_to_missing_alloc, 127) \ x(backpointer_to_missing_ptr, 128) \ -- cgit v1.2.3 From 356952b13af5b2c338df1e06889fd1b5e12cbbf4 Mon Sep 17 00:00:00 2001 From: bolan wang Date: Wed, 6 Mar 2024 19:03:39 +0800 Subject: USB: serial: option: add Fibocom FM135-GL variants Update the USB serial option driver support for the Fibocom FM135-GL LTE modules. - VID:PID 2cb7:0115, FM135-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x0115: mbim, diag, at, pipe Here are the outputs of usb-devices: T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 16 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2cb7 ProdID=0115 Rev=05.15 S: Manufacturer=Fibocom Wireless Inc. S: Product=Fibocom Module S: SerialNumber=12345678 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: bolan wang Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 55a65d941ccb..cedf93a3eaea 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2272,6 +2272,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0xff, 0x30) }, /* Fibocom FG150 Diag */ { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x010b, 0xff, 0, 0) }, /* Fibocom FG150 AT */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0111, 0xff) }, /* Fibocom FM160 (MBIM mode) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0115, 0xff), /* Fibocom FM135 (laptop MBIM) */ + .driver_info = RSVD(5) }, { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a0, 0xff) }, /* Fibocom NL668-AM/NL652-EU (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a2, 0xff) }, /* Fibocom FM101-GL (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a3, 0xff) }, /* Fibocom FM101-GL (laptop MBIM) */ -- cgit v1.2.3 From c840244aba7ad2b83ed904378b36bd6aef25511c Mon Sep 17 00:00:00 2001 From: Jerry Meng Date: Mon, 15 Apr 2024 15:04:29 +0800 Subject: USB: serial: option: support Quectel EM060K sub-models EM060K_129, EM060K_12a, EM060K_12b and EM0060K_12c are EM060K's sub-models, having the same name "Quectel EM060K-GL" and the same interface layout. MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=0129 Rev= 5.04 S: Manufacturer=Quectel S: Product=Quectel EM060K-GL S: SerialNumber=f6fa08b6 C:* #Ifs= 8 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8f(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Jerry Meng Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index cedf93a3eaea..16cf0ddaae9b 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -255,6 +255,10 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_PRODUCT_EM061K_LMS 0x0124 #define QUECTEL_PRODUCT_EC25 0x0125 #define QUECTEL_PRODUCT_EM060K_128 0x0128 +#define QUECTEL_PRODUCT_EM060K_129 0x0129 +#define QUECTEL_PRODUCT_EM060K_12a 0x012a +#define QUECTEL_PRODUCT_EM060K_12b 0x012b +#define QUECTEL_PRODUCT_EM060K_12c 0x012c #define QUECTEL_PRODUCT_EG91 0x0191 #define QUECTEL_PRODUCT_EG95 0x0195 #define QUECTEL_PRODUCT_BG96 0x0296 @@ -1218,6 +1222,18 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0x00, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_128, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_129, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12a, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12b, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K_12c, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0x00, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0xff, 0x40) }, -- cgit v1.2.3 From 4864a6dd8320ad856698f93009c89f66ccb1653f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 7 Apr 2024 18:57:56 +0300 Subject: fuse: fix wrong ff->iomode state changes from parallel dio write There is a confusion with fuse_file_uncached_io_{start,end} interface. These helpers do two things when called from passthrough open()/release(): 1. Take/drop negative refcount of fi->iocachectr (inode uncached io mode) 2. State change ff->iomode IOM_NONE <-> IOM_UNCACHED (file uncached open) The calls from parallel dio write path need to take a reference on fi->iocachectr, but they should not be changing ff->iomode state, because in this case, the fi->iocachectr reference does not stick around until file release(). Factor out helpers fuse_inode_uncached_io_{start,end}, to be used from parallel dio write path and rename fuse_file_*cached_io_{start,end} helpers to fuse_file_*cached_io_{open,release} to clarify the difference. Fixes: 205c1d802683 ("fuse: allow parallel dio writes with FUSE_DIRECT_IO_ALLOW_MMAP") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 12 +++++++----- fs/fuse/fuse_i.h | 7 ++++--- fs/fuse/inode.c | 1 + fs/fuse/iomode.c | 56 +++++++++++++++++++++++++++++++++++++++----------------- 4 files changed, 51 insertions(+), 25 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a56e7bffd000..b57ce4157640 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1362,7 +1362,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, bool *exclusive) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_file *ff = iocb->ki_filp->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); if (*exclusive) { @@ -1377,7 +1377,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, * have raced, so check it again. */ if (fuse_io_past_eof(iocb, from) || - fuse_file_uncached_io_start(inode, ff, NULL) != 0) { + fuse_inode_uncached_io_start(fi, NULL) != 0) { inode_unlock_shared(inode); inode_lock(inode); *exclusive = true; @@ -1388,13 +1388,13 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_file *ff = iocb->ki_filp->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); if (exclusive) { inode_unlock(inode); } else { /* Allow opens in caching mode after last parallel dio end */ - fuse_file_uncached_io_end(inode, ff); + fuse_inode_uncached_io_end(fi); inode_unlock_shared(inode); } } @@ -2574,8 +2574,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) * First mmap of direct_io file enters caching inode io mode. * Also waits for parallel dio writers to go into serial mode * (exclusive instead of shared lock). + * After first mmap, the inode stays in caching io mode until + * the direct_io file release. */ - rc = fuse_file_cached_io_start(inode, ff); + rc = fuse_file_cached_io_open(inode, ff); if (rc) return rc; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b24084b60864..f23919610313 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1394,9 +1394,10 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); /* iomode.c */ -int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb); -void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); +int fuse_inode_uncached_io_start(struct fuse_inode *fi, + struct fuse_backing *fb); +void fuse_inode_uncached_io_end(struct fuse_inode *fi); int fuse_file_io_open(struct file *file, struct inode *inode); void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3a5d88878335..99e44ea7d875 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,6 +175,7 @@ static void fuse_evict_inode(struct inode *inode) } } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { + WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index c653ddcf0578..98f1fd523dae 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -21,12 +21,13 @@ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) } /* - * Start cached io mode. + * Called on cached file open() and on first mmap() of direct_io file. + * Takes cached_io inode mode reference to be dropped on file release. * * Blocks new parallel dio writes and waits for the in-progress parallel dio * writes to complete. */ -int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -67,10 +68,9 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) return 0; } -static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) +static void fuse_file_cached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) { - struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fi->lock); WARN_ON(fi->iocachectr <= 0); WARN_ON(ff->iomode != IOM_CACHED); @@ -82,9 +82,8 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) } /* Start strictly uncached io mode where cache access is not allowed */ -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) +int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) { - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_backing *oldfb; int err = 0; @@ -99,9 +98,7 @@ int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struc err = -ETXTBSY; goto unlock; } - WARN_ON(ff->iomode != IOM_NONE); fi->iocachectr--; - ff->iomode = IOM_UNCACHED; /* fuse inode holds a single refcount of backing file */ if (!oldfb) { @@ -115,15 +112,29 @@ unlock: return err; } -void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +/* Takes uncached_io inode mode reference to be dropped on file release */ +static int fuse_file_uncached_io_open(struct inode *inode, + struct fuse_file *ff, + struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + err = fuse_inode_uncached_io_start(fi, fb); + if (err) + return err; + + WARN_ON(ff->iomode != IOM_NONE); + ff->iomode = IOM_UNCACHED; + return 0; +} + +void fuse_inode_uncached_io_end(struct fuse_inode *fi) +{ struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); - WARN_ON(ff->iomode != IOM_UNCACHED); - ff->iomode = IOM_NONE; fi->iocachectr++; if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); @@ -134,6 +145,15 @@ void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) fuse_backing_put(oldfb); } +/* Drop uncached_io reference from passthrough open */ +static void fuse_file_uncached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) +{ + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fuse_inode_uncached_io_end(fi); +} + /* * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write @@ -163,7 +183,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ - err = fuse_file_uncached_io_start(inode, ff, fb); + err = fuse_file_uncached_io_open(inode, ff, fb); if (!err) return 0; @@ -216,7 +236,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode) if (ff->open_flags & FOPEN_PASSTHROUGH) err = fuse_file_passthrough_open(inode, file); else - err = fuse_file_cached_io_start(inode, ff); + err = fuse_file_cached_io_open(inode, ff); if (err) goto fail; @@ -236,8 +256,10 @@ fail: /* No more pending io and no new io possible to inode via open/mmapped file */ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + /* - * Last parallel dio close allows caching inode io mode. + * Last passthrough file close allows caching inode io mode. * Last caching file close exits caching inode io mode. */ switch (ff->iomode) { @@ -245,10 +267,10 @@ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) /* Nothing to do */ break; case IOM_UNCACHED: - fuse_file_uncached_io_end(inode, ff); + fuse_file_uncached_io_release(ff, fi); break; case IOM_CACHED: - fuse_file_cached_io_end(inode, ff); + fuse_file_cached_io_release(ff, fi); break; } } -- cgit v1.2.3 From 7cc911262835419fe469ebfae89891c0e97c62ef Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 7 Apr 2024 18:57:57 +0300 Subject: fuse: fix parallel dio write on file open in passthrough mode Parallel dio write takes a negative refcount of fi->iocachectr and so does open of file in passthrough mode. The refcount of passthrough mode is associated with attach/detach of a fuse_backing object to fuse inode. For parallel dio write, the backing file is irrelevant, so the call to fuse_inode_uncached_io_start() passes a NULL fuse_backing object. Passing a NULL fuse_backing will result in false -EBUSY error if the file is already open in passthrough mode. Allow taking negative fi->iocachectr refcount with NULL fuse_backing, because it does not conflict with an already attached fuse_backing object. Fixes: 4a90451bbc7f ("fuse: implement open in passthrough mode") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/iomode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index 98f1fd523dae..c99e285f3183 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -90,7 +90,7 @@ int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) spin_lock(&fi->lock); /* deny conflicting backing files on same fuse inode */ oldfb = fuse_inode_backing(fi); - if (oldfb && oldfb != fb) { + if (fb && oldfb && oldfb != fb) { err = -EBUSY; goto unlock; } @@ -101,7 +101,7 @@ int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) fi->iocachectr--; /* fuse inode holds a single refcount of backing file */ - if (!oldfb) { + if (fb && !oldfb) { oldfb = fuse_inode_backing_set(fi, fb); WARN_ON_ONCE(oldfb != NULL); } else { -- cgit v1.2.3 From eb4b691b9115fae4c844f5941418335575cf667f Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 13 Apr 2024 17:34:31 -0700 Subject: fuse: fix leaked ENOSYS error on first statx call FUSE attempts to detect server support for statx by trying it once and setting no_statx=1 if it fails with ENOSYS, but consider the following scenario: - Userspace (e.g. sh) calls stat() on a file * succeeds - Userspace (e.g. lsd) calls statx(BTIME) on the same file - request_mask = STATX_BASIC_STATS | STATX_BTIME - first pass: sync=true due to differing cache_mask - statx fails and returns ENOSYS - set no_statx and retry - retry sets mask = STATX_BASIC_STATS - now mask == cache_mask; sync=false (time_before: still valid) - so we take the "else if (stat)" path - "err" is still ENOSYS from the failed statx call Fix this by zeroing "err" before retrying the failed call. Fixes: d3045530bdd2 ("fuse: implement statx") Cc: stable@vger.kernel.org # v6.6 Signed-off-by: Danny Lin Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 4a6df591add6..2b0d4781f394 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1321,6 +1321,7 @@ retry: err = fuse_do_statx(inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; + err = 0; goto retry; } } else { -- cgit v1.2.3 From fff1386cc889d8fb4089d285f883f8cba62d82ce Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 11 Apr 2024 11:15:09 +1000 Subject: nouveau: fix instmem race condition around ptr stores Running a lot of VK CTS in parallel against nouveau, once every few hours you might see something like this crash. BUG: kernel NULL pointer dereference, address: 0000000000000008 PGD 8000000114e6e067 P4D 8000000114e6e067 PUD 109046067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 7 PID: 53891 Comm: deqp-vk Not tainted 6.8.0-rc6+ #27 Hardware name: Gigabyte Technology Co., Ltd. Z390 I AORUS PRO WIFI/Z390 I AORUS PRO WIFI-CF, BIOS F8 11/05/2021 RIP: 0010:gp100_vmm_pgt_mem+0xe3/0x180 [nouveau] Code: c7 48 01 c8 49 89 45 58 85 d2 0f 84 95 00 00 00 41 0f b7 46 12 49 8b 7e 08 89 da 42 8d 2c f8 48 8b 47 08 41 83 c7 01 48 89 ee <48> 8b 40 08 ff d0 0f 1f 00 49 8b 7e 08 48 89 d9 48 8d 75 04 48 c1 RSP: 0000:ffffac20c5857838 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 00000000004d8001 RCX: 0000000000000001 RDX: 00000000004d8001 RSI: 00000000000006d8 RDI: ffffa07afe332180 RBP: 00000000000006d8 R08: ffffac20c5857ad0 R09: 0000000000ffff10 R10: 0000000000000001 R11: ffffa07af27e2de0 R12: 000000000000001c R13: ffffac20c5857ad0 R14: ffffa07a96fe9040 R15: 000000000000001c FS: 00007fe395eed7c0(0000) GS:ffffa07e2c980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000011febe001 CR4: 00000000003706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ... ? gp100_vmm_pgt_mem+0xe3/0x180 [nouveau] ? gp100_vmm_pgt_mem+0x37/0x180 [nouveau] nvkm_vmm_iter+0x351/0xa20 [nouveau] ? __pfx_nvkm_vmm_ref_ptes+0x10/0x10 [nouveau] ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] ? __lock_acquire+0x3ed/0x2170 ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] nvkm_vmm_ptes_get_map+0xc2/0x100 [nouveau] ? __pfx_nvkm_vmm_ref_ptes+0x10/0x10 [nouveau] ? __pfx_gp100_vmm_pgt_mem+0x10/0x10 [nouveau] nvkm_vmm_map_locked+0x224/0x3a0 [nouveau] Adding any sort of useful debug usually makes it go away, so I hand wrote the function in a line, and debugged the asm. Every so often pt->memory->ptrs is NULL. This ptrs ptr is set in the nv50_instobj_acquire called from nvkm_kmap. If Thread A and Thread B both get to nv50_instobj_acquire around the same time, and Thread A hits the refcount_set line, and in lockstep thread B succeeds at refcount_inc_not_zero, there is a chance the ptrs value won't have been stored since refcount_set is unordered. Force a memory barrier here, I picked smp_mb, since we want it on all CPUs and it's write followed by a read. v2: use paired smp_rmb/smp_wmb. Cc: Fixes: be55287aa5ba ("drm/nouveau/imem/nv50: embed nvkm_instobj directly into nv04_instobj") Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240411011510.2546857-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c index a7f3fc342d87..dd5b5a17ece0 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c @@ -222,8 +222,11 @@ nv50_instobj_acquire(struct nvkm_memory *memory) void __iomem *map = NULL; /* Already mapped? */ - if (refcount_inc_not_zero(&iobj->maps)) + if (refcount_inc_not_zero(&iobj->maps)) { + /* read barrier match the wmb on refcount set */ + smp_rmb(); return iobj->map; + } /* Take the lock, and re-check that another thread hasn't * already mapped the object in the meantime. @@ -250,6 +253,8 @@ nv50_instobj_acquire(struct nvkm_memory *memory) iobj->base.memory.ptrs = &nv50_instobj_fast; else iobj->base.memory.ptrs = &nv50_instobj_slow; + /* barrier to ensure the ptrs are written before refcount is set */ + smp_wmb(); refcount_set(&iobj->maps, 1); } -- cgit v1.2.3 From cf92bb778eda7830e79452c6917efa8474a30c1e Mon Sep 17 00:00:00 2001 From: Mikhail Kobuk Date: Thu, 11 Apr 2024 14:08:52 +0300 Subject: drm: nv04: Fix out of bounds access When Output Resource (dcb->or) value is assigned in fabricate_dcb_output(), there may be out of bounds access to dac_users array in case dcb->or is zero because ffs(dcb->or) is used as index there. The 'or' argument of fabricate_dcb_output() must be interpreted as a number of bit to set, not value. Utilize macros from 'enum nouveau_or' in calls instead of hardcoding. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 2e5702aff395 ("drm/nouveau: fabricate DCB encoder table for iMac G4") Fixes: 670820c0e6a9 ("drm/nouveau: Workaround incorrect DCB entry on a GeForce3 Ti 200.") Signed-off-by: Mikhail Kobuk Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240411110854.16701-1-m.kobuk@ispras.ru --- drivers/gpu/drm/nouveau/nouveau_bios.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c index 479effcf607e..79cfab53f80e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bios.c +++ b/drivers/gpu/drm/nouveau/nouveau_bios.c @@ -23,6 +23,7 @@ */ #include "nouveau_drv.h" +#include "nouveau_bios.h" #include "nouveau_reg.h" #include "dispnv04/hw.h" #include "nouveau_encoder.h" @@ -1677,7 +1678,7 @@ apply_dcb_encoder_quirks(struct drm_device *dev, int idx, u32 *conn, u32 *conf) */ if (nv_match_device(dev, 0x0201, 0x1462, 0x8851)) { if (*conn == 0xf2005014 && *conf == 0xffffffff) { - fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, 1, 1, 1); + fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, 1, 1, DCB_OUTPUT_B); return false; } } @@ -1763,26 +1764,26 @@ fabricate_dcb_encoder_table(struct drm_device *dev, struct nvbios *bios) #ifdef __powerpc__ /* Apple iMac G4 NV17 */ if (of_machine_is_compatible("PowerMac4,5")) { - fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, 0, all_heads, 1); - fabricate_dcb_output(dcb, DCB_OUTPUT_ANALOG, 1, all_heads, 2); + fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, 0, all_heads, DCB_OUTPUT_B); + fabricate_dcb_output(dcb, DCB_OUTPUT_ANALOG, 1, all_heads, DCB_OUTPUT_C); return; } #endif /* Make up some sane defaults */ fabricate_dcb_output(dcb, DCB_OUTPUT_ANALOG, - bios->legacy.i2c_indices.crt, 1, 1); + bios->legacy.i2c_indices.crt, 1, DCB_OUTPUT_B); if (nv04_tv_identify(dev, bios->legacy.i2c_indices.tv) >= 0) fabricate_dcb_output(dcb, DCB_OUTPUT_TV, bios->legacy.i2c_indices.tv, - all_heads, 0); + all_heads, DCB_OUTPUT_A); else if (bios->tmds.output0_script_ptr || bios->tmds.output1_script_ptr) fabricate_dcb_output(dcb, DCB_OUTPUT_TMDS, bios->legacy.i2c_indices.panel, - all_heads, 1); + all_heads, DCB_OUTPUT_B); } static int -- cgit v1.2.3 From 09492cb45100cab909cabe164deb7cdc14e38634 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 15 Mar 2024 16:02:53 +0800 Subject: cuse: add kernel-doc comments to cuse_process_init_reply() This commit adds kernel-doc style comments with complete parameter descriptions for the function cuse_process_init_reply. Signed-off-by: Yang Li Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cad106c37e..0b2da7b7e2ad 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -310,6 +310,10 @@ struct cuse_init_args { /** * cuse_process_init_reply - finish initializing CUSE channel * + * @fm: The fuse mount information containing the CUSE connection. + * @args: The arguments passed to the init reply. + * @error: The error code signifying if any error occurred during the process. + * * This function creates the character device and sets up all the * required data structures for it. Please read the comment at the * top of this file for high level overview. -- cgit v1.2.3 From 460b0d33cf10eee33de651381d3170ef13241650 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Apr 2024 11:02:02 -0700 Subject: inet: bring NLM_DONE out to a separate recv() again Commit under Fixes optimized the number of recv() calls needed during RTM_GETROUTE dumps, but we got multiple reports of applications hanging on recv() calls. Applications expect that a route dump will be terminated with a recv() reading an individual NLM_DONE message. Coalescing NLM_DONE is perfectly legal in netlink, but even tho reporters fixed the code in respective projects, chances are it will take time for those applications to get updated. So revert to old behavior (for now)? Old kernel (5.19): $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 692 bytes, 11 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 Recv: read 20 bytes, 1 messages nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 Before (6.9-rc2): $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 712 bytes, 12 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 After: $ ./cli.py --dbg-small-recv 4096 --spec netlink/specs/rt_route.yaml \ --dump getroute --json '{"rtm-family": 2}' Recv: read 692 bytes, 11 messages nl_len = 68 (52) nl_flags = 0x22 nl_type = 24 ... nl_len = 60 (44) nl_flags = 0x22 nl_type = 24 Recv: read 20 bytes, 1 messages nl_len = 20 (4) nl_flags = 0x2 nl_type = 3 Reported-by: Stefano Brivio Link: https://lore.kernel.org/all/20240315124808.033ff58d@elisabeth Reported-by: Ilya Maximets Link: https://lore.kernel.org/all/02b50aae-f0e9-47a4-8365-a977a85975d3@ovn.org Fixes: 4ce5dc9316de ("inet: switch inet_dump_fib() to RCU protection") Signed-off-by: Jakub Kicinski Tested-by: Ilya Maximets Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 48741352a88a..c484b1c0fc00 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1050,6 +1050,11 @@ next: e++; } } + + /* Don't let NLM_DONE coalesce into a message, even if it could. + * Some user space expects NLM_DONE in a separate recv(). + */ + err = skb->len; out: cb->args[1] = e; -- cgit v1.2.3 From 75ce9506ee3dc66648a7d74ab3b0acfa364d6d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Fri, 12 Apr 2024 12:02:56 +0000 Subject: octeontx2-pf: fix FLOW_DIS_IS_FRAGMENT implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upon reviewing the flower control flags handling in this driver, I notice that the key wasn't being used, only the mask. Ie. `tc flower ... ip_flags nofrag` was hardware offloaded as `... ip_flags frag`. Only compile tested, no access to HW. Fixes: c672e3727989 ("octeontx2-pf: Add support to filter packet based on IP fragment") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 87bdb93cb066..f4655a8c0705 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -689,6 +689,7 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) { struct flow_match_control match; + u32 val; flow_rule_match_control(rule, &match); if (match.mask->flags & FLOW_DIS_FIRST_FRAG) { @@ -697,12 +698,14 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, } if (match.mask->flags & FLOW_DIS_IS_FRAGMENT) { + val = match.key->flags & FLOW_DIS_IS_FRAGMENT; if (ntohs(flow_spec->etype) == ETH_P_IP) { - flow_spec->ip_flag = IPV4_FLAG_MORE; + flow_spec->ip_flag = val ? IPV4_FLAG_MORE : 0; flow_mask->ip_flag = IPV4_FLAG_MORE; req->features |= BIT_ULL(NPC_IPFRAG_IPV4); } else if (ntohs(flow_spec->etype) == ETH_P_IPV6) { - flow_spec->next_header = IPPROTO_FRAGMENT; + flow_spec->next_header = val ? + IPPROTO_FRAGMENT : 0; flow_mask->next_header = 0xff; req->features |= BIT_ULL(NPC_IPFRAG_IPV6); } else { -- cgit v1.2.3 From b6976f323a8687cc0d55bc92c2086fd934324ed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 15 Apr 2024 15:48:21 +0200 Subject: drm/ttm: stop pooling cached NUMA pages v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only pool write combined and uncached allocations because they require extra overhead on allocation and release. If we also pool cached NUMA it not only means some extra unnecessary overhead, but also that under memory pressure it can happen that pages from the wrong NUMA node enters the pool and are re-used over and over again. This can lead to performance reduction after running into memory pressure. v2: restructure and cleanup the code a bit from the internal hack to test this. Signed-off-by: Christian König Fixes: 4482d3c94d7f ("drm/ttm: add NUMA node id to the pool") CC: stable@vger.kernel.org Reviewed-by: Felix Kuehling Link: https://patchwork.freedesktop.org/patch/msgid/20240415134821.1919-1-christian.koenig@amd.com --- drivers/gpu/drm/ttm/ttm_pool.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 112438d965ff..6e1fd6985ffc 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -288,17 +288,23 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool, enum ttm_caching caching, unsigned int order) { - if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) + if (pool->use_dma_alloc) return &pool->caching[caching].orders[order]; #ifdef CONFIG_X86 switch (caching) { case ttm_write_combined: + if (pool->nid != NUMA_NO_NODE) + return &pool->caching[caching].orders[order]; + if (pool->use_dma32) return &global_dma32_write_combined[order]; return &global_write_combined[order]; case ttm_uncached: + if (pool->nid != NUMA_NO_NODE) + return &pool->caching[caching].orders[order]; + if (pool->use_dma32) return &global_dma32_uncached[order]; @@ -566,11 +572,17 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, pool->use_dma_alloc = use_dma_alloc; pool->use_dma32 = use_dma32; - if (use_dma_alloc || nid != NUMA_NO_NODE) { - for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j < NR_PAGE_ORDERS; ++j) - ttm_pool_type_init(&pool->caching[i].orders[j], - pool, i, j); + for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { + for (j = 0; j < NR_PAGE_ORDERS; ++j) { + struct ttm_pool_type *pt; + + /* Initialize only pool types which are actually used */ + pt = ttm_pool_select_type(pool, i, j); + if (pt != &pool->caching[i].orders[j]) + continue; + + ttm_pool_type_init(pt, pool, i, j); + } } } EXPORT_SYMBOL(ttm_pool_init); @@ -599,10 +611,16 @@ void ttm_pool_fini(struct ttm_pool *pool) { unsigned int i, j; - if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) { - for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j < NR_PAGE_ORDERS; ++j) - ttm_pool_type_fini(&pool->caching[i].orders[j]); + for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { + for (j = 0; j < NR_PAGE_ORDERS; ++j) { + struct ttm_pool_type *pt; + + pt = ttm_pool_select_type(pool, i, j); + if (pt != &pool->caching[i].orders[j]) + continue; + + ttm_pool_type_fini(pt); + } } /* We removed the pool types from the LRU, but we need to also make sure -- cgit v1.2.3 From a2ac1cbc5397eb4e400efa66c3337886d9a63026 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Mon, 15 Apr 2024 13:10:51 +0530 Subject: pwm: dwc: allow suspend/resume for 16 channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 16 channel pwm support, we're registering two instances of pwm_chip with 8 channels each. We need to update PM functions to use both instances of pwm_chip during power state transitions. Introduce struct dwc_pwm_drvdata and use it as driver_data, which will maintain both instances of pwm_chip along with dwc_pwm_info and allow us to use them inside suspend/resume handles. Fixes: ebf2c89eb95e ("pwm: dwc: Add 16 channel support for Intel Elkhart Lake") Signed-off-by: Raag Jadav Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240415074051.14681-1-raag.jadav@intel.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-dwc-core.c | 1 - drivers/pwm/pwm-dwc.c | 88 ++++++++++++++++++++++++++++++---------------- drivers/pwm/pwm-dwc.h | 6 ++++ 3 files changed, 63 insertions(+), 32 deletions(-) diff --git a/drivers/pwm/pwm-dwc-core.c b/drivers/pwm/pwm-dwc-core.c index 043736972cb9..c8425493b95d 100644 --- a/drivers/pwm/pwm-dwc-core.c +++ b/drivers/pwm/pwm-dwc-core.c @@ -172,7 +172,6 @@ struct pwm_chip *dwc_pwm_alloc(struct device *dev) dwc->clk_ns = 10; chip->ops = &dwc_pwm_ops; - dev_set_drvdata(dev, chip); return chip; } EXPORT_SYMBOL_GPL(dwc_pwm_alloc); diff --git a/drivers/pwm/pwm-dwc.c b/drivers/pwm/pwm-dwc.c index 676eaf8d7a53..fb3eadf6fbc4 100644 --- a/drivers/pwm/pwm-dwc.c +++ b/drivers/pwm/pwm-dwc.c @@ -31,26 +31,34 @@ static const struct dwc_pwm_info ehl_pwm_info = { .size = 0x1000, }; -static int dwc_pwm_init_one(struct device *dev, void __iomem *base, unsigned int offset) +static int dwc_pwm_init_one(struct device *dev, struct dwc_pwm_drvdata *ddata, unsigned int idx) { struct pwm_chip *chip; struct dwc_pwm *dwc; + int ret; chip = dwc_pwm_alloc(dev); if (IS_ERR(chip)) return PTR_ERR(chip); dwc = to_dwc_pwm(chip); - dwc->base = base + offset; + dwc->base = ddata->io_base + (ddata->info->size * idx); - return devm_pwmchip_add(dev, chip); + ret = devm_pwmchip_add(dev, chip); + if (ret) + return ret; + + ddata->chips[idx] = chip; + return 0; } static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id) { const struct dwc_pwm_info *info; struct device *dev = &pci->dev; - int i, ret; + struct dwc_pwm_drvdata *ddata; + unsigned int idx; + int ret; ret = pcim_enable_device(pci); if (ret) @@ -63,17 +71,25 @@ static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id) return dev_err_probe(dev, ret, "Failed to iomap PCI BAR\n"); info = (const struct dwc_pwm_info *)id->driver_data; - - for (i = 0; i < info->nr; i++) { - /* - * No need to check for pcim_iomap_table() failure, - * pcim_iomap_regions() already does it for us. - */ - ret = dwc_pwm_init_one(dev, pcim_iomap_table(pci)[0], i * info->size); + ddata = devm_kzalloc(dev, struct_size(ddata, chips, info->nr), GFP_KERNEL); + if (!ddata) + return -ENOMEM; + + /* + * No need to check for pcim_iomap_table() failure, + * pcim_iomap_regions() already does it for us. + */ + ddata->io_base = pcim_iomap_table(pci)[0]; + ddata->info = info; + + for (idx = 0; idx < ddata->info->nr; idx++) { + ret = dwc_pwm_init_one(dev, ddata, idx); if (ret) return ret; } + dev_set_drvdata(dev, ddata); + pm_runtime_put(dev); pm_runtime_allow(dev); @@ -88,19 +104,24 @@ static void dwc_pwm_remove(struct pci_dev *pci) static int dwc_pwm_suspend(struct device *dev) { - struct pwm_chip *chip = dev_get_drvdata(dev); - struct dwc_pwm *dwc = to_dwc_pwm(chip); - int i; - - for (i = 0; i < DWC_TIMERS_TOTAL; i++) { - if (chip->pwms[i].state.enabled) { - dev_err(dev, "PWM %u in use by consumer (%s)\n", - i, chip->pwms[i].label); - return -EBUSY; + struct dwc_pwm_drvdata *ddata = dev_get_drvdata(dev); + unsigned int idx; + + for (idx = 0; idx < ddata->info->nr; idx++) { + struct pwm_chip *chip = ddata->chips[idx]; + struct dwc_pwm *dwc = to_dwc_pwm(chip); + unsigned int i; + + for (i = 0; i < DWC_TIMERS_TOTAL; i++) { + if (chip->pwms[i].state.enabled) { + dev_err(dev, "PWM %u in use by consumer (%s)\n", + i, chip->pwms[i].label); + return -EBUSY; + } + dwc->ctx[i].cnt = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT(i)); + dwc->ctx[i].cnt2 = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT2(i)); + dwc->ctx[i].ctrl = dwc_pwm_readl(dwc, DWC_TIM_CTRL(i)); } - dwc->ctx[i].cnt = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT(i)); - dwc->ctx[i].cnt2 = dwc_pwm_readl(dwc, DWC_TIM_LD_CNT2(i)); - dwc->ctx[i].ctrl = dwc_pwm_readl(dwc, DWC_TIM_CTRL(i)); } return 0; @@ -108,14 +129,19 @@ static int dwc_pwm_suspend(struct device *dev) static int dwc_pwm_resume(struct device *dev) { - struct pwm_chip *chip = dev_get_drvdata(dev); - struct dwc_pwm *dwc = to_dwc_pwm(chip); - int i; - - for (i = 0; i < DWC_TIMERS_TOTAL; i++) { - dwc_pwm_writel(dwc, dwc->ctx[i].cnt, DWC_TIM_LD_CNT(i)); - dwc_pwm_writel(dwc, dwc->ctx[i].cnt2, DWC_TIM_LD_CNT2(i)); - dwc_pwm_writel(dwc, dwc->ctx[i].ctrl, DWC_TIM_CTRL(i)); + struct dwc_pwm_drvdata *ddata = dev_get_drvdata(dev); + unsigned int idx; + + for (idx = 0; idx < ddata->info->nr; idx++) { + struct pwm_chip *chip = ddata->chips[idx]; + struct dwc_pwm *dwc = to_dwc_pwm(chip); + unsigned int i; + + for (i = 0; i < DWC_TIMERS_TOTAL; i++) { + dwc_pwm_writel(dwc, dwc->ctx[i].cnt, DWC_TIM_LD_CNT(i)); + dwc_pwm_writel(dwc, dwc->ctx[i].cnt2, DWC_TIM_LD_CNT2(i)); + dwc_pwm_writel(dwc, dwc->ctx[i].ctrl, DWC_TIM_CTRL(i)); + } } return 0; diff --git a/drivers/pwm/pwm-dwc.h b/drivers/pwm/pwm-dwc.h index a8b074841ae8..c6e2df5a6122 100644 --- a/drivers/pwm/pwm-dwc.h +++ b/drivers/pwm/pwm-dwc.h @@ -38,6 +38,12 @@ struct dwc_pwm_info { unsigned int size; }; +struct dwc_pwm_drvdata { + const struct dwc_pwm_info *info; + void __iomem *io_base; + struct pwm_chip *chips[]; +}; + struct dwc_pwm_ctx { u32 cnt; u32 cnt2; -- cgit v1.2.3 From fb7c3d8ba039df877886fd457538d8b24ca9c84b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 4 Apr 2024 10:18:08 +0200 Subject: dt-bindings: pwm: mediatek,pwm-disp: Document power-domains property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow the power-domains property to the PWM_DISP block as on some SoCs this does need at most one power domain. Fixes: b09b179bac0a ("dt-bindings: pwm: Convert pwm-mtk-disp.txt to mediatek,pwm-disp.yaml format") Signed-off-by: AngeloGioacchino Del Regno Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240404081808.92199-1-angelogioacchino.delregno@collabora.com Signed-off-by: Uwe Kleine-König --- Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml index afcdeed4e88a..bc813fe74fab 100644 --- a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -52,6 +52,9 @@ properties: - const: main - const: mm + power-domains: + maxItems: 1 + required: - compatible - reg -- cgit v1.2.3 From b32233accefff1338806f064fb9b62cf5bc0609f Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Thu, 11 Apr 2024 22:55:09 -0400 Subject: drm/vmwgfx: Fix prime import/export vmwgfx never supported prime import of external buffers. Furthermore the driver exposes two different objects to userspace: vmw_surface's and gem buffers but prime import/export only worked with vmw_surfaces. Because gem buffers are used through the dumb_buffer interface this meant that the driver created buffers couldn't have been prime exported or imported. Fix prime import/export. Makes IGT's kms_prime pass. Signed-off-by: Zack Rusin Fixes: 8afa13a0583f ("drm/vmwgfx: Implement DRIVER_GEM") Cc: # v6.6+ Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240412025511.78553-4-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 35 ++++++++++++++++++++++-- drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 7 +++-- drivers/gpu/drm/vmwgfx/vmwgfx_bo.h | 2 ++ drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 1 + drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 3 ++ drivers/gpu/drm/vmwgfx/vmwgfx_gem.c | 32 ++++++++++++++++++++++ drivers/gpu/drm/vmwgfx/vmwgfx_prime.c | 15 ++++++++-- drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 44 ++++++++++++++++++++---------- 8 files changed, 117 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c index c52c7bf1485b..717d624e9a05 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c @@ -456,8 +456,10 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, .no_wait_gpu = false }; u32 j, initial_line = dst_offset / dst_stride; - struct vmw_bo_blit_line_data d; + struct vmw_bo_blit_line_data d = {0}; int ret = 0; + struct page **dst_pages = NULL; + struct page **src_pages = NULL; /* Buffer objects need to be either pinned or reserved: */ if (!(dst->pin_count)) @@ -477,12 +479,35 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, return ret; } + if (!src->ttm->pages && src->ttm->sg) { + src_pages = kvmalloc_array(src->ttm->num_pages, + sizeof(struct page *), GFP_KERNEL); + if (!src_pages) + return -ENOMEM; + ret = drm_prime_sg_to_page_array(src->ttm->sg, src_pages, + src->ttm->num_pages); + if (ret) + goto out; + } + if (!dst->ttm->pages && dst->ttm->sg) { + dst_pages = kvmalloc_array(dst->ttm->num_pages, + sizeof(struct page *), GFP_KERNEL); + if (!dst_pages) { + ret = -ENOMEM; + goto out; + } + ret = drm_prime_sg_to_page_array(dst->ttm->sg, dst_pages, + dst->ttm->num_pages); + if (ret) + goto out; + } + d.mapped_dst = 0; d.mapped_src = 0; d.dst_addr = NULL; d.src_addr = NULL; - d.dst_pages = dst->ttm->pages; - d.src_pages = src->ttm->pages; + d.dst_pages = dst->ttm->pages ? dst->ttm->pages : dst_pages; + d.src_pages = src->ttm->pages ? src->ttm->pages : src_pages; d.dst_num_pages = PFN_UP(dst->resource->size); d.src_num_pages = PFN_UP(src->resource->size); d.dst_prot = ttm_io_prot(dst, dst->resource, PAGE_KERNEL); @@ -504,6 +529,10 @@ out: kunmap_atomic(d.src_addr); if (d.dst_addr) kunmap_atomic(d.dst_addr); + if (src_pages) + kvfree(src_pages); + if (dst_pages) + kvfree(dst_pages); return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c index bfd41ce3c8f4..e5eb21a471a6 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c @@ -377,7 +377,8 @@ static int vmw_bo_init(struct vmw_private *dev_priv, { struct ttm_operation_ctx ctx = { .interruptible = params->bo_type != ttm_bo_type_kernel, - .no_wait_gpu = false + .no_wait_gpu = false, + .resv = params->resv, }; struct ttm_device *bdev = &dev_priv->bdev; struct drm_device *vdev = &dev_priv->drm; @@ -394,8 +395,8 @@ static int vmw_bo_init(struct vmw_private *dev_priv, vmw_bo_placement_set(vmw_bo, params->domain, params->busy_domain); ret = ttm_bo_init_reserved(bdev, &vmw_bo->tbo, params->bo_type, - &vmw_bo->placement, 0, &ctx, NULL, - NULL, destroy); + &vmw_bo->placement, 0, &ctx, + params->sg, params->resv, destroy); if (unlikely(ret)) return ret; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h index 0d496dc9c6af..f349642e6190 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h @@ -55,6 +55,8 @@ struct vmw_bo_params { enum ttm_bo_type bo_type; size_t size; bool pin; + struct dma_resv *resv; + struct sg_table *sg; }; /** diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 0a304706e013..58fb40c93100 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -1628,6 +1628,7 @@ static const struct drm_driver driver = { .prime_fd_to_handle = vmw_prime_fd_to_handle, .prime_handle_to_fd = vmw_prime_handle_to_fd, + .gem_prime_import_sg_table = vmw_prime_import_sg_table, .fops = &vmwgfx_driver_fops, .name = VMWGFX_DRIVER_NAME, diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index 12efecc17df6..b019a1a1787a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -1130,6 +1130,9 @@ extern int vmw_prime_handle_to_fd(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); +struct drm_gem_object *vmw_prime_import_sg_table(struct drm_device *dev, + struct dma_buf_attachment *attach, + struct sg_table *table); /* * MemoryOBject management - vmwgfx_mob.c diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c index 12787bb9c111..d6bcaf078b1f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c @@ -149,6 +149,38 @@ out_no_bo: return ret; } +struct drm_gem_object *vmw_prime_import_sg_table(struct drm_device *dev, + struct dma_buf_attachment *attach, + struct sg_table *table) +{ + int ret; + struct vmw_private *dev_priv = vmw_priv(dev); + struct drm_gem_object *gem = NULL; + struct vmw_bo *vbo; + struct vmw_bo_params params = { + .domain = (dev_priv->has_mob) ? VMW_BO_DOMAIN_SYS : VMW_BO_DOMAIN_VRAM, + .busy_domain = VMW_BO_DOMAIN_SYS, + .bo_type = ttm_bo_type_sg, + .size = attach->dmabuf->size, + .pin = false, + .resv = attach->dmabuf->resv, + .sg = table, + + }; + + dma_resv_lock(params.resv, NULL); + + ret = vmw_bo_create(dev_priv, ¶ms, &vbo); + if (ret != 0) + goto out_no_bo; + + vbo->tbo.base.funcs = &vmw_gem_object_funcs; + + gem = &vbo->tbo.base; +out_no_bo: + dma_resv_unlock(params.resv); + return gem; +} int vmw_gem_object_create_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c index 2d72a5ee7c0c..c99cad444991 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c @@ -75,8 +75,12 @@ int vmw_prime_fd_to_handle(struct drm_device *dev, int fd, u32 *handle) { struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; + int ret = ttm_prime_fd_to_handle(tfile, fd, handle); - return ttm_prime_fd_to_handle(tfile, fd, handle); + if (ret) + ret = drm_gem_prime_fd_to_handle(dev, file_priv, fd, handle); + + return ret; } int vmw_prime_handle_to_fd(struct drm_device *dev, @@ -85,5 +89,12 @@ int vmw_prime_handle_to_fd(struct drm_device *dev, int *prime_fd) { struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; - return ttm_prime_handle_to_fd(tfile, handle, flags, prime_fd); + int ret; + + if (handle > VMWGFX_NUM_MOB) + ret = ttm_prime_handle_to_fd(tfile, handle, flags, prime_fd); + else + ret = drm_gem_prime_handle_to_fd(dev, file_priv, handle, flags, prime_fd); + + return ret; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index 4d23d0a70bcb..621d98b376bb 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -188,13 +188,18 @@ static int vmw_ttm_map_dma(struct vmw_ttm_tt *vmw_tt) switch (dev_priv->map_mode) { case vmw_dma_map_bind: case vmw_dma_map_populate: - vsgt->sgt = &vmw_tt->sgt; - ret = sg_alloc_table_from_pages_segment( - &vmw_tt->sgt, vsgt->pages, vsgt->num_pages, 0, - (unsigned long)vsgt->num_pages << PAGE_SHIFT, - dma_get_max_seg_size(dev_priv->drm.dev), GFP_KERNEL); - if (ret) - goto out_sg_alloc_fail; + if (vmw_tt->dma_ttm.page_flags & TTM_TT_FLAG_EXTERNAL) { + vsgt->sgt = vmw_tt->dma_ttm.sg; + } else { + vsgt->sgt = &vmw_tt->sgt; + ret = sg_alloc_table_from_pages_segment(&vmw_tt->sgt, + vsgt->pages, vsgt->num_pages, 0, + (unsigned long)vsgt->num_pages << PAGE_SHIFT, + dma_get_max_seg_size(dev_priv->drm.dev), + GFP_KERNEL); + if (ret) + goto out_sg_alloc_fail; + } ret = vmw_ttm_map_for_dma(vmw_tt); if (unlikely(ret != 0)) @@ -209,8 +214,9 @@ static int vmw_ttm_map_dma(struct vmw_ttm_tt *vmw_tt) return 0; out_map_fail: - sg_free_table(vmw_tt->vsgt.sgt); - vmw_tt->vsgt.sgt = NULL; + drm_warn(&dev_priv->drm, "VSG table map failed!"); + sg_free_table(vsgt->sgt); + vsgt->sgt = NULL; out_sg_alloc_fail: return ret; } @@ -356,15 +362,17 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) static int vmw_ttm_populate(struct ttm_device *bdev, struct ttm_tt *ttm, struct ttm_operation_ctx *ctx) { - int ret; + bool external = (ttm->page_flags & TTM_TT_FLAG_EXTERNAL) != 0; - /* TODO: maybe completely drop this ? */ if (ttm_tt_is_populated(ttm)) return 0; - ret = ttm_pool_alloc(&bdev->pool, ttm, ctx); + if (external && ttm->sg) + return drm_prime_sg_to_dma_addr_array(ttm->sg, + ttm->dma_address, + ttm->num_pages); - return ret; + return ttm_pool_alloc(&bdev->pool, ttm, ctx); } static void vmw_ttm_unpopulate(struct ttm_device *bdev, @@ -372,6 +380,10 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev, { struct vmw_ttm_tt *vmw_tt = container_of(ttm, struct vmw_ttm_tt, dma_ttm); + bool external = (ttm->page_flags & TTM_TT_FLAG_EXTERNAL) != 0; + + if (external) + return; vmw_ttm_unbind(bdev, ttm); @@ -390,6 +402,7 @@ static struct ttm_tt *vmw_ttm_tt_create(struct ttm_buffer_object *bo, { struct vmw_ttm_tt *vmw_be; int ret; + bool external = bo->type == ttm_bo_type_sg; vmw_be = kzalloc(sizeof(*vmw_be), GFP_KERNEL); if (!vmw_be) @@ -398,7 +411,10 @@ static struct ttm_tt *vmw_ttm_tt_create(struct ttm_buffer_object *bo, vmw_be->dev_priv = vmw_priv_from_ttm(bo->bdev); vmw_be->mob = NULL; - if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent) + if (external) + page_flags |= TTM_TT_FLAG_EXTERNAL | TTM_TT_FLAG_EXTERNAL_MAPPABLE; + + if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent || external) ret = ttm_sg_tt_init(&vmw_be->dma_ttm, bo, page_flags, ttm_cached); else -- cgit v1.2.3 From a60ccade88f926e871a57176e86a34bbf0db0098 Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Thu, 11 Apr 2024 22:55:10 -0400 Subject: drm/vmwgfx: Fix crtc's atomic check conditional The conditional was supposed to prevent enabling of a crtc state without a set primary plane. Accidently it also prevented disabling crtc state with a set primary plane. Neither is correct. Fix the conditional and just driver-warn when a crtc state has been enabled without a primary plane which will help debug broken userspace. Fixes IGT's kms_atomic_interruptible and kms_atomic_transition tests. Signed-off-by: Zack Rusin Fixes: 06ec41909e31 ("drm/vmwgfx: Add and connect CRTC helper functions") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v4.12+ Reviewed-by: Ian Forbes Reviewed-by: Martin Krastev Link: https://patchwork.freedesktop.org/patch/msgid/20240412025511.78553-5-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index cd4925346ed4..84ae4e10a2eb 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -933,6 +933,7 @@ int vmw_du_cursor_plane_atomic_check(struct drm_plane *plane, int vmw_du_crtc_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state) { + struct vmw_private *vmw = vmw_priv(crtc->dev); struct drm_crtc_state *new_state = drm_atomic_get_new_crtc_state(state, crtc); struct vmw_display_unit *du = vmw_crtc_to_du(new_state->crtc); @@ -940,9 +941,13 @@ int vmw_du_crtc_atomic_check(struct drm_crtc *crtc, bool has_primary = new_state->plane_mask & drm_plane_mask(crtc->primary); - /* We always want to have an active plane with an active CRTC */ - if (has_primary != new_state->enable) - return -EINVAL; + /* + * This is fine in general, but broken userspace might expect + * some actual rendering so give a clue as why it's blank. + */ + if (new_state->enable && !has_primary) + drm_dbg_driver(&vmw->drm, + "CRTC without a primary plane will be blank.\n"); if (new_state->connector_mask != connector_mask && -- cgit v1.2.3 From d4c972bff3129a9dd4c22a3999fd8eba1a81531a Mon Sep 17 00:00:00 2001 From: Zack Rusin Date: Thu, 11 Apr 2024 22:55:11 -0400 Subject: drm/vmwgfx: Sort primary plane formats by order of preference The table of primary plane formats wasn't sorted at all, leading to applications picking our least desirable formats by defaults. Sort the primary plane formats according to our order of preference. Nice side-effect of this change is that it makes IGT's kms_atomic plane-invalid-params pass because the test picks the first format which for vmwgfx was DRM_FORMAT_XRGB1555 and uses fb's with odd sizes which make Pixman, which IGT depends on assert due to the fact that our 16bpp formats aren't 32 bit aligned like Pixman requires all formats to be. Signed-off-by: Zack Rusin Fixes: 36cc79bc9077 ("drm/vmwgfx: Add universal plane support") Cc: Broadcom internal kernel review list Cc: dri-devel@lists.freedesktop.org Cc: # v4.12+ Acked-by: Pekka Paalanen Link: https://patchwork.freedesktop.org/patch/msgid/20240412025511.78553-6-zack.rusin@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h index a94947b588e8..19a843da87b7 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h @@ -243,10 +243,10 @@ struct vmw_framebuffer_bo { static const uint32_t __maybe_unused vmw_primary_plane_formats[] = { - DRM_FORMAT_XRGB1555, - DRM_FORMAT_RGB565, DRM_FORMAT_XRGB8888, DRM_FORMAT_ARGB8888, + DRM_FORMAT_RGB565, + DRM_FORMAT_XRGB1555, }; static const uint32_t __maybe_unused vmw_cursor_plane_formats[] = { -- cgit v1.2.3 From 015a12a4a6708d9cadcaea8334e270747923394c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 15 Apr 2024 15:10:03 +0530 Subject: arm64/hugetlb: Fix page table walk in huge_pte_alloc() Currently normal HugeTLB fault ends up crashing the kernel, as p4dp derived from p4d_offset() is an invalid address when PGTABLE_LEVEL = 5. A p4d level entry needs to be allocated when not available while walking the page table during HugeTLB faults. Let's call p4d_alloc() to allocate such entries when required instead of current p4d_offset(). Unable to handle kernel paging request at virtual address ffffffff80000000 Mem abort info: ESR = 0x0000000096000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault Data abort info: ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 52-bit VAs, pgdp=0000000081da9000 [ffffffff80000000] pgd=1000000082cec003, p4d=0000000082c32003, pud=0000000000000000 Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 108 Comm: high_addr_hugep Not tainted 6.9.0-rc4 #48 Hardware name: Foundation-v8A (DT) pstate: 01402005 (nzcv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : huge_pte_alloc+0xd4/0x334 lr : hugetlb_fault+0x1b8/0xc68 sp : ffff8000833bbc20 x29: ffff8000833bbc20 x28: fff000080080cb58 x27: ffff800082a7cc58 x26: 0000000000000000 x25: fff0000800378e40 x24: fff00008008d6c60 x23: 00000000de9dbf07 x22: fff0000800378e40 x21: 0004000000000000 x20: 0004000000000000 x19: ffffffff80000000 x18: 1ffe00010011d7a1 x17: 0000000000000001 x16: ffffffffffffffff x15: 0000000000000001 x14: 0000000000000000 x13: ffff8000816120d0 x12: ffffffffffffffff x11: 0000000000000000 x10: fff00008008ebd0c x9 : 0004000000000000 x8 : 0000000000001255 x7 : fff00008003e2000 x6 : 00000000061d54b0 x5 : 0000000000001000 x4 : ffffffff80000000 x3 : 0000000000200000 x2 : 0000000000000004 x1 : 0000000080000000 x0 : 0000000000000000 Call trace: huge_pte_alloc+0xd4/0x334 hugetlb_fault+0x1b8/0xc68 handle_mm_fault+0x260/0x29c do_page_fault+0xfc/0x47c do_translation_fault+0x68/0x74 do_mem_abort+0x44/0x94 el0_da+0x2c/0x9c el0t_64_sync_handler+0x70/0xc4 el0t_64_sync+0x190/0x194 Code: aa000084 cb010084 b24c2c84 8b130c93 (f9400260) ---[ end trace 0000000000000000 ]--- Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Fixes: a6bbf5d4d9d1 ("arm64: mm: Add definitions to support 5 levels of paging") Reported-by: Dev Jain Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20240415094003.1812018-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/hugetlbpage.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0f0e10bb0a95..b872b003a55f 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -276,7 +276,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep = NULL; pgdp = pgd_offset(mm, addr); - p4dp = p4d_offset(pgdp, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + return NULL; + pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return NULL; -- cgit v1.2.3 From 3078e059a5e984663c5c2b04485375c84c2700f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 12 Apr 2024 14:36:38 +0800 Subject: bcachefs: fix error path of __bch2_read_super() In __bch2_read_super(), if kstrdup() fails, it needs to release memory in sb->holder, fix to call bch2_free_super() in the error path. Signed-off-by: Chao Yu Reviewed-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index e0aa3655b63b..648986a7dbde 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -700,8 +700,11 @@ retry: return -ENOMEM; sb->sb_name = kstrdup(path, GFP_KERNEL); - if (!sb->sb_name) - return -ENOMEM; + if (!sb->sb_name) { + ret = -ENOMEM; + prt_printf(&err, "error allocating memory for sb_name"); + goto err; + } #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) -- cgit v1.2.3 From ad29cf999a9161e7849aa229d2028854f90728c2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 13 Apr 2024 18:02:15 -0400 Subject: bcachefs: set_btree_iter_dontneed also clears should_be_locked This is part of a larger series cleaning up the semantics of should_be_locked and adding assertions around it; if we don't need an iterator/path anymore, it clearly doesn't need to be locked. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 1d58d447b386..1c70836dd7cc 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -498,8 +498,13 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - if (!trans->restarted) - btree_iter_path(trans, iter)->preserve = false; + if (!iter->path || trans->restarted) + return; + + struct btree_path *path = btree_iter_path(trans, iter); + path->preserve = false; + if (path->ref == 1) + path->should_be_locked = false; } void *__bch2_trans_kmalloc(struct btree_trans *, size_t); -- cgit v1.2.3 From 35f4f8c9fc972248055096d63b782060e473311b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Wed, 3 Apr 2024 17:24:50 -0300 Subject: drm/v3d: Don't increment `enabled_ns` twice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 509433d8146c ("drm/v3d: Expose the total GPU usage stats on sysfs") introduced the calculation of global GPU stats. For the regards, it used the already existing infrastructure provided by commit 09a93cc4f7d1 ("drm/v3d: Implement show_fdinfo() callback for GPU usage stats"). While adding global GPU stats calculation ability, the author forgot to delete the existing one. Currently, the value of `enabled_ns` is incremented twice by the end of the job, when it should be added just once. Therefore, delete the leftovers from commit 509433d8146c ("drm/v3d: Expose the total GPU usage stats on sysfs"). Fixes: 509433d8146c ("drm/v3d: Expose the total GPU usage stats on sysfs") Reported-by: Tvrtko Ursulin Signed-off-by: Maíra Canal Reviewed-by: Tvrtko Ursulin Reviewed-by: Jose Maria Casanova Crespo Link: https://patchwork.freedesktop.org/patch/msgid/20240403203517.731876-2-mcanal@igalia.com --- drivers/gpu/drm/v3d/v3d_irq.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c index 2e04f6cb661e..ce6b2fb341d1 100644 --- a/drivers/gpu/drm/v3d/v3d_irq.c +++ b/drivers/gpu/drm/v3d/v3d_irq.c @@ -105,7 +105,6 @@ v3d_irq(int irq, void *arg) struct v3d_file_priv *file = v3d->bin_job->base.file->driver_priv; u64 runtime = local_clock() - file->start_ns[V3D_BIN]; - file->enabled_ns[V3D_BIN] += local_clock() - file->start_ns[V3D_BIN]; file->jobs_sent[V3D_BIN]++; v3d->queue[V3D_BIN].jobs_sent++; @@ -126,7 +125,6 @@ v3d_irq(int irq, void *arg) struct v3d_file_priv *file = v3d->render_job->base.file->driver_priv; u64 runtime = local_clock() - file->start_ns[V3D_RENDER]; - file->enabled_ns[V3D_RENDER] += local_clock() - file->start_ns[V3D_RENDER]; file->jobs_sent[V3D_RENDER]++; v3d->queue[V3D_RENDER].jobs_sent++; @@ -147,7 +145,6 @@ v3d_irq(int irq, void *arg) struct v3d_file_priv *file = v3d->csd_job->base.file->driver_priv; u64 runtime = local_clock() - file->start_ns[V3D_CSD]; - file->enabled_ns[V3D_CSD] += local_clock() - file->start_ns[V3D_CSD]; file->jobs_sent[V3D_CSD]++; v3d->queue[V3D_CSD].jobs_sent++; @@ -195,7 +192,6 @@ v3d_hub_irq(int irq, void *arg) struct v3d_file_priv *file = v3d->tfu_job->base.file->driver_priv; u64 runtime = local_clock() - file->start_ns[V3D_TFU]; - file->enabled_ns[V3D_TFU] += local_clock() - file->start_ns[V3D_TFU]; file->jobs_sent[V3D_TFU]++; v3d->queue[V3D_TFU].jobs_sent++; -- cgit v1.2.3 From e048d668f2969cf2b76e0fa21882a1b3bb323eca Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 11 Apr 2024 11:11:06 -0700 Subject: configs/hardening: Fix disabling UBSAN configurations The initial change that added kernel/configs/hardening.config attempted to disable all UBSAN sanitizers except for the array bounds one while turning on UBSAN_TRAP. Unfortunately, it only got the syntax for CONFIG_UBSAN_SHIFT correct, so configurations that are on by default with CONFIG_UBSAN=y such as CONFIG_UBSAN_{BOOL,ENUM} do not get disabled properly. CONFIG_ARCH_HAS_UBSAN=y CONFIG_UBSAN=y CONFIG_UBSAN_TRAP=y CONFIG_CC_HAS_UBSAN_BOUNDS_STRICT=y CONFIG_UBSAN_BOUNDS=y CONFIG_UBSAN_BOUNDS_STRICT=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set CONFIG_UBSAN_SIGNED_WRAP=y CONFIG_UBSAN_BOOL=y CONFIG_UBSAN_ENUM=y # CONFIG_TEST_UBSAN is not set Add the missing 'is not set' to each configuration that needs it so that they get disabled as intended. CONFIG_ARCH_HAS_UBSAN=y CONFIG_UBSAN=y CONFIG_UBSAN_TRAP=y CONFIG_CC_HAS_UBSAN_BOUNDS_STRICT=y CONFIG_UBSAN_BOUNDS=y CONFIG_UBSAN_BOUNDS_STRICT=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set CONFIG_UBSAN_SIGNED_WRAP=y # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set # CONFIG_TEST_UBSAN is not set Fixes: 215199e3d9f3 ("hardening: Provide Kconfig fragments for basic options") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240411-fix-ubsan-in-hardening-config-v1-1-e0177c80ffaa@kernel.org Signed-off-by: Kees Cook --- kernel/configs/hardening.config | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 7a5bbfc024b7..d6f6dc45628a 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -39,11 +39,11 @@ CONFIG_UBSAN=y CONFIG_UBSAN_TRAP=y CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_SHIFT is not set -# CONFIG_UBSAN_DIV_ZERO -# CONFIG_UBSAN_UNREACHABLE -# CONFIG_UBSAN_BOOL -# CONFIG_UBSAN_ENUM -# CONFIG_UBSAN_ALIGNMENT +# CONFIG_UBSAN_DIV_ZERO is not set +# CONFIG_UBSAN_UNREACHABLE is not set +# CONFIG_UBSAN_BOOL is not set +# CONFIG_UBSAN_ENUM is not set +# CONFIG_UBSAN_ALIGNMENT is not set # Sampling-based heap out-of-bounds and use-after-free detection. CONFIG_KFENCE=y -- cgit v1.2.3 From 7fcb91d94e897413c0345bb32ea11293f33efbb1 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 11 Apr 2024 11:11:07 -0700 Subject: configs/hardening: Disable CONFIG_UBSAN_SIGNED_WRAP kernel/configs/hardening.config turns on UBSAN for the bounds sanitizer, as that in combination with trapping can stop the exploitation of buffer overflows within the kernel. At the same time, hardening.config turns off every other UBSAN sanitizer because trapping means all UBSAN reports will be fatal and the problems brought up by other sanitizers generally do not have security implications. The signed integer overflow sanitizer was recently added back to the kernel and it is default on with just CONFIG_UBSAN=y, meaning that it gets enabled when merging hardening.config into another configuration. While this sanitizer does have security implications like the array bounds sanitizer, work to clean up enough instances to allow this to run in production environments is still ramping up, which means regular users and testers may be broken by these instances with CONFIG_UBSAN_TRAP=y. Disable CONFIG_UBSAN_SIGNED_WRAP in hardening.config to avoid this situation. Fixes: 557f8c582a9b ("ubsan: Reintroduce signed overflow sanitizer") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240411-fix-ubsan-in-hardening-config-v1-2-e0177c80ffaa@kernel.org Signed-off-by: Kees Cook --- kernel/configs/hardening.config | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index d6f6dc45628a..4b4cfcba3190 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -41,6 +41,7 @@ CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set +# CONFIG_UBSAN_SIGNED_WRAP is not set # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set # CONFIG_UBSAN_ALIGNMENT is not set -- cgit v1.2.3 From ee7e980dc7c9f22c142807c5f582a6524138f57a Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 4 Apr 2024 19:35:53 -0400 Subject: drm/nouveau/kms/nv50-: Disable AUX bus for disconnected DP ports GSP has its own state for keeping track of whether or not a given display connector is plugged in or not, and enforces this state on the driver. In particular, AUX transactions on a DisplayPort connector which GSP says is disconnected can never succeed - and can in some cases even cause unexpected timeouts, which can trickle up to cause other problems. A good example of this is runtime power management: where we can actually get stuck trying to resume the GPU if a userspace application like fwupd tries accessing a drm_aux_dev for a disconnected port. This was an issue I hit a few times with my Slimbook Executive 16 - where trying to offload something to the discrete GPU would wake it up, and then potentially cause it to timeout as fwupd tried to immediately access the dp_aux_dev nodes for nouveau. Likewise: we don't really have any cases I know of where we'd want to ignore this state and try an aux transaction anyway - and failing pointless aux transactions immediately can even speed things up. So - let's start enabling/disabling the aux bus in nouveau_dp_detect() to fix this. We enable the aux bus during connector probing, and leave it enabled if we discover something is actually on the connector. Otherwise, we just shut it off. This should fix some people's runtime PM issues (like myself), and also get rid of quite of a lot of GSP error spam in dmesg. Signed-off-by: Lyude Paul Reviewed-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240404233736.7946-2-lyude@redhat.com (cherry picked from commit 9c8a10bf1f3467b2c16f6848249bdc7692ace825) --- drivers/gpu/drm/nouveau/nouveau_dp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c b/drivers/gpu/drm/nouveau/nouveau_dp.c index 7de7707ec6a8..3f6ccce20ea0 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dp.c +++ b/drivers/gpu/drm/nouveau/nouveau_dp.c @@ -232,6 +232,9 @@ nouveau_dp_detect(struct nouveau_connector *nv_connector, dpcd[DP_DPCD_REV] != 0) return NOUVEAU_DP_SST; + // Ensure that the aux bus is enabled for probing + drm_dp_dpcd_set_powered(&nv_connector->aux, true); + mutex_lock(&nv_encoder->dp.hpd_irq_lock); if (mstm) { /* If we're not ready to handle MST state changes yet, just @@ -293,6 +296,13 @@ out: if (mstm && !mstm->suspended && ret != NOUVEAU_DP_MST) nv50_mstm_remove(mstm); + /* GSP doesn't like when we try to do aux transactions on a port it considers disconnected, + * and since we don't really have a usecase for that anyway - just disable the aux bus here + * if we've decided the connector is disconnected + */ + if (ret == NOUVEAU_DP_NONE) + drm_dp_dpcd_set_powered(&nv_connector->aux, false); + mutex_unlock(&nv_encoder->dp.hpd_irq_lock); return ret; } -- cgit v1.2.3 From bf52d7f9b2067f02efe7e32697479097aba4a055 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 4 Apr 2024 19:35:54 -0400 Subject: drm/nouveau/dp: Don't probe eDP ports twice harder I didn't pay close enough attention the last time I tried to fix this problem - while we currently do correctly take care to make sure we don't probe a connected eDP port more then once, we don't do the same thing for eDP ports we found to be disconnected. So, fix this and make sure we only ever probe eDP ports once and then leave them at that connector state forever (since without HPD, it's not going to change on its own anyway). This should get rid of the last few GSP errors getting spit out during runtime suspend and resume on some machines, as we tried to reprobe eDP ports in response to ACPI hotplug probe events. Signed-off-by: Lyude Paul Reviewed-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240404233736.7946-3-lyude@redhat.com (cherry picked from commit fe6660b661c3397af0867d5d098f5b26581f1290) --- drivers/gpu/drm/nouveau/nouveau_dp.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c b/drivers/gpu/drm/nouveau/nouveau_dp.c index 3f6ccce20ea0..a72c45809484 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dp.c +++ b/drivers/gpu/drm/nouveau/nouveau_dp.c @@ -225,12 +225,15 @@ nouveau_dp_detect(struct nouveau_connector *nv_connector, u8 *dpcd = nv_encoder->dp.dpcd; int ret = NOUVEAU_DP_NONE, hpd; - /* If we've already read the DPCD on an eDP device, we don't need to - * reread it as it won't change + /* eDP ports don't support hotplugging - so there's no point in probing eDP ports unless we + * haven't probed them once before. */ - if (connector->connector_type == DRM_MODE_CONNECTOR_eDP && - dpcd[DP_DPCD_REV] != 0) - return NOUVEAU_DP_SST; + if (connector->connector_type == DRM_MODE_CONNECTOR_eDP) { + if (connector->status == connector_status_connected) + return NOUVEAU_DP_SST; + else if (connector->status == connector_status_disconnected) + return NOUVEAU_DP_NONE; + } // Ensure that the aux bus is enabled for probing drm_dp_dpcd_set_powered(&nv_connector->aux, true); -- cgit v1.2.3 From f4626c12e4b538f757a73d08f4d86d564175b4f7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Apr 2024 11:28:35 -0700 Subject: ubsan: Add awareness of signed integer overflow traps On arm64, UBSAN traps can be decoded from the trap instruction. Add the add, sub, and mul overflow trap codes now that CONFIG_UBSAN_SIGNED_WRAP exists. Seen under clang 19: Internal error: UBSAN: unrecognized failure code: 00000000f2005515 [#1] PREEMPT SMP Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/lkml/20240411-fix-ubsan-in-hardening-config-v1-0-e0177c80ffaa@kernel.org Fixes: 557f8c582a9b ("ubsan: Reintroduce signed overflow sanitizer") Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240415182832.work.932-kees@kernel.org Signed-off-by: Kees Cook --- lib/ubsan.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/ubsan.c b/lib/ubsan.c index 5fc107f61934..a1c983d148f1 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -44,9 +44,10 @@ const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) case ubsan_shift_out_of_bounds: return "UBSAN: shift out of bounds"; #endif -#ifdef CONFIG_UBSAN_DIV_ZERO +#if defined(CONFIG_UBSAN_DIV_ZERO) || defined(CONFIG_UBSAN_SIGNED_WRAP) /* - * SanitizerKind::IntegerDivideByZero emits + * SanitizerKind::IntegerDivideByZero and + * SanitizerKind::SignedIntegerOverflow emit * SanitizerHandler::DivremOverflow. */ case ubsan_divrem_overflow: @@ -77,6 +78,19 @@ const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) return "UBSAN: alignment assumption"; case ubsan_type_mismatch: return "UBSAN: type mismatch"; +#endif +#ifdef CONFIG_UBSAN_SIGNED_WRAP + /* + * SanitizerKind::SignedIntegerOverflow emits + * SanitizerHandler::AddOverflow, SanitizerHandler::SubOverflow, + * or SanitizerHandler::MulOverflow. + */ + case ubsan_add_overflow: + return "UBSAN: integer addition overflow"; + case ubsan_sub_overflow: + return "UBSAN: integer subtraction overflow"; + case ubsan_mul_overflow: + return "UBSAN: integer multiplication overflow"; #endif default: return "UBSAN: unrecognized failure code"; -- cgit v1.2.3 From 4225dfa4535f219b03ae14147d9c6e7e82ec8df4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:52 +0100 Subject: selftests/tcp_ao: Make RST tests less flaky Currently, "active reset" cases are flaky, because select() is called for 3 sockets, while only 2 are expected to receive RST. The idea of the third socket was to get into request_sock_queue, but the test mistakenly attempted to connect() after the listener socket was shut down. Repair this test, it's important to check the different kernel code-paths for signing RST TCP-AO segments. Fixes: c6df7b2361d7 ("selftests/net: Add TCP-AO RST test") Reported-by: Jakub Kicinski Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/rst.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index 7df8b8700e39..a2fe88d35ac0 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -256,8 +256,6 @@ static int test_wait_fds(int sk[], size_t nr, bool is_writable[], static void test_client_active_rst(unsigned int port) { - /* one in queue, another accept()ed */ - unsigned int wait_for = backlog + 2; int i, sk[3], err; bool is_writable[ARRAY_SIZE(sk)] = {false}; unsigned int last = ARRAY_SIZE(sk) - 1; @@ -275,16 +273,20 @@ static void test_client_active_rst(unsigned int port) for (i = 0; i < last; i++) { err = _test_connect_socket(sk[i], this_ip_dest, port, (i == 0) ? TEST_TIMEOUT_SEC : -1); - if (err < 0) test_error("failed to connect()"); } - synchronize_threads(); /* 2: connection accept()ed, another queued */ - err = test_wait_fds(sk, last, is_writable, wait_for, TEST_TIMEOUT_SEC); + synchronize_threads(); /* 2: two connections: one accept()ed, another queued */ + err = test_wait_fds(sk, last, is_writable, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("test_wait_fds(): %d", err); + /* async connect() with third sk to get into request_sock_queue */ + err = _test_connect_socket(sk[last], this_ip_dest, port, -1); + if (err < 0) + test_error("failed to connect()"); + synchronize_threads(); /* 3: close listen socket */ if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC)) test_fail("Failed to send data on connected socket"); @@ -292,13 +294,14 @@ static void test_client_active_rst(unsigned int port) test_ok("Verified established tcp connection"); synchronize_threads(); /* 4: finishing up */ - err = _test_connect_socket(sk[last], this_ip_dest, port, -1); - if (err < 0) - test_error("failed to connect()"); synchronize_threads(); /* 5: closed active sk */ - err = test_wait_fds(sk, ARRAY_SIZE(sk), NULL, - wait_for, TEST_TIMEOUT_SEC); + /* + * Wait for 2 connections: one accepted, another in the accept queue, + * the one in request_sock_queue won't get fully established, so + * doesn't receive an active RST, see inet_csk_listen_stop(). + */ + err = test_wait_fds(sk, last, NULL, last, TEST_TIMEOUT_SEC); if (err < 0) test_error("select(): %d", err); -- cgit v1.2.3 From b089b3bead532419cdcbd8e4e0a3e23c49d11573 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:53 +0100 Subject: selftests/tcp_ao: Zero-init tcp_ao_info_opt The structure is on the stack and has to be zero-initialized as the kernel checks for: > if (in.reserved != 0 || in.reserved2 != 0) > return -EINVAL; Fixes: b26660531cf6 ("selftests/net: Add test for TCP-AO add setsockopt() command") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/setsockopt-closed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c index 452de131fa3a..517930f9721b 100644 --- a/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c +++ b/tools/testing/selftests/net/tcp_ao/setsockopt-closed.c @@ -21,7 +21,7 @@ static void make_listen(int sk) static void test_vefify_ao_info(int sk, struct tcp_ao_info_opt *info, const char *tst) { - struct tcp_ao_info_opt tmp; + struct tcp_ao_info_opt tmp = {}; socklen_t len = sizeof(tmp); if (getsockopt(sk, IPPROTO_TCP, TCP_AO_INFO, &tmp, &len)) -- cgit v1.2.3 From beb78cd1329d039d73487ca05633d1b92e1ab2ea Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:54 +0100 Subject: selftests/tcp_ao: Fix fscanf() call for format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces: > lib/proc.c: In function ‘netstat_read_type’: > lib/proc.c:89:9: error: format not a string literal and no format arguments [-Werror=format-security] > 89 | if (fscanf(fnetstat, type->header_name) == EOF) > | ^~ > cc1: some warnings being treated as errors Here the selftests lib parses header name, while expectes non-space word ending with a column. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_ao/lib/proc.c b/tools/testing/selftests/net/tcp_ao/lib/proc.c index 2fb6dd8adba6..8b984fa04286 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/proc.c +++ b/tools/testing/selftests/net/tcp_ao/lib/proc.c @@ -86,7 +86,7 @@ static void netstat_read_type(FILE *fnetstat, struct netstat **dest, char *line) pos = strchr(line, ' ') + 1; - if (fscanf(fnetstat, type->header_name) == EOF) + if (fscanf(fnetstat, "%[^ :]", type->header_name) == EOF) test_error("fscanf(%s)", type->header_name); if (fread(&tmp, 1, 1, fnetstat) != 1 || tmp != ':') test_error("Unexpected netstat format (%c)", tmp); -- cgit v1.2.3 From b476c93654d748c13624f7c7d0ba191c56a8092e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Sat, 13 Apr 2024 02:42:55 +0100 Subject: selftests/tcp_ao: Printing fixes to confirm with format-security MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On my new laptop with packages from nixos-unstable, gcc 12.3.0 produces > lib/setup.c: In function ‘__test_msg’: > lib/setup.c:20:9: error: format not a string literal and no format arguments [-Werror=format-security] > 20 | ksft_print_msg(buf); > | ^~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_ok’: > lib/setup.c:26:9: error: format not a string literal and no format arguments [-Werror=format-security] > 26 | ksft_test_result_pass(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_fail’: > lib/setup.c:32:9: error: format not a string literal and no format arguments [-Werror=format-security] > 32 | ksft_test_result_fail(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_xfail’: > lib/setup.c:38:9: error: format not a string literal and no format arguments [-Werror=format-security] > 38 | ksft_test_result_xfail(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_error’: > lib/setup.c:44:9: error: format not a string literal and no format arguments [-Werror=format-security] > 44 | ksft_test_result_error(buf); > | ^~~~~~~~~~~~~~~~~~~~~~ > lib/setup.c: In function ‘__test_skip’: > lib/setup.c:50:9: error: format not a string literal and no format arguments [-Werror=format-security] > 50 | ksft_test_result_skip(buf); > | ^~~~~~~~~~~~~~~~~~~~~ > cc1: some warnings being treated as errors As the buffer was already pre-printed into, print it as a string rather than a format-string. Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Reported-by: Muhammad Usama Anjum Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/setup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/lib/setup.c b/tools/testing/selftests/net/tcp_ao/lib/setup.c index 92276f916f2f..e408b9243b2c 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/setup.c +++ b/tools/testing/selftests/net/tcp_ao/lib/setup.c @@ -17,37 +17,37 @@ static pthread_mutex_t ksft_print_lock = PTHREAD_MUTEX_INITIALIZER; void __test_msg(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_print_msg(buf); + ksft_print_msg("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_ok(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_pass(buf); + ksft_test_result_pass("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_fail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_fail(buf); + ksft_test_result_fail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_xfail(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_xfail(buf); + ksft_test_result_xfail("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_error(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_error(buf); + ksft_test_result_error("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } void __test_skip(const char *buf) { pthread_mutex_lock(&ksft_print_lock); - ksft_test_result_skip(buf); + ksft_test_result_skip("%s", buf); pthread_mutex_unlock(&ksft_print_lock); } -- cgit v1.2.3 From fe90f3967bdb3e13f133e5f44025e15f943a99c5 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 15 Apr 2024 11:21:13 -0400 Subject: sched: Add missing memory barrier in switch_mm_cid Many architectures' switch_mm() (e.g. arm64) do not have an smp_mb() which the core scheduler code has depended upon since commit: commit 223baf9d17f25 ("sched: Fix performance regression introduced by mm_cid") If switch_mm() doesn't call smp_mb(), sched_mm_cid_remote_clear() can unset the actively used cid when it fails to observe active task after it sets lazy_put. There *is* a memory barrier between storing to rq->curr and _return to userspace_ (as required by membarrier), but the rseq mm_cid has stricter requirements: the barrier needs to be issued between store to rq->curr and switch_mm_cid(), which happens earlier than: - spin_unlock(), - switch_to(). So it's fine when the architecture switch_mm() happens to have that barrier already, but less so when the architecture only provides the full barrier in switch_to() or spin_unlock(). It is a bug in the rseq switch_mm_cid() implementation. All architectures that don't have memory barriers in switch_mm(), but rather have the full barrier either in finish_lock_switch() or switch_to() have them too late for the needs of switch_mm_cid(). Introduce a new smp_mb__after_switch_mm(), defined as smp_mb() in the generic barrier.h header, and use it in switch_mm_cid() for scheduler transitions where switch_mm() is expected to provide a memory barrier. Architectures can override smp_mb__after_switch_mm() if their switch_mm() implementation provides an implicit memory barrier. Override it with a no-op on x86 which implicitly provide this memory barrier by writing to CR3. Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") Reported-by: levi.yun Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Reviewed-by: Catalin Marinas # for arm64 Acked-by: Dave Hansen # for x86 Cc: # 6.4.x Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240415152114.59122-2-mathieu.desnoyers@efficios.com --- arch/x86/include/asm/barrier.h | 3 +++ include/asm-generic/barrier.h | 8 ++++++++ kernel/sched/sched.h | 20 ++++++++++++++------ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index fe1e7e3cc844..63bdc6b85219 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -79,6 +79,9 @@ do { \ #define __smp_mb__before_atomic() do { } while (0) #define __smp_mb__after_atomic() do { } while (0) +/* Writing to CR3 provides a full memory barrier in switch_mm(). */ +#define smp_mb__after_switch_mm() do { } while (0) + #include #endif /* _ASM_X86_BARRIER_H */ diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 0c0695763bea..d4f581c1e21d 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -294,5 +294,13 @@ do { \ #define io_stop_wc() do { } while (0) #endif +/* + * Architectures that guarantee an implicit smp_mb() in switch_mm() + * can override smp_mb__after_switch_mm. + */ +#ifndef smp_mb__after_switch_mm +# define smp_mb__after_switch_mm() smp_mb() +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d2242679239e..ae50f212775e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -79,6 +79,8 @@ # include #endif +#include + #include "cpupri.h" #include "cpudeadline.h" @@ -3445,13 +3447,19 @@ static inline void switch_mm_cid(struct rq *rq, * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. * Provide it here. */ - if (!prev->mm) // from kernel + if (!prev->mm) { // from kernel smp_mb(); - /* - * user -> user transition guarantees a memory barrier through - * switch_mm() when current->mm changes. If current->mm is - * unchanged, no barrier is needed. - */ + } else { // from user + /* + * user->user transition relies on an implicit + * memory barrier in switch_mm() when + * current->mm changes. If the architecture + * switch_mm() does not have an implicit memory + * barrier, it is emitted here. If current->mm + * is unchanged, no barrier is needed. + */ + smp_mb__after_switch_mm(); + } } if (prev->mm_cid_active) { mm_cid_snapshot_time(rq, prev->mm); -- cgit v1.2.3 From 03cea821b82cbf0ee4a285f9dca6cc1d660dbef3 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:54 -0500 Subject: platform/x86/amd: pmf: Decrease error message to debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS ROG Zephyrus G14 doesn't have _CRS in AMDI0102 device and so there are no resources to walk. This is expected behavior because it doesn't support Smart PC. Decrease error message to debug. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218685 Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240410140956.385-1-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmf/acpi.c b/drivers/platform/x86/amd/pmf/acpi.c index d0cf46e2fc8e..60fc71c9fb0f 100644 --- a/drivers/platform/x86/amd/pmf/acpi.c +++ b/drivers/platform/x86/amd/pmf/acpi.c @@ -437,7 +437,7 @@ int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev) status = acpi_walk_resources(ahandle, METHOD_NAME__CRS, apmf_walk_resources, pmf_dev); if (ACPI_FAILURE(status)) { - dev_err(pmf_dev->dev, "acpi_walk_resources failed :%d\n", status); + dev_dbg(pmf_dev->dev, "acpi_walk_resources failed :%d\n", status); return -EINVAL; } -- cgit v1.2.3 From ed13f622bcd594d6cefd6239b1722ed8b84ba98f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:55 -0500 Subject: platform/x86/amd: pmf: Add infrastructure for quirking supported funcs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the event of a BIOS bug add infrastructure that will be utilized to override the return value for supported_funcs to avoid enabling features. Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240410140956.385-2-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/Makefile | 2 +- drivers/platform/x86/amd/pmf/acpi.c | 5 +++- drivers/platform/x86/amd/pmf/core.c | 1 + drivers/platform/x86/amd/pmf/pmf-quirks.c | 43 +++++++++++++++++++++++++++++++ drivers/platform/x86/amd/pmf/pmf.h | 3 +++ 5 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 drivers/platform/x86/amd/pmf/pmf-quirks.c diff --git a/drivers/platform/x86/amd/pmf/Makefile b/drivers/platform/x86/amd/pmf/Makefile index 6b26e48ce8ad..7d6079b02589 100644 --- a/drivers/platform/x86/amd/pmf/Makefile +++ b/drivers/platform/x86/amd/pmf/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_AMD_PMF) += amd-pmf.o amd-pmf-objs := core.o acpi.o sps.o \ auto-mode.o cnqf.o \ - tee-if.o spc.o + tee-if.o spc.o pmf-quirks.o diff --git a/drivers/platform/x86/amd/pmf/acpi.c b/drivers/platform/x86/amd/pmf/acpi.c index 60fc71c9fb0f..1157ec148880 100644 --- a/drivers/platform/x86/amd/pmf/acpi.c +++ b/drivers/platform/x86/amd/pmf/acpi.c @@ -343,7 +343,10 @@ static int apmf_if_verify_interface(struct amd_pmf_dev *pdev) if (err) return err; - pdev->supported_func = output.supported_functions; + /* only set if not already set by a quirk */ + if (!pdev->supported_func) + pdev->supported_func = output.supported_functions; + dev_dbg(pdev->dev, "supported functions:0x%x notifications:0x%x version:%u\n", output.supported_functions, output.notification_mask, output.version); diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 5d4f80698a8b..64e6e34a2a9a 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -445,6 +445,7 @@ static int amd_pmf_probe(struct platform_device *pdev) mutex_init(&dev->lock); mutex_init(&dev->update_mutex); + amd_pmf_quirks_init(dev); apmf_acpi_init(dev); platform_set_drvdata(pdev, dev); amd_pmf_dbgfs_register(dev); diff --git a/drivers/platform/x86/amd/pmf/pmf-quirks.c b/drivers/platform/x86/amd/pmf/pmf-quirks.c new file mode 100644 index 000000000000..9f3790eaaa30 --- /dev/null +++ b/drivers/platform/x86/amd/pmf/pmf-quirks.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Platform Management Framework Driver Quirks + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Mario Limonciello + */ + +#include + +#include "pmf.h" + +struct quirk_entry { + u32 supported_func; +}; + +static struct quirk_entry quirk_no_sps_bug = { + .supported_func = 0x4003, +}; + +static const struct dmi_system_id fwbug_list[] = { + {} +}; + +void amd_pmf_quirks_init(struct amd_pmf_dev *dev) +{ + const struct dmi_system_id *dmi_id; + struct quirk_entry *quirks; + + dmi_id = dmi_first_match(fwbug_list); + if (!dmi_id) + return; + + quirks = dmi_id->driver_data; + if (quirks->supported_func) { + dev->supported_func = quirks->supported_func; + pr_info("Using supported funcs quirk to avoid %s platform firmware bug\n", + dmi_id->ident); + } +} + diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 8c4df5753f40..eeedd0c0395a 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -720,4 +720,7 @@ int apmf_check_smart_pc(struct amd_pmf_dev *pmf_dev); void amd_pmf_populate_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); void amd_pmf_dump_ta_inputs(struct amd_pmf_dev *dev, struct ta_pmf_enact_table *in); +/* Quirk infrastructure */ +void amd_pmf_quirks_init(struct amd_pmf_dev *dev); + #endif /* PMF_H */ -- cgit v1.2.3 From 9d893061ed68820de24b572d1e193b5e4737f2e0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:09:56 -0500 Subject: platform/x86/amd: pmf: Add quirk for ROG Zephyrus G14 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ROG Zephyrus G14 advertises support for SPS notifications to the BIOS but doesn't actually use them. Instead the asus-nb-wmi driver utilizes such events. Add a quirk to prevent the system from registering for ACPI platform profile when this system is found to avoid conflicts. Reported-by: al0uette@outlook.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218685 Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20240410140956.385-3-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/pmf-quirks.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/amd/pmf/pmf-quirks.c b/drivers/platform/x86/amd/pmf/pmf-quirks.c index 9f3790eaaa30..0b2eb0ae85fe 100644 --- a/drivers/platform/x86/amd/pmf/pmf-quirks.c +++ b/drivers/platform/x86/amd/pmf/pmf-quirks.c @@ -21,6 +21,14 @@ static struct quirk_entry quirk_no_sps_bug = { }; static const struct dmi_system_id fwbug_list[] = { + { + .ident = "ROG Zephyrus G14", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "GA403UV"), + }, + .driver_data = &quirk_no_sps_bug, + }, {} }; -- cgit v1.2.3 From d8c2d38c4d1dee8fe8e015b9ebf65bdd8e4da99b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 15 Apr 2024 14:28:53 -0700 Subject: platform/x86: ISST: Add Granite Rapids-D to HPM CPU list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Granite Rapids-D to hpm_cpu_ids, so that MSR 0x54 can be used. Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240415212853.2820470-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/speed_select_if/isst_if_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index 08df9494603c..30951f7131cd 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -719,6 +719,7 @@ static struct miscdevice isst_if_char_driver = { }; static const struct x86_cpu_id hpm_cpu_ids[] = { + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, NULL), X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, NULL), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, NULL), {} -- cgit v1.2.3 From bc774d46b41482534c7ba92f6342ca0a355c13af Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 15 Apr 2024 15:06:25 -0700 Subject: platform/x86/intel-uncore-freq: Increase minor number support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No new changes will be added for minor version 2. Change the minor version number to 2 and stop displaying log message for unsupported minor version 2. Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240415220625.2828339-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c index bd75d61ff8a6..ef730200a04b 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c @@ -29,7 +29,7 @@ #include "uncore-frequency-common.h" #define UNCORE_MAJOR_VERSION 0 -#define UNCORE_MINOR_VERSION 1 +#define UNCORE_MINOR_VERSION 2 #define UNCORE_HEADER_INDEX 0 #define UNCORE_FABRIC_CLUSTER_OFFSET 8 @@ -329,7 +329,7 @@ static int uncore_probe(struct auxiliary_device *auxdev, const struct auxiliary_ goto remove_clusters; } - if (TPMI_MINOR_VERSION(pd_info->ufs_header_ver) != UNCORE_MINOR_VERSION) + if (TPMI_MINOR_VERSION(pd_info->ufs_header_ver) > UNCORE_MINOR_VERSION) dev_info(&auxdev->dev, "Uncore: Ignore: Unsupported minor version:%lx\n", TPMI_MINOR_VERSION(pd_info->ufs_header_ver)); -- cgit v1.2.3 From 0ebd96f5da4410c0cb8fc75e44f1009530b2f90b Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:14 +0300 Subject: net: stmmac: Apply half-duplex-less constraint for DW QoS Eth only There are three DW MAC IP-cores which can have the multiple Tx/Rx queues enabled: DW GMAC v3.7+ with AV feature, DW QoS Eth v4.x/v5.x, DW XGMAC/XLGMAC Based on the respective HW databooks, only the DW QoS Eth IP-core doesn't support the half-duplex link mode in case if more than one queues enabled: "In multiple queue/channel configurations, for half-duplex operation, enable only the Q0/CH0 on Tx and Rx. For single queue/channel in full-duplex operation, any queue/channel can be enabled." The rest of the IP-cores don't have such constraint. Thus in order to have the constraint applied for the DW QoS Eth MACs only, let's move the it' implementation to the respective MAC-capabilities getter and make sure the getter is called in the queues re-init procedure. Fixes: b6cfffa7ad92 ("stmmac: fix DMA channel hang in half-duplex mode") Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 7 +++++++ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 19 +++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index cef25efbdff9..ec6a13e644b3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -71,6 +71,13 @@ static void dwmac4_core_init(struct mac_device_info *hw, static void dwmac4_phylink_get_caps(struct stmmac_priv *priv) { priv->phylink_config.mac_capabilities |= MAC_2500FD; + + if (priv->plat->tx_queues_to_use > 1) + priv->phylink_config.mac_capabilities &= + ~(MAC_10HD | MAC_100HD | MAC_1000HD); + else + priv->phylink_config.mac_capabilities |= + (MAC_10HD | MAC_100HD | MAC_1000HD); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 24cd80490d19..dd58c21b53ee 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1198,17 +1198,6 @@ static int stmmac_init_phy(struct net_device *dev) return ret; } -static void stmmac_set_half_duplex(struct stmmac_priv *priv) -{ - /* Half-Duplex can only work with single tx queue */ - if (priv->plat->tx_queues_to_use > 1) - priv->phylink_config.mac_capabilities &= - ~(MAC_10HD | MAC_100HD | MAC_1000HD); - else - priv->phylink_config.mac_capabilities |= - (MAC_10HD | MAC_100HD | MAC_1000HD); -} - static int stmmac_phy_setup(struct stmmac_priv *priv) { struct stmmac_mdio_bus_data *mdio_bus_data; @@ -1237,10 +1226,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) priv->phylink_config.supported_interfaces); priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_10FD | MAC_100FD | - MAC_1000FD; - - stmmac_set_half_duplex(priv); + MAC_10 | MAC_100 | MAC_1000; /* Get the MAC specific capabilities */ stmmac_mac_phylink_get_caps(priv); @@ -7355,7 +7341,8 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) priv->rss.table[i] = ethtool_rxfh_indir_default(i, rx_cnt); - stmmac_set_half_duplex(priv); + stmmac_mac_phylink_get_caps(priv); + stmmac_napi_add(dev); if (netif_running(dev)) -- cgit v1.2.3 From 59c3d6ca6cbded6c6599e975b42a9d6a27fcbaf2 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:15 +0300 Subject: net: stmmac: Fix max-speed being ignored on queue re-init It's possible to have the maximum link speed being artificially limited on the platform-specific basis. It's done either by setting up the plat_stmmacenet_data::max_speed field or by specifying the "max-speed" DT-property. In such cases it's required that any specific MAC-capabilities re-initializations would take the limit into account. In particular the link speed capabilities may change during the number of active Tx/Rx queues re-initialization. But the currently implemented procedure doesn't take the speed limit into account. Fix that by calling phylink_limit_mac_speed() in the stmmac_reinit_queues() method if the speed limitation was required in the same way as it's done in the stmmac_phy_setup() function. Fixes: 95201f36f395 ("net: stmmac: update MAC capabilities when tx queues are updated") Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index dd58c21b53ee..b8a1f02398ee 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7328,6 +7328,7 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) { struct stmmac_priv *priv = netdev_priv(dev); int ret = 0, i; + int max_speed; if (netif_running(dev)) stmmac_release(dev); @@ -7343,6 +7344,10 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) stmmac_mac_phylink_get_caps(priv); + max_speed = priv->plat->max_speed; + if (max_speed) + phylink_limit_mac_speed(&priv->phylink_config, max_speed); + stmmac_napi_add(dev); if (netif_running(dev)) -- cgit v1.2.3 From 9cb54af214a7cdc91577ec083e5569f2ce2c86d8 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 12 Apr 2024 21:03:16 +0300 Subject: net: stmmac: Fix IP-cores specific MAC capabilities Here is the list of the MAC capabilities specific to the particular DW MAC IP-cores currently supported by the driver: DW MAC100: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 DW GMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 Allwinner sun8i MAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 DW QoS Eth: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD if there is more than 1 active Tx/Rx queues: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD | MAC_2500FD DW XGMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD DW XLGMAC: MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD | MAC_25000FD | MAC_40000FD | MAC_50000FD | MAC_100000FD As you can see there are only two common capabilities: MAC_ASYM_PAUSE | MAC_SYM_PAUSE. Meanwhile what is currently implemented defines 10/100/1000 link speeds for all IP-cores, which is definitely incorrect for DW MAC100, DW XGMAC and DW XLGMAC devices. Seeing the flow-control is implemented as a callback for each MAC IP-core (see dwmac100_flow_ctrl(), dwmac1000_flow_ctrl(), sun8i_dwmac_flow_ctrl(), etc) and since the MAC-specific setup() method is supposed to be called for each available DW MAC-based device, the capabilities initialization can be freely moved to these setup() functions, thus correctly setting up the MAC-capabilities for each IP-core (including the Allwinner Sun8i). A new stmmac_link::caps field was specifically introduced for that so to have all link-specific info preserved in a single structure. Note the suggested change fixes three earlier commits at a time. The commit 5b0d7d7da64b ("net: stmmac: Add the missing speeds that XGMAC supports") permitted the 10-100 link speeds and 1G half-duplex mode for DW XGMAC IP-core even though it doesn't support them. The commit df7699c70c1b ("net: stmmac: Do not cut down 1G modes") incorrectly added the MAC1000 capability to the DW MAC100 IP-core. Similarly to the DW XGMAC the commit 8a880936e902 ("net: stmmac: Add XLGMII support") incorrectly permitted the 10-100 link speeds and 1G half-duplex mode for DW XLGMAC IP-core. Fixes: 5b0d7d7da64b ("net: stmmac: Add the missing speeds that XGMAC supports") Fixes: df7699c70c1b ("net: stmmac: Do not cut down 1G modes") Fixes: 8a880936e902 ("net: stmmac: Add XLGMII support") Suggested-by: Russell King (Oracle) Signed-off-by: Serge Semin Reviewed-by: Romain Gantois Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/common.h | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 10 ++++------ drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 18 ++++++++---------- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++++--- 7 files changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index a6fefe675ef1..3b7d4ac1e7be 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -553,6 +553,7 @@ extern const struct stmmac_hwtimestamp stmmac_ptp; extern const struct stmmac_mode_ops dwmac4_ring_mode_ops; struct mac_link { + u32 caps; u32 speed_mask; u32 speed10; u32 speed100; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index b21d99faa2d0..e1b761dcfa1d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1096,6 +1096,8 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv) priv->dev->priv_flags |= IFF_UNICAST_FLT; + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000; /* The loopback bit seems to be re-set when link change * Simply mask it each time * Speed 10/100/1000 are set in BIT(2)/BIT(3) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 3927609abc44..8555299443f4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -539,6 +539,8 @@ int dwmac1000_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000; mac->link.duplex = GMAC_CONTROL_DM; mac->link.speed10 = GMAC_CONTROL_PS; mac->link.speed100 = GMAC_CONTROL_PS | GMAC_CONTROL_FES; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c index a6e8d7bd9588..7667d103cd0e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c @@ -175,6 +175,8 @@ int dwmac100_setup(struct stmmac_priv *priv) dev_info(priv->device, "\tDWMAC100\n"); mac->pcsr = priv->ioaddr; + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100; mac->link.duplex = MAC_CONTROL_F; mac->link.speed10 = 0; mac->link.speed100 = 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index ec6a13e644b3..a38226d7cc6a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -70,14 +70,10 @@ static void dwmac4_core_init(struct mac_device_info *hw, static void dwmac4_phylink_get_caps(struct stmmac_priv *priv) { - priv->phylink_config.mac_capabilities |= MAC_2500FD; - if (priv->plat->tx_queues_to_use > 1) - priv->phylink_config.mac_capabilities &= - ~(MAC_10HD | MAC_100HD | MAC_1000HD); + priv->hw->link.caps &= ~(MAC_10HD | MAC_100HD | MAC_1000HD); else - priv->phylink_config.mac_capabilities |= - (MAC_10HD | MAC_100HD | MAC_1000HD); + priv->hw->link.caps |= (MAC_10HD | MAC_100HD | MAC_1000HD); } static void dwmac4_rx_queue_enable(struct mac_device_info *hw, @@ -1385,6 +1381,8 @@ int dwmac4_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_10 | MAC_100 | MAC_1000 | MAC_2500FD; mac->link.duplex = GMAC_CONFIG_DM; mac->link.speed10 = GMAC_CONFIG_PS; mac->link.speed100 = GMAC_CONFIG_FES | GMAC_CONFIG_PS; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index e841e312077e..f8e7775bb633 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -47,14 +47,6 @@ static void dwxgmac2_core_init(struct mac_device_info *hw, writel(XGMAC_INT_DEFAULT_EN, ioaddr + XGMAC_INT_EN); } -static void xgmac_phylink_get_caps(struct stmmac_priv *priv) -{ - priv->phylink_config.mac_capabilities |= MAC_2500FD | MAC_5000FD | - MAC_10000FD | MAC_25000FD | - MAC_40000FD | MAC_50000FD | - MAC_100000FD; -} - static void dwxgmac2_set_mac(void __iomem *ioaddr, bool enable) { u32 tx = readl(ioaddr + XGMAC_TX_CONFIG); @@ -1540,7 +1532,6 @@ static void dwxgmac3_fpe_configure(void __iomem *ioaddr, struct stmmac_fpe_cfg * const struct stmmac_ops dwxgmac210_ops = { .core_init = dwxgmac2_core_init, - .phylink_get_caps = xgmac_phylink_get_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxgmac2_rx_queue_enable, @@ -1601,7 +1592,6 @@ static void dwxlgmac2_rx_queue_enable(struct mac_device_info *hw, u8 mode, const struct stmmac_ops dwxlgmac2_ops = { .core_init = dwxgmac2_core_init, - .phylink_get_caps = xgmac_phylink_get_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxlgmac2_rx_queue_enable, @@ -1661,6 +1651,9 @@ int dwxgmac2_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_1000FD | MAC_2500FD | MAC_5000FD | + MAC_10000FD; mac->link.duplex = 0; mac->link.speed10 = XGMAC_CONFIG_SS_10_MII; mac->link.speed100 = XGMAC_CONFIG_SS_100_MII; @@ -1698,6 +1691,11 @@ int dwxlgmac2_setup(struct stmmac_priv *priv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); + mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | + MAC_1000FD | MAC_2500FD | MAC_5000FD | + MAC_10000FD | MAC_25000FD | + MAC_40000FD | MAC_50000FD | + MAC_100000FD; mac->link.duplex = 0; mac->link.speed1000 = XLGMAC_CONFIG_SS_1000; mac->link.speed2500 = XLGMAC_CONFIG_SS_2500; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b8a1f02398ee..7c6fb14b5555 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1225,12 +1225,11 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) xpcs_get_interfaces(priv->hw->xpcs, priv->phylink_config.supported_interfaces); - priv->phylink_config.mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_10 | MAC_100 | MAC_1000; - /* Get the MAC specific capabilities */ stmmac_mac_phylink_get_caps(priv); + priv->phylink_config.mac_capabilities = priv->hw->link.caps; + max_speed = priv->plat->max_speed; if (max_speed) phylink_limit_mac_speed(&priv->phylink_config, max_speed); @@ -7344,6 +7343,8 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) stmmac_mac_phylink_get_caps(priv); + priv->phylink_config.mac_capabilities = priv->hw->link.caps; + max_speed = priv->plat->max_speed; if (max_speed) phylink_limit_mac_speed(&priv->phylink_config, max_speed); -- cgit v1.2.3 From 428051600cb4e5a61d81aba3f8009b6c4f5e7582 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 15 Mar 2024 12:08:20 +0100 Subject: ice: tc: check src_vsi in case of traffic from VF In case of traffic going from the VF (so ingress for port representor) source VSI should be consider during packet classification. It is needed for hardware to not match packets from different ports with filters added on other port. It is only for "from VF" traffic, because other traffic direction doesn't have source VSI. Set correct ::src_vsi in rule_info to pass it to the hardware filter. For example this rule should drop only ipv4 packets from eth10, not from the others VF PRs. It is needed to check source VSI in this case. $tc filter add dev eth10 ingress protocol ip flower skip_sw action drop Fixes: 0d08a441fb1a ("ice: ndo_setup_tc implementation for PF") Reviewed-by: Jedrzej Jagielski Reviewed-by: Sridhar Samudrala Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index b890410a2bc0..49ed5fd7db10 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -28,6 +28,8 @@ ice_tc_count_lkups(u32 flags, struct ice_tc_flower_lyr_2_4_hdrs *headers, * - ICE_TC_FLWR_FIELD_VLAN_TPID (present if specified) * - Tunnel flag (present if tunnel) */ + if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS) + lkups_cnt++; if (flags & ICE_TC_FLWR_FIELD_TENANT_ID) lkups_cnt++; @@ -363,6 +365,11 @@ ice_tc_fill_rules(struct ice_hw *hw, u32 flags, /* Always add direction metadata */ ice_rule_add_direction_metadata(&list[ICE_TC_METADATA_LKUP_IDX]); + if (tc_fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { + ice_rule_add_src_vsi_metadata(&list[i]); + i++; + } + rule_info->tun_type = ice_sw_type_from_tunnel(tc_fltr->tunnel_type); if (tc_fltr->tunnel_type != TNL_LAST) { i = ice_tc_fill_tunnel_outer(flags, tc_fltr, list, i); @@ -820,6 +827,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) /* specify the cookie as filter_rule_id */ rule_info.fltr_rule_id = fltr->cookie; + rule_info.src_vsi = vsi->idx; ret = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, &rule_added); if (ret == -EEXIST) { -- cgit v1.2.3 From 73278715725a8347032acf233082ca4eb31e6a56 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 15 Mar 2024 12:08:21 +0100 Subject: ice: tc: allow zero flags in parsing tc flower The check for flags is done to not pass empty lookups to adding switch rule functions. Since metadata is always added to lookups there is no need to check against the flag. It is also fixing the problem with such rule: $ tc filter add dev gtp_dev ingress protocol ip prio 0 flower \ enc_dst_port 2123 action drop Switch block in case of GTP can't parse the destination port, because it should always be set to GTP specific value. The same with ethertype. The result is that there is no other matching criteria than GTP tunnel. In this case flags is 0, rule can't be added only because of defensive check against flags. Fixes: 9a225f81f540 ("ice: Support GTP-U and GTP-C offload in switchdev") Reviewed-by: Wojciech Drewek Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 49ed5fd7db10..bcbcfc67e560 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -779,7 +779,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) int ret; int i; - if (!flags || (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT)) { + if (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported encap field(s)"); return -EOPNOTSUPP; } -- cgit v1.2.3 From 2cca35f5dd78b9f8297c879c5db5ab137c5d86c3 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Tue, 9 Apr 2024 17:45:44 +0200 Subject: ice: Fix checking for unsupported keys on non-tunnel device Add missing FLOW_DISSECTOR_KEY_ENC_* checks to TC flower filter parsing. Without these checks, it would be possible to add filters with tunnel options on non-tunnel devices. enc_* options are only valid for tunnel devices. Example: devlink dev eswitch set $PF1_PCI mode switchdev echo 1 > /sys/class/net/$PF1/device/sriov_numvfs tc qdisc add dev $VF1_PR ingress ethtool -K $PF1 hw-tc-offload on tc filter add dev $VF1_PR ingress flower enc_ttl 12 skip_sw action drop Fixes: 9e300987d4a8 ("ice: VXLAN and Geneve TC support") Reviewed-by: Michal Swiatkowski Signed-off-by: Marcin Szycik Reviewed-by: Jacob Keller Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index bcbcfc67e560..688ccb0615ab 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -1489,7 +1489,10 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS))) { + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL))) { NL_SET_ERR_MSG_MOD(fltr->extack, "Tunnel key used, but device isn't a tunnel"); return -EOPNOTSUPP; } else { -- cgit v1.2.3 From 9e4d3f4f34455abbaa9930bf6b7575a5cd081496 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 4 Apr 2024 13:07:59 +0300 Subject: drm/panel: visionox-rm69299: don't unregister DSI device The DSI device for the panel was registered by the DSI host, so it is an error to unregister it from the panel driver. Drop the call to mipi_dsi_device_unregister(). Fixes: c7f66d32dd43 ("drm/panel: add support for rm69299 visionox panel") Reviewed-by: Jessica Zhang Signed-off-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20240404-drop-panel-unregister-v1-1-9f56953c5fb9@linaro.org --- drivers/gpu/drm/panel/panel-visionox-rm69299.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-visionox-rm69299.c b/drivers/gpu/drm/panel/panel-visionox-rm69299.c index 775144695283..b15ca56a09a7 100644 --- a/drivers/gpu/drm/panel/panel-visionox-rm69299.c +++ b/drivers/gpu/drm/panel/panel-visionox-rm69299.c @@ -253,8 +253,6 @@ static void visionox_rm69299_remove(struct mipi_dsi_device *dsi) struct visionox_rm69299 *ctx = mipi_dsi_get_drvdata(dsi); mipi_dsi_detach(ctx->dsi); - mipi_dsi_device_unregister(ctx->dsi); - drm_panel_remove(&ctx->panel); } -- cgit v1.2.3 From 941c0bdbc176df825adf77052263b2d63db6fef7 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 4 Apr 2024 13:08:00 +0300 Subject: drm/panel: novatek-nt36682e: don't unregister DSI device The DSI device for the panel was registered by the DSI host, so it is an error to unregister it from the panel driver. Drop the call to mipi_dsi_device_unregister(). Fixes: ea4f9975625a ("drm/panel: Add support for Novatek NT36672E panel driver") Reviewed-by: Jessica Zhang Signed-off-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20240404-drop-panel-unregister-v1-2-9f56953c5fb9@linaro.org --- drivers/gpu/drm/panel/panel-novatek-nt36672e.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-novatek-nt36672e.c b/drivers/gpu/drm/panel/panel-novatek-nt36672e.c index cb7406d74466..c39fe0fc5d69 100644 --- a/drivers/gpu/drm/panel/panel-novatek-nt36672e.c +++ b/drivers/gpu/drm/panel/panel-novatek-nt36672e.c @@ -614,8 +614,6 @@ static void nt36672e_panel_remove(struct mipi_dsi_device *dsi) struct nt36672e_panel *ctx = mipi_dsi_get_drvdata(dsi); mipi_dsi_detach(ctx->dsi); - mipi_dsi_device_unregister(ctx->dsi); - drm_panel_remove(&ctx->panel); } -- cgit v1.2.3 From 631426ba1d45a8672b177ee85ad4cabe760dd131 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 14 Mar 2024 17:12:59 +0100 Subject: mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly Darrick reports that in some cases where pread() would fail with -EIO and mmap()+access would generate a SIGBUS signal, MADV_POPULATE_READ / MADV_POPULATE_WRITE will keep retrying forever and not fail with -EFAULT. While the madvise() call can be interrupted by a signal, this is not the desired behavior. MADV_POPULATE_READ / MADV_POPULATE_WRITE should behave like page faults in that case: fail and not retry forever. A reproducer can be found at [1]. The reason is that __get_user_pages(), as called by faultin_vma_page_range(), will not handle VM_FAULT_RETRY in a proper way: it will simply return 0 when VM_FAULT_RETRY happened, making madvise_populate()->faultin_vma_page_range() retry again and again, never setting FOLL_TRIED->FAULT_FLAG_TRIED for __get_user_pages(). __get_user_pages_locked() does what we want, but duplicating that logic in faultin_vma_page_range() feels wrong. So let's use __get_user_pages_locked() instead, that will detect VM_FAULT_RETRY and set FOLL_TRIED when retrying, making the fault handler return VM_FAULT_SIGBUS (VM_FAULT_ERROR) at some point, propagating -EFAULT from faultin_page() to __get_user_pages(), all the way to madvise_populate(). But, there is an issue: __get_user_pages_locked() will end up re-taking the MM lock and then __get_user_pages() will do another VMA lookup. In the meantime, the VMA layout could have changed and we'd fail with different error codes than we'd want to. As __get_user_pages() will currently do a new VMA lookup either way, let it do the VMA handling in a different way, controlled by a new FOLL_MADV_POPULATE flag, effectively moving these checks from madvise_populate() + faultin_page_range() in there. With this change, Darricks reproducer properly fails with -EFAULT, as documented for MADV_POPULATE_READ / MADV_POPULATE_WRITE. [1] https://lore.kernel.org/all/20240313171936.GN1927156@frogsfrogsfrogs/ Link: https://lkml.kernel.org/r/20240314161300.382526-1-david@redhat.com Link: https://lkml.kernel.org/r/20240314161300.382526-2-david@redhat.com Fixes: 4ca9b3859dac ("mm/madvise: introduce MADV_POPULATE_(READ|WRITE) to prefault page tables") Signed-off-by: David Hildenbrand Reported-by: Darrick J. Wong Closes: https://lore.kernel.org/all/20240311223815.GW1927156@frogsfrogsfrogs/ Cc: Darrick J. Wong Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 54 ++++++++++++++++++++++++++++++++---------------------- mm/internal.h | 10 ++++++---- mm/madvise.c | 17 ++--------------- 3 files changed, 40 insertions(+), 41 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index af8edadc05d1..1611e73b1121 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1206,6 +1206,22 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { + /* + * MADV_POPULATE_(READ|WRITE) wants to handle VMA + * lookups+error reporting differently. + */ + if (gup_flags & FOLL_MADV_POPULATE) { + vma = vma_lookup(mm, start); + if (!vma) { + ret = -ENOMEM; + goto out; + } + if (check_vma_flags(vma, gup_flags)) { + ret = -EINVAL; + goto out; + } + goto retry; + } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, @@ -1685,35 +1701,35 @@ long populate_vma_page_range(struct vm_area_struct *vma, } /* - * faultin_vma_page_range() - populate (prefault) page tables inside the - * given VMA range readable/writable + * faultin_page_range() - populate (prefault) page tables inside the + * given range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * - * @vma: target vma + * @mm: the mm to populate page tables in * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * - * Returns either number of processed pages in the vma, or a negative error - * code on error (see __get_user_pages()). + * Returns either number of processed pages in the MM, or a negative error + * code on error (see __get_user_pages()). Note that this function reports + * errors related to VMAs, such as incompatible mappings, as expected by + * MADV_POPULATE_(READ|WRITE). * - * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and - * covered by the VMA. If it's released, *@locked will be set to 0. + * The range must be page-aligned. + * + * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. */ -long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, bool write, int *locked) +long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked) { - struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* @@ -1725,19 +1741,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE | + FOLL_MADV_POPULATE; if (write) gup_flags |= FOLL_WRITE; - /* - * We want to report -EINVAL instead of -EFAULT for any permission - * problems or incompatible mappings. - */ - if (check_vma_flags(vma, gup_flags)) - return -EINVAL; - - ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, locked); + ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked, + gup_flags); lru_add_drain(); return ret; } diff --git a/mm/internal.h b/mm/internal.h index 7e486f2c502c..07ad2675a88b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -686,9 +686,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio); void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); -extern long faultin_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - bool write, int *locked); +extern long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); @@ -1127,10 +1126,13 @@ enum { FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, + /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ + FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ - FOLL_FAST_ONLY | FOLL_UNLOCKABLE) + FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ + FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, diff --git a/mm/madvise.c b/mm/madvise.c index 44a498c94158..1a073fcc4c0c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -908,27 +908,14 @@ static long madvise_populate(struct vm_area_struct *vma, { const bool write = behavior == MADV_POPULATE_WRITE; struct mm_struct *mm = vma->vm_mm; - unsigned long tmp_end; int locked = 1; long pages; *prev = vma; while (start < end) { - /* - * We might have temporarily dropped the lock. For example, - * our VMA might have been split. - */ - if (!vma || start >= vma->vm_end) { - vma = vma_lookup(mm, start); - if (!vma) - return -ENOMEM; - } - - tmp_end = min_t(unsigned long, end, vma->vm_end); /* Populate (prefault) page tables readable/writable. */ - pages = faultin_vma_page_range(vma, start, tmp_end, write, - &locked); + pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; @@ -949,7 +936,7 @@ static long madvise_populate(struct vm_area_struct *vma, pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; - case -ENOMEM: + case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } -- cgit v1.2.3 From c0205eaf3af9f5db14d4b5ee4abacf4a583c3c50 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 4 Apr 2024 10:17:26 -0700 Subject: userfaultfd: change src_folio after ensuring it's unpinned in UFFDIO_MOVE Commit d7a08838ab74 ("mm: userfaultfd: fix unexpected change to src_folio when UFFDIO_MOVE fails") moved the src_folio->{mapping, index} changing to after clearing the page-table and ensuring that it's not pinned. This avoids failure of swapout+migration and possibly memory corruption. However, the commit missed fixing it in the huge-page case. Link: https://lkml.kernel.org/r/20240404171726.2302435-1-lokeshgidra@google.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Lokesh Gidra Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Kalesh Singh Cc: Lokesh Gidra Cc: Nicolas Geoffray Cc: Peter Xu Cc: Qi Zheng Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9859aa4f7553..89f58c7603b2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2259,9 +2259,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm goto unlock_ptls; } - folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); - src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { @@ -2270,6 +2267,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm goto unlock_ptls; } + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); -- cgit v1.2.3 From ea4b5b33bf8a4936cfb07759d926db697c43fb1e Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:06:59 +0200 Subject: mm,page_owner: update metadata for tail pages Patch series "page_owner: Fix refcount imbalance and print fixup", v4. This series consists of a refactoring/correctness of updating the metadata of tail pages, a couple of fixups for the refcounting part and a fixup for the stack_start() function. From this series on, instead of counting the stacks, we count the outstanding nr_base_pages each stack has, which gives us a much better memory overview. The other fixup is for the migration part. A more detailed explanation can be found in the changelog of the respective patches. This patch (of 4): __set_page_owner_handle() and __reset_page_owner() update the metadata of all pages when the page is of a higher-order, but we miss to do the same when the pages are migrated. __folio_copy_owner() only updates the metadata of the head page, meaning that the information stored in the first page and the tail pages will not match. Strictly speaking that is not a big problem because 1) we do not print tail pages and 2) upon splitting all tail pages will inherit the metadata of the head page, but it is better to have all metadata in check should there be any problem, so it can ease debugging. For that purpose, a couple of helpers are created __update_page_owner_handle() which updates the metadata on allocation, and __update_page_owner_free_handle() which does the same when the page is freed. __folio_copy_owner() will make use of both as it needs to entirely replace the page_owner metadata for the new page. Link: https://lkml.kernel.org/r/20240404070702.2744-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20240404070702.2744-2-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Tested-by: Kefeng Wang Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Alexandre Ghiti Signed-off-by: Andrew Morton --- mm/page_owner.c | 137 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 74 insertions(+), 63 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index d17d1351ec84..52d1ced0b57f 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -228,9 +228,58 @@ static void dec_stack_record_count(depot_stack_handle_t handle) refcount_dec(&stack_record->count); } -void __reset_page_owner(struct page *page, unsigned short order) +static inline void __update_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + gfp_t gfp_mask, + short last_migrate_reason, u64 ts_nsec, + pid_t pid, pid_t tgid, char *comm) { int i; + struct page_owner *page_owner; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + page_owner->handle = handle; + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = last_migrate_reason; + page_owner->pid = pid; + page_owner->tgid = tgid; + page_owner->ts_nsec = ts_nsec; + strscpy(page_owner->comm, comm, + sizeof(page_owner->comm)); + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } +} + +static inline void __update_page_owner_free_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + pid_t pid, pid_t tgid, + u64 free_ts_nsec) +{ + int i; + struct page_owner *page_owner; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + /* Only __reset_page_owner() wants to clear the bit */ + if (handle) { + __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_owner->free_handle = handle; + } + page_owner->free_ts_nsec = free_ts_nsec; + page_owner->free_pid = current->pid; + page_owner->free_tgid = current->tgid; + page_ext = page_ext_next(page_ext); + } +} + +void __reset_page_owner(struct page *page, unsigned short order) +{ struct page_ext *page_ext; depot_stack_handle_t handle; depot_stack_handle_t alloc_handle; @@ -245,16 +294,10 @@ void __reset_page_owner(struct page *page, unsigned short order) alloc_handle = page_owner->handle; handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); - for (i = 0; i < (1 << order); i++) { - __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_owner->free_handle = handle; - page_owner->free_ts_nsec = free_ts_nsec; - page_owner->free_pid = current->pid; - page_owner->free_tgid = current->tgid; - page_ext = page_ext_next(page_ext); - page_owner = get_page_owner(page_ext); - } + __update_page_owner_free_handle(page_ext, handle, order, current->pid, + current->tgid, free_ts_nsec); page_ext_put(page_ext); + if (alloc_handle != early_handle) /* * early_handle is being set as a handle for all those @@ -266,36 +309,11 @@ void __reset_page_owner(struct page *page, unsigned short order) dec_stack_record_count(alloc_handle); } -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, - unsigned short order, gfp_t gfp_mask) -{ - struct page_owner *page_owner; - int i; - u64 ts_nsec = local_clock(); - - for (i = 0; i < (1 << order); i++) { - page_owner = get_page_owner(page_ext); - page_owner->handle = handle; - page_owner->order = order; - page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; - page_owner->pid = current->pid; - page_owner->tgid = current->tgid; - page_owner->ts_nsec = ts_nsec; - strscpy(page_owner->comm, current->comm, - sizeof(page_owner->comm)); - __set_bit(PAGE_EXT_OWNER, &page_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - - page_ext = page_ext_next(page_ext); - } -} - noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { struct page_ext *page_ext; + u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); @@ -303,7 +321,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order, page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, + current->pid, current->tgid, ts_nsec, + current->comm); page_ext_put(page_ext); inc_stack_record_count(handle, gfp_mask); } @@ -342,7 +362,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) { struct page_ext *old_ext; struct page_ext *new_ext; - struct page_owner *old_page_owner, *new_page_owner; + struct page_owner *old_page_owner; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -355,31 +375,21 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) } old_page_owner = get_page_owner(old_ext); - new_page_owner = get_page_owner(new_ext); - new_page_owner->order = old_page_owner->order; - new_page_owner->gfp_mask = old_page_owner->gfp_mask; - new_page_owner->last_migrate_reason = - old_page_owner->last_migrate_reason; - new_page_owner->handle = old_page_owner->handle; - new_page_owner->pid = old_page_owner->pid; - new_page_owner->tgid = old_page_owner->tgid; - new_page_owner->free_pid = old_page_owner->free_pid; - new_page_owner->free_tgid = old_page_owner->free_tgid; - new_page_owner->ts_nsec = old_page_owner->ts_nsec; - new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; - strcpy(new_page_owner->comm, old_page_owner->comm); - + __update_page_owner_handle(new_ext, old_page_owner->handle, + old_page_owner->order, old_page_owner->gfp_mask, + old_page_owner->last_migrate_reason, + old_page_owner->ts_nsec, old_page_owner->pid, + old_page_owner->tgid, old_page_owner->comm); /* - * We don't clear the bit on the old folio as it's going to be freed - * after migration. Until then, the info can be useful in case of - * a bug, and the overall stats will be off a bit only temporarily. - * Also, migrate_misplaced_transhuge_page() can still fail the - * migration and then we want the old folio to retain the info. But - * in that case we also don't need to explicitly clear the info from - * the new page, which will be freed. + * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio + * will be freed after migration. Keep them until then as they may be + * useful. */ - __set_bit(PAGE_EXT_OWNER, &new_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + __update_page_owner_free_handle(new_ext, 0, old_page_owner->order, + old_page_owner->free_pid, + old_page_owner->free_tgid, + old_page_owner->free_ts_nsec); + page_ext_put(new_ext); page_ext_put(old_ext); } @@ -787,8 +797,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) goto ext_put_continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, - 0, 0); + __update_page_owner_handle(page_ext, early_handle, 0, 0, + -1, local_clock(), current->pid, + current->tgid, current->comm); count++; ext_put_continue: page_ext_put(page_ext); -- cgit v1.2.3 From f5c12105c15f0ddf0ff37646290568dd986fa2f3 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:00 +0200 Subject: mm,page_owner: fix refcount imbalance Current code does not contemplate scenarios were an allocation and free operation on the same pages do not handle it in the same amount at once. To give an example, page_alloc_exact(), where we will allocate a page of enough order to stafisfy the size request, but we will free the remainings right away. In the above example, we will increment the stack_record refcount only once, but we will decrease it the same number of times as number of unused pages we have to free. This will lead to a warning because of refcount imbalance. Fix this by recording the number of base pages in the refcount field. Link: https://lkml.kernel.org/r/20240404070702.2744-3-osalvador@suse.de Reported-by: syzbot+41bbfdb8d41003d12c0f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/00000000000090e8ff0613eda0e5@google.com Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Tested-by: Alexandre Ghiti Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- Documentation/mm/page_owner.rst | 73 +++++++++++++++++++++-------------------- mm/page_owner.c | 34 +++++++++++-------- 2 files changed, 58 insertions(+), 49 deletions(-) diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index 0d0334cd5179..3a45a20fc05a 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of each page. It is already implemented and activated if page owner is enabled. Other usages are more than welcome. -It can also be used to show all the stacks and their outstanding -allocations, which gives us a quick overview of where the memory is going -without the need to screen through all the pages and match the allocation -and free operation. +It can also be used to show all the stacks and their current number of +allocated base pages, which gives us a quick overview of where the memory +is going without the need to screen through all the pages and match the +allocation and free operation. page owner is disabled by default. So, if you'd like to use it, you need to add "page_owner=on" to your boot cmdline. If the kernel is built @@ -75,42 +75,45 @@ Usage cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt cat stacks.txt - prep_new_page+0xa9/0x120 - get_page_from_freelist+0x7e6/0x2140 - __alloc_pages+0x18a/0x370 - new_slab+0xc8/0x580 - ___slab_alloc+0x1f2/0xaf0 - __slab_alloc.isra.86+0x22/0x40 - kmem_cache_alloc+0x31b/0x350 - __khugepaged_enter+0x39/0x100 - dup_mmap+0x1c7/0x5ce - copy_process+0x1afe/0x1c90 - kernel_clone+0x9a/0x3c0 - __do_sys_clone+0x66/0x90 - do_syscall_64+0x7f/0x160 - entry_SYSCALL_64_after_hwframe+0x6c/0x74 - stack_count: 234 + post_alloc_hook+0x177/0x1a0 + get_page_from_freelist+0xd01/0xd80 + __alloc_pages+0x39e/0x7e0 + allocate_slab+0xbc/0x3f0 + ___slab_alloc+0x528/0x8a0 + kmem_cache_alloc+0x224/0x3b0 + sk_prot_alloc+0x58/0x1a0 + sk_alloc+0x32/0x4f0 + inet_create+0x427/0xb50 + __sock_create+0x2e4/0x650 + inet_ctl_sock_create+0x30/0x180 + igmp_net_init+0xc1/0x130 + ops_init+0x167/0x410 + setup_net+0x304/0xa60 + copy_net_ns+0x29b/0x4a0 + create_new_namespaces+0x4a1/0x820 + nr_base_pages: 16 ... ... echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt cat stacks_7000.txt - prep_new_page+0xa9/0x120 - get_page_from_freelist+0x7e6/0x2140 - __alloc_pages+0x18a/0x370 - alloc_pages_mpol+0xdf/0x1e0 - folio_alloc+0x14/0x50 - filemap_alloc_folio+0xb0/0x100 - page_cache_ra_unbounded+0x97/0x180 - filemap_fault+0x4b4/0x1200 - __do_fault+0x2d/0x110 - do_pte_missing+0x4b0/0xa30 - __handle_mm_fault+0x7fa/0xb70 - handle_mm_fault+0x125/0x300 - do_user_addr_fault+0x3c9/0x840 - exc_page_fault+0x68/0x150 - asm_exc_page_fault+0x22/0x30 - stack_count: 8248 + post_alloc_hook+0x177/0x1a0 + get_page_from_freelist+0xd01/0xd80 + __alloc_pages+0x39e/0x7e0 + alloc_pages_mpol+0x22e/0x490 + folio_alloc+0xd5/0x110 + filemap_alloc_folio+0x78/0x230 + page_cache_ra_order+0x287/0x6f0 + filemap_get_pages+0x517/0x1160 + filemap_read+0x304/0x9f0 + xfs_file_buffered_read+0xe6/0x1d0 [xfs] + xfs_file_read_iter+0x1f0/0x380 [xfs] + __kernel_read+0x3b9/0x730 + kernel_read_file+0x309/0x4d0 + __do_sys_finit_module+0x381/0x730 + do_syscall_64+0x8d/0x150 + entry_SYSCALL_64_after_hwframe+0x62/0x6a + nr_base_pages: 20824 ... cat /sys/kernel/debug/page_owner > page_owner_full.txt diff --git a/mm/page_owner.c b/mm/page_owner.c index 52d1ced0b57f..5df0d6892bdc 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record, spin_unlock_irqrestore(&stack_list_lock, flags); } -static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) +static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, + int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); @@ -217,15 +218,20 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) /* Add the new stack_record to our list */ add_stack_record_to_list(stack_record, gfp_mask); } - refcount_inc(&stack_record->count); + refcount_add(nr_base_pages, &stack_record->count); } -static void dec_stack_record_count(depot_stack_handle_t handle) +static void dec_stack_record_count(depot_stack_handle_t handle, + int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); - if (stack_record) - refcount_dec(&stack_record->count); + if (!stack_record) + return; + + if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) + pr_warn("%s: refcount went to 0 for %u handle\n", __func__, + handle); } static inline void __update_page_owner_handle(struct page_ext *page_ext, @@ -306,7 +312,7 @@ void __reset_page_owner(struct page *page, unsigned short order) * the machinery is not ready yet, we cannot decrement * their refcount either. */ - dec_stack_record_count(alloc_handle); + dec_stack_record_count(alloc_handle, 1 << order); } noinline void __set_page_owner(struct page *page, unsigned short order, @@ -325,7 +331,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order, current->pid, current->tgid, ts_nsec, current->comm); page_ext_put(page_ext); - inc_stack_record_count(handle, gfp_mask); + inc_stack_record_count(handle, gfp_mask, 1 << order); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -872,11 +878,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) return stack; } -static unsigned long page_owner_stack_threshold; +static unsigned long page_owner_pages_threshold; static int stack_print(struct seq_file *m, void *v) { - int i, stack_count; + int i, nr_base_pages; struct stack *stack = v; unsigned long *entries; unsigned long nr_entries; @@ -887,14 +893,14 @@ static int stack_print(struct seq_file *m, void *v) nr_entries = stack_record->size; entries = stack_record->entries; - stack_count = refcount_read(&stack_record->count) - 1; + nr_base_pages = refcount_read(&stack_record->count) - 1; - if (stack_count < 1 || stack_count < page_owner_stack_threshold) + if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) return 0; for (i = 0; i < nr_entries; i++) seq_printf(m, " %pS\n", (void *)entries[i]); - seq_printf(m, "stack_count: %d\n\n", stack_count); + seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); return 0; } @@ -924,13 +930,13 @@ static const struct file_operations page_owner_stack_operations = { static int page_owner_threshold_get(void *data, u64 *val) { - *val = READ_ONCE(page_owner_stack_threshold); + *val = READ_ONCE(page_owner_pages_threshold); return 0; } static int page_owner_threshold_set(void *data, u64 val) { - WRITE_ONCE(page_owner_stack_threshold, val); + WRITE_ONCE(page_owner_pages_threshold, val); return 0; } -- cgit v1.2.3 From 718b1f3373a7999f77e617c17abdcb98a3c001ea Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:01 +0200 Subject: mm,page_owner: fix accounting of pages when migrating Upon migration, new allocated pages are being given the handle of the old pages. This is problematic because it means that for the stack which allocated the old page, we will be substracting the old page + the new one when that page is freed, creating an accounting imbalance. There is an interest in keeping it that way, as otherwise the output will biased towards migration stacks should those operations occur often, but that is not really helpful. The link from the new page to the old stack is being performed by calling __update_page_owner_handle() in __folio_copy_owner(). The only thing that is left is to link the migrate stack to the old page, so the old page will be subtracted from the migrate stack, avoiding by doing so any possible imbalance. Link: https://lkml.kernel.org/r/20240404070702.2744-4-osalvador@suse.de Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- mm/page_owner.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/page_owner.c b/mm/page_owner.c index 5df0d6892bdc..b4476f45b376 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -366,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order) void __folio_copy_owner(struct folio *newfolio, struct folio *old) { + int i; struct page_ext *old_ext; struct page_ext *new_ext; struct page_owner *old_page_owner; + struct page_owner *new_page_owner; + depot_stack_handle_t migrate_handle; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -381,6 +384,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) } old_page_owner = get_page_owner(old_ext); + new_page_owner = get_page_owner(new_ext); + migrate_handle = new_page_owner->handle; __update_page_owner_handle(new_ext, old_page_owner->handle, old_page_owner->order, old_page_owner->gfp_mask, old_page_owner->last_migrate_reason, @@ -395,6 +400,16 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) old_page_owner->free_pid, old_page_owner->free_tgid, old_page_owner->free_ts_nsec); + /* + * We linked the original stack to the new folio, we need to do the same + * for the new one and the old folio otherwise there will be an imbalance + * when subtracting those pages from the stack. + */ + for (i = 0; i < (1 << new_page_owner->order); i++) { + old_page_owner->handle = migrate_handle; + old_ext = page_ext_next(old_ext); + old_page_owner = get_page_owner(old_ext); + } page_ext_put(new_ext); page_ext_put(old_ext); -- cgit v1.2.3 From 74017458017127ca6bf14b1f9fda69e03f43389b Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:02 +0200 Subject: mm,page_owner: fix printing of stack records When seq_* code sees that its buffer overflowed, it re-allocates a bigger onecand calls seq_operations->start() callback again. stack_start() naively though that if it got called again, it meant that the old record got already printed so it returned the next object, but that is not true. The consequence of that is that every time stack_stop() -> stack_start() get called because we needed a bigger buffer, stack_start() will skip entries, and those will not be printed. Fix it by not advancing to the next object in stack_start(). Link: https://lkml.kernel.org/r/20240404070702.2744-5-osalvador@suse.de Fixes: 765973a09803 ("mm,page_owner: display all stacks and their count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- mm/page_owner.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index b4476f45b376..9bef0b442863 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -872,13 +872,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) * value of stack_list. */ stack = smp_load_acquire(&stack_list); + m->private = stack; } else { stack = m->private; - stack = stack->next; } - m->private = stack; - return stack; } -- cgit v1.2.3 From c5977c95dff182d6ee06f4d6f60bcb0284912969 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Apr 2024 19:19:20 -0400 Subject: mm/userfaultfd: allow hugetlb change protection upon poison entry After UFFDIO_POISON, there can be two kinds of hugetlb pte markers, either the POISON one or UFFD_WP one. Allow change protection to run on a poisoned marker just like !hugetlb cases, ignoring the marker irrelevant of the permission. Here the two bits are mutual exclusive. For example, when install a poisoned entry it must not be UFFD_WP already (by checking pte_none() before such install). And it also means if UFFD_WP is set there must have no POISON bit set. It makes sense because UFFD_WP is a bit to reflect permission, and permissions do not apply if the pte is poisoned and destined to sigbus. So here we simply check uffd_wp bit set first, do nothing otherwise. Attach the Fixes to UFFDIO_POISON work, as before that it should not be possible to have poison entry for hugetlb (e.g., hugetlb doesn't do swap, so no chance of swapin errors). Link: https://lkml.kernel.org/r/20240405231920.1772199-1-peterx@redhat.com Link: https://lore.kernel.org/r/000000000000920d5e0615602dd1@google.com Fixes: fc71884a5f59 ("mm: userfaultfd: add new UFFDIO_POISON ioctl") Signed-off-by: Peter Xu Reported-by: syzbot+b07c8ac8eee3d4d8440f@syzkaller.appspotmail.com Reviewed-by: David Hildenbrand Reviewed-by: Axel Rasmussen Cc: [6.6+] Signed-off-by: Andrew Morton --- mm/hugetlb.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 23ef240ba48a..31d00eee028f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7044,9 +7044,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { - /* No other markers apply for now. */ - WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); - if (uffd_wp_resolve) + /* + * Do nothing on a poison marker; page is + * corrupted, permissons do not apply. Here + * pte_marker_uffd_wp()==true implies !poison + * because they're mutual exclusive. + */ + if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { -- cgit v1.2.3 From 1983184c22dd84a4d95a71e5c6775c2638557dc7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 7 Apr 2024 16:54:56 +0800 Subject: mm/memory-failure: fix deadlock when hugetlb_optimize_vmemmap is enabled When I did hard offline test with hugetlb pages, below deadlock occurs: ====================================================== WARNING: possible circular locking dependency detected 6.8.0-11409-gf6cef5f8c37f #1 Not tainted ------------------------------------------------------ bash/46904 is trying to acquire lock: ffffffffabe68910 (cpu_hotplug_lock){++++}-{0:0}, at: static_key_slow_dec+0x16/0x60 but task is already holding lock: ffffffffabf92ea8 (pcp_batch_high_lock){+.+.}-{3:3}, at: zone_pcp_disable+0x16/0x40 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (pcp_batch_high_lock){+.+.}-{3:3}: __mutex_lock+0x6c/0x770 page_alloc_cpu_online+0x3c/0x70 cpuhp_invoke_callback+0x397/0x5f0 __cpuhp_invoke_callback_range+0x71/0xe0 _cpu_up+0xeb/0x210 cpu_up+0x91/0xe0 cpuhp_bringup_mask+0x49/0xb0 bringup_nonboot_cpus+0xb7/0xe0 smp_init+0x25/0xa0 kernel_init_freeable+0x15f/0x3e0 kernel_init+0x15/0x1b0 ret_from_fork+0x2f/0x50 ret_from_fork_asm+0x1a/0x30 -> #0 (cpu_hotplug_lock){++++}-{0:0}: __lock_acquire+0x1298/0x1cd0 lock_acquire+0xc0/0x2b0 cpus_read_lock+0x2a/0xc0 static_key_slow_dec+0x16/0x60 __hugetlb_vmemmap_restore_folio+0x1b9/0x200 dissolve_free_huge_page+0x211/0x260 __page_handle_poison+0x45/0xc0 memory_failure+0x65e/0xc70 hard_offline_page_store+0x55/0xa0 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x387/0x550 ksys_write+0x64/0xe0 do_syscall_64+0xca/0x1e0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(pcp_batch_high_lock); lock(cpu_hotplug_lock); lock(pcp_batch_high_lock); rlock(cpu_hotplug_lock); *** DEADLOCK *** 5 locks held by bash/46904: #0: ffff98f6c3bb23f0 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x64/0xe0 #1: ffff98f6c328e488 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xf8/0x1d0 #2: ffff98ef83b31890 (kn->active#113){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1d0 #3: ffffffffabf9db48 (mf_mutex){+.+.}-{3:3}, at: memory_failure+0x44/0xc70 #4: ffffffffabf92ea8 (pcp_batch_high_lock){+.+.}-{3:3}, at: zone_pcp_disable+0x16/0x40 stack backtrace: CPU: 10 PID: 46904 Comm: bash Kdump: loaded Not tainted 6.8.0-11409-gf6cef5f8c37f #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x68/0xa0 check_noncircular+0x129/0x140 __lock_acquire+0x1298/0x1cd0 lock_acquire+0xc0/0x2b0 cpus_read_lock+0x2a/0xc0 static_key_slow_dec+0x16/0x60 __hugetlb_vmemmap_restore_folio+0x1b9/0x200 dissolve_free_huge_page+0x211/0x260 __page_handle_poison+0x45/0xc0 memory_failure+0x65e/0xc70 hard_offline_page_store+0x55/0xa0 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x387/0x550 ksys_write+0x64/0xe0 do_syscall_64+0xca/0x1e0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7fc862314887 Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007fff19311268 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007fc862314887 RDX: 000000000000000c RSI: 000056405645fe10 RDI: 0000000000000001 RBP: 000056405645fe10 R08: 00007fc8623d1460 R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c R13: 00007fc86241b780 R14: 00007fc862417600 R15: 00007fc862416a00 In short, below scene breaks the lock dependency chain: memory_failure __page_handle_poison zone_pcp_disable -- lock(pcp_batch_high_lock) dissolve_free_huge_page __hugetlb_vmemmap_restore_folio static_key_slow_dec cpus_read_lock -- rlock(cpu_hotplug_lock) Fix this by calling drain_all_pages() instead. This issue won't occur until commit a6b40850c442 ("mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key"). As it introduced rlock(cpu_hotplug_lock) in dissolve_free_huge_page() code path while lock(pcp_batch_high_lock) is already in the __page_handle_poison(). [linmiaohe@huawei.com: extend comment per Oscar] [akpm@linux-foundation.org: reflow block comment] Link: https://lkml.kernel.org/r/20240407085456.2798193-1-linmiaohe@huawei.com Fixes: a6b40850c442 ("mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key") Signed-off-by: Miaohe Lin Acked-by: Oscar Salvador Reviewed-by: Jane Chu Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9349948f1abf..9e62a00b46dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -154,11 +154,23 @@ static int __page_handle_poison(struct page *page) { int ret; - zone_pcp_disable(page_zone(page)); + /* + * zone_pcp_disable() can't be used here. It will + * hold pcp_batch_high_lock and dissolve_free_huge_page() might hold + * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap + * optimization is enabled. This will break current lock dependency + * chain and leads to deadlock. + * Disabling pcp before dissolving the page was a deterministic + * approach because we made sure that those pages cannot end up in any + * PCP list. Draining PCP lists expels those pages to the buddy system, + * but nothing guarantees that those pages do not get back to a PCP + * queue if we need to refill those. + */ ret = dissolve_free_huge_page(page); - if (!ret) + if (!ret) { + drain_all_pages(page_zone(page)); ret = take_page_off_buddy(page); - zone_pcp_enable(page_zone(page)); + } return ret; } -- cgit v1.2.3 From 07a57a338adb6ec9e766d6a6790f76527f45ceb5 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Sun, 7 Apr 2024 15:05:37 +0200 Subject: mm,swapops: update check in is_pfn_swap_entry for hwpoison entries Tony reported that the Machine check recovery was broken in v6.9-rc1, as he was hitting a VM_BUG_ON when injecting uncorrectable memory errors to DRAM. After some more digging and debugging on his side, he realized that this went back to v6.1, with the introduction of 'commit 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry")'. That commit, among other things, introduced swp_offset_pfn(), replacing hwpoison_entry_to_pfn() in its favour. The patch also introduced a VM_BUG_ON() check for is_pfn_swap_entry(), but is_pfn_swap_entry() never got updated to cover hwpoison entries, which means that we would hit the VM_BUG_ON whenever we would call swp_offset_pfn() for such entries on environments with CONFIG_DEBUG_VM set. Fix this by updating the check to cover hwpoison entries as well, and update the comment while we are it. Link: https://lkml.kernel.org/r/20240407130537.16977-1-osalvador@suse.de Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry") Signed-off-by: Oscar Salvador Reported-by: Tony Luck Closes: https://lore.kernel.org/all/Zg8kLSl2yAlA3o5D@agluck-desk3/ Tested-by: Tony Luck Reviewed-by: Peter Xu Reviewed-by: David Hildenbrand Acked-by: Miaohe Lin Cc: [6.1.x] Signed-off-by: Andrew Morton --- include/linux/swapops.h | 65 +++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 48b700ba1d18..a5c560a2f8c2 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) } #endif /* CONFIG_MIGRATION */ +#ifdef CONFIG_MEMORY_FAILURE + +/* + * Support for hardware poisoned pages + */ +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(SWP_HWPOISON, page_to_pfn(page)); +} + +static inline int is_hwpoison_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_HWPOISON; +} + +#else + +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + return swp_entry(0, 0); +} + +static inline int is_hwpoison_entry(swp_entry_t swp) +{ + return 0; +} +#endif + typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) @@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) /* * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They are used to represent unaddressable device memory - * and to restrict access to a page undergoing migration. + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { @@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry) BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry); + is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; @@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -#ifdef CONFIG_MEMORY_FAILURE - -/* - * Support for hardware poisoned pages - */ -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - BUG_ON(!PageLocked(page)); - return swp_entry(SWP_HWPOISON, page_to_pfn(page)); -} - -static inline int is_hwpoison_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_HWPOISON; -} - -#else - -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - return swp_entry(0, 0); -} - -static inline int is_hwpoison_entry(swp_entry_t swp) -{ - return 0; -} -#endif - static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; -- cgit v1.2.3 From 9253c54e01b6505d348afbc02abaa4d9f8a01395 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 8 Apr 2024 23:02:06 +0100 Subject: Squashfs: check the inode number is not the invalid value of zero Syskiller has produced an out of bounds access in fill_meta_index(). That out of bounds access is ultimately caused because the inode has an inode number with the invalid value of zero, which was not checked. The reason this causes the out of bounds access is due to following sequence of events: 1. Fill_meta_index() is called to allocate (via empty_meta_index()) and fill a metadata index. It however suffers a data read error and aborts, invalidating the newly returned empty metadata index. It does this by setting the inode number of the index to zero, which means unused (zero is not a valid inode number). 2. When fill_meta_index() is subsequently called again on another read operation, locate_meta_index() returns the previous index because it matches the inode number of 0. Because this index has been returned it is expected to have been filled, and because it hasn't been, an out of bounds access is performed. This patch adds a sanity check which checks that the inode number is not zero when the inode is created and returns -EINVAL if it is. [phillip@squashfs.org.uk: whitespace fix] Link: https://lkml.kernel.org/r/20240409204723.446925-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20240408220206.435788-1-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: "Ubisectech Sirius" Closes: https://lore.kernel.org/lkml/87f5c007-b8a5-41ae-8b57-431e924c5915.bugreport@ubisectech.com/ Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- fs/squashfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index aa3411354e66..16bd693d0b3a 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, gid_t i_gid; int err; + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + if (inode->i_ino == 0) + return -EINVAL; + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); if (err) return err; @@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); - inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); inode_set_atime(inode, inode_get_mtime_sec(inode), 0); inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); -- cgit v1.2.3 From 0b2cf0a45e06d9538a2371f90150297a87b20eea Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 9 Apr 2024 15:17:15 +0200 Subject: mm,page_owner: defer enablement of static branch Kefeng Wang reported that he was seeing some memory leaks with kmemleak with page_owner enabled. The reason is that we enable the page_owner_inited static branch and then proceed with the linking of stack_list struct to dummy_stack, which means that exists a race window between these two steps where we can have pages already being allocated calling add_stack_record_to_list(), allocating objects and linking them to stack_list, but then we set stack_list pointing to dummy_stack in init_page_owner. Which means that the objects that have been allocated during that time window are unreferenced and lost. Fix this by deferring the enablement of the branch until we have properly set up the list. Link: https://lkml.kernel.org/r/20240409131715.13632-1-osalvador@suse.de Fixes: 4bedfb314bdd ("mm,page_owner: maintain own list of stack_records structs") Signed-off-by: Oscar Salvador Reported-by: Kefeng Wang Closes: https://lore.kernel.org/linux-mm/74b147b0-718d-4d50-be75-d6afc801cd24@huawei.com/ Tested-by: Kefeng Wang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 9bef0b442863..742f432e5bf0 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -118,7 +118,6 @@ static __init void init_page_owner(void) register_dummy_stack(); register_failure_stack(); register_early_stack(); - static_branch_enable(&page_owner_inited); init_early_allocated_pages(); /* Initialize dummy and failure stacks and link them to stack_list */ dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); @@ -129,6 +128,7 @@ static __init void init_page_owner(void) refcount_set(&failure_stack.stack_record->count, 1); dummy_stack.next = &failure_stack; stack_list = &dummy_stack; + static_branch_enable(&page_owner_inited); } struct page_ext_operations page_owner_ops = { -- cgit v1.2.3 From 1f737846aa3c45f07a06fa0d018b39e1afb8084a Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Tue, 9 Apr 2024 17:54:07 +0200 Subject: mm/shmem: inline shmem_is_huge() for disabled transparent hugepages In order to minimize code size (CONFIG_CC_OPTIMIZE_FOR_SIZE=y), compiler might choose to make a regular function call (out-of-line) for shmem_is_huge() instead of inlining it. When transparent hugepages are disabled (CONFIG_TRANSPARENT_HUGEPAGE=n), it can cause compilation error. mm/shmem.c: In function `shmem_getattr': ./include/linux/huge_mm.h:383:27: note: in expansion of macro `BUILD_BUG' 383 | #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) | ^~~~~~~~~ mm/shmem.c:1148:33: note: in expansion of macro `HPAGE_PMD_SIZE' 1148 | stat->blksize = HPAGE_PMD_SIZE; To prevent the possible error, always inline shmem_is_huge() when transparent hugepages are disabled. Link: https://lkml.kernel.org/r/20240409155407.2322714-1-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ilya Leoshkevich Cc: Vasily Gorbik Cc: Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++++++ mm/shmem.c | 6 ------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a4c15db2f5e5..3fb18f7eb73e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +#else +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) +{ + return false; +} +#endif + #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); #else diff --git a/mm/shmem.c b/mm/shmem.c index 0aad0d9a621b..94ab99b6b574 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) -{ - return false; -} - static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { -- cgit v1.2.3 From 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 10 Apr 2024 17:14:41 +0800 Subject: fork: defer linking file vma until vma is fully initialized Thorvald reported a WARNING [1]. And the root cause is below race: CPU 1 CPU 2 fork hugetlbfs_fallocate dup_mmap hugetlbfs_punch_hole i_mmap_lock_write(mapping); vma_interval_tree_insert_after -- Child vma is visible through i_mmap tree. i_mmap_unlock_write(mapping); hugetlb_dup_vma_private -- Clear vma_lock outside i_mmap_rwsem! i_mmap_lock_write(mapping); hugetlb_vmdelete_list vma_interval_tree_foreach hugetlb_vma_trylock_write -- Vma_lock is cleared. tmp->vm_ops->open -- Alloc new vma_lock outside i_mmap_rwsem! hugetlb_vma_unlock_write -- Vma_lock is assigned!!! i_mmap_unlock_write(mapping); hugetlb_dup_vma_private() and hugetlb_vm_op_open() are called outside i_mmap_rwsem lock while vma lock can be used in the same time. Fix this by deferring linking file vma until vma is fully initialized. Those vmas should be initialized first before they can be used. Link: https://lkml.kernel.org/r/20240410091441.3539905-1-linmiaohe@huawei.com Fixes: 8d9bfb260814 ("hugetlb: add vma based lock for pmd sharing") Signed-off-by: Miaohe Lin Reported-by: Thorvald Natvig Closes: https://lore.kernel.org/linux-mm/20240129161735.6gmjsswx62o4pbja@revolver/T/ [1] Reviewed-by: Jane Chu Cc: Christian Brauner Cc: Heiko Carstens Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Oleg Nesterov Cc: Peng Zhang Cc: Tycho Andersen Cc: Signed-off-by: Andrew Morton --- kernel/fork.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 39a5046c2f0b..aebb3e6c96dc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -714,6 +714,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); + /* + * Copy/update hugetlb private vma information. + */ + if (is_vm_hugetlb_page(tmp)) + hugetlb_dup_vma_private(tmp); + + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); + + mm->map_count++; + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; @@ -730,25 +747,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, i_mmap_unlock_write(mapping); } - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - if (retval) { mpnt = vma_next(&vmi); goto loop_out; -- cgit v1.2.3 From 8247bf1db92a9697d4ee26db3259d2a1959d1366 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Sat, 13 Apr 2024 03:17:20 +0900 Subject: MAINTAINERS: update Naoya Horiguchi's email address My old NEC address has been removed, so update MAINTAINERS and .mailmap to map it to my gmail address. Link: https://lkml.kernel.org/r/20240412181720.18452-1-nao.horiguchi@gmail.com Signed-off-by: Naoya Horiguchi Acked-by: Miaohe Lin Cc: Oscar Salvador Signed-off-by: Andrew Morton --- .mailmap | 3 ++- MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 8284692f9610..625b496bf5f4 100644 --- a/.mailmap +++ b/.mailmap @@ -446,7 +446,8 @@ Mythri P K Nadav Amit Nadav Amit Nadia Yvette Chambers William Lee Irwin III -Naoya Horiguchi +Naoya Horiguchi +Naoya Horiguchi Nathan Chancellor Neeraj Upadhyay Neil Armstrong diff --git a/MAINTAINERS b/MAINTAINERS index c23fda1aa1f0..ee9cc2b40409 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10024,7 +10024,7 @@ F: drivers/media/platform/st/sti/hva HWPOISON MEMORY FAILURE HANDLING M: Miaohe Lin -R: Naoya Horiguchi +R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained F: mm/hwpoison-inject.c -- cgit v1.2.3 From c4a7dc9523b59b3e73fd522c73e95e072f876b16 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Tue, 16 Apr 2024 03:20:48 +0900 Subject: nilfs2: fix OOB in nilfs_set_de_type The size of the nilfs_type_by_mode array in the fs/nilfs2/dir.c file is defined as "S_IFMT >> S_SHIFT", but the nilfs_set_de_type() function, which uses this array, specifies the index to read from the array in the same way as "(mode & S_IFMT) >> S_SHIFT". static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) { umode_t mode = inode->i_mode; de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; // oob } However, when the index is determined this way, an out-of-bounds (OOB) error occurs by referring to an index that is 1 larger than the array size when the condition "mode & S_IFMT == S_IFMT" is satisfied. Therefore, a patch to resize the nilfs_type_by_mode array should be applied to prevent OOB errors. Link: https://lkml.kernel.org/r/20240415182048.7144-1-konishi.ryusuke@gmail.com Reported-by: syzbot+2e22057de05b9f3b30d8@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2e22057de05b9f3b30d8 Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") Signed-off-by: Jeongjun Park Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index bc846b904b68..aee40db7a036 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = { #define S_SHIFT 12 static unsigned char -nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { +nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, -- cgit v1.2.3 From f8bbc07ac535593139c875ffa19af924b1084540 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Sun, 14 Apr 2024 22:02:46 -0400 Subject: tun: limit printing rate when illegal packet received by tun dev vhost_worker will call tun call backs to receive packets. If too many illegal packets arrives, tun_do_read will keep dumping packet contents. When console is enabled, it will costs much more cpu time to dump packet and soft lockup will be detected. net_ratelimit mechanism can be used to limit the dumping rate. PID: 33036 TASK: ffff949da6f20000 CPU: 23 COMMAND: "vhost-32980" #0 [fffffe00003fce50] crash_nmi_callback at ffffffff89249253 #1 [fffffe00003fce58] nmi_handle at ffffffff89225fa3 #2 [fffffe00003fceb0] default_do_nmi at ffffffff8922642e #3 [fffffe00003fced0] do_nmi at ffffffff8922660d #4 [fffffe00003fcef0] end_repeat_nmi at ffffffff89c01663 [exception RIP: io_serial_in+20] RIP: ffffffff89792594 RSP: ffffa655314979e8 RFLAGS: 00000002 RAX: ffffffff89792500 RBX: ffffffff8af428a0 RCX: 0000000000000000 RDX: 00000000000003fd RSI: 0000000000000005 RDI: ffffffff8af428a0 RBP: 0000000000002710 R8: 0000000000000004 R9: 000000000000000f R10: 0000000000000000 R11: ffffffff8acbf64f R12: 0000000000000020 R13: ffffffff8acbf698 R14: 0000000000000058 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #5 [ffffa655314979e8] io_serial_in at ffffffff89792594 #6 [ffffa655314979e8] wait_for_xmitr at ffffffff89793470 #7 [ffffa65531497a08] serial8250_console_putchar at ffffffff897934f6 #8 [ffffa65531497a20] uart_console_write at ffffffff8978b605 #9 [ffffa65531497a48] serial8250_console_write at ffffffff89796558 #10 [ffffa65531497ac8] console_unlock at ffffffff89316124 #11 [ffffa65531497b10] vprintk_emit at ffffffff89317c07 #12 [ffffa65531497b68] printk at ffffffff89318306 #13 [ffffa65531497bc8] print_hex_dump at ffffffff89650765 #14 [ffffa65531497ca8] tun_do_read at ffffffffc0b06c27 [tun] #15 [ffffa65531497d38] tun_recvmsg at ffffffffc0b06e34 [tun] #16 [ffffa65531497d68] handle_rx at ffffffffc0c5d682 [vhost_net] #17 [ffffa65531497ed0] vhost_worker at ffffffffc0c644dc [vhost] #18 [ffffa65531497f10] kthread at ffffffff892d2e72 #19 [ffffa65531497f50] ret_from_fork at ffffffff89c0022f Fixes: ef3db4a59542 ("tun: avoid BUG, dump packet on GSO errors") Signed-off-by: Lei Chen Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Reviewed-by: Eric Dumazet Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20240415020247.2207781-1-lei.chen@smartx.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0b3f21cba552..92da8c03d960 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2125,14 +2125,16 @@ static ssize_t tun_put_user(struct tun_struct *tun, tun_is_little_endian(tun), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); - pr_err("unexpected GSO type: " - "0x%x, gso_size %d, hdr_len %d\n", - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), - tun16_to_cpu(tun, gso.hdr_len)); - print_hex_dump(KERN_ERR, "tun: ", - DUMP_PREFIX_NONE, - 16, 1, skb->head, - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + + if (net_ratelimit()) { + netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), + tun16_to_cpu(tun, gso.hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", + DUMP_PREFIX_NONE, + 16, 1, skb->head, + min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); + } WARN_ON_ONCE(1); return -EINVAL; } -- cgit v1.2.3 From fb1f4584b1215e8c209f6b3a4028ed8351a0e961 Mon Sep 17 00:00:00 2001 From: Chuanhong Guo Date: Tue, 12 Mar 2024 14:29:12 +0800 Subject: USB: serial: option: add support for Fibocom FM650/FG650 Fibocom FM650/FG650 are 5G modems with ECM/NCM/RNDIS/MBIM modes. This patch adds support to all 4 modes. In all 4 modes, the first serial port is the AT console while the other 3 appear to be diagnostic interfaces for dumping modem logs. usb-devices output for all modes: ECM: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 5 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=0a04 Rev=04.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=FG650 Module S: SerialNumber=0123456789ABCDEF C: #Ifs= 5 Cfg#= 1 Atr=c0 MxPwr=504mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether E: Ad=82(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms NCM: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 6 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=0a05 Rev=04.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=FG650 Module S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=504mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0d Prot=00 Driver=cdc_ncm E: Ad=82(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=01 Driver=cdc_ncm E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms RNDIS: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 4 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=0a06 Rev=04.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=FG650 Module S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=504mA I: If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms MBIM: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 7 Spd=5000 MxCh= 0 D: Ver= 3.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=2cb7 ProdID=0a07 Rev=04.04 S: Manufacturer=Fibocom Wireless Inc. S: Product=FG650 Module S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=504mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Chuanhong Guo Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 16cf0ddaae9b..ad0fff1e889d 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2295,6 +2295,10 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a3, 0xff) }, /* Fibocom FM101-GL (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a4, 0xff), /* Fibocom FM101-GL (laptop MBIM) */ .driver_info = RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a04, 0xff) }, /* Fibocom FM650-CN (ECM mode) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a05, 0xff) }, /* Fibocom FM650-CN (NCM mode) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a06, 0xff) }, /* Fibocom FM650-CN (RNDIS mode) */ + { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a07, 0xff) }, /* Fibocom FM650-CN (MBIM mode) */ { USB_DEVICE_INTERFACE_CLASS(0x2df3, 0x9d03, 0xff) }, /* LongSung M5710 */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) }, /* GosunCn GM500 MBIM */ -- cgit v1.2.3 From d59cf049c8378677053703e724808836f180888e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 13 Apr 2024 16:01:39 +0300 Subject: net: dsa: mt7530: fix mirroring frames received on local port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This switch intellectual property provides a bit on the ARL global control register which controls allowing mirroring frames which are received on the local port (monitor port). This bit is unset after reset. This ability must be enabled to fully support the port mirroring feature on this switch intellectual property. Therefore, this patch fixes the traffic not being reflected on a port, which would be configured like below: tc qdisc add dev swp0 clsact tc filter add dev swp0 ingress matchall skip_sw \ action mirred egress mirror dev swp0 As a side note, this configuration provides the hairpinning feature for a single port. Fixes: 37feab6076aa ("net: dsa: mt7530: add support for port mirroring") Signed-off-by: Arınç ÜNAL Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 6 ++++++ drivers/net/dsa/mt7530.h | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index c0d0bce0b594..b84e1845fa02 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2480,6 +2480,9 @@ mt7530_setup(struct dsa_switch *ds) PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT)); } + /* Allow mirroring frames received on the local port (monitor port). */ + mt7530_set(priv, MT753X_AGC, LOCAL_EN); + /* Setup VLAN ID 0 for VLAN-unaware bridges */ ret = mt7530_setup_vlan0(priv); if (ret) @@ -2591,6 +2594,9 @@ mt7531_setup_common(struct dsa_switch *ds) PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT)); } + /* Allow mirroring frames received on the local port (monitor port). */ + mt7530_set(priv, MT753X_AGC, LOCAL_EN); + /* Flush the FDB table */ ret = mt7530_fdb_cmd(priv, MT7530_FDB_FLUSH, NULL); if (ret < 0) diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 585db03c0548..a08053390b28 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -32,6 +32,10 @@ enum mt753x_id { #define SYSC_REG_RSTCTRL 0x34 #define RESET_MCM BIT(2) +/* Register for ARL global control */ +#define MT753X_AGC 0xc +#define LOCAL_EN BIT(7) + /* Registers to mac forward control for unknown frames */ #define MT7530_MFC 0x10 #define BC_FFP(x) (((x) & 0xff) << 24) -- cgit v1.2.3 From 2c606d138518cc69f09c35929abc414a99e3a28f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 13 Apr 2024 16:01:40 +0300 Subject: net: dsa: mt7530: fix port mirroring for MT7988 SoC switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "MT7988A Wi-Fi 7 Generation Router Platform: Datasheet (Open Version) v0.1" document shows bits 16 to 18 as the MIRROR_PORT field of the CPU forward control register. Currently, the MT7530 DSA subdriver configures bits 0 to 2 of the CPU forward control register which breaks the port mirroring feature for the MT7988 SoC switch. Fix this by using the MT7531_MIRROR_PORT_GET() and MT7531_MIRROR_PORT_SET() macros which utilise the correct bits. Fixes: 110c18bfed41 ("net: dsa: mt7530: introduce driver for MT7988 built-in switch") Signed-off-by: Arınç ÜNAL Acked-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b84e1845fa02..8090390edaf9 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1883,14 +1883,16 @@ mt7530_port_vlan_del(struct dsa_switch *ds, int port, static int mt753x_mirror_port_get(unsigned int id, u32 val) { - return (id == ID_MT7531) ? MT7531_MIRROR_PORT_GET(val) : - MIRROR_PORT(val); + return (id == ID_MT7531 || id == ID_MT7988) ? + MT7531_MIRROR_PORT_GET(val) : + MIRROR_PORT(val); } static int mt753x_mirror_port_set(unsigned int id, u32 val) { - return (id == ID_MT7531) ? MT7531_MIRROR_PORT_SET(val) : - MIRROR_PORT(val); + return (id == ID_MT7531 || id == ID_MT7988) ? + MT7531_MIRROR_PORT_SET(val) : + MIRROR_PORT(val); } static int mt753x_port_mirror_add(struct dsa_switch *ds, int port, -- cgit v1.2.3 From 13c785323b36b845300b256d0e5963c3727667d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 17 Apr 2024 11:03:27 +0200 Subject: serial: stm32: Return IRQ_NONE in the ISR if no handling happend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there is a stuck irq that the handler doesn't address, returning IRQ_HANDLED unconditionally makes it impossible for the irq core to detect the problem and disable the irq. So only return IRQ_HANDLED if an event was handled. A stuck irq is still problematic, but with this change at least it only makes the UART nonfunctional instead of occupying the (usually only) CPU by 100% and so stall the whole machine. Fixes: 48a6092fb41f ("serial: stm32-usart: Add STM32 USART Driver") Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/5f92603d0dfd8a5b8014b2b10a902d91e0bb881f.1713344161.git.u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index 58d169e5c1db..d60cbac69194 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -861,6 +861,7 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs; u32 sr; unsigned int size; + irqreturn_t ret = IRQ_NONE; sr = readl_relaxed(port->membase + ofs->isr); @@ -869,11 +870,14 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) (sr & USART_SR_TC)) { stm32_usart_tc_interrupt_disable(port); stm32_usart_rs485_rts_disable(port); + ret = IRQ_HANDLED; } - if ((sr & USART_SR_RTOF) && ofs->icr != UNDEF_REG) + if ((sr & USART_SR_RTOF) && ofs->icr != UNDEF_REG) { writel_relaxed(USART_ICR_RTOCF, port->membase + ofs->icr); + ret = IRQ_HANDLED; + } if ((sr & USART_SR_WUF) && ofs->icr != UNDEF_REG) { /* Clear wake up flag and disable wake up interrupt */ @@ -882,6 +886,7 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) stm32_usart_clr_bits(port, ofs->cr3, USART_CR3_WUFIE); if (irqd_is_wakeup_set(irq_get_irq_data(port->irq))) pm_wakeup_event(tport->tty->dev, 0); + ret = IRQ_HANDLED; } /* @@ -896,6 +901,7 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) uart_unlock_and_check_sysrq(port); if (size) tty_flip_buffer_push(tport); + ret = IRQ_HANDLED; } } @@ -903,6 +909,7 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) uart_port_lock(port); stm32_usart_transmit_chars(port); uart_port_unlock(port); + ret = IRQ_HANDLED; } /* Receiver timeout irq for DMA RX */ @@ -912,9 +919,10 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) uart_unlock_and_check_sysrq(port); if (size) tty_flip_buffer_push(tport); + ret = IRQ_HANDLED; } - return IRQ_HANDLED; + return ret; } static void stm32_usart_set_mctrl(struct uart_port *port, unsigned int mctrl) -- cgit v1.2.3 From ea2624b5b829b8f93c0dce25721d835969b34faf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 17 Apr 2024 11:03:28 +0200 Subject: serial: stm32: Reset .throttled state in .startup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an UART is opened that still has .throttled set from a previous open, the RX interrupt is enabled but the irq handler doesn't consider it. This easily results in a stuck irq with the effect to occupy the CPU in a tight loop. So reset the throttle state in .startup() to ensure that RX irqs are handled. Fixes: d1ec8a2eabe9 ("serial: stm32: update throttle and unthrottle ops for dma mode") Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/a784f80d3414f7db723b2ec66efc56e1ad666cbf.1713344161.git.u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/stm32-usart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index d60cbac69194..4fa5a03ebac0 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -1092,6 +1092,7 @@ static int stm32_usart_startup(struct uart_port *port) val |= USART_CR2_SWAP; writel_relaxed(val, port->membase + ofs->cr2); } + stm32_port->throttled = false; /* RX FIFO Flush */ if (ofs->rqr != UNDEF_REG) -- cgit v1.2.3 From e871abcda3b67d0820b4182ebe93435624e9c6a4 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 17 Apr 2024 13:38:29 +0200 Subject: random: handle creditable entropy from atomic process context The entropy accounting changes a static key when the RNG has initialized, since it only ever initializes once. Static key changes, however, cannot be made from atomic context, so depending on where the last creditable entropy comes from, the static key change might need to be deferred to a worker. Previously the code used the execute_in_process_context() helper function, which accounts for whether or not the caller is in_interrupt(). However, that doesn't account for the case where the caller is actually in process context but is holding a spinlock. This turned out to be the case with input_handle_event() in drivers/input/input.c contributing entropy: [] die+0xa8/0x2fc [] bug_handler+0x44/0xec [] brk_handler+0x90/0x144 [] do_debug_exception+0xa0/0x148 [] el1_dbg+0x60/0x7c [] el1h_64_sync_handler+0x38/0x90 [] el1h_64_sync+0x64/0x6c [] __might_resched+0x1fc/0x2e8 [] __might_sleep+0x44/0x7c [] cpus_read_lock+0x1c/0xec [] static_key_enable+0x14/0x38 [] crng_set_ready+0x14/0x28 [] execute_in_process_context+0xb8/0xf8 [] _credit_init_bits+0x118/0x1dc [] add_timer_randomness+0x264/0x270 [] add_input_randomness+0x38/0x48 [] input_handle_event+0x2b8/0x490 [] input_event+0x6c/0x98 According to Guoyong, it's not really possible to refactor the various drivers to never hold a spinlock there. And in_atomic() isn't reliable. So, rather than trying to be too fancy, just punt the change in the static key to a workqueue always. There's basically no drawback of doing this, as the code already needed to account for the static key not changing immediately, and given that it's just an optimization, there's not exactly a hurry to change the static key right away, so deferal is fine. Reported-by: Guoyong Wang Cc: stable@vger.kernel.org Fixes: f5bda35fba61 ("random: use static branch for crng_ready()") Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 456be28ba67c..2597cb43f438 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -702,7 +702,7 @@ static void extract_entropy(void *buf, size_t len) static void __cold _credit_init_bits(size_t bits) { - static struct execute_work set_ready; + static DECLARE_WORK(set_ready, crng_set_ready); unsigned int new, orig, add; unsigned long flags; @@ -718,8 +718,8 @@ static void __cold _credit_init_bits(size_t bits) if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) { crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */ - if (static_key_initialized) - execute_in_process_context(crng_set_ready, &set_ready); + if (static_key_initialized && system_unbound_wq) + queue_work(system_unbound_wq, &set_ready); atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); @@ -890,8 +890,8 @@ void __init random_init(void) /* * If we were initialized by the cpu or bootloader before jump labels - * are initialized, then we should enable the static branch here, where - * it's guaranteed that jump labels have been initialized. + * or workqueues are initialized, then we should enable the static + * branch here, where it's guaranteed that these have been initialized. */ if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY) crng_set_ready(NULL); -- cgit v1.2.3 From 83781384a96b95e2b6403d3c8a002b2c89031770 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Mon, 15 Apr 2024 15:15:07 +0200 Subject: s390/ism: Properly fix receive message buffer allocation Since [1], dma_alloc_coherent() does not accept requests for GFP_COMP anymore, even on archs that may be able to fulfill this. Functionality that relied on the receive buffer being a compound page broke at that point: The SMC-D protocol, that utilizes the ism device driver, passes receive buffers to the splice processor in a struct splice_pipe_desc with a single entry list of struct pages. As the buffer is no longer a compound page, the splice processor now rejects requests to handle more than a page worth of data. Replace dma_alloc_coherent() and allocate a buffer with folio_alloc and create a DMA map for it with dma_map_page(). Since only receive buffers on ISM devices use DMA, qualify the mapping as FROM_DEVICE. Since ISM devices are available on arch s390, only, and on that arch all DMA is coherent, there is no need to introduce and export some kind of dma_sync_to_cpu() method to be called by the SMC-D protocol layer. Analogously, replace dma_free_coherent by a two step dma_unmap_page, then folio_put to free the receive buffer. [1] https://lore.kernel.org/all/20221113163535.884299-1-hch@lst.de/ Fixes: c08004eede4b ("s390/ism: don't pass bogus GFP_ flags to dma_alloc_coherent") Signed-off-by: Gerd Bayer Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2c8e964425dc..43778b088ffa 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -292,13 +292,16 @@ out: static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, - dmb->cpu_addr, dmb->dma_addr); + dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, + DMA_FROM_DEVICE); + folio_put(virt_to_folio(dmb->cpu_addr)); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { + struct folio *folio; unsigned long bit; + int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -315,14 +318,30 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, - &dmb->dma_addr, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!dmb->cpu_addr) - clear_bit(dmb->sba_idx, ism->sba_bitmap); + folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | + __GFP_NORETRY, get_order(dmb->dmb_len)); - return dmb->cpu_addr ? 0 : -ENOMEM; + if (!folio) { + rc = -ENOMEM; + goto out_bit; + } + + dmb->cpu_addr = folio_address(folio); + dmb->dma_addr = dma_map_page(&ism->pdev->dev, + virt_to_page(dmb->cpu_addr), 0, + dmb->dmb_len, DMA_FROM_DEVICE); + if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { + rc = -ENOMEM; + goto out_free; + } + + return 0; + +out_free: + kfree(dmb->cpu_addr); +out_bit: + clear_bit(dmb->sba_idx, ism->sba_bitmap); + return rc; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, -- cgit v1.2.3 From 652ead9b746a63e4e79d7ad66d3edf0a8a5b0c2f Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 4 Apr 2024 11:03:02 +0200 Subject: drm/xe: Fix bo leak in intel_fb_bo_framebuffer_init Add a unreference bo in the error path, to prevent leaking a bo ref. Return 0 on success to clarify the success path. Signed-off-by: Maarten Lankhorst Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Cc: # v6.8+ Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240404090302.68422-1-maarten.lankhorst@linux.intel.com (cherry picked from commit a2f3d731be3893e730417ae3190760fcaffdf549) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/intel_fb_bo.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/display/intel_fb_bo.c b/drivers/gpu/drm/xe/display/intel_fb_bo.c index b21da7b745a5..a9c1f9885c6b 100644 --- a/drivers/gpu/drm/xe/display/intel_fb_bo.c +++ b/drivers/gpu/drm/xe/display/intel_fb_bo.c @@ -31,7 +31,7 @@ int intel_fb_bo_framebuffer_init(struct intel_framebuffer *intel_fb, ret = ttm_bo_reserve(&bo->ttm, true, false, NULL); if (ret) - return ret; + goto err; if (!(bo->flags & XE_BO_SCANOUT_BIT)) { /* @@ -42,12 +42,16 @@ int intel_fb_bo_framebuffer_init(struct intel_framebuffer *intel_fb, */ if (XE_IOCTL_DBG(i915, !list_empty(&bo->ttm.base.gpuva.list))) { ttm_bo_unreserve(&bo->ttm); - return -EINVAL; + ret = -EINVAL; + goto err; } bo->flags |= XE_BO_SCANOUT_BIT; } ttm_bo_unreserve(&bo->ttm); + return 0; +err: + xe_bo_put(bo); return ret; } -- cgit v1.2.3 From ca7c52ac7ad384bcf299d89482c45fec7cd00da9 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 12 Apr 2024 12:31:45 +0100 Subject: drm/xe/vm: prevent UAF with asid based lookup The asid is only erased from the xarray when the vm refcount reaches zero, however this leads to potential UAF since the xe_vm_get() only works on a vm with refcount != 0. Since the asid is allocated in the vm create ioctl, rather erase it when closing the vm, prior to dropping the potential last ref. This should also work when user closes driver fd without explicit vm destroy. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1594 Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: # v6.8+ Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240412113144.259426-4-matthew.auld@intel.com (cherry picked from commit 83967c57320d0d01ae512f10e79213f81e4bf594) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_vm.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 62d1ef8867a8..3d4c8f342e21 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1577,6 +1577,16 @@ void xe_vm_close_and_put(struct xe_vm *vm) xe->usm.num_vm_in_fault_mode--; else if (!(vm->flags & XE_VM_FLAG_MIGRATION)) xe->usm.num_vm_in_non_fault_mode--; + + if (vm->usm.asid) { + void *lookup; + + xe_assert(xe, xe->info.has_asid); + xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION)); + + lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid); + xe_assert(xe, lookup == vm); + } mutex_unlock(&xe->usm.lock); for_each_tile(tile, xe, id) @@ -1592,24 +1602,15 @@ static void vm_destroy_work_func(struct work_struct *w) struct xe_device *xe = vm->xe; struct xe_tile *tile; u8 id; - void *lookup; /* xe_vm_close_and_put was not called? */ xe_assert(xe, !vm->size); mutex_destroy(&vm->snap_mutex); - if (!(vm->flags & XE_VM_FLAG_MIGRATION)) { + if (!(vm->flags & XE_VM_FLAG_MIGRATION)) xe_device_mem_access_put(xe); - if (xe->info.has_asid && vm->usm.asid) { - mutex_lock(&xe->usm.lock); - lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid); - xe_assert(xe, lookup == vm); - mutex_unlock(&xe->usm.lock); - } - } - for_each_tile(tile, xe, id) XE_WARN_ON(vm->pt_root[id]); -- cgit v1.2.3 From f609e7b1b49e4d15cf107d2069673ee63860c398 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 10 Apr 2024 09:10:46 -0500 Subject: platform/x86/amd/pmc: Extend Framework 13 quirk to more BIOSes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BIOS 03.05 still hasn't fixed the spurious IRQ1 issue. As it's still being worked on there is still a possibility that it won't need to apply to future BIOS releases. Add a quirk for BIOS 03.05 as well. Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240410141046.433-1-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index b456370166b6..b4f49720c87f 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -208,6 +208,15 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_BIOS_VERSION, "03.03"), } }, + { + .ident = "Framework Laptop 13 (Phoenix)", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Framework"), + DMI_MATCH(DMI_PRODUCT_NAME, "Laptop 13 (AMD Ryzen 7040Series)"), + DMI_MATCH(DMI_BIOS_VERSION, "03.05"), + } + }, {} }; -- cgit v1.2.3 From ca7c4507ba87e9fc22e0ecfa819c3664b3e8287b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 15 Mar 2024 13:07:53 +0100 Subject: drm/amdgpu: remove invalid resource->start check v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The majority of those where removed in the commit aed01a68047b ("drm/amdgpu: Remove TTM resource->start visible VRAM condition v2") But this one was missed because it's working on the resource and not the BO. Since we also no longer use a fake start address for visible BOs this will now trigger invalid mapping errors. v2: also remove the unused variable Signed-off-by: Christian König Fixes: aed01a68047b ("drm/amdgpu: Remove TTM resource->start visible VRAM condition v2") CC: stable@vger.kernel.org Acked-by: Pierre-Eric Pelloux-Prayer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index fc418e670fda..6417cb76ccd4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -557,7 +557,6 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev, struct ttm_resource *mem) { struct amdgpu_device *adev = amdgpu_ttm_adev(bdev); - size_t bus_size = (size_t)mem->size; switch (mem->mem_type) { case TTM_PL_SYSTEM: @@ -568,9 +567,6 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev, break; case TTM_PL_VRAM: mem->bus.offset = mem->start << PAGE_SHIFT; - /* check if it's visible */ - if ((mem->bus.offset + bus_size) > adev->gmc.visible_vram_size) - return -EINVAL; if (adev->mman.aper_base_kaddr && mem->placement & TTM_PL_FLAG_CONTIGUOUS) -- cgit v1.2.3 From 18921b205012568b45760753ad3146ddb9e2d4e2 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 10 Apr 2024 15:52:10 -0400 Subject: drm/amdkfd: Fix memory leak in create_process failure Fix memory leak due to a leaked mmget reference on an error handling code path that is triggered when attempting to create KFD processes while a GPU reset is in progress. Fixes: 0ab2d7532b05 ("drm/amdkfd: prepare per-process debug enable and disable") CC: Xiaogang Chen Signed-off-by: Felix Kuehling Tested-by: Harish Kasiviswanthan Reviewed-by: Mukul Joshi Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 717a60d7a4ea..b79986412cd8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -819,9 +819,9 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) mutex_lock(&kfd_processes_mutex); if (kfd_is_locked()) { - mutex_unlock(&kfd_processes_mutex); pr_debug("KFD is locked! Cannot create process"); - return ERR_PTR(-EINVAL); + process = ERR_PTR(-EINVAL); + goto out; } /* A prior open of /dev/kfd could have already created the process. */ -- cgit v1.2.3 From 91f10a3d21f2313485178d49efef8a3ba02bd8c7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 29 Mar 2024 18:03:03 -0400 Subject: Revert "drm/amd/display: fix USB-C flag update after enc10 feature init" This reverts commit b5abd7f983e14054593dc91d6df2aa5f8cc67652. This change breaks DSC on 4k monitors at 144Hz over USB-C. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3254 Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher Cc: Muhammad Ahmed Cc: Tom Chung Cc: Charlene Liu Cc: Hamza Mahfooz Cc: Harry Wentland Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c | 8 +++----- drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c index e224a028d68a..8a0460e86309 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dio_link_encoder.c @@ -248,14 +248,12 @@ void dcn32_link_encoder_construct( enc10->base.hpd_source = init_data->hpd_source; enc10->base.connector = init_data->connector; - enc10->base.preferred_engine = ENGINE_ID_UNKNOWN; - - enc10->base.features = *enc_features; if (enc10->base.connector.id == CONNECTOR_ID_USBC) enc10->base.features.flags.bits.DP_IS_USB_C = 1; - if (enc10->base.connector.id == CONNECTOR_ID_USBC) - enc10->base.features.flags.bits.DP_IS_USB_C = 1; + enc10->base.preferred_engine = ENGINE_ID_UNKNOWN; + + enc10->base.features = *enc_features; enc10->base.transmitter = init_data->transmitter; diff --git a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c index 81e349d5835b..da94e5309fba 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_link_encoder.c @@ -184,6 +184,8 @@ void dcn35_link_encoder_construct( enc10->base.hpd_source = init_data->hpd_source; enc10->base.connector = init_data->connector; + if (enc10->base.connector.id == CONNECTOR_ID_USBC) + enc10->base.features.flags.bits.DP_IS_USB_C = 1; enc10->base.preferred_engine = ENGINE_ID_UNKNOWN; @@ -238,8 +240,6 @@ void dcn35_link_encoder_construct( } enc10->base.features.flags.bits.HDMI_6GB_EN = 1; - if (enc10->base.connector.id == CONNECTOR_ID_USBC) - enc10->base.features.flags.bits.DP_IS_USB_C = 1; if (bp_funcs->get_connector_speed_cap_info) result = bp_funcs->get_connector_speed_cap_info(enc10->base.ctx->dc_bios, -- cgit v1.2.3 From d111855ab7ffffc552f6a475259dc392f2319b6d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 15 Apr 2024 07:52:13 +0200 Subject: s390/mm: Fix NULL pointer dereference The recently added check to figure out if a fault happened on gmap ASCE dereferences the gmap pointer in lowcore without checking that it is not NULL. For all non-KVM processes the pointer is NULL, so that some value from lowcore will be read. With the current layouts of struct gmap and struct lowcore the read value (aka ASCE) is zero, so that this doesn't lead to any observable bug; at least currently. Fix this by adding the missing NULL pointer check. Fixes: 64c3431808bd ("s390/entry: compare gmap asce to determine guest/host fault") Signed-off-by: Sven Schnelle Reviewed-by: Claudio Imbrenda Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/entry.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3dc85638bc63..6a1e0fbbaa15 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -340,7 +340,8 @@ SYM_CODE_START(pgm_check_handler) mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK stctg %c1,%c1,__PT_CR1(%r11) #if IS_ENABLED(CONFIG_KVM) - lg %r12,__LC_GMAP + ltg %r12,__LC_GMAP + jz 5f clc __GMAP_ASCE(8,%r12), __PT_CR1(%r11) jne 5f BPENTER __SF_SIE_FLAGS(%r10),_TIF_ISOLATE_BP_GUEST -- cgit v1.2.3 From efefd4f00c967d00ad7abe092554ffbb70c1a793 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:01 +0200 Subject: netfilter: nf_tables: missing iterator type in lookup walk Add missing decorator type to lookup expression and tighten WARN_ON_ONCE check in pipapo to spot earlier that this is unset. Fixes: 29b359cf6d95 ("netfilter: nft_set_pipapo: walk over current view on netlink dump") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_lookup.c | 1 + net/netfilter/nft_set_pipapo.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index a0055f510e31..b314ca728a29 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -216,6 +216,7 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, return 0; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index eeaf05ffba95..0f903d18bbea 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2123,7 +2123,8 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_field *f; unsigned int i, r; - WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + WARN_ON_ONCE(iter->type != NFT_ITER_READ && + iter->type != NFT_ITER_UPDATE); rcu_read_lock(); if (iter->type == NFT_ITER_READ) -- cgit v1.2.3 From e79b47a8615d42c68aaeb68971593333667382ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:11 +0200 Subject: netfilter: nf_tables: restore set elements when delete set fails From abort path, nft_mapelem_activate() needs to restore refcounters to the original state. Currently, it uses the set->ops->walk() to iterate over these set elements. The existing set iterator skips inactive elements in the next generation, this does not work from the abort path to restore the original state since it has to skip active elements instead (not inactive ones). This patch moves the check for inactive elements to the set iterator callback, then it reverses the logic for the .activate case which needs to skip active elements. Toggle next generation bit for elements when delete set command is invoked and call nft_clear() from .activate (abort) path to restore the next generation bit. The splat below shows an object in mappings memleak: [43929.457523] ------------[ cut here ]------------ [43929.457532] WARNING: CPU: 0 PID: 1139 at include/net/netfilter/nf_tables.h:1237 nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [...] [43929.458014] RIP: 0010:nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458076] Code: 83 f8 01 77 ab 49 8d 7c 24 08 e8 37 5e d0 de 49 8b 6c 24 08 48 8d 7d 50 e8 e9 5c d0 de 8b 45 50 8d 50 ff 89 55 50 85 c0 75 86 <0f> 0b eb 82 0f 0b eb b3 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 [43929.458081] RSP: 0018:ffff888140f9f4b0 EFLAGS: 00010246 [43929.458086] RAX: 0000000000000000 RBX: ffff8881434f5288 RCX: dffffc0000000000 [43929.458090] RDX: 00000000ffffffff RSI: ffffffffa26d28a7 RDI: ffff88810ecc9550 [43929.458093] RBP: ffff88810ecc9500 R08: 0000000000000001 R09: ffffed10281f3e8f [43929.458096] R10: 0000000000000003 R11: ffff0000ffff0000 R12: ffff8881434f52a0 [43929.458100] R13: ffff888140f9f5f4 R14: ffff888151c7a800 R15: 0000000000000002 [43929.458103] FS: 00007f0c687c4740(0000) GS:ffff888390800000(0000) knlGS:0000000000000000 [43929.458107] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [43929.458111] CR2: 00007f58dbe5b008 CR3: 0000000123602005 CR4: 00000000001706f0 [43929.458114] Call Trace: [43929.458118] [43929.458121] ? __warn+0x9f/0x1a0 [43929.458127] ? nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458188] ? report_bug+0x1b1/0x1e0 [43929.458196] ? handle_bug+0x3c/0x70 [43929.458200] ? exc_invalid_op+0x17/0x40 [43929.458211] ? nft_setelem_data_deactivate+0xd7/0xf0 [nf_tables] [43929.458271] ? nft_setelem_data_deactivate+0xe4/0xf0 [nf_tables] [43929.458332] nft_mapelem_deactivate+0x24/0x30 [nf_tables] [43929.458392] nft_rhash_walk+0xdd/0x180 [nf_tables] [43929.458453] ? __pfx_nft_rhash_walk+0x10/0x10 [nf_tables] [43929.458512] ? rb_insert_color+0x2e/0x280 [43929.458520] nft_map_deactivate+0xdc/0x1e0 [nf_tables] [43929.458582] ? __pfx_nft_map_deactivate+0x10/0x10 [nf_tables] [43929.458642] ? __pfx_nft_mapelem_deactivate+0x10/0x10 [nf_tables] [43929.458701] ? __rcu_read_unlock+0x46/0x70 [43929.458709] nft_delset+0xff/0x110 [nf_tables] [43929.458769] nft_flush_table+0x16f/0x460 [nf_tables] [43929.458830] nf_tables_deltable+0x501/0x580 [nf_tables] Fixes: 628bd3e49cba ("netfilter: nf_tables: drop map element references from preparation phase") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 44 ++++++++++++++++++++++++++++++++++++++---- net/netfilter/nft_set_bitmap.c | 4 +--- net/netfilter/nft_set_hash.c | 8 ++------ net/netfilter/nft_set_pipapo.c | 5 +---- net/netfilter/nft_set_rbtree.c | 4 +--- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7a34db62ea9..d0c09f899e80 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -594,6 +594,12 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; @@ -617,6 +623,7 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } @@ -3880,6 +3887,9 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_data *data; int err; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; @@ -3903,17 +3913,20 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { - u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_iter dummy_iter = { + .genmask = nft_genmask_next(ctx->net), + }; struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) + if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; - ret = nft_setelem_validate(ctx, set, NULL, catchall->elem); + ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); if (ret < 0) return ret; } @@ -5402,6 +5415,11 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + return nft_setelem_data_validate(ctx, set, elem_priv); } @@ -5494,6 +5512,13 @@ static int nft_mapelem_activate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + /* called from abort path, reverse check to undo changes. */ + if (nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; @@ -5511,6 +5536,7 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } @@ -5785,6 +5811,9 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; @@ -6635,7 +6664,7 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_setelem_is_catchall(set, elem_priv)) { - nft_set_elem_change_active(net, set, ext); + nft_clear(net, ext); } else { set->ops->activate(net, set, elem_priv); } @@ -7317,8 +7346,12 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, sizeof(struct nft_trans_elem), GFP_ATOMIC); if (!trans) @@ -10800,6 +10833,9 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 32df7a16835d..1caa04619dc6 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -172,7 +172,7 @@ static void nft_bitmap_activate(const struct net *net, nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); - nft_set_elem_change_active(net, set, &be->ext); + nft_clear(net, &be->ext); } static void nft_bitmap_flush(const struct net *net, @@ -222,8 +222,6 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, list_for_each_entry_rcu(be, &priv->list, head) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&be->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &be->priv); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6968a3b34236..daa56dda737a 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -199,7 +199,7 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, { struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_rhash_flush(const struct net *net, @@ -286,8 +286,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) @@ -599,7 +597,7 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, { struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_hash_flush(const struct net *net, @@ -652,8 +650,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[i], node) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 0f903d18bbea..187138afac45 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1847,7 +1847,7 @@ static void nft_pipapo_activate(const struct net *net, { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &e->ext); + nft_clear(net, &e->ext); } /** @@ -2149,9 +2149,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, e = f->mt[r].e; - if (!nft_set_elem_active(&e->ext, iter->genmask)) - goto cont; - iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) goto out; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9944fe479e53..b7ea21327549 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -532,7 +532,7 @@ static void nft_rbtree_activate(const struct net *net, { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &rbe->ext); + nft_clear(net, &rbe->ext); } static void nft_rbtree_flush(const struct net *net, @@ -600,8 +600,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&rbe->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) { -- cgit v1.2.3 From 6fef2d4c00b5b8561ad68dd2b68173f5c6af1e75 Mon Sep 17 00:00:00 2001 From: xinhui pan Date: Thu, 11 Apr 2024 11:11:38 +0800 Subject: drm/amdgpu: validate the parameters of bo mapping operations more clearly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verify the parameters of amdgpu_vm_bo_(map/replace_map/clearing_mappings) in one common place. Fixes: dc54d3d1744d ("drm/amdgpu: implement AMDGPU_VA_OP_CLEAR v2") Cc: stable@vger.kernel.org Reported-by: Vlad Stolyarov Suggested-by: Christian König Signed-off-by: xinhui pan Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 72 ++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 4299ce386322..94089069c9ad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1613,6 +1613,37 @@ static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev, trace_amdgpu_vm_bo_map(bo_va, mapping); } +/* Validate operation parameters to prevent potential abuse */ +static int amdgpu_vm_verify_parameters(struct amdgpu_device *adev, + struct amdgpu_bo *bo, + uint64_t saddr, + uint64_t offset, + uint64_t size) +{ + uint64_t tmp, lpfn; + + if (saddr & AMDGPU_GPU_PAGE_MASK + || offset & AMDGPU_GPU_PAGE_MASK + || size & AMDGPU_GPU_PAGE_MASK) + return -EINVAL; + + if (check_add_overflow(saddr, size, &tmp) + || check_add_overflow(offset, size, &tmp) + || size == 0 /* which also leads to end < begin */) + return -EINVAL; + + /* make sure object fit at this offset */ + if (bo && offset + size > amdgpu_bo_size(bo)) + return -EINVAL; + + /* Ensure last pfn not exceed max_pfn */ + lpfn = (saddr + size - 1) >> AMDGPU_GPU_PAGE_SHIFT; + if (lpfn >= adev->vm_manager.max_pfn) + return -EINVAL; + + return 0; +} + /** * amdgpu_vm_bo_map - map bo inside a vm * @@ -1639,21 +1670,14 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev, struct amdgpu_bo *bo = bo_va->base.bo; struct amdgpu_vm *vm = bo_va->base.vm; uint64_t eaddr; + int r; - /* validate the parameters */ - if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK || size & ~PAGE_MASK) - return -EINVAL; - if (saddr + size <= saddr || offset + size <= offset) - return -EINVAL; - - /* make sure object fit at this offset */ - eaddr = saddr + size - 1; - if ((bo && offset + size > amdgpu_bo_size(bo)) || - (eaddr >= adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT)) - return -EINVAL; + r = amdgpu_vm_verify_parameters(adev, bo, saddr, offset, size); + if (r) + return r; saddr /= AMDGPU_GPU_PAGE_SIZE; - eaddr /= AMDGPU_GPU_PAGE_SIZE; + eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE; tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr); if (tmp) { @@ -1706,17 +1730,9 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev, uint64_t eaddr; int r; - /* validate the parameters */ - if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK || size & ~PAGE_MASK) - return -EINVAL; - if (saddr + size <= saddr || offset + size <= offset) - return -EINVAL; - - /* make sure object fit at this offset */ - eaddr = saddr + size - 1; - if ((bo && offset + size > amdgpu_bo_size(bo)) || - (eaddr >= adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT)) - return -EINVAL; + r = amdgpu_vm_verify_parameters(adev, bo, saddr, offset, size); + if (r) + return r; /* Allocate all the needed memory */ mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); @@ -1730,7 +1746,7 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev, } saddr /= AMDGPU_GPU_PAGE_SIZE; - eaddr /= AMDGPU_GPU_PAGE_SIZE; + eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE; mapping->start = saddr; mapping->last = eaddr; @@ -1817,10 +1833,14 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo_va_mapping *before, *after, *tmp, *next; LIST_HEAD(removed); uint64_t eaddr; + int r; + + r = amdgpu_vm_verify_parameters(adev, NULL, saddr, 0, size); + if (r) + return r; - eaddr = saddr + size - 1; saddr /= AMDGPU_GPU_PAGE_SIZE; - eaddr /= AMDGPU_GPU_PAGE_SIZE; + eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE; /* Allocate all the needed memory */ before = kzalloc(sizeof(*before), GFP_KERNEL); -- cgit v1.2.3 From a6ff969fe9cbf369e3cd0ac54261fec1122682ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 4 Apr 2024 16:25:40 +0200 Subject: drm/amdgpu: fix visible VRAM handling during faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we removed the hacky start code check we actually didn't took into account that *all* VRAM pages needs to be CPU accessible. Clean up the code and unify the handling into a single helper which checks if the whole resource is CPU accessible. The only place where a partial check would make sense is during eviction, but that is neglitible. Signed-off-by: Christian König Fixes: aed01a68047b ("drm/amdgpu: Remove TTM resource->start visible VRAM condition v2") Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher CC: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 22 +++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 22 ----------- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 +++++++++++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 3 ++ 5 files changed, 53 insertions(+), 57 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 0a4b09709cfb..ec888fc6ead8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -819,7 +819,7 @@ retry: p->bytes_moved += ctx.bytes_moved; if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && - amdgpu_bo_in_cpu_visible_vram(bo)) + amdgpu_res_cpu_visible(adev, bo->tbo.resource)) p->bytes_moved_vis += ctx.bytes_moved; if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 010b0cb7693c..2099159a693f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -617,8 +617,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev, return r; if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && - bo->tbo.resource->mem_type == TTM_PL_VRAM && - amdgpu_bo_in_cpu_visible_vram(bo)) + amdgpu_res_cpu_visible(adev, bo->tbo.resource)) amdgpu_cs_report_moved_bytes(adev, ctx.bytes_moved, ctx.bytes_moved); else @@ -1272,23 +1271,25 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict) void amdgpu_bo_get_memory(struct amdgpu_bo *bo, struct amdgpu_mem_stats *stats) { + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + struct ttm_resource *res = bo->tbo.resource; uint64_t size = amdgpu_bo_size(bo); struct drm_gem_object *obj; unsigned int domain; bool shared; /* Abort if the BO doesn't currently have a backing store */ - if (!bo->tbo.resource) + if (!res) return; obj = &bo->tbo.base; shared = drm_gem_object_is_shared_for_memory_stats(obj); - domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); + domain = amdgpu_mem_type_to_domain(res->mem_type); switch (domain) { case AMDGPU_GEM_DOMAIN_VRAM: stats->vram += size; - if (amdgpu_bo_in_cpu_visible_vram(bo)) + if (amdgpu_res_cpu_visible(adev, bo->tbo.resource)) stats->visible_vram += size; if (shared) stats->vram_shared += size; @@ -1389,10 +1390,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo) /* Remember that this BO was accessed by the CPU */ abo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; - if (bo->resource->mem_type != TTM_PL_VRAM) - return 0; - - if (amdgpu_bo_in_cpu_visible_vram(abo)) + if (amdgpu_res_cpu_visible(adev, bo->resource)) return 0; /* Can't move a pinned BO to visible VRAM */ @@ -1415,7 +1413,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo) /* this should never happen */ if (bo->resource->mem_type == TTM_PL_VRAM && - !amdgpu_bo_in_cpu_visible_vram(abo)) + !amdgpu_res_cpu_visible(adev, bo->resource)) return VM_FAULT_SIGBUS; ttm_bo_move_to_lru_tail_unlocked(bo); @@ -1579,6 +1577,7 @@ uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, */ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) { + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct dma_buf_attachment *attachment; struct dma_buf *dma_buf; const char *placement; @@ -1587,10 +1586,11 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) if (dma_resv_trylock(bo->tbo.base.resv)) { unsigned int domain; + domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); switch (domain) { case AMDGPU_GEM_DOMAIN_VRAM: - if (amdgpu_bo_in_cpu_visible_vram(bo)) + if (amdgpu_res_cpu_visible(adev, bo->tbo.resource)) placement = "VRAM VISIBLE"; else placement = "VRAM"; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index be679c42b0b8..fa03d9e4874c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -250,28 +250,6 @@ static inline u64 amdgpu_bo_mmap_offset(struct amdgpu_bo *bo) return drm_vma_node_offset_addr(&bo->tbo.base.vma_node); } -/** - * amdgpu_bo_in_cpu_visible_vram - check if BO is (partly) in visible VRAM - */ -static inline bool amdgpu_bo_in_cpu_visible_vram(struct amdgpu_bo *bo) -{ - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - struct amdgpu_res_cursor cursor; - - if (!bo->tbo.resource || bo->tbo.resource->mem_type != TTM_PL_VRAM) - return false; - - amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); - while (cursor.remaining) { - if (cursor.start < adev->gmc.visible_vram_size) - return true; - - amdgpu_res_next(&cursor, cursor.size); - } - - return false; -} - /** * amdgpu_bo_explicit_sync - return whether the bo is explicitly synced */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 6417cb76ccd4..1d71729e3f6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -133,7 +133,7 @@ static void amdgpu_evict_flags(struct ttm_buffer_object *bo, } else if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && !(abo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) && - amdgpu_bo_in_cpu_visible_vram(abo)) { + amdgpu_res_cpu_visible(adev, bo->resource)) { /* Try evicting to the CPU inaccessible part of VRAM * first, but only set GTT as busy placement, so this @@ -403,40 +403,55 @@ error: return r; } -/* - * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy +/** + * amdgpu_res_cpu_visible - Check that resource can be accessed by CPU + * @adev: amdgpu device + * @res: the resource to check * - * Called by amdgpu_bo_move() + * Returns: true if the full resource is CPU visible, false otherwise. */ -static bool amdgpu_mem_visible(struct amdgpu_device *adev, - struct ttm_resource *mem) +bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, + struct ttm_resource *res) { - u64 mem_size = (u64)mem->size; struct amdgpu_res_cursor cursor; - u64 end; - if (mem->mem_type == TTM_PL_SYSTEM || - mem->mem_type == TTM_PL_TT) + if (!res) + return false; + + if (res->mem_type == TTM_PL_SYSTEM || res->mem_type == TTM_PL_TT || + res->mem_type == AMDGPU_PL_PREEMPT) return true; - if (mem->mem_type != TTM_PL_VRAM) + + if (res->mem_type != TTM_PL_VRAM) return false; - amdgpu_res_first(mem, 0, mem_size, &cursor); - end = cursor.start + cursor.size; + amdgpu_res_first(res, 0, res->size, &cursor); while (cursor.remaining) { + if ((cursor.start + cursor.size) >= adev->gmc.visible_vram_size) + return false; amdgpu_res_next(&cursor, cursor.size); + } - if (!cursor.remaining) - break; + return true; +} - /* ttm_resource_ioremap only supports contiguous memory */ - if (end != cursor.start) - return false; +/* + * amdgpu_res_copyable - Check that memory can be accessed by ttm_bo_move_memcpy + * + * Called by amdgpu_bo_move() + */ +static bool amdgpu_res_copyable(struct amdgpu_device *adev, + struct ttm_resource *mem) +{ + if (!amdgpu_res_cpu_visible(adev, mem)) + return false; - end = cursor.start + cursor.size; - } + /* ttm_resource_ioremap only supports contiguous memory */ + if (mem->mem_type == TTM_PL_VRAM && + !(mem->placement & TTM_PL_FLAG_CONTIGUOUS)) + return false; - return end <= adev->gmc.visible_vram_size; + return true; } /* @@ -529,8 +544,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, if (r) { /* Check that all memory is CPU accessible */ - if (!amdgpu_mem_visible(adev, old_mem) || - !amdgpu_mem_visible(adev, new_mem)) { + if (!amdgpu_res_copyable(adev, old_mem) || + !amdgpu_res_copyable(adev, new_mem)) { pr_err("Move buffer fallback to memcpy unavailable\n"); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 65ec82141a8e..32cf6b6f6efd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -139,6 +139,9 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr, int amdgpu_vram_mgr_query_page_status(struct amdgpu_vram_mgr *mgr, uint64_t start); +bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, + struct ttm_resource *res); + int amdgpu_ttm_init(struct amdgpu_device *adev); void amdgpu_ttm_fini(struct amdgpu_device *adev); void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, -- cgit v1.2.3 From 0ba753bc7e79e49556e81b0d09b2de1aa558553b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sun, 14 Apr 2024 22:06:08 -0400 Subject: drm/radeon: make -fstrict-flex-arrays=3 happy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver parses a union where the layout up through the first array is the same, however, the array has different sizes depending on the elements in the union. Be explicit to fix the UBSAN checker. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3323 Fixes: df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") Acked-by: Christian König Reviewed-by: Kees Cook Signed-off-by: Alex Deucher Cc: Kees Cook --- drivers/gpu/drm/radeon/radeon_atombios.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index bb1f0a3371ab..10793a433bf5 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -923,8 +923,12 @@ bool radeon_get_atom_connector_info_from_supported_devices_table(struct max_device = ATOM_MAX_SUPPORTED_DEVICE_INFO; for (i = 0; i < max_device; i++) { - ATOM_CONNECTOR_INFO_I2C ci = - supported_devices->info.asConnInfo[i]; + ATOM_CONNECTOR_INFO_I2C ci; + + if (frev > 1) + ci = supported_devices->info_2d1.asConnInfo[i]; + else + ci = supported_devices->info.asConnInfo[i]; bios_connectors[i].valid = false; -- cgit v1.2.3 From 781d41fed19caf900c8405064676813dc9921d32 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 8 Apr 2024 13:30:15 -0400 Subject: drm/radeon: silence UBSAN warning (v3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert a variable sized array from [1] to []. v2: fix up a few more. v3: integrate comments from Kees. Reviewed-by: Kees Cook Tested-by: Jeff Johnson (v2) Acked-by: Christian König (v1) Signed-off-by: Alex Deucher Cc: keescook@chromium.org --- drivers/gpu/drm/radeon/pptable.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/radeon/pptable.h b/drivers/gpu/drm/radeon/pptable.h index 94947229888b..b7f22597ee95 100644 --- a/drivers/gpu/drm/radeon/pptable.h +++ b/drivers/gpu/drm/radeon/pptable.h @@ -424,7 +424,7 @@ typedef struct _ATOM_PPLIB_SUMO_CLOCK_INFO{ typedef struct _ATOM_PPLIB_STATE_V2 { //number of valid dpm levels in this state; Driver uses it to calculate the whole - //size of the state: sizeof(ATOM_PPLIB_STATE_V2) + (ucNumDPMLevels - 1) * sizeof(UCHAR) + //size of the state: struct_size(ATOM_PPLIB_STATE_V2, clockInfoIndex, ucNumDPMLevels) UCHAR ucNumDPMLevels; //a index to the array of nonClockInfos @@ -432,14 +432,14 @@ typedef struct _ATOM_PPLIB_STATE_V2 /** * Driver will read the first ucNumDPMLevels in this array */ - UCHAR clockInfoIndex[1]; + UCHAR clockInfoIndex[] __counted_by(ucNumDPMLevels); } ATOM_PPLIB_STATE_V2; typedef struct _StateArray{ //how many states we have UCHAR ucNumEntries; - ATOM_PPLIB_STATE_V2 states[1]; + ATOM_PPLIB_STATE_V2 states[] __counted_by(ucNumEntries); }StateArray; @@ -450,7 +450,7 @@ typedef struct _ClockInfoArray{ //sizeof(ATOM_PPLIB_CLOCK_INFO) UCHAR ucEntrySize; - UCHAR clockInfo[1]; + UCHAR clockInfo[] __counted_by(ucNumEntries); }ClockInfoArray; typedef struct _NonClockInfoArray{ @@ -460,7 +460,7 @@ typedef struct _NonClockInfoArray{ //sizeof(ATOM_PPLIB_NONCLOCK_INFO) UCHAR ucEntrySize; - ATOM_PPLIB_NONCLOCK_INFO nonClockInfo[1]; + ATOM_PPLIB_NONCLOCK_INFO nonClockInfo[] __counted_by(ucNumEntries); }NonClockInfoArray; typedef struct _ATOM_PPLIB_Clock_Voltage_Dependency_Record -- cgit v1.2.3 From 6376306adde5b252ee7c73572e35d13fb13f6f18 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 15 Apr 2024 18:15:43 +0200 Subject: x86/retpolines: Enable the default thunk warning only on relevant configs The using-default-thunk warning check makes sense only with configurations which actually enable the special return thunks. Otherwise, it fires on unrelated 32-bit configs on which the special return thunks won't even work (they're 64-bit only) and, what is more, those configs even go off into the weeds when booting in the alternatives patching code, leading to a dead machine. Fixes: 4461438a8405 ("x86/retpoline: Ensure default return thunk isn't used at runtime") Reported-by: Klara Modin Reported-by: Erhard Furtner Signed-off-by: Borislav Petkov (AMD) Tested-by: Klara Modin Link: https://lore.kernel.org/r/78e0d19c-b77a-4169-a80f-2eef91f4a1d6@gmail.com Link: https://lore.kernel.org/r/20240413024956.488d474e@yea --- arch/x86/lib/retpoline.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index e674ccf720b9..391059b2c6fb 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -382,8 +382,15 @@ SYM_FUNC_END(call_depth_return_thunk) SYM_CODE_START(__x86_return_thunk) UNWIND_HINT_FUNC ANNOTATE_NOENDBR +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || \ + defined(CONFIG_MITIGATION_SRSO) || \ + defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ALTERNATIVE __stringify(ANNOTATE_UNRET_SAFE; ret), \ "jmp warn_thunk_thunk", X86_FEATURE_ALWAYS +#else + ANNOTATE_UNRET_SAFE + ret +#endif int3 SYM_CODE_END(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) -- cgit v1.2.3 From 298b871cd55a607037ac8af0011b9fdeb54c1e65 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 16 Apr 2024 06:44:04 +0900 Subject: bootconfig: Fix the kerneldoc of _xbc_exit() Fix the kerneldoc of _xbc_exit() which is updated to have an @early argument and the function name is changed. Link: https://lore.kernel.org/all/171321744474.599864.13532445969528690358.stgit@devnote2/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404150036.kPJ3HEFA-lkp@intel.com/ Fixes: 89f9a1e876b5 ("bootconfig: use memblock_free_late to free xbc memory to buddy") Signed-off-by: Masami Hiramatsu (Google) --- lib/bootconfig.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 8841554432d5..97f8911ea339 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -901,7 +901,8 @@ static int __init xbc_parse_tree(void) } /** - * xbc_exit() - Clean up all parsed bootconfig + * _xbc_exit() - Clean up all parsed bootconfig + * @early: Set true if this is called before budy system is initialized. * * This clears all data structures of parsed bootconfig on memory. * If you need to reuse xbc_init() with new boot config, you can -- cgit v1.2.3 From 69ffed4b62523bbc85511f150500329d28aba356 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 17 Apr 2024 17:19:13 +0300 Subject: gpiolib: swnode: Remove wrong header inclusion The flags in the software node properties are supposed to be the GPIO lookup flags, which are provided by gpio/machine.h, as the software nodes are the kernel internal thing and doesn't need to rely to any of ABIs. Fixes: e7f9ff5dc90c ("gpiolib: add support for software nodes") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/property.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/gpio/property.h b/include/linux/gpio/property.h index 6c75c8bd44a0..1a14e239221f 100644 --- a/include/linux/gpio/property.h +++ b/include/linux/gpio/property.h @@ -2,7 +2,6 @@ #ifndef __LINUX_GPIO_PROPERTY_H #define __LINUX_GPIO_PROPERTY_H -#include /* for GPIO_* flags */ #include #define PROPERTY_ENTRY_GPIO(_name_, _chip_node_, _idx_, _flags_) \ -- cgit v1.2.3 From 86a1471d7cde792941109b93b558b5dc078b9ee9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Apr 2024 17:43:21 +0200 Subject: netfilter: nf_tables: fix memleak in map from abort path The delete set command does not rely on the transaction object for element removal, therefore, a combination of delete element + delete set from the abort path could result in restoring twice the refcount of the mapping. Check for inactive element in the next generation for the delete element command in the abort path, skip restoring state if next generation bit has been already cleared. This is similar to the activate logic using the set walk iterator. [ 6170.286929] ------------[ cut here ]------------ [ 6170.286939] WARNING: CPU: 6 PID: 790302 at net/netfilter/nf_tables_api.c:2086 nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.287071] Modules linked in: [...] [ 6170.287633] CPU: 6 PID: 790302 Comm: kworker/6:2 Not tainted 6.9.0-rc3+ #365 [ 6170.287768] RIP: 0010:nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.287886] Code: df 48 8d 7d 58 e8 69 2e 3b df 48 8b 7d 58 e8 80 1b 37 df 48 8d 7d 68 e8 57 2e 3b df 48 8b 7d 68 e8 6e 1b 37 df 48 89 ef eb c4 <0f> 0b 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 0f [ 6170.287895] RSP: 0018:ffff888134b8fd08 EFLAGS: 00010202 [ 6170.287904] RAX: 0000000000000001 RBX: ffff888125bffb28 RCX: dffffc0000000000 [ 6170.287912] RDX: 0000000000000003 RSI: ffffffffa20298ab RDI: ffff88811ebe4750 [ 6170.287919] RBP: ffff88811ebe4700 R08: ffff88838e812650 R09: fffffbfff0623a55 [ 6170.287926] R10: ffffffff8311d2af R11: 0000000000000001 R12: ffff888125bffb10 [ 6170.287933] R13: ffff888125bffb10 R14: dead000000000122 R15: dead000000000100 [ 6170.287940] FS: 0000000000000000(0000) GS:ffff888390b00000(0000) knlGS:0000000000000000 [ 6170.287948] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6170.287955] CR2: 00007fd31fc00710 CR3: 0000000133f60004 CR4: 00000000001706f0 [ 6170.287962] Call Trace: [ 6170.287967] [ 6170.287973] ? __warn+0x9f/0x1a0 [ 6170.287986] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288092] ? report_bug+0x1b1/0x1e0 [ 6170.287986] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288092] ? report_bug+0x1b1/0x1e0 [ 6170.288104] ? handle_bug+0x3c/0x70 [ 6170.288112] ? exc_invalid_op+0x17/0x40 [ 6170.288120] ? asm_exc_invalid_op+0x1a/0x20 [ 6170.288132] ? nf_tables_chain_destroy+0x2b/0x220 [nf_tables] [ 6170.288243] ? nf_tables_chain_destroy+0x1f7/0x220 [nf_tables] [ 6170.288366] ? nf_tables_chain_destroy+0x2b/0x220 [nf_tables] [ 6170.288483] nf_tables_trans_destroy_work+0x588/0x590 [nf_tables] Fixes: 591054469b3e ("netfilter: nf_tables: revisit chain/object refcounting from elements") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d0c09f899e80..167074283ea9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7223,6 +7223,16 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) } } +static int nft_setelem_active_next(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + u8 genmask = nft_genmask_next(net); + + return nft_set_elem_active(ext, genmask); +} + static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -10644,8 +10654,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; - nft_setelem_data_activate(net, te->set, te->elem_priv); - nft_setelem_activate(net, te->set, te->elem_priv); + if (!nft_setelem_active_next(net, te->set, te->elem_priv)) { + nft_setelem_data_activate(net, te->set, te->elem_priv); + nft_setelem_activate(net, te->set, te->elem_priv); + } if (!nft_setelem_is_catchall(te->set, te->elem_priv)) te->set->ndeact--; -- cgit v1.2.3 From 0f022d32c3eca477fbf79a205243a6123ed0fe11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Apr 2024 18:07:28 -0300 Subject: net/sched: Fix mirred deadlock on device recursion When the mirred action is used on a classful egress qdisc and a packet is mirrored or redirected to self we hit a qdisc lock deadlock. See trace below. [..... other info removed for brevity....] [ 82.890906] [ 82.890906] ============================================ [ 82.890906] WARNING: possible recursive locking detected [ 82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G W [ 82.890906] -------------------------------------------- [ 82.890906] ping/418 is trying to acquire lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] but task is already holding lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] other info that might help us debug this: [ 82.890906] Possible unsafe locking scenario: [ 82.890906] [ 82.890906] CPU0 [ 82.890906] ---- [ 82.890906] lock(&sch->q.lock); [ 82.890906] lock(&sch->q.lock); [ 82.890906] [ 82.890906] *** DEADLOCK *** [ 82.890906] [..... other info removed for brevity....] Example setup (eth0->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 Another example(eth0->eth1->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth1 tc qdisc add dev eth1 root handle 1: htb default 30 tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 We fix this by adding an owner field (CPU id) to struct Qdisc set after root qdisc is entered. When the softirq enters it a second time, if the qdisc owner is the same CPU, the packet is dropped to break the loop. Reported-by: Mingshuai Ren Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/ Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()") Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops") Signed-off-by: Eric Dumazet Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240415210728.36949-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 1 + net/core/dev.c | 6 ++++++ net/sched/sch_generic.c | 1 + 3 files changed, 8 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cefe0c4bdae3..41ca14e81d55 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -117,6 +117,7 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; + int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; diff --git a/net/core/dev.c b/net/core/dev.c index 984ff8b9d0e1..331848eca7d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3775,6 +3775,10 @@ no_lock_out: return rc; } + if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { + kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); + return NET_XMIT_DROP; + } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3814,7 +3818,9 @@ no_lock_out: qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { + WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff5336493777..4a2c763e2d11 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -974,6 +974,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); -- cgit v1.2.3 From caed8eba221533123192d39cc947f45cbb1e1db5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Apr 2024 08:10:48 -0700 Subject: selftests: kselftest_harness: fix Clang warning about zero-length format Apparently it's more legal to pass the format as NULL, than it is to use an empty string. Clang complains about empty formats: ./../kselftest_harness.h:1207:30: warning: format string is empty [-Wformat-zero-length] 1207 | diagnostic ? "%s" : "", diagnostic); | ^~ 1 warning generated. Reported-by: Sean Christopherson Link: https://lore.kernel.org/all/20240409224256.1581292-1-seanjc@google.com Fixes: 378193eff339 ("selftests: kselftest_harness: let PASS / FAIL provide diagnostic") Tested-by: Sean Christopherson Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240416151048.1682352-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/kselftest.h | 10 ++++++---- tools/testing/selftests/kselftest_harness.h | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 541bf192e30e..4eca3fd1292c 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -288,15 +288,17 @@ void ksft_test_result_code(int exit_code, const char *test_name, } /* Docs seem to call for double space if directive is absent */ - if (!directive[0] && msg[0]) + if (!directive[0] && msg) directive = " # "; - va_start(args, msg); printf("%s %u %s%s", tap_code, ksft_test_num(), test_name, directive); errno = saved_errno; - vprintf(msg, args); + if (msg) { + va_start(args, msg); + vprintf(msg, args); + va_end(args); + } printf("\n"); - va_end(args); } static inline int ksft_exit_pass(void) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 4fd735e48ee7..adb15cae79ab 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -1202,7 +1202,7 @@ void __run_test(struct __fixture_metadata *f, diagnostic = "unknown"; ksft_test_result_code(t->exit_code, test_name, - diagnostic ? "%s" : "", diagnostic); + diagnostic ? "%s" : NULL, diagnostic); } static int test_harness_run(int argc, char **argv) -- cgit v1.2.3 From d362046021ea122309da8c8e0b6850c792ca97b5 Mon Sep 17 00:00:00 2001 From: Vanillan Wang Date: Tue, 16 Apr 2024 20:07:13 +0800 Subject: net:usb:qmi_wwan: support Rolling modules Update the qmi_wwan driver support for the Rolling LTE modules. - VID:PID 33f8:0104, RW101-GL for laptop debug M.2 cards(with RMNET interface for /Linux/Chrome OS) 0x0104: RMNET, diag, at, pipe Here are the outputs of usb-devices: T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=33f8 ProdID=0104 Rev=05.04 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=ba2eb033 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Vanillan Wang Link: https://lore.kernel.org/r/20240416120713.24777-1-vanillanwang@163.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index e2e181378f41..edc34402e787 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1431,6 +1431,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2692, 0x9025, 4)}, /* Cellient MPL200 (rebranded Qualcomm 05c6:9025) */ {QMI_QUIRK_SET_DTR(0x1546, 0x1312, 4)}, /* u-blox LARA-R6 01B */ {QMI_QUIRK_SET_DTR(0x1546, 0x1342, 4)}, /* u-blox LARA-L6 */ + {QMI_QUIRK_SET_DTR(0x33f8, 0x0104, 4)}, /* Rolling RW101 RMNET */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ -- cgit v1.2.3 From 94667949ec3bbb2218c46ad0a0e7274c8832e494 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 16 Apr 2024 10:23:29 +0200 Subject: net: ethernet: mtk_eth_soc: fix WED + wifi reset The WLAN + WED reset sequence relies on being able to receive interrupts from the card, in order to synchronize individual steps with the firmware. When WED is stopped, leave interrupts running and rely on the driver turning off unwanted ones. WED DMA also needs to be disabled before resetting. Fixes: f78cd9c783e0 ("net: ethernet: mtk_wed: update mtk_wed_stop") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20240416082330.82564-1-nbd@nbd.name Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_wed.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index c895e265ae0e..61334a71058c 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -1074,13 +1074,13 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev) static void mtk_wed_stop(struct mtk_wed_device *dev) { + mtk_wed_dma_disable(dev); mtk_wed_set_ext_int(dev, false); wed_w32(dev, MTK_WED_WPDMA_INT_TRIGGER, 0); wed_w32(dev, MTK_WED_WDMA_INT_TRIGGER, 0); wdma_w32(dev, MTK_WDMA_INT_MASK, 0); wdma_w32(dev, MTK_WDMA_INT_GRP2, 0); - wed_w32(dev, MTK_WED_WPDMA_INT_MASK, 0); if (!mtk_wed_get_rx_capa(dev)) return; @@ -1093,7 +1093,6 @@ static void mtk_wed_deinit(struct mtk_wed_device *dev) { mtk_wed_stop(dev); - mtk_wed_dma_disable(dev); wed_clr(dev, MTK_WED_CTRL, MTK_WED_CTRL_WDMA_INT_AGENT_EN | @@ -2605,9 +2604,6 @@ mtk_wed_irq_get(struct mtk_wed_device *dev, u32 mask) static void mtk_wed_irq_set_mask(struct mtk_wed_device *dev, u32 mask) { - if (!dev->running) - return; - mtk_wed_set_ext_int(dev, !!mask); wed_w32(dev, MTK_WED_INT_MASK, mask); } -- cgit v1.2.3 From f74ab0c5e5947bcb3a400ab73d837974e76fad23 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 11 Apr 2024 17:18:22 +0800 Subject: ALSA: hda/tas2781: Add new vendor_id and subsystem_id to support ThinkPad ICE-1 Add new vendor_id and subsystem_id to support new Lenovo laptop ThinkPad ICE-1 Signed-off-by: Shenghao Ding Cc: Message-ID: <20240411091823.1644-1-shenghao-ding@ti.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d6940bc4ec39..d4c2c3d6c76c 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10335,6 +10335,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x222e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x2231, "Thinkpad T560", ALC292_FIXUP_TPT460), SND_PCI_QUIRK(0x17aa, 0x2233, "Thinkpad", ALC292_FIXUP_TPT460), + SND_PCI_QUIRK(0x17aa, 0x2234, "Thinkpad ICE-1", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x2245, "Thinkpad T470", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x2246, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x2247, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), -- cgit v1.2.3 From 0672b017324b1444f12d008a3ba8bc0c6c9384fa Mon Sep 17 00:00:00 2001 From: Vitalii Torshyn Date: Thu, 11 Apr 2024 15:58:03 +0300 Subject: ALSA: hda/realtek: Fixes for Asus GU605M and GA403U sound Added the correct pin table for Asus GU605M and GA403U, enabling all speakers to be controlled with the master. Updated quirks for GU605M and GA403U by including the pin table patch in the chain. Co-developed-by: Luke D. Jones Signed-off-by: Luke D. Jones Signed-off-by: Vitalii Torshyn Message-ID: <20240411125803.18539-1-vitaly.torshyn@gmail.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d4c2c3d6c76c..4cf05a5c3aba 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7467,6 +7467,10 @@ enum { ALC285_FIXUP_CS35L56_I2C_2, ALC285_FIXUP_CS35L56_I2C_4, ALC285_FIXUP_ASUS_GA403U, + ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC, + ALC285_FIXUP_ASUS_GA403U_I2C_SPEAKER2_TO_DAC1, + ALC285_FIXUP_ASUS_GU605_SPI_2_HEADSET_MIC, + ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1 }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9690,6 +9694,38 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc285_fixup_asus_ga403u, }, + [ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x03a11050 }, + { 0x1b, 0x03a11c30 }, + { } + }, + .chained = true, + .chain_id = ALC285_FIXUP_ASUS_GA403U_I2C_SPEAKER2_TO_DAC1 + }, + [ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + .chained = true, + .chain_id = ALC285_FIXUP_ASUS_GU605_SPI_2_HEADSET_MIC, + }, + [ALC285_FIXUP_ASUS_GU605_SPI_2_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x03a11050 }, + { 0x1b, 0x03a11c30 }, + { } + }, + .chained = true, + .chain_id = ALC285_FIXUP_CS35L56_SPI_2 + }, + [ALC285_FIXUP_ASUS_GA403U_I2C_SPEAKER2_TO_DAC1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + .chained = true, + .chain_id = ALC285_FIXUP_ASUS_GA403U, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -10145,7 +10181,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1b11, "ASUS UX431DA", ALC294_FIXUP_ASUS_COEF_1B), - SND_PCI_QUIRK(0x1043, 0x1b13, "ASUS U41SV/GA403U", ALC285_FIXUP_ASUS_GA403U), + SND_PCI_QUIRK(0x1043, 0x1b13, "ASUS U41SV/GA403U", ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2), @@ -10153,7 +10189,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), - SND_PCI_QUIRK(0x1043, 0x1c63, "ASUS GU605M", ALC285_FIXUP_CS35L56_SPI_2), + SND_PCI_QUIRK(0x1043, 0x1c63, "ASUS GU605M", ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1), SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), SND_PCI_QUIRK(0x1043, 0x1c9f, "ASUS G614JU/JV/JI", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1caf, "ASUS G634JY/JZ/JI/JG", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), -- cgit v1.2.3 From dca5f4dfa925b51becee65031869e917e6229620 Mon Sep 17 00:00:00 2001 From: Huayu Zhang Date: Sat, 13 Apr 2024 19:41:22 +0800 Subject: ALSA: hda/realtek: Fix volumn control of ThinkBook 16P Gen4 change HDA & AMP configuration from ALC287_FIXUP_CS35L41_I2C_2 to ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD for ThinkBook 16P Gen4 models to fix volumn control issue (cannot fully mute). Signed-off-by: Huayu Zhang Fixes: 6214e24cae9b ("ALSA: hda/realtek: Add quirks for Lenovo Thinkbook 16P laptops") Message-ID: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4cf05a5c3aba..afe1238ca51b 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10433,8 +10433,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3886, "Y780 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38a7, "Y780P AMD YG dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38a8, "Y780P AMD VECO dual", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x17aa, 0x38a9, "Thinkbook 16P", ALC287_FIXUP_CS35L41_I2C_2), - SND_PCI_QUIRK(0x17aa, 0x38ab, "Thinkbook 16P", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x17aa, 0x38a9, "Thinkbook 16P", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x38ab, "Thinkbook 16P", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x38b4, "Legion Slim 7 16IRH8", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x38b5, "Legion Slim 7 16IRH8", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x38b6, "Legion Slim 7 16APH8", ALC287_FIXUP_CS35L41_I2C_2), -- cgit v1.2.3 From cf16ffa17c398434a77b8a373e69287c95b60de2 Mon Sep 17 00:00:00 2001 From: Coia Prant Date: Mon, 15 Apr 2024 07:26:25 -0700 Subject: USB: serial: option: add Lonsung U8300/U9300 product Update the USB serial option driver to support Longsung U8300/U9300. For U8300 Interface 4 is used by for QMI interface in stock firmware of U8300, the router which uses U8300 modem. Interface 5 is used by for ADB interface in stock firmware of U8300, the router which uses U8300 modem. Interface mapping is: 0: unknown (Debug), 1: AT (Modem), 2: AT, 3: PPP (NDIS / Pipe), 4: QMI, 5: ADB T: Bus=05 Lev=01 Prnt=03 Port=02 Cnt=01 Dev#= 4 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1c9e ProdID=9b05 Rev=03.18 S: Manufacturer=Android S: Product=Android C: #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms For U9300 Interface 1 is used by for ADB interface in stock firmware of U9300, the router which uses U9300 modem. Interface 4 is used by for QMI interface in stock firmware of U9300, the router which uses U9300 modem. Interface mapping is: 0: ADB, 1: AT (Modem), 2: AT, 3: PPP (NDIS / Pipe), 4: QMI Note: Interface 3 of some models of the U9300 series can send AT commands. T: Bus=05 Lev=01 Prnt=05 Port=04 Cnt=01 Dev#= 6 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1c9e ProdID=9b3c Rev=03.18 S: Manufacturer=Android S: Product=Android C: #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms Tested successfully using Modem Manager on U9300. Tested successfully AT commands using If=1, If=2 and If=3 on U9300. Signed-off-by: Coia Prant Reviewed-by: Lars Melin [ johan: drop product defines, trim commit message ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index ad0fff1e889d..8827ca3a6595 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2068,6 +2068,10 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(LONGCHEER_VENDOR_ID, 0x9803, 0xff), .driver_info = RSVD(4) }, + { USB_DEVICE(LONGCHEER_VENDOR_ID, 0x9b05), /* Longsung U8300 */ + .driver_info = RSVD(4) | RSVD(5) }, + { USB_DEVICE(LONGCHEER_VENDOR_ID, 0x9b3c), /* Longsung U9300 */ + .driver_info = RSVD(0) | RSVD(4) }, { USB_DEVICE(LONGCHEER_VENDOR_ID, ZOOM_PRODUCT_4597) }, { USB_DEVICE(LONGCHEER_VENDOR_ID, IBALL_3_5G_CONNECT) }, { USB_DEVICE(HAIER_VENDOR_ID, HAIER_PRODUCT_CE100) }, -- cgit v1.2.3 From 311f97a4c7c22a01f8897bddf00428dfd0668e79 Mon Sep 17 00:00:00 2001 From: Vanillan Wang Date: Tue, 16 Apr 2024 18:02:55 +0800 Subject: USB: serial: option: add Rolling RW101-GL and RW135-GL support Update the USB serial option driver support for the Rolling LTE modules. - VID:PID 33f8:01a2, RW101-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x01a2: mbim, diag, at, pipe - VID:PID 33f8:01a3, RW101-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x01a3: mbim, pipe - VID:PID 33f8:01a4, RW101-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x01a4: mbim, diag, at, pipe - VID:PID 33f8:0104, RW101-GL for laptop debug M.2 cards(with RMNET interface for /Linux/Chrome OS) 0x0104: RMNET, diag, at, pipe - VID:PID 33f8:0115, RW135-GL for laptop debug M.2 cards(with MBIM interface for /Linux/Chrome OS) 0x0115: MBIM, diag, at, pipe Here are the outputs of usb-devices: T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=33f8 ProdID=01a2 Rev=05.15 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=12345678 C: #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=33f8 ProdID=01a3 Rev=05.15 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=12345678 C: #Ifs= 3 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=33f8 ProdID=01a4 Rev=05.15 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=12345678 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=33f8 ProdID=0104 Rev=05.04 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=ba2eb033 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 16 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=33f8 ProdID=0115 Rev=05.15 S: Manufacturer=Rolling Wireless S.a.r.l. S: Product=Rolling Module S: SerialNumber=12345678 C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Vanillan Wang Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 8827ca3a6595..e22612017f66 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2307,6 +2307,14 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) }, /* GosunCn GM500 MBIM */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1406, 0xff) }, /* GosunCn GM500 ECM/NCM */ + { USB_DEVICE(0x33f8, 0x0104), /* Rolling RW101-GL (laptop RMNET) */ + .driver_info = RSVD(4) | RSVD(5) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a2, 0xff) }, /* Rolling RW101-GL (laptop MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a3, 0xff) }, /* Rolling RW101-GL (laptop MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a4, 0xff), /* Rolling RW101-GL (laptop MBIM) */ + .driver_info = RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0115, 0xff), /* Rolling RW135-GL (laptop MBIM) */ + .driver_info = RSVD(5) }, { USB_DEVICE_AND_INTERFACE_INFO(OPPO_VENDOR_ID, OPPO_PRODUCT_R11, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, -- cgit v1.2.3 From 7caf3daaaf0436fe370834c72c667a97d3671d1a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 17 Apr 2024 17:16:33 +0100 Subject: ALSA: hda/realtek: Add quirks for Huawei Matebook D14 NBLB-WAX9N The headset mic requires a fixup to be properly detected/used. As a reference, this specific model from 2021 reports the following devices: https://alsa-project.org/db/?f=1a5ddeb0b151db8fe051407f5bb1c075b7dd3e4a Signed-off-by: Mauro Carvalho Chehab Cc: Message-ID: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index afe1238ca51b..f71460847492 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10266,6 +10266,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x152d, 0x1082, "Quanta NL3", ALC269_FIXUP_LIFEBOOK), + SND_PCI_QUIRK(0x152d, 0x1262, "Huawei NBLB-WAX9N", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1558, 0x0353, "Clevo V35[05]SN[CDE]Q", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1323, "Clevo N130ZU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1325, "Clevo N15[01][CW]U", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), -- cgit v1.2.3 From 32f5f73b79ffdef215e2e1bcb6ad74387c0f925c Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Wed, 17 Apr 2024 10:47:31 -0700 Subject: x86/fred: Fix INT80 emulation for FRED Add a FRED-specific INT80 handler and document why it differs from the current one. Eventually, the common bits will be unified once FRED hw is available and it turns out that no further changes are needed but for now, keep the handlers separate for everyone's sanity's sake. [ bp: Zap duplicated commit message, massage. ] Fixes: 55617fb991df ("x86/entry: Do not allow external 0x80 interrupts") Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240417174731.4189592-1-xin@zytor.com --- arch/x86/entry/common.c | 65 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_fred.c | 2 +- 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6de50b80702e..51cc9c7cb9bd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -255,6 +255,71 @@ __visible noinstr void do_int80_emulation(struct pt_regs *regs) instrumentation_end(); syscall_exit_to_user_mode(regs); } + +#ifdef CONFIG_X86_FRED +/* + * A FRED-specific INT80 handler is warranted for the follwing reasons: + * + * 1) As INT instructions and hardware interrupts are separate event + * types, FRED does not preclude the use of vector 0x80 for external + * interrupts. As a result, the FRED setup code does not reserve + * vector 0x80 and calling int80_is_external() is not merely + * suboptimal but actively incorrect: it could cause a system call + * to be incorrectly ignored. + * + * 2) It is called only for handling vector 0x80 of event type + * EVENT_TYPE_SWINT and will never be called to handle any external + * interrupt (event type EVENT_TYPE_EXTINT). + * + * 3) FRED has separate entry flows depending on if the event came from + * user space or kernel space, and because the kernel does not use + * INT insns, the FRED kernel entry handler fred_entry_from_kernel() + * falls through to fred_bad_type() if the event type is + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling + * an INT insn, it can only be from a user level. + * + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will + * likely take a different approach if it is ever needed: it + * probably belongs in either fred_intx()/ fred_other() or + * asm_fred_entrypoint_user(), depending on if this ought to be done + * for all entries from userspace or only system + * calls. + * + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. + */ +DEFINE_FREDENTRY_RAW(int80_emulation) +{ + int nr; + + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* + * FRED pushed 0 into regs::orig_ax and regs::ax contains the + * syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif #else /* CONFIG_IA32_EMULATION */ /* Handles int $0x80 on a 32bit kernel */ diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index ac120cbdaaf2..9fa18b8c7f26 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -66,7 +66,7 @@ static noinstr void fred_intx(struct pt_regs *regs) /* INT80 */ case IA32_SYSCALL_VECTOR: if (ia32_enabled()) - return int80_emulation(regs); + return fred_int80_emulation(regs); fallthrough; #endif -- cgit v1.2.3 From a4b37f5033fa812f02f3b7bd1242393d347ba791 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Wed, 17 Apr 2024 19:34:25 +0800 Subject: x86/fred: Fix incorrect error code printout in fred_bad_type() regs->orig_ax has been set to -1 on entry so in the printout, fred_bad_type() should use the passed parameter error_code. Fixes: 14619d912b65 ("x86/fred: FRED entry/exit and dispatch code") Signed-off-by: Hou Wenlong Signed-off-by: Borislav Petkov (AMD) Acked-by: H. Peter Anvin (Intel) Link: https://lore.kernel.org/r/b2a8f0a41449d25240e314a2ddfbf6549511fb04.1713353612.git.houwenlong.hwl@antgroup.com --- arch/x86/entry/entry_fred.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 9fa18b8c7f26..89c1476fcdd9 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -28,9 +28,9 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code if (regs->fred_cs.sl > 0) { pr_emerg("PANIC: invalid or fatal FRED event; event type %u " "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", - regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + regs->fred_ss.type, regs->fred_ss.vector, error_code, fred_event_data(regs), regs->cs, regs->ip); - die("invalid or fatal FRED event", regs, regs->orig_ax); + die("invalid or fatal FRED event", regs, error_code); panic("invalid or fatal FRED event"); } else { unsigned long flags = oops_begin(); @@ -38,10 +38,10 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code pr_alert("BUG: invalid or fatal FRED event; event type %u " "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", - regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + regs->fred_ss.type, regs->fred_ss.vector, error_code, fred_event_data(regs), regs->cs, regs->ip); - if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + if (__die("Invalid or fatal FRED event", regs, error_code)) sig = 0; oops_end(flags, regs, sig); -- cgit v1.2.3 From def52db470df28d6f43cacbd21137f03b9502073 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:51 +0100 Subject: net: ravb: Count packets instead of descriptors in R-Car RX path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The units of "work done" in the RX path should be packets instead of descriptors. Descriptors which are used by the hardware to record error conditions or are empty in the case of a DMA mapping error should not count towards our RX work budget. Also make the limit variable unsigned as it can never be negative. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Reviewed-by: Niklas Söderlund Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index ba01c8cc3c90..7d231d011bff 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -892,29 +892,24 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; int entry = priv->cur_rx[q] % priv->num_rx_ring[q]; - int boguscnt = (priv->dirty_rx[q] + priv->num_rx_ring[q]) - - priv->cur_rx[q]; struct net_device_stats *stats = &priv->stats[q]; struct ravb_ex_rx_desc *desc; + unsigned int limit, i; struct sk_buff *skb; dma_addr_t dma_addr; struct timespec64 ts; + int rx_packets = 0; u8 desc_status; u16 pkt_len; - int limit; - boguscnt = min(boguscnt, *quota); - limit = boguscnt; + limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; desc = &priv->rx_ring[q].ex_desc[entry]; - while (desc->die_dt != DT_FEMPTY) { + for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS; - if (--boguscnt < 0) - break; - /* We use 0-byte descriptors to mark the DMA mapping errors */ if (!pkt_len) continue; @@ -960,7 +955,7 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum(skb); napi_gro_receive(&priv->napi[q], skb); - stats->rx_packets++; + rx_packets++; stats->rx_bytes += pkt_len; } @@ -995,9 +990,9 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) desc->die_dt = DT_FEMPTY; } - *quota -= limit - (++boguscnt); - - return boguscnt <= 0; + stats->rx_packets += rx_packets; + *quota -= rx_packets; + return *quota == 0; } /* Packet receive function for Ethernet AVB */ -- cgit v1.2.3 From a892493a343494bd6bab9d098593932077ff3c43 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:52 +0100 Subject: net: ravb: Allow RX loop to move past DMA mapping errors The RX loops in ravb_rx_gbeth() and ravb_rx_rcar() skip to the next loop iteration if a zero-length descriptor is seen (indicating a DMA mapping error). However, the current RX descriptor index `priv->cur_rx[q]` was incremented at the end of the loop and so would not be incremented when we skip to the next loop iteration. This would cause the loop to keep seeing the same zero-length descriptor instead of moving on to the next descriptor. As the loop counter `i` still increments, the loop would eventually terminate so there is no risk of being stuck here forever - but we should still fix this to avoid wasting cycles. To fix this, the RX descriptor index is incremented at the top of the loop, in the for statement itself. The assignments of `entry` and `desc` are brought into the loop to avoid the need for duplication. Fixes: d8b48911fd24 ("ravb: fix ring memory allocation") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 7d231d011bff..3b870926af14 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -775,12 +775,15 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) int limit; int i; - entry = priv->cur_rx[q] % priv->num_rx_ring[q]; limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; stats = &priv->stats[q]; - desc = &priv->rx_ring[q].desc[entry]; - for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { + for (i = 0; i < limit; i++, priv->cur_rx[q]++) { + entry = priv->cur_rx[q] % priv->num_rx_ring[q]; + desc = &priv->rx_ring[q].desc[entry]; + if (rx_packets == *quota || desc->die_dt == DT_FEMPTY) + break; + /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; @@ -848,9 +851,6 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) break; } } - - entry = (++priv->cur_rx[q]) % priv->num_rx_ring[q]; - desc = &priv->rx_ring[q].desc[entry]; } /* Refill the RX ring buffers. */ @@ -891,7 +891,6 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) { struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; - int entry = priv->cur_rx[q] % priv->num_rx_ring[q]; struct net_device_stats *stats = &priv->stats[q]; struct ravb_ex_rx_desc *desc; unsigned int limit, i; @@ -901,10 +900,15 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) int rx_packets = 0; u8 desc_status; u16 pkt_len; + int entry; limit = priv->dirty_rx[q] + priv->num_rx_ring[q] - priv->cur_rx[q]; - desc = &priv->rx_ring[q].ex_desc[entry]; - for (i = 0; i < limit && rx_packets < *quota && desc->die_dt != DT_FEMPTY; i++) { + for (i = 0; i < limit; i++, priv->cur_rx[q]++) { + entry = priv->cur_rx[q] % priv->num_rx_ring[q]; + desc = &priv->rx_ring[q].ex_desc[entry]; + if (rx_packets == *quota || desc->die_dt == DT_FEMPTY) + break; + /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; @@ -958,9 +962,6 @@ static bool ravb_rx_rcar(struct net_device *ndev, int *quota, int q) rx_packets++; stats->rx_bytes += pkt_len; } - - entry = (++priv->cur_rx[q]) % priv->num_rx_ring[q]; - desc = &priv->rx_ring[q].ex_desc[entry]; } /* Refill the RX ring buffers. */ -- cgit v1.2.3 From c7c449502b51c5b5de79f97a42be750b28f6ecee Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:53 +0100 Subject: net: ravb: Fix GbEth jumbo packet RX checksum handling Sending a 7kB ping packet to the RZ/G2L in v6.9-rc2 causes the following backtrace: WARNING: CPU: 0 PID: 0 at include/linux/skbuff.h:3127 skb_trim+0x30/0x38 Hardware name: Renesas SMARC EVK based on r9a07g044l2 (DT) pc : skb_trim+0x30/0x38 lr : ravb_rx_csum_gbeth+0x40/0x90 Call trace: skb_trim+0x30/0x38 ravb_rx_gbeth+0x56c/0x5cc ravb_poll+0xa0/0x204 __napi_poll+0x38/0x17c This is caused by ravb_rx_gbeth() calling ravb_rx_csum_gbeth() with the wrong skb for a packet which spans multiple descriptors. To fix this, use the correct skb. Fixes: c2da9408579d ("ravb: Add Rx checksum offload support for GbEth") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 3b870926af14..6969cdeeb67a 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -843,7 +843,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) priv->rx_1st_skb->protocol = eth_type_trans(priv->rx_1st_skb, ndev); if (ndev->features & NETIF_F_RXCSUM) - ravb_rx_csum_gbeth(skb); + ravb_rx_csum_gbeth(priv->rx_1st_skb); napi_gro_receive(&priv->napi[q], priv->rx_1st_skb); rx_packets++; -- cgit v1.2.3 From 2e36c9fbc476f95a1b19e3fa0a2cdf408475ff56 Mon Sep 17 00:00:00 2001 From: Paul Barker Date: Tue, 16 Apr 2024 13:02:54 +0100 Subject: net: ravb: Fix RX byte accounting for jumbo packets The RX byte accounting for jumbo packets was changed to fix a potential use-after-free bug. However, that fix used the wrong variable and so only accounted for the number of bytes in the final descriptor, not the number of bytes in the whole packet. To fix this, we can simply update our stats with the correct number of bytes before calling napi_gro_receive(). Also rename pkt_len to desc_len in ravb_rx_gbeth() to avoid any future confusion. The variable name pkt_len is correct in ravb_rx_rcar() as that function does not handle packets spanning multiple descriptors. Fixes: 5a5a3e564de6 ("ravb: Fix potential use-after-free in ravb_rx_gbeth()") Signed-off-by: Paul Barker Reviewed-by: Sergey Shtylyov Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 6969cdeeb67a..fcb756d77681 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -769,7 +769,7 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) dma_addr_t dma_addr; int rx_packets = 0; u8 desc_status; - u16 pkt_len; + u16 desc_len; u8 die_dt; int entry; int limit; @@ -787,10 +787,10 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) /* Descriptor type must be checked before all other reads */ dma_rmb(); desc_status = desc->msc; - pkt_len = le16_to_cpu(desc->ds_cc) & RX_DS; + desc_len = le16_to_cpu(desc->ds_cc) & RX_DS; /* We use 0-byte descriptors to mark the DMA mapping errors */ - if (!pkt_len) + if (!desc_len) continue; if (desc_status & MSC_MC) @@ -811,25 +811,25 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) switch (die_dt) { case DT_FSINGLE: skb = ravb_get_skb_gbeth(ndev, entry, desc); - skb_put(skb, pkt_len); + skb_put(skb, desc_len); skb->protocol = eth_type_trans(skb, ndev); if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum_gbeth(skb); napi_gro_receive(&priv->napi[q], skb); rx_packets++; - stats->rx_bytes += pkt_len; + stats->rx_bytes += desc_len; break; case DT_FSTART: priv->rx_1st_skb = ravb_get_skb_gbeth(ndev, entry, desc); - skb_put(priv->rx_1st_skb, pkt_len); + skb_put(priv->rx_1st_skb, desc_len); break; case DT_FMID: skb = ravb_get_skb_gbeth(ndev, entry, desc); skb_copy_to_linear_data_offset(priv->rx_1st_skb, priv->rx_1st_skb->len, skb->data, - pkt_len); - skb_put(priv->rx_1st_skb, pkt_len); + desc_len); + skb_put(priv->rx_1st_skb, desc_len); dev_kfree_skb(skb); break; case DT_FEND: @@ -837,17 +837,17 @@ static bool ravb_rx_gbeth(struct net_device *ndev, int *quota, int q) skb_copy_to_linear_data_offset(priv->rx_1st_skb, priv->rx_1st_skb->len, skb->data, - pkt_len); - skb_put(priv->rx_1st_skb, pkt_len); + desc_len); + skb_put(priv->rx_1st_skb, desc_len); dev_kfree_skb(skb); priv->rx_1st_skb->protocol = eth_type_trans(priv->rx_1st_skb, ndev); if (ndev->features & NETIF_F_RXCSUM) ravb_rx_csum_gbeth(priv->rx_1st_skb); + stats->rx_bytes += priv->rx_1st_skb->len; napi_gro_receive(&priv->napi[q], priv->rx_1st_skb); rx_packets++; - stats->rx_bytes += pkt_len; break; } } -- cgit v1.2.3 From 3aadf100f93d80815685493d60cd8cab206403df Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 18 Apr 2024 13:45:17 +0200 Subject: Revert "vmgenid: emit uevent when VMGENID updates" This reverts commit ad6bcdad2b6724e113f191a12f859a9e8456b26d. I had nak'd it, and Greg said on the thread that it links that he wasn't going to take it either, especially since it's not his code or his tree, but then, seemingly accidentally, it got pushed up some months later, in what looks like a mistake, with no further discussion in the linked thread. So revert it, since it's clearly not intended. Fixes: ad6bcdad2b67 ("vmgenid: emit uevent when VMGENID updates") Cc: stable@vger.kernel.org Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230531095119.11202-2-bchalios@amazon.es Signed-off-by: Jason A. Donenfeld --- drivers/virt/vmgenid.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/virt/vmgenid.c b/drivers/virt/vmgenid.c index b67a28da4702..a1c467a0e9f7 100644 --- a/drivers/virt/vmgenid.c +++ b/drivers/virt/vmgenid.c @@ -68,7 +68,6 @@ out: static void vmgenid_notify(struct acpi_device *device, u32 event) { struct vmgenid_state *state = acpi_driver_data(device); - char *envp[] = { "NEW_VMGENID=1", NULL }; u8 old_id[VMGENID_SIZE]; memcpy(old_id, state->this_id, sizeof(old_id)); @@ -76,7 +75,6 @@ static void vmgenid_notify(struct acpi_device *device, u32 event) if (!memcmp(old_id, state->this_id, sizeof(old_id))) return; add_vmfork_randomness(state->this_id, sizeof(state->this_id)); - kobject_uevent_env(&device->dev.kobj, KOBJ_CHANGE, envp); } static const struct acpi_device_id vmgenid_ids[] = { -- cgit v1.2.3 From 1607830dadeefc407e4956336d9fcd9e9defd810 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 18 Apr 2024 16:33:28 +0200 Subject: Revert "usb: cdc-wdm: close race between read and workqueue" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 339f83612f3a569b194680768b22bf113c26a29d. It has been found to cause problems in a number of Chromebook devices, so revert the change until it can be brought back in a safe way. Link: https://lore.kernel.org/r/385a3519-b45d-48c5-a6fd-a3fdb6bec92f@chromium.org Reported-by:: Aleksander Morgado Fixes: 339f83612f3a ("usb: cdc-wdm: close race between read and workqueue") Cc: stable Cc: Oliver Neukum Cc: Bjørn Mork Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index c8262e2f2917..c553decb5461 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -485,7 +485,6 @@ out_free_mem: static int service_outstanding_interrupt(struct wdm_device *desc) { int rv = 0; - int used; /* submit read urb only if the device is waiting for it */ if (!desc->resp_count || !--desc->resp_count) @@ -500,10 +499,7 @@ static int service_outstanding_interrupt(struct wdm_device *desc) goto out; } - used = test_and_set_bit(WDM_RESPONDING, &desc->flags); - if (used) - goto out; - + set_bit(WDM_RESPONDING, &desc->flags); spin_unlock_irq(&desc->iuspin); rv = usb_submit_urb(desc->response, GFP_KERNEL); spin_lock_irq(&desc->iuspin); -- cgit v1.2.3 From f2e0eee4703869dc5edb5302a919861566ca7797 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Tue, 16 Apr 2024 01:23:07 +0000 Subject: usb: dwc3: ep0: Don't reset resource alloc flag The DWC3_EP_RESOURCE_ALLOCATED flag ensures that the resource of an endpoint is only assigned once. Unless the endpoint is reset, don't clear this flag. Otherwise we may set endpoint resource again, which prevents the driver from initiate transfer after handling a STALL or endpoint halt to the control endpoint. Cc: stable@vger.kernel.org Fixes: b311048c174d ("usb: dwc3: gadget: Rewrite endpoint allocation flow") Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/00122b7cc5be06abef461776e7cc9f5ebc8bc1cb.1713229786.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/ep0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 72bb722da2f2..d96ffbe52039 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -226,7 +226,8 @@ void dwc3_ep0_stall_and_restart(struct dwc3 *dwc) /* reinitialize physical ep1 */ dep = dwc->eps[1]; - dep->flags = DWC3_EP_ENABLED; + dep->flags &= DWC3_EP_RESOURCE_ALLOCATED; + dep->flags |= DWC3_EP_ENABLED; /* stall is always issued on EP0 */ dep = dwc->eps[0]; -- cgit v1.2.3 From 582ee2f9d268d302595db3e36b985e5cbb93284d Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Thu, 18 Apr 2024 13:34:30 +0200 Subject: USB: serial: option: add Telit FN920C04 rmnet compositions Add the following Telit FN920C04 compositions: 0x10a0: rmnet + tty (AT/NMEA) + tty (AT) + tty (diag) T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10a0 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10a4: rmnet + tty (AT) + tty (AT) + tty (diag) T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10a4 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10a9: rmnet + tty (AT) + tty (diag) + DPL (data packet logging) + adb T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 9 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10a9 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Daniele Palmas Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e22612017f66..8a5846d4adf6 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1376,6 +1376,12 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1083, 0xff), /* Telit FE990 (ECM) */ .driver_info = NCTRL(0) | RSVD(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a0, 0xff), /* Telit FN20C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a4, 0xff), /* Telit FN20C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a9, 0xff), /* Telit FN20C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), -- cgit v1.2.3 From 9543f6e26634537997b6e909c20911b7bf4876de Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 16 Apr 2024 23:04:34 -0700 Subject: x86/cpufeatures: Fix dependencies for GFNI, VAES, and VPCLMULQDQ Fix cpuid_deps[] to list the correct dependencies for GFNI, VAES, and VPCLMULQDQ. These features don't depend on AVX512, and there exist CPUs that support these features but not AVX512. GFNI actually doesn't even depend on AVX. This prevents GFNI from being unnecessarily disabled if AVX is disabled to mitigate the GDS vulnerability. This also prevents all three features from being unnecessarily disabled if AVX512VL (or its dependency AVX512F) were to be disabled, but it looks like there isn't any case where this happens anyway. Fixes: c128dbfa0f87 ("x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU features") Signed-off-by: Eric Biggers Signed-off-by: Borislav Petkov (AMD) Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20240417060434.47101-1-ebiggers@kernel.org --- arch/x86/kernel/cpu/cpuid-deps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b7174209d855..946813d816bf 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -44,7 +44,10 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_VAES, X86_FEATURE_AVX }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, @@ -56,9 +59,6 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, -- cgit v1.2.3 From 752863bddacab6b5c5164b1df8c8b2e3a175ee28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Apr 2024 16:47:43 +0200 Subject: block: propagate partition scanning errors to the BLKRRPART ioctl Commit 4601b4b130de ("block: reopen the device in blkdev_reread_part") lost the propagation of I/O errors from the low-level read of the partition table to the user space caller of the BLKRRPART. Apparently some user space relies on, so restore the propagation. This isn't exactly pretty as other block device open calls explicitly do not are about these errors, so add a new BLK_OPEN_STRICT_SCAN to opt into the error propagation. Fixes: 4601b4b130de ("block: reopen the device in blkdev_reread_part") Reported-by: Saranya Muruganandam Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Shin'ichiro Kawasaki Tested-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240417144743.2277601-1-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 29 +++++++++++++++++++---------- block/ioctl.c | 3 ++- include/linux/blkdev.h | 2 ++ 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 7a5f611c3d2e..cea51dca8753 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -652,6 +652,14 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } +static void blkdev_put_whole(struct block_device *bdev) +{ + if (atomic_dec_and_test(&bdev->bd_openers)) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk); +} + static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -670,20 +678,21 @@ static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); - if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, false); atomic_inc(&bdev->bd_openers); + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { + /* + * Only return scanning errors if we are called from contexts + * that explicitly want them, e.g. the BLKRRPART ioctl. + */ + ret = bdev_disk_changed(disk, false); + if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { + blkdev_put_whole(bdev); + return ret; + } + } return 0; } -static void blkdev_put_whole(struct block_device *bdev) -{ - if (atomic_dec_and_test(&bdev->bd_openers)) - blkdev_flush_mapping(bdev); - if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk); -} - static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; diff --git a/block/ioctl.c b/block/ioctl.c index a9028a2c2db5..f505f9c341eb 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -563,7 +563,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode); + return disk_scan_partitions(bdev->bd_disk, + mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..d16320852c4b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -128,6 +128,8 @@ typedef unsigned int __bitwise blk_mode_t; #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ #define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) +/* return partition scanning errors */ +#define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6)) struct gendisk { /* -- cgit v1.2.3 From 56f78615bcb1c3ba58a5d9911bad3d9185cf141b Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Wed, 17 Apr 2024 10:55:13 +0200 Subject: net: usb: ax88179_178a: avoid writing the mac address before first reading After the commit d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets"), reset operation, in which the default mac address from the device is read, is not executed from bind operation and the random address, that is pregenerated just in case, is direclty written the first time in the device, so the default one from the device is not even read. This writing is not dangerous because is volatile and the default mac address is not missed. In order to avoid this and keep the simplification to have only one reset and reduce the delays, restore the reset from bind operation and remove the reset that is commanded from open operation. The behavior is the same but everything is ready for usbnet_probe. Tested with ASIX AX88179 USB Gigabit Ethernet devices. Restore the old behavior for the rest of possible devices because I don't have the hardware to test. cc: stable@vger.kernel.org # 6.6+ Fixes: d2689b6a86b9 ("net: usb: ax88179_178a: avoid two consecutive device resets") Reported-by: Jarkko Palviainen Signed-off-by: Jose Ignacio Tornos Martinez Link: https://lore.kernel.org/r/20240417085524.219532-1-jtornosm@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/ax88179_178a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index a9c418890a1c..752f821a1990 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1317,6 +1317,8 @@ static int ax88179_bind(struct usbnet *dev, struct usb_interface *intf) netif_set_tso_max_size(dev->net, 16384); + ax88179_reset(dev); + return 0; } @@ -1695,7 +1697,6 @@ static const struct driver_info ax88179_info = { .unbind = ax88179_unbind, .status = ax88179_status, .link_reset = ax88179_link_reset, - .reset = ax88179_reset, .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, @@ -1708,7 +1709,6 @@ static const struct driver_info ax88178a_info = { .unbind = ax88179_unbind, .status = ax88179_status, .link_reset = ax88179_link_reset, - .reset = ax88179_reset, .stop = ax88179_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX, .rx_fixup = ax88179_rx_fixup, -- cgit v1.2.3 From c24cd679b075b0e953ea167b0aa2b2d59e4eba7f Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 17 Apr 2024 15:24:25 +0530 Subject: net: ethernet: ti: am65-cpsw-nuss: cleanup DMA Channels before using them The TX and RX DMA Channels used by the driver to exchange data with CPSW are not guaranteed to be in a clean state during driver initialization. The Bootloader could have used the same DMA Channels without cleaning them up in the event of failure. Thus, reset and disable the DMA Channels to ensure that they are in a clean state before using them. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Reported-by: Schuyler Patton Signed-off-by: Siddharth Vadapalli Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20240417095425.2253876-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2939a21ca74f..1d00e21808c1 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2793,6 +2793,8 @@ static void am65_cpsw_unregister_devlink(struct am65_cpsw_common *common) static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) { + struct am65_cpsw_rx_chn *rx_chan = &common->rx_chns; + struct am65_cpsw_tx_chn *tx_chan = common->tx_chns; struct device *dev = common->dev; struct am65_cpsw_port *port; int ret = 0, i; @@ -2805,6 +2807,22 @@ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) if (ret) return ret; + /* The DMA Channels are not guaranteed to be in a clean state. + * Reset and disable them to ensure that they are back to the + * clean state and ready to be used. + */ + for (i = 0; i < common->tx_ch_num; i++) { + k3_udma_glue_reset_tx_chn(tx_chan[i].tx_chn, &tx_chan[i], + am65_cpsw_nuss_tx_cleanup); + k3_udma_glue_disable_tx_chn(tx_chan[i].tx_chn); + } + + for (i = 0; i < AM65_CPSW_MAX_RX_FLOWS; i++) + k3_udma_glue_reset_rx_chn(rx_chan->rx_chn, i, rx_chan, + am65_cpsw_nuss_rx_cleanup, !!i); + + k3_udma_glue_disable_rx_chn(rx_chan->rx_chn); + ret = am65_cpsw_nuss_register_devlink(common); if (ret) return ret; -- cgit v1.2.3 From 2b504e1620376052744ebee408a84394bdaef40a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 09:54:14 +0200 Subject: arm64/head: Drop unnecessary pre-disable-MMU workaround The Falkor erratum that results in the need for an ISB before clearing the M bit in SCTLR_ELx only applies to execution at exception level x, and so the workaround is not needed when disabling the EL1 MMU while running at EL2. Signed-off-by: Ard Biesheuvel Acked-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20240415075412.2347624-5-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 06234c3a15f3..b8bbd72cb194 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -323,13 +323,11 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) cbz x0, 2f /* Set a sane SCTLR_EL1, the VHE way */ - pre_disable_mmu_workaround msr_s SYS_SCTLR_EL12, x1 mov x2, #BOOT_CPU_FLAG_E2H b 3f 2: - pre_disable_mmu_workaround msr sctlr_el1, x1 mov x2, xzr 3: -- cgit v1.2.3 From 34e526cb7d46726b2ae5f83f2892d00ebb088509 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Apr 2024 09:54:15 +0200 Subject: arm64/head: Disable MMU at EL2 before clearing HCR_EL2.E2H Even though the boot protocol stipulates otherwise, an exception has been made for the EFI stub, and entering the core kernel with the MMU enabled is permitted. This allows a substantial amount of cache maintenance to be elided, wich is significant when fast boot times are critical (e.g., for booting micro-VMs) Once the initial ID map has been populated, the MMU is disabled as part of the logic sequence that puts all system registers into a known state. Any code that needs to execute within the window where the MMU is off is cleaned to the PoC explicitly, which includes all of HYP text when entering at EL2. However, the current sequence of initializing the EL2 system registers is not safe: HCR_EL2 is set to its nVHE initial state before SCTLR_EL2 is reprogrammed, and this means that a VHE-to-nVHE switch may occur while the MMU is enabled. This switch causes some system registers as well as page table descriptors to be interpreted in a different way, potentially resulting in spurious exceptions relating to MMU translation. So disable the MMU explicitly first when entering in EL2 with the MMU and caches enabled. Fixes: 617861703830 ("efi: arm64: enter with MMU and caches enabled") Signed-off-by: Ard Biesheuvel Cc: # 6.3.x Acked-by: Mark Rutland Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240415075412.2347624-6-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index b8bbd72cb194..cb68adcabe07 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -289,6 +289,11 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) adr_l x1, __hyp_text_end adr_l x2, dcache_clean_poc blr x2 + + mov_q x0, INIT_SCTLR_EL2_MMU_OFF + pre_disable_mmu_workaround + msr sctlr_el2, x0 + isb 0: mov_q x0, HCR_HOST_NVHE_FLAGS -- cgit v1.2.3 From 7ee5faad0f8c3ad86c8cfc2f6aac91d2ba29790f Mon Sep 17 00:00:00 2001 From: Ai Chao Date: Fri, 19 Apr 2024 16:21:59 +0800 Subject: ALSA: hda/realtek - Enable audio jacks of Haier Boyue G42 with ALC269VC The Haier Boyue G42 with ALC269VC cannot detect the MIC of headset, the line out and internal speaker until ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS quirk applied. Signed-off-by: Ai Chao Cc: Message-ID: <20240419082159.476879-1-aichao@kylinos.cn> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f71460847492..70d80b6af3fe 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10497,6 +10497,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d05, 0x115c, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x121b, "TongFang GMxAGxx", ALC269_FIXUP_NO_SHUTUP), SND_PCI_QUIRK(0x1d05, 0x1387, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1d17, 0x3288, "Haier Boyue G42", ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS), SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), -- cgit v1.2.3 From f25f17dc5c6a5e3f2014d44635f0c0db45224efe Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 19 Apr 2024 12:04:39 +0200 Subject: ALSA: seq: ump: Fix conversion from MIDI2 to MIDI1 UMP messages The conversion from MIDI2 to MIDI1 UMP messages had a leftover artifact (superfluous bit shift), and this resulted in the bogus type check, leading to empty outputs. Let's fix it. Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") Cc: Link: https://github.com/alsa-project/alsa-utils/issues/262 Message-ID: <20240419100442.14806-1-tiwai@suse.de> Signed-off-by: Takashi Iwai --- sound/core/seq/seq_ump_convert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c index b141024830ec..ee6ac649df83 100644 --- a/sound/core/seq/seq_ump_convert.c +++ b/sound/core/seq/seq_ump_convert.c @@ -428,7 +428,7 @@ static int cvt_ump_midi2_to_midi1(struct snd_seq_client *dest, midi1->note.group = midi2->note.group; midi1->note.status = midi2->note.status; midi1->note.channel = midi2->note.channel; - switch (midi2->note.status << 4) { + switch (midi2->note.status) { case UMP_MSG_STATUS_NOTE_ON: case UMP_MSG_STATUS_NOTE_OFF: midi1->note.note = midi2->note.note; -- cgit v1.2.3 From b552f63cd43735048bbe9bfbb7a9dcfce166fbdd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Apr 2024 21:02:12 +0200 Subject: thermal/debugfs: Add missing count increment to thermal_debug_tz_trip_up() The count field in struct trip_stats, representing the number of times the zone temperature was above the trip point, needs to be incremented in thermal_debug_tz_trip_up(), for two reasons. First, if a trip point is crossed on the way up for the first time, thermal_debug_update_temp() called from update_temperature() does not see it because it has not been added to trips_crossed[] array in the thermal zone's struct tz_debugfs object yet. Therefore, when thermal_debug_tz_trip_up() is called after that, the trip point's count value is 0, and the attempt to divide by it during the average temperature computation leads to a divide error which causes the kernel to crash. Setting the count to 1 before the division by incrementing it fixes this problem. Second, if a trip point is crossed on the way up, but it has been crossed on the way up already before, its count value needs to be incremented to make a record of the fact that the zone temperature is above the trip now. Without doing that, if the mitigations applied after crossing the trip cause the zone temperature to drop below its threshold, the count will not be updated for this episode at all and the average temperature in the trip statistics record will be somewhat higher than it should be. Fixes: 7ef01f228c9f ("thermal/debugfs: Add thermal debugfs information for mitigation episodes") Cc :6.8+ # 6.8+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_debugfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c index c617e8b9f0dd..d78d54ae2605 100644 --- a/drivers/thermal/thermal_debugfs.c +++ b/drivers/thermal/thermal_debugfs.c @@ -616,6 +616,7 @@ void thermal_debug_tz_trip_up(struct thermal_zone_device *tz, tze->trip_stats[trip_id].timestamp = now; tze->trip_stats[trip_id].max = max(tze->trip_stats[trip_id].max, temperature); tze->trip_stats[trip_id].min = min(tze->trip_stats[trip_id].min, temperature); + tze->trip_stats[trip_id].count++; tze->trip_stats[trip_id].avg = tze->trip_stats[trip_id].avg + (temperature - tze->trip_stats[trip_id].avg) / tze->trip_stats[trip_id].count; -- cgit v1.2.3 From 01bc4fda9ea0a6b52f12326486f07a4910666cf6 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 19 Apr 2024 17:32:57 +0800 Subject: blk-iocost: do not WARN if iocg was already offlined In iocg_pay_debt(), warn is triggered if 'active_list' is empty, which is intended to confirm iocg is active when it has debt. However, warn can be triggered during a blkcg or disk removal, if iocg_waitq_timer_fn() is run at that time: WARNING: CPU: 0 PID: 2344971 at block/blk-iocost.c:1402 iocg_pay_debt+0x14c/0x190 Call trace: iocg_pay_debt+0x14c/0x190 iocg_kick_waitq+0x438/0x4c0 iocg_waitq_timer_fn+0xd8/0x130 __run_hrtimer+0x144/0x45c __hrtimer_run_queues+0x16c/0x244 hrtimer_interrupt+0x2cc/0x7b0 The warn in this situation is meaningless. Since this iocg is being removed, the state of the 'active_list' is irrelevant, and 'waitq_timer' is canceled after removing 'active_list' in ioc_pd_free(), which ensures iocg is freed after iocg_waitq_timer_fn() returns. Therefore, add the check if iocg was already offlined to avoid warn when removing a blkcg or disk. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240419093257.3004211-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index baa20c85799d..690ca99dfaca 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1439,8 +1439,11 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); - /* make sure that nobody messed with @iocg */ - WARN_ON_ONCE(list_empty(&iocg->active_list)); + /* + * make sure that nobody messed with @iocg. Check iocg->pd.online + * to avoid warn when removing blkcg or disk. + */ + WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); WARN_ON_ONCE(iocg->inuse > 1); iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); -- cgit v1.2.3 From 50449ca66cc5a8cbc64749cf4b9f3d3fc5f4b457 Mon Sep 17 00:00:00 2001 From: Yaxiong Tian Date: Wed, 17 Apr 2024 10:52:48 +0800 Subject: arm64: hibernate: Fix level3 translation fault in swsusp_save() On arm64 machines, swsusp_save() faults if it attempts to access MEMBLOCK_NOMAP memory ranges. This can be reproduced in QEMU using UEFI when booting with rodata=off debug_pagealloc=off and CONFIG_KFENCE=n: Unable to handle kernel paging request at virtual address ffffff8000000000 Mem abort info: ESR = 0x0000000096000007 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x07: level 3 translation fault Data abort info: ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp=00000000eeb0b000 [ffffff8000000000] pgd=180000217fff9803, p4d=180000217fff9803, pud=180000217fff9803, pmd=180000217fff8803, pte=0000000000000000 Internal error: Oops: 0000000096000007 [#1] SMP Internal error: Oops: 0000000096000007 [#1] SMP Modules linked in: xt_multiport ipt_REJECT nf_reject_ipv4 xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c iptable_filter bpfilter rfkill at803x snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg dwmac_generic stmmac_platform snd_hda_codec stmmac joydev pcs_xpcs snd_hda_core phylink ppdev lp parport ramoops reed_solomon ip_tables x_tables nls_iso8859_1 vfat multipath linear amdgpu amdxcp drm_exec gpu_sched drm_buddy hid_generic usbhid hid radeon video drm_suballoc_helper drm_ttm_helper ttm i2c_algo_bit drm_display_helper cec drm_kms_helper drm CPU: 0 PID: 3663 Comm: systemd-sleep Not tainted 6.6.2+ #76 Source Version: 4e22ed63a0a48e7a7cff9b98b7806d8d4add7dc0 Hardware name: Greatwall GW-XXXXXX-XXX/GW-XXXXXX-XXX, BIOS KunLun BIOS V4.0 01/19/2021 pstate: 600003c5 (nZCv DAIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : swsusp_save+0x280/0x538 lr : swsusp_save+0x280/0x538 sp : ffffffa034a3fa40 x29: ffffffa034a3fa40 x28: ffffff8000001000 x27: 0000000000000000 x26: ffffff8001400000 x25: ffffffc08113e248 x24: 0000000000000000 x23: 0000000000080000 x22: ffffffc08113e280 x21: 00000000000c69f2 x20: ffffff8000000000 x19: ffffffc081ae2500 x18: 0000000000000000 x17: 6666662074736420 x16: 3030303030303030 x15: 3038666666666666 x14: 0000000000000b69 x13: ffffff9f89088530 x12: 00000000ffffffea x11: 00000000ffff7fff x10: 00000000ffff7fff x9 : ffffffc08193f0d0 x8 : 00000000000bffe8 x7 : c0000000ffff7fff x6 : 0000000000000001 x5 : ffffffa0fff09dc8 x4 : 0000000000000000 x3 : 0000000000000027 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 000000000000004e Call trace: swsusp_save+0x280/0x538 swsusp_arch_suspend+0x148/0x190 hibernation_snapshot+0x240/0x39c hibernate+0xc4/0x378 state_store+0xf0/0x10c kobj_attr_store+0x14/0x24 The reason is swsusp_save() -> copy_data_pages() -> page_is_saveable() -> kernel_page_present() assuming that a page is always present when can_set_direct_map() is false (all of rodata_full, debug_pagealloc_enabled() and arm64_kfence_can_set_direct_map() false), irrespective of the MEMBLOCK_NOMAP ranges. Such MEMBLOCK_NOMAP regions should not be saved during hibernation. This problem was introduced by changes to the pfn_valid() logic in commit a7d9f306ba70 ("arm64: drop pfn_valid_within() and simplify pfn_valid()"). Similar to other architectures, drop the !can_set_direct_map() check in kernel_page_present() so that page_is_savable() skips such pages. Fixes: a7d9f306ba70 ("arm64: drop pfn_valid_within() and simplify pfn_valid()") Cc: # 5.14.x Suggested-by: Mike Rapoport Suggested-by: Catalin Marinas Co-developed-by: xiongxin Signed-off-by: xiongxin Signed-off-by: Yaxiong Tian Acked-by: Mike Rapoport (IBM) Link: https://lore.kernel.org/r/20240417025248.386622-1-tianyaxiong@kylinos.cn [catalin.marinas@arm.com: rework commit message] Signed-off-by: Catalin Marinas --- arch/arm64/mm/pageattr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0c4e3ecf989d..0e270a1c51e6 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -219,9 +219,6 @@ bool kernel_page_present(struct page *page) pte_t *ptep; unsigned long addr = (unsigned long)page_address(page); - if (!can_set_direct_map()) - return true; - pgdp = pgd_offset_k(addr); if (pgd_none(READ_ONCE(*pgdp))) return false; -- cgit v1.2.3 From 366c5cec9ce473f68925d703a07cac56e1d16956 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sat, 20 Apr 2024 08:34:09 -0400 Subject: MAINTAINERS: update to working email address jejb@linux.ibm.com no longer works. Signed-off-by: James Bottomley --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c23fda1aa1f0..e5121ff5e60e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11995,7 +11995,7 @@ F: include/keys/encrypted-type.h F: security/keys/encrypted-keys/ KEYS-TRUSTED -M: James Bottomley +M: James Bottomley M: Jarkko Sakkinen M: Mimi Zohar L: linux-integrity@vger.kernel.org @@ -19669,7 +19669,7 @@ F: drivers/scsi/sg.c F: include/scsi/sg.h SCSI SUBSYSTEM -M: "James E.J. Bottomley" +M: "James E.J. Bottomley" M: "Martin K. Petersen" L: linux-scsi@vger.kernel.org S: Maintained -- cgit v1.2.3 From ed30a4a51bb196781c8058073ea720133a65596f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 21 Apr 2024 12:35:54 -0700 Subject: Linux 6.9-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 59d8a7f95d0a..43b10f3d438c 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- cgit v1.2.3