summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-class-led-trigger-netdev20
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu1
-rw-r--r--Documentation/ABI/testing/sysfs-driver-chromeos-acpi17
-rw-r--r--Documentation/RCU/lockdep-splat.rst2
-rw-r--r--Documentation/RCU/rculist_nulls.rst38
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt68
-rw-r--r--Documentation/arch/arm64/silicon-errata.rst19
-rw-r--r--Documentation/arch/arm64/sme.rst2
-rw-r--r--Documentation/arch/index.rst2
-rw-r--r--Documentation/arch/s390/3270.ChangeLog (renamed from Documentation/s390/3270.ChangeLog)0
-rw-r--r--Documentation/arch/s390/3270.rst (renamed from Documentation/s390/3270.rst)4
-rw-r--r--Documentation/arch/s390/cds.rst (renamed from Documentation/s390/cds.rst)2
-rw-r--r--Documentation/arch/s390/common_io.rst (renamed from Documentation/s390/common_io.rst)2
-rw-r--r--Documentation/arch/s390/config3270.sh (renamed from Documentation/s390/config3270.sh)0
-rw-r--r--Documentation/arch/s390/driver-model.rst (renamed from Documentation/s390/driver-model.rst)0
-rw-r--r--Documentation/arch/s390/features.rst (renamed from Documentation/s390/features.rst)0
-rw-r--r--Documentation/arch/s390/index.rst (renamed from Documentation/s390/index.rst)0
-rw-r--r--Documentation/arch/s390/monreader.rst (renamed from Documentation/s390/monreader.rst)0
-rw-r--r--Documentation/arch/s390/pci.rst (renamed from Documentation/s390/pci.rst)2
-rw-r--r--Documentation/arch/s390/qeth.rst (renamed from Documentation/s390/qeth.rst)0
-rw-r--r--Documentation/arch/s390/s390dbf.rst (renamed from Documentation/s390/s390dbf.rst)0
-rw-r--r--Documentation/arch/s390/text_files.rst (renamed from Documentation/s390/text_files.rst)0
-rw-r--r--Documentation/arch/s390/vfio-ap-locking.rst (renamed from Documentation/s390/vfio-ap-locking.rst)0
-rw-r--r--Documentation/arch/s390/vfio-ap.rst (renamed from Documentation/s390/vfio-ap.rst)0
-rw-r--r--Documentation/arch/s390/vfio-ccw.rst (renamed from Documentation/s390/vfio-ccw.rst)2
-rw-r--r--Documentation/arch/s390/zfcpdump.rst (renamed from Documentation/s390/zfcpdump.rst)0
-rw-r--r--Documentation/arch/x86/boot.rst2
-rw-r--r--Documentation/core-api/cpu_hotplug.rst10
-rw-r--r--Documentation/devicetree/bindings/arm/pmu.yaml5
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml1
-rw-r--r--Documentation/driver-api/s390-drivers.rst4
-rw-r--r--Documentation/filesystems/fscrypt.rst164
-rw-r--r--Documentation/filesystems/idmappings.rst14
-rw-r--r--Documentation/filesystems/locking.rst23
-rw-r--r--Documentation/filesystems/tmpfs.rst38
-rw-r--r--Documentation/filesystems/vfs.rst12
-rw-r--r--Documentation/firmware-guide/acpi/chromeos-acpi-device.rst5
-rw-r--r--Documentation/scheduler/sched-design-CFS.rst2
-rw-r--r--MAINTAINERS70
-rw-r--r--Makefile2
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/alpha/kernel/osf_sys.c2
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/include/asm/syscall.h3
-rw-r--r--arch/arm/kernel/entry-common.S1
-rw-r--r--arch/arm/kernel/hw_breakpoint.c8
-rw-r--r--arch/arm/kernel/ptrace.c5
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/Kconfig6
-rw-r--r--arch/arm64/include/asm/acpi.h3
-rw-r--r--arch/arm64/include/asm/efi.h18
-rw-r--r--arch/arm64/include/asm/hwcap.h1
-rw-r--r--arch/arm64/include/asm/kernel-pgtable.h27
-rw-r--r--arch/arm64/include/asm/mmu.h1
-rw-r--r--arch/arm64/include/asm/pgtable.h8
-rw-r--r--arch/arm64/include/asm/sdei.h6
-rw-r--r--arch/arm64/include/asm/sysreg.h6
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/arm64/include/uapi/asm/hwcap.h1
-rw-r--r--arch/arm64/kernel/cpufeature.c9
-rw-r--r--arch/arm64/kernel/cpuidle.c2
-rw-r--r--arch/arm64/kernel/cpuinfo.c1
-rw-r--r--arch/arm64/kernel/efi.c16
-rw-r--r--arch/arm64/kernel/entry-common.c32
-rw-r--r--arch/arm64/kernel/entry.S27
-rw-r--r--arch/arm64/kernel/fpsimd.c22
-rw-r--r--arch/arm64/kernel/head.S4
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c4
-rw-r--r--arch/arm64/kernel/idreg-override.c6
-rw-r--r--arch/arm64/kernel/pci.c2
-rw-r--r--arch/arm64/kernel/ptrace.c3
-rw-r--r--arch/arm64/kernel/sdei.c3
-rw-r--r--arch/arm64/kernel/smp.c8
-rw-r--r--arch/arm64/kernel/syscall.c33
-rw-r--r--arch/arm64/kernel/vdso/vdso.lds.S2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/list_debug.c8
-rw-r--r--arch/arm64/mm/init.c27
-rw-r--r--arch/arm64/mm/proc.S2
-rw-r--r--arch/ia64/include/asm/acpi.h6
-rw-r--r--arch/ia64/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/loongarch/Kconfig2
-rw-r--r--arch/loongarch/Makefile2
-rw-r--r--arch/loongarch/include/asm/Kbuild1
-rw-r--r--arch/loongarch/include/asm/fpu.h22
-rw-r--r--arch/loongarch/include/asm/local.h4
-rw-r--r--arch/loongarch/include/asm/ptrace.h2
-rw-r--r--arch/loongarch/include/asm/smp.h2
-rw-r--r--arch/loongarch/kernel/fpu.S2
-rw-r--r--arch/loongarch/kernel/hw_breakpoint.c3
-rw-r--r--arch/loongarch/kernel/mcount.S2
-rw-r--r--arch/loongarch/kernel/mcount_dyn.S1
-rw-r--r--arch/loongarch/kernel/process.c7
-rw-r--r--arch/loongarch/kernel/ptrace.c4
-rw-r--r--arch/loongarch/kernel/smp.c2
-rw-r--r--arch/loongarch/kernel/traps.c14
-rw-r--r--arch/loongarch/lib/clear_user.S2
-rw-r--r--arch/loongarch/lib/copy_user.S2
-rw-r--r--arch/loongarch/lib/memcpy.S2
-rw-r--r--arch/loongarch/lib/memmove.S2
-rw-r--r--arch/loongarch/lib/memset.S2
-rw-r--r--arch/loongarch/lib/unaligned.S1
-rw-r--r--arch/loongarch/mm/page.S2
-rw-r--r--arch/loongarch/mm/tlbex.S1
-rw-r--r--arch/m68k/configs/amiga_defconfig2
-rw-r--r--arch/m68k/configs/apollo_defconfig2
-rw-r--r--arch/m68k/configs/atari_defconfig2
-rw-r--r--arch/m68k/configs/bvme6000_defconfig2
-rw-r--r--arch/m68k/configs/hp300_defconfig2
-rw-r--r--arch/m68k/configs/mac_defconfig2
-rw-r--r--arch/m68k/configs/multi_defconfig2
-rw-r--r--arch/m68k/configs/mvme147_defconfig2
-rw-r--r--arch/m68k/configs/mvme16x_defconfig2
-rw-r--r--arch/m68k/configs/q40_defconfig2
-rw-r--r--arch/m68k/configs/sun3_defconfig2
-rw-r--r--arch/m68k/configs/sun3x_defconfig2
-rw-r--r--arch/m68k/include/asm/Kbuild1
-rw-r--r--arch/m68k/include/asm/div64.h3
-rw-r--r--arch/m68k/include/asm/string.h1
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/m68k/lib/divsi3.S2
-rw-r--r--arch/m68k/lib/modsi3.S2
-rw-r--r--arch/m68k/lib/mulsi3.S2
-rw-r--r--arch/m68k/lib/udivsi3.S2
-rw-r--r--arch/m68k/lib/umodsi3.S2
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/include/asm/local.h4
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/mm/book3s64/subpage_prot.c1
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c2
-rw-r--r--arch/riscv/Kconfig8
-rw-r--r--arch/riscv/include/asm/efi.h10
-rw-r--r--arch/riscv/include/asm/vector.h3
-rw-r--r--arch/riscv/include/uapi/asm/ptrace.h1
-rw-r--r--arch/riscv/kernel/ptrace.c69
-rw-r--r--arch/riscv/mm/pageattr.c1
-rw-r--r--arch/s390/Kbuild2
-rw-r--r--arch/s390/Kconfig31
-rw-r--r--arch/s390/Makefile1
-rw-r--r--arch/s390/boot/startup.c22
-rw-r--r--arch/s390/configs/debug_defconfig1
-rw-r--r--arch/s390/configs/defconfig1
-rw-r--r--arch/s390/crypto/paes_s390.c2
-rw-r--r--arch/s390/hypfs/Makefile11
-rw-r--r--arch/s390/hypfs/hypfs.h10
-rw-r--r--arch/s390/hypfs/hypfs_dbfs.c31
-rw-r--r--arch/s390/hypfs/hypfs_diag.c453
-rw-r--r--arch/s390/hypfs/hypfs_diag.h35
-rw-r--r--arch/s390/hypfs/hypfs_diag_fs.c393
-rw-r--r--arch/s390/hypfs/hypfs_vm.c175
-rw-r--r--arch/s390/hypfs/hypfs_vm.h50
-rw-r--r--arch/s390/hypfs/hypfs_vm_fs.c139
-rw-r--r--arch/s390/hypfs/inode.c39
-rw-r--r--arch/s390/include/asm/Kbuild1
-rw-r--r--arch/s390/include/asm/debug.h4
-rw-r--r--arch/s390/include/asm/diag.h3
-rw-r--r--arch/s390/include/asm/ftrace.h17
-rw-r--r--arch/s390/include/asm/kfence.h2
-rw-r--r--arch/s390/include/asm/kvm_host.h3
-rw-r--r--arch/s390/include/asm/maccess.h3
-rw-r--r--arch/s390/include/asm/page.h12
-rw-r--r--arch/s390/include/asm/pfault.h26
-rw-r--r--arch/s390/include/asm/pgtable.h2
-rw-r--r--arch/s390/include/asm/sclp.h1
-rw-r--r--arch/s390/include/asm/setup.h9
-rw-r--r--arch/s390/include/asm/uv.h6
-rw-r--r--arch/s390/include/uapi/asm/pkey.h2
-rw-r--r--arch/s390/kernel/Makefile7
-rw-r--r--arch/s390/kernel/asm-offsets.c9
-rw-r--r--arch/s390/kernel/cert_store.c811
-rw-r--r--arch/s390/kernel/diag.c25
-rw-r--r--arch/s390/kernel/ebcdic.c2
-rw-r--r--arch/s390/kernel/entry.S2
-rw-r--r--arch/s390/kernel/ipl.c32
-rw-r--r--arch/s390/kernel/machine_kexec.c1
-rw-r--r--arch/s390/kernel/machine_kexec_file.c4
-rw-r--r--arch/s390/kernel/mcount.S65
-rw-r--r--arch/s390/kernel/setup.c3
-rw-r--r--arch/s390/kernel/smp.c16
-rw-r--r--arch/s390/kernel/sthyi.c4
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/kernel/uv.c3
-rw-r--r--arch/s390/kvm/kvm-s390.h12
-rw-r--r--arch/s390/kvm/pv.c14
-rw-r--r--arch/s390/lib/mem.S2
-rw-r--r--arch/s390/lib/tishift.S2
-rw-r--r--arch/s390/mm/Makefile1
-rw-r--r--arch/s390/mm/cmm.c2
-rw-r--r--arch/s390/mm/dump_pagetables.c2
-rw-r--r--arch/s390/mm/extmem.c9
-rw-r--r--arch/s390/mm/fault.c228
-rw-r--r--arch/s390/mm/gmap.c5
-rw-r--r--arch/s390/mm/maccess.c7
-rw-r--r--arch/s390/mm/pfault.c248
-rw-r--r--arch/s390/mm/vmem.c4
-rw-r--r--arch/s390/pci/pci_clp.c7
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/um/drivers/mconsole_kern.c4
-rw-r--r--arch/um/drivers/vector_user.c4
-rw-r--r--arch/um/include/shared/user.h1
-rw-r--r--arch/um/os-Linux/umid.c6
-rw-r--r--arch/x86/Kconfig38
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/efi_mixed.S107
-rw-r--r--arch/x86/boot/compressed/error.c2
-rw-r--r--arch/x86/boot/compressed/error.h2
-rw-r--r--arch/x86/boot/compressed/head_32.S32
-rw-r--r--arch/x86/boot/compressed/head_64.S280
-rw-r--r--arch/x86/boot/compressed/misc.c44
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/pgtable.h10
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c87
-rw-r--r--arch/x86/boot/compressed/sev.c114
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/events/amd/ibs.c186
-rw-r--r--arch/x86/events/core.c11
-rw-r--r--arch/x86/events/intel/core.c54
-rw-r--r--arch/x86/events/intel/cstate.c12
-rw-r--r--arch/x86/events/intel/ds.c9
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c2
-rw-r--r--arch/x86/events/msr.c10
-rw-r--r--arch/x86/events/perf_event.h2
-rw-r--r--arch/x86/events/rapl.c2
-rw-r--r--arch/x86/include/asm/acpi.h24
-rw-r--r--arch/x86/include/asm/boot.h8
-rw-r--r--arch/x86/include/asm/div64.h6
-rw-r--r--arch/x86/include/asm/efi.h7
-rw-r--r--arch/x86/include/asm/intel-family.h18
-rw-r--r--arch/x86/include/asm/local.h4
-rw-r--r--arch/x86/include/asm/mem_encrypt.h6
-rw-r--r--arch/x86/include/asm/microcode.h155
-rw-r--r--arch/x86/include/asm/microcode_amd.h60
-rw-r--r--arch/x86/include/asm/microcode_intel.h88
-rw-r--r--arch/x86/include/asm/paravirt.h7
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/qspinlock.h7
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h2
-rw-r--r--arch/x86/include/asm/sev.h6
-rw-r--r--arch/x86/include/asm/topology.h4
-rw-r--r--arch/x86/include/asm/uv/bios.h4
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/kernel/alternative.c1
-rw-r--r--arch/x86/kernel/amd_nb.c8
-rw-r--r--arch/x86/kernel/apic/ipi.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c5
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/intel.c176
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c2
-rw-r--r--arch/x86/kernel/cpu/mce/core.c35
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c19
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h6
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile4
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c133
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c17
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c304
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h131
-rw-r--r--arch/x86/kernel/fpu/context.h3
-rw-r--r--arch/x86/kernel/fpu/core.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c7
-rw-r--r--arch/x86/kernel/head_64.S32
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/paravirt.c11
-rw-r--r--arch/x86/kernel/sev.c6
-rw-r--r--arch/x86/kernel/smpboot.c19
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c13
-rw-r--r--arch/x86/platform/efi/memmap.c2
-rw-r--r--arch/x86/platform/uv/uv_nmi.c12
-rw-r--r--arch/x86/purgatory/purgatory.c1
-rw-r--r--arch/x86/xen/enlighten_pv.c10
-rw-r--r--arch/x86/xen/mmu_pv.c18
-rw-r--r--arch/x86/xen/setup.c4
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--block/bdev.c69
-rw-r--r--block/disk-events.c23
-rw-r--r--block/genhd.c45
-rw-r--r--block/ioctl.c9
-rw-r--r--block/partitions/core.c5
-rw-r--r--crypto/af_alg.c4
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/acpi/Kconfig2
-rw-r--r--drivers/acpi/Makefile1
-rw-r--r--drivers/acpi/ac.c27
-rw-r--r--drivers/acpi/acpi_cmos_rtc.c25
-rw-r--r--drivers/acpi/acpi_extlog.c2
-rw-r--r--drivers/acpi/acpi_processor.c124
-rw-r--r--drivers/acpi/acpi_tad.c27
-rw-r--r--drivers/acpi/acpi_video.c26
-rw-r--r--drivers/acpi/acpica/acdebug.h2
-rw-r--r--drivers/acpi/acpica/acglobal.h1
-rw-r--r--drivers/acpi/acpica/aclocal.h38
-rw-r--r--drivers/acpi/acpica/acpredef.h3
-rw-r--r--drivers/acpi/acpica/dbcmds.c58
-rw-r--r--drivers/acpi/acpica/dbinput.c8
-rw-r--r--drivers/acpi/acpica/dswstate.c4
-rw-r--r--drivers/acpi/acpica/exserial.c3
-rw-r--r--drivers/acpi/acpica/psopcode.c2
-rw-r--r--drivers/acpi/acpica/utdebug.c5
-rw-r--r--drivers/acpi/arm64/Makefile1
-rw-r--r--drivers/acpi/arm64/amba.c (renamed from drivers/acpi/acpi_amba.c)2
-rw-r--r--drivers/acpi/arm64/init.c2
-rw-r--r--drivers/acpi/arm64/init.h1
-rw-r--r--drivers/acpi/arm64/iort.c5
-rw-r--r--drivers/acpi/battery.c24
-rw-r--r--drivers/acpi/bus.c33
-rw-r--r--drivers/acpi/hed.c15
-rw-r--r--drivers/acpi/internal.h16
-rw-r--r--drivers/acpi/nfit/core.c42
-rw-r--r--drivers/acpi/prmt.c8
-rw-r--r--drivers/acpi/processor_core.c29
-rw-r--r--drivers/acpi/processor_pdc.c97
-rw-r--r--drivers/acpi/resource.c6
-rw-r--r--drivers/acpi/scan.c4
-rw-r--r--drivers/acpi/thermal.c470
-rw-r--r--drivers/acpi/video_detect.c27
-rw-r--r--drivers/acpi/x86/s2idle.c99
-rw-r--r--drivers/acpi/x86/utils.c35
-rw-r--r--drivers/android/binderfs.c8
-rw-r--r--drivers/block/amiflop.c1
-rw-r--r--drivers/block/floppy.c2
-rw-r--r--drivers/block/loop.c6
-rw-r--r--drivers/block/nbd.c8
-rw-r--r--drivers/clk/clk-devres.c13
-rw-r--r--drivers/clk/keystone/syscon-clk.c6
-rw-r--r--drivers/crypto/Kconfig7
-rw-r--r--drivers/crypto/caam/ctrl.c4
-rw-r--r--drivers/dma-buf/sw_sync.c18
-rw-r--r--drivers/edac/amd64_edac.c15
-rw-r--r--drivers/edac/i10nm_base.c2
-rw-r--r--drivers/eisa/eisa-bus.c2
-rw-r--r--drivers/firmware/arm_sdei.c19
-rw-r--r--drivers/firmware/efi/libstub/Makefile3
-rw-r--r--drivers/firmware/efi/libstub/arm64-stub.c2
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c2
-rw-r--r--drivers/firmware/efi/libstub/efistub.h3
-rw-r--r--drivers/firmware/efi/libstub/randomalloc.c10
-rw-r--r--drivers/firmware/efi/libstub/x86-5lvl.c95
-rw-r--r--drivers/firmware/efi/libstub/x86-stub.c283
-rw-r--r--drivers/firmware/efi/libstub/x86-stub.h17
-rw-r--r--drivers/firmware/efi/libstub/zboot.c2
-rw-r--r--drivers/firmware/efi/riscv-runtime.c15
-rw-r--r--drivers/firmware/efi/runtime-wrappers.c358
-rw-r--r--drivers/gpio/gpio-sim.c15
-rw-r--r--drivers/gpu/drm/bridge/samsung-dsim.c27
-rw-r--r--drivers/gpu/drm/drm_probe_helper.c68
-rw-r--r--drivers/gpu/drm/i915/display/intel_hotplug.c4
-rw-r--r--drivers/gpu/drm/i915/gt/uc/intel_huc.c2
-rw-r--r--drivers/gpu/drm/i915/i915_driver.c33
-rw-r--r--drivers/gpu/drm/panfrost/panfrost_devfreq.c2
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_bo.c6
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_bo.h8
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_drv.h12
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c35
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_kms.c6
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c3
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_shader.c3
-rw-r--r--drivers/hwmon/k10temp.c8
-rw-r--r--drivers/idle/intel_idle.c10
-rw-r--r--drivers/infiniband/hw/qib/qib_fs.c3
-rw-r--r--drivers/irqchip/irq-bcm6345-l1.c1
-rw-r--r--drivers/irqchip/irq-bcm7038-l1.c1
-rw-r--r--drivers/irqchip/irq-brcmstb-l2.c1
-rw-r--r--drivers/irqchip/irq-gic-pm.c2
-rw-r--r--drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c2
-rw-r--r--drivers/irqchip/irq-i8259.c2
-rw-r--r--drivers/irqchip/irq-imx-intmux.c3
-rw-r--r--drivers/irqchip/irq-imx-irqsteer.c3
-rw-r--r--drivers/irqchip/irq-imx-mu-msi.c4
-rw-r--r--drivers/irqchip/irq-keystone.c2
-rw-r--r--drivers/irqchip/irq-loongson-eiointc.c2
-rw-r--r--drivers/irqchip/irq-loongson-htvec.c1
-rw-r--r--drivers/irqchip/irq-loongson-pch-pic.c2
-rw-r--r--drivers/irqchip/irq-ls-scfg-msi.c3
-rw-r--r--drivers/irqchip/irq-madera.c4
-rw-r--r--drivers/irqchip/irq-meson-gpio.c5
-rw-r--r--drivers/irqchip/irq-mips-gic.c2
-rw-r--r--drivers/irqchip/irq-mvebu-sei.c3
-rw-r--r--drivers/irqchip/irq-orion.c3
-rw-r--r--drivers/irqchip/irq-pruss-intc.c6
-rw-r--r--drivers/irqchip/irq-qcom-mpm.c2
-rw-r--r--drivers/irqchip/irq-renesas-intc-irqpin.c1
-rw-r--r--drivers/irqchip/irq-st.c2
-rw-r--r--drivers/irqchip/irq-stm32-exti.c3
-rw-r--r--drivers/irqchip/irq-sunxi-nmi.c1
-rw-r--r--drivers/irqchip/irq-tb10x.c1
-rw-r--r--drivers/irqchip/irq-ti-sci-inta.c4
-rw-r--r--drivers/irqchip/irq-ti-sci-intr.c4
-rw-r--r--drivers/irqchip/irq-uniphier-aidet.c1
-rw-r--r--drivers/irqchip/irq-xtensa-pic.c1
-rw-r--r--drivers/irqchip/irqchip.c2
-rw-r--r--drivers/irqchip/qcom-pdc.c1
-rw-r--r--drivers/leds/trigger/ledtrig-netdev.c8
-rw-r--r--drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc.c2
-rw-r--r--drivers/misc/ibmasm/ibmasmfs.c2
-rw-r--r--drivers/misc/ibmvmc.c2
-rw-r--r--drivers/misc/lkdtm/bugs.c51
-rw-r--r--drivers/net/bonding/bond_alb.c6
-rw-r--r--drivers/net/can/vxcan.c7
-rw-r--r--drivers/net/dsa/mt7530.c4
-rw-r--r--drivers/net/dsa/mt7530.h2
-rw-r--r--drivers/net/dsa/ocelot/felix_vsc9959.c3
-rw-r--r--drivers/net/ethernet/broadcom/bgmac.c2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x.h2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c21
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c32
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c17
-rw-r--r--drivers/net/ethernet/broadcom/genet/bcmmii.c2
-rw-r--r--drivers/net/ethernet/broadcom/tg3.c5
-rw-r--r--drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c2
-rw-r--r--drivers/net/ethernet/ibm/ibmveth.c2
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c5
-rw-r--r--drivers/net/ethernet/intel/ice/ice_base.c3
-rw-r--r--drivers/net/ethernet/intel/ice/ice_sriov.c8
-rw-r--r--drivers/net/ethernet/intel/ice/ice_vf_lib.c34
-rw-r--r--drivers/net/ethernet/intel/ice/ice_vf_lib.h1
-rw-r--r--drivers/net/ethernet/intel/ice/ice_virtchnl.c1
-rw-r--r--drivers/net/ethernet/intel/igb/igb_ptp.c24
-rw-r--r--drivers/net/ethernet/intel/igc/igc_defines.h2
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c3
-rw-r--r--drivers/net/ethernet/mediatek/mtk_wed.c12
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/pci.c8
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/reg.h9
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.c4
-rw-r--r--drivers/net/ethernet/sfc/falcon/selftest.c2
-rw-r--r--drivers/net/ethernet/sfc/selftest.c2
-rw-r--r--drivers/net/ethernet/sfc/siena/selftest.c2
-rw-r--r--drivers/net/ipvlan/ipvlan_main.c3
-rw-r--r--drivers/net/mdio/mdio-bitbang.c4
-rw-r--r--drivers/net/phy/phy.c11
-rw-r--r--drivers/net/phy/sfp-bus.c10
-rw-r--r--drivers/net/veth.c5
-rw-r--r--drivers/net/wireless/intel/iwlwifi/Kconfig1
-rw-r--r--drivers/of/dynamic.c31
-rw-r--r--drivers/of/kexec.c3
-rw-r--r--drivers/of/platform.c4
-rw-r--r--drivers/of/unittest.c4
-rw-r--r--drivers/perf/Kconfig2
-rw-r--r--drivers/perf/alibaba_uncore_drw_pmu.c27
-rw-r--r--drivers/perf/amlogic/meson_ddr_pmu_core.c2
-rw-r--r--drivers/perf/arm-cci.c5
-rw-r--r--drivers/perf/arm-cmn.c161
-rw-r--r--drivers/perf/arm_dmc620_pmu.c19
-rw-r--r--drivers/perf/arm_dsu_pmu.c2
-rw-r--r--drivers/perf/arm_pmu.c10
-rw-r--r--drivers/perf/arm_pmu_acpi.c137
-rw-r--r--drivers/perf/arm_pmu_platform.c1
-rw-r--r--drivers/perf/arm_pmuv3.c33
-rw-r--r--drivers/perf/arm_smmuv3_pmu.c47
-rw-r--r--drivers/perf/arm_spe_pmu.c3
-rw-r--r--drivers/perf/fsl_imx8_ddr_perf.c48
-rw-r--r--drivers/perf/fsl_imx9_ddr_perf.c4
-rw-r--r--drivers/perf/hisilicon/hisi_pcie_pmu.c17
-rw-r--r--drivers/perf/marvell_cn10k_ddr_pmu.c3
-rw-r--r--drivers/perf/marvell_cn10k_tad_pmu.c3
-rw-r--r--drivers/perf/xgene_pmu.c4
-rw-r--r--drivers/pinctrl/pinctrl-amd.c30
-rw-r--r--drivers/pinctrl/renesas/pinctrl-rza2.c17
-rw-r--r--drivers/pinctrl/renesas/pinctrl-rzg2l.c15
-rw-r--r--drivers/pinctrl/renesas/pinctrl-rzv2m.c13
-rw-r--r--drivers/platform/chrome/chromeos_acpi.c33
-rw-r--r--drivers/platform/chrome/cros_ec_lpc.c4
-rw-r--r--drivers/platform/mellanox/mlxbf-tmfifo.c1
-rw-r--r--drivers/platform/x86/ideapad-laptop.c5
-rw-r--r--drivers/platform/x86/intel/ifs/load.c7
-rw-r--r--drivers/platform/x86/intel/pmc/core.c2
-rw-r--r--drivers/platform/x86/intel/speed_select_if/isst_if_common.c2
-rw-r--r--drivers/platform/x86/lenovo-ymc.c7
-rw-r--r--drivers/pnp/pnpacpi/core.c3
-rw-r--r--drivers/powercap/intel_rapl_common.c2
-rw-r--r--drivers/powercap/intel_rapl_msr.c2
-rw-r--r--drivers/s390/block/dasd.c7
-rw-r--r--drivers/s390/block/dcssblk.c26
-rw-r--r--drivers/s390/block/scm_blk.c2
-rw-r--r--drivers/s390/char/sclp_cmd.c4
-rw-r--r--drivers/s390/char/sclp_early.c1
-rw-r--r--drivers/s390/char/vmcp.c2
-rw-r--r--drivers/s390/char/zcore.c2
-rw-r--r--drivers/s390/crypto/Makefile2
-rw-r--r--drivers/s390/crypto/ap_bus.c34
-rw-r--r--drivers/s390/crypto/ap_bus.h20
-rw-r--r--drivers/s390/crypto/ap_queue.c47
-rw-r--r--drivers/s390/crypto/pkey_api.c119
-rw-r--r--drivers/s390/crypto/vfio_ap_ops.c164
-rw-r--r--drivers/s390/crypto/vfio_ap_private.h6
-rw-r--r--drivers/s390/crypto/zcrypt_cex2a.c227
-rw-r--r--drivers/s390/crypto/zcrypt_cex2a.h134
-rw-r--r--drivers/s390/crypto/zcrypt_cex2c.c421
-rw-r--r--drivers/s390/crypto/zcrypt_cex2c.h18
-rw-r--r--drivers/s390/crypto/zcrypt_ep11misc.c347
-rw-r--r--drivers/s390/crypto/zcrypt_ep11misc.h24
-rw-r--r--drivers/s390/crypto/zcrypt_msgtype50.c64
-rw-r--r--drivers/s390/crypto/zcrypt_msgtype50.h3
-rw-r--r--drivers/s390/crypto/zcrypt_msgtype6.c14
-rw-r--r--drivers/scsi/raid_class.c48
-rw-r--r--drivers/scsi/snic/snic_disc.c3
-rw-r--r--drivers/soc/fsl/qe/qe.c4
-rw-r--r--drivers/spi/spi-cadence.c19
-rw-r--r--drivers/spi/spi-stm32.c6
-rw-r--r--drivers/thermal/intel/intel_tcc_cooling.c2
-rw-r--r--drivers/thermal/thermal_core.c22
-rw-r--r--drivers/thermal/thermal_core.h4
-rw-r--r--drivers/thermal/thermal_trip.c18
-rw-r--r--drivers/ufs/core/ufs-mcq.c6
-rw-r--r--drivers/ufs/host/ufs-qcom.c2
-rw-r--r--drivers/usb/core/devio.c16
-rw-r--r--drivers/usb/gadget/function/f_fs.c3
-rw-r--r--drivers/usb/gadget/legacy/inode.c3
-rw-r--r--drivers/xen/Kconfig7
-rw-r--r--drivers/xen/grant-table.c2
-rw-r--r--drivers/xen/privcmd.c282
-rw-r--r--drivers/xen/xen-acpi-processor.c7
-rw-r--r--drivers/xen/xen-pciback/conf_space_quirks.h2
-rw-r--r--drivers/xen/xen-pciback/pciback.h3
-rw-r--r--drivers/xen/xenbus/xenbus_probe_frontend.c2
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c4
-rw-r--r--drivers/zorro/names.c1
-rw-r--r--fs/9p/vfs_inode.c8
-rw-r--r--fs/9p/vfs_inode_dotl.c12
-rw-r--r--fs/Kconfig16
-rw-r--r--fs/adfs/inode.c4
-rw-r--r--fs/affs/amigaffs.c6
-rw-r--r--fs/affs/file.c14
-rw-r--r--fs/affs/inode.c16
-rw-r--r--fs/affs/namei.c20
-rw-r--r--fs/afs/dynroot.c2
-rw-r--r--fs/afs/inode.c8
-rw-r--r--fs/aio.c20
-rw-r--r--fs/attr.c22
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs/root.c6
-rw-r--r--fs/autofs/waitq.c5
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/dir.c16
-rw-r--r--fs/bfs/inode.c5
-rw-r--r--fs/binfmt_misc.c3
-rw-r--r--fs/btrfs/Kconfig4
-rw-r--r--fs/btrfs/accessors.h23
-rw-r--r--fs/btrfs/backref.c29
-rw-r--r--fs/btrfs/block-group.c73
-rw-r--r--fs/btrfs/block-group.h4
-rw-r--r--fs/btrfs/btrfs_inode.h6
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c175
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-io-tree.c14
-rw-r--r--fs/btrfs/extent-io-tree.h6
-rw-r--r--fs/btrfs/extent-tree.c288
-rw-r--r--fs/btrfs/extent-tree.h16
-rw-r--r--fs/btrfs/extent_io.c680
-rw-r--r--fs/btrfs/extent_io.h35
-rw-r--r--fs/btrfs/file-item.c34
-rw-r--r--fs/btrfs/file-item.h6
-rw-r--r--fs/btrfs/file.c46
-rw-r--r--fs/btrfs/free-space-cache.c18
-rw-r--r--fs/btrfs/free-space-tree.c13
-rw-r--r--fs/btrfs/fs.h15
-rw-r--r--fs/btrfs/inode.c850
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/messages.c16
-rw-r--r--fs/btrfs/messages.h2
-rw-r--r--fs/btrfs/ordered-data.c8
-rw-r--r--fs/btrfs/print-tree.c10
-rw-r--r--fs/btrfs/qgroup.c19
-rw-r--r--fs/btrfs/raid56.c29
-rw-r--r--fs/btrfs/raid56.h1
-rw-r--r--fs/btrfs/reflink.c3
-rw-r--r--fs/btrfs/relocation.c33
-rw-r--r--fs/btrfs/scrub.c240
-rw-r--r--fs/btrfs/send.c6
-rw-r--r--fs/btrfs/space-info.c85
-rw-r--r--fs/btrfs/super.c11
-rw-r--r--fs/btrfs/sysfs.c7
-rw-r--r--fs/btrfs/tests/extent-io-tests.c302
-rw-r--r--fs/btrfs/tests/extent-map-tests.c412
-rw-r--r--fs/btrfs/transaction.c42
-rw-r--r--fs/btrfs/tree-log.c12
-rw-r--r--fs/btrfs/volumes.c98
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zoned.c292
-rw-r--r--fs/btrfs/zoned.h28
-rw-r--r--fs/buffer.c18
-rw-r--r--fs/cachefiles/io.c16
-rw-r--r--fs/ceph/acl.c2
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/inode.c18
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/xattr.c2
-rw-r--r--fs/coda/coda_linux.c3
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/coda/inode.c5
-rw-r--r--fs/configfs/inode.c7
-rw-r--r--fs/cramfs/inode.c11
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c16
-rw-r--r--fs/ecryptfs/crypto.c8
-rw-r--r--fs/ecryptfs/inode.c7
-rw-r--r--fs/ecryptfs/mmap.c5
-rw-r--r--fs/ecryptfs/read_write.c12
-rw-r--r--fs/efivarfs/file.c2
-rw-r--r--fs/efivarfs/inode.c2
-rw-r--r--fs/efs/inode.c4
-rw-r--r--fs/erofs/Kconfig16
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h2
-rw-r--r--fs/erofs/decompressor.c6
-rw-r--r--fs/erofs/decompressor_deflate.c247
-rw-r--r--fs/erofs/erofs_fs.h17
-rw-r--r--fs/erofs/inode.c14
-rw-r--r--fs/erofs/internal.h23
-rw-r--r--fs/erofs/super.c44
-rw-r--r--fs/erofs/xattr.c14
-rw-r--r--fs/erofs/zdata.c274
-rw-r--r--fs/erofs/zmap.c5
-rw-r--r--fs/eventfd.c2
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/exfat/exfat_fs.h2
-rw-r--r--fs/exfat/file.c6
-rw-r--r--fs/exfat/inode.c6
-rw-r--r--fs/exfat/namei.c26
-rw-r--r--fs/exfat/super.c42
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/dir.c6
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c12
-rw-r--r--fs/ext2/ioctl.c4
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c2
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/ext4.h90
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/extents.c12
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode-test.c6
-rw-r--r--fs/ext4/inode.c18
-rw-r--r--fs/ext4/ioctl.c9
-rw-r--r--fs/ext4/namei.c26
-rw-r--r--fs/ext4/super.c73
-rw-r--r--fs/ext4/xattr.c6
-rw-r--r--fs/f2fs/compress.c2
-rw-r--r--fs/f2fs/dir.c8
-rw-r--r--fs/f2fs/f2fs.h6
-rw-r--r--fs/f2fs/file.c22
-rw-r--r--fs/f2fs/gc.c8
-rw-r--r--fs/f2fs/inline.c2
-rw-r--r--fs/f2fs/inode.c10
-rw-r--r--fs/f2fs/namei.c12
-rw-r--r--fs/f2fs/recovery.c4
-rw-r--r--fs/f2fs/super.c9
-rw-r--r--fs/f2fs/xattr.c2
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/file.c2
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fat/misc.c10
-rw-r--r--fs/fcntl.c29
-rw-r--r--fs/file.c30
-rw-r--r--fs/file_table.c5
-rw-r--r--fs/freevxfs/vxfs_inode.c3
-rw-r--r--fs/fs-writeback.c4
-rw-r--r--fs/fs_context.c36
-rw-r--r--fs/fsopen.c106
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/dir.c10
-rw-r--r--fs/fuse/inode.c16
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c13
-rw-r--r--fs/gfs2/dir.c15
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glops.c4
-rw-r--r--fs/gfs2/inode.c16
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/super.c16
-rw-r--r--fs/gfs2/sys.c4
-rw-r--r--fs/gfs2/xattr.c8
-rw-r--r--fs/hfs/catalog.c8
-rw-r--r--fs/hfs/dir.c2
-rw-r--r--fs/hfs/inode.c13
-rw-r--r--fs/hfs/sysdep.c4
-rw-r--r--fs/hfsplus/catalog.c8
-rw-r--r--fs/hfsplus/dir.c6
-rw-r--r--fs/hfsplus/inode.c18
-rw-r--r--fs/hostfs/hostfs_kern.c3
-rw-r--r--fs/hpfs/dir.c8
-rw-r--r--fs/hpfs/inode.c6
-rw-r--r--fs/hpfs/namei.c29
-rw-r--r--fs/hpfs/super.c5
-rw-r--r--fs/hugetlbfs/inode.c12
-rw-r--r--fs/inode.c233
-rw-r--r--fs/internal.h4
-rw-r--r--fs/ioctl.c18
-rw-r--r--fs/iomap/buffered-io.c465
-rw-r--r--fs/iomap/direct-io.c163
-rw-r--r--fs/isofs/inode.c9
-rw-r--r--fs/isofs/rock.c16
-rw-r--r--fs/jffs2/dir.c24
-rw-r--r--fs/jffs2/file.c3
-rw-r--r--fs/jffs2/fs.c10
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/acl.c2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_imap.c8
-rw-r--r--fs/jfs/jfs_inode.c4
-rw-r--r--fs/jfs/namei.c24
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/xattr.c2
-rw-r--r--fs/kernel_read_file.c12
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/inode.c53
-rw-r--r--fs/libfs.c349
-rw-r--r--fs/locks.c47
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/minix/dir.c6
-rw-r--r--fs/minix/inode.c12
-rw-r--r--fs/minix/itree_common.c4
-rw-r--r--fs/minix/namei.c6
-rw-r--r--fs/namei.c5
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/direct.c26
-rw-r--r--fs/nfs/fscache.h4
-rw-r--r--fs/nfs/inode.c22
-rw-r--r--fs/nfs/namespace.c3
-rw-r--r--fs/nfs/nfs42proc.c5
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c18
-rw-r--r--fs/nfs/sysfs.c4
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfsctl.c3
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nilfs2/dir.c6
-rw-r--r--fs/nilfs2/inode.c12
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/nilfs2/namei.c8
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/nilfs2/super.c81
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/inode.c15
-rw-r--r--fs/ntfs/mft.c3
-rw-r--r--fs/ntfs3/file.c8
-rw-r--r--fs/ntfs3/frecord.c3
-rw-r--r--fs/ntfs3/inode.c14
-rw-r--r--fs/ntfs3/namei.c11
-rw-r--r--fs/ntfs3/super.c33
-rw-r--r--fs/ntfs3/xattr.c4
-rw-r--r--fs/ocfs2/acl.c6
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmglue.c7
-rw-r--r--fs/ocfs2/file.c18
-rw-r--r--fs/ocfs2/inode.c12
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/move_extents.c6
-rw-r--r--fs/ocfs2/namei.c21
-rw-r--r--fs/ocfs2/refcounttree.c14
-rw-r--r--fs/ocfs2/xattr.c6
-rw-r--r--fs/omfs/dir.c4
-rw-r--r--fs/omfs/inode.c9
-rw-r--r--fs/open.c54
-rw-r--r--fs/openpromfs/inode.c5
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/orangefs/namei.c2
-rw-r--r--fs/orangefs/orangefs-kernel.h2
-rw-r--r--fs/orangefs/orangefs-utils.c6
-rw-r--r--fs/overlayfs/file.c17
-rw-r--r--fs/overlayfs/inode.c2
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/util.c2
-rw-r--r--fs/pipe.c10
-rw-r--r--fs/posix_acl.c2
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/proc_net.c3
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/thread_self.c2
-rw-r--r--fs/pstore/Kconfig100
-rw-r--r--fs/pstore/inode.c6
-rw-r--r--fs/pstore/platform.c353
-rw-r--r--fs/pstore/ram.c11
-rw-r--r--fs/pstore/ram_core.c17
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/qnx6/inode.c3
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/ramfs/inode.c6
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/ioctl.c4
-rw-r--r--fs/reiserfs/journal.c4
-rw-r--r--fs/reiserfs/namei.c18
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c5
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/romfs/super.c13
-rw-r--r--fs/smb/client/cifsfs.c2
-rw-r--r--fs/smb/client/file.c4
-rw-r--r--fs/smb/client/fscache.h5
-rw-r--r--fs/smb/client/inode.c16
-rw-r--r--fs/smb/client/smb2ops.c3
-rw-r--r--fs/smb/server/smb2pdu.c30
-rw-r--r--fs/smb/server/vfs.c3
-rw-r--r--fs/splice.c64
-rw-r--r--fs/squashfs/inode.c2
-rw-r--r--fs/stack.c2
-rw-r--r--fs/stat.c65
-rw-r--r--fs/super.c829
-rw-r--r--fs/sysv/dir.c6
-rw-r--r--fs/sysv/ialloc.c2
-rw-r--r--fs/sysv/inode.c5
-rw-r--r--fs/sysv/itree.c7
-rw-r--r--fs/sysv/namei.c6
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/debug.c4
-rw-r--r--fs/ubifs/dir.c41
-rw-r--r--fs/ubifs/file.c31
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c4
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/udf/ialloc.c2
-rw-r--r--fs/udf/inode.c17
-rw-r--r--fs/udf/namei.c24
-rw-r--r--fs/udf/symlink.c2
-rw-r--r--fs/ufs/dir.c6
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c23
-rw-r--r--fs/ufs/namei.c8
-rw-r--r--fs/vboxsf/utils.c6
-rw-r--r--fs/verity/fsverity_private.h12
-rw-r--r--fs/verity/hash_algs.c8
-rw-r--r--fs/verity/init.c56
-rw-r--r--fs/verity/open.c18
-rw-r--r--fs/verity/signature.c77
-rw-r--r--fs/verity/verify.c11
-rw-r--r--fs/xattr.c83
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c5
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/scrub/fscounters.c188
-rw-r--r--fs/xfs/scrub/scrub.c6
-rw-r--r--fs/xfs/scrub/scrub.h1
-rw-r--r--fs/xfs/scrub/trace.h26
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c6
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_inode.c3
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_iops.c25
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_super.c138
-rw-r--r--fs/zonefs/file.c2
-rw-r--r--fs/zonefs/super.c8
-rw-r--r--include/acpi/acnames.h1
-rw-r--r--include/acpi/acpi_bus.h17
-rw-r--r--include/acpi/acpixf.h4
-rw-r--r--include/acpi/actbl1.h2
-rw-r--r--include/acpi/actbl2.h76
-rw-r--r--include/acpi/actbl3.h4
-rw-r--r--include/acpi/pdc_intel.h36
-rw-r--r--include/acpi/platform/aclinux.h1
-rw-r--r--include/acpi/platform/aczephyr.h3
-rw-r--r--include/acpi/proc_cap_intel.h40
-rw-r--r--include/drm/display/drm_dp.h2
-rw-r--r--include/drm/drm_probe_helper.h1
-rw-r--r--include/linux/acpi.h12
-rw-r--r--include/linux/acpi_iort.h1
-rw-r--r--include/linux/arm_sdei.h2
-rw-r--r--include/linux/blk_types.h1
-rw-r--r--include/linux/blkdev.h15
-rw-r--r--include/linux/cgroup-defs.h2
-rw-r--r--include/linux/clk.h80
-rw-r--r--include/linux/compiler_attributes.h26
-rw-r--r--include/linux/compiler_types.h28
-rw-r--r--include/linux/completion.h1
-rw-r--r--include/linux/cpu.h26
-rw-r--r--include/linux/cpu_smt.h33
-rw-r--r--include/linux/cpuhotplug.h2
-rw-r--r--include/linux/decompress/mm.h2
-rw-r--r--include/linux/dm-verity-loadpin.h2
-rw-r--r--include/linux/dnotify.h4
-rw-r--r--include/linux/efi.h53
-rw-r--r--include/linux/filelock.h12
-rw-r--r--include/linux/fs.h231
-rw-r--r--include/linux/fs_context.h6
-rw-r--r--include/linux/fs_stack.h2
-rw-r--r--include/linux/huge_mm.h3
-rw-r--r--include/linux/iomap.h3
-rw-r--r--include/linux/list.h89
-rw-r--r--include/linux/lsm_hook_defs.h1
-rw-r--r--include/linux/mm.h21
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--include/linux/notifier.h11
-rw-r--r--include/linux/nsproxy.h7
-rw-r--r--include/linux/pagemap.h82
-rw-r--r--include/linux/pagewalk.h11
-rw-r--r--include/linux/pci_ids.h2
-rw-r--r--include/linux/perf/arm_pmu.h1
-rw-r--r--include/linux/perf_event.h36
-rw-r--r--include/linux/pipe_fs_i.h4
-rw-r--r--include/linux/raid_class.h4
-rw-r--r--include/linux/rbtree_augmented.h26
-rw-r--r--include/linux/rculist_nulls.h4
-rw-r--r--include/linux/rcupdate_trace.h1
-rw-r--r--include/linux/rcupdate_wait.h5
-rw-r--r--include/linux/sched.h21
-rw-r--r--include/linux/sched/task.h38
-rw-r--r--include/linux/security.h6
-rw-r--r--include/linux/seq_file.h7
-rw-r--r--include/linux/shmem_fs.h31
-rw-r--r--include/linux/srcutiny.h4
-rw-r--r--include/linux/swait.h2
-rw-r--r--include/linux/syscalls.h20
-rw-r--r--include/linux/thermal.h9
-rw-r--r--include/linux/torture.h7
-rw-r--r--include/linux/trace_events.h11
-rw-r--r--include/linux/uio.h9
-rw-r--r--include/linux/wait.h3
-rw-r--r--include/linux/writeback.h5
-rw-r--r--include/linux/xattr.h10
-rw-r--r--include/net/bonding.h11
-rw-r--r--include/net/inet_sock.h2
-rw-r--r--include/net/ip.h15
-rw-r--r--include/net/mac80211.h1
-rw-r--r--include/net/netfilter/nf_tables.h6
-rw-r--r--include/net/rtnetlink.h4
-rw-r--r--include/net/sock.h7
-rw-r--r--include/trace/events/btrfs.h30
-rw-r--r--include/trace/events/erofs.h16
-rw-r--r--include/uapi/asm-generic/unistd.h5
-rw-r--r--include/uapi/linux/btrfs_tree.h6
-rw-r--r--include/uapi/linux/elf.h1
-rw-r--r--include/uapi/linux/mount.h3
-rw-r--r--include/uapi/linux/perf_event.h3
-rw-r--r--include/uapi/linux/quota.h1
-rw-r--r--include/uapi/linux/seccomp.h4
-rw-r--r--include/uapi/linux/stddef.h4
-rw-r--r--include/uapi/xen/privcmd.h14
-rw-r--r--include/xen/events.h1
-rw-r--r--init/Kconfig1
-rw-r--r--init/do_mounts.c38
-rw-r--r--io_uring/rw.c58
-rw-r--r--ipc/mqueue.c23
-rw-r--r--kernel/bpf/inode.c6
-rw-r--r--kernel/cgroup/cgroup.c34
-rw-r--r--kernel/cpu.c144
-rw-r--r--kernel/entry/common.c3
-rw-r--r--kernel/events/core.c24
-rw-r--r--kernel/events/ring_buffer.c5
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/irq/chip.c11
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/manage.c26
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/kallsyms.c27
-rw-r--r--kernel/kallsyms_selftest.c23
-rw-r--r--kernel/locking/locktorture.c12
-rw-r--r--kernel/locking/qspinlock_paravirt.h20
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/rcu/rcu.h8
-rw-r--r--kernel/rcu/rcuscale.c83
-rw-r--r--kernel/rcu/rcutorture.c7
-rw-r--r--kernel/rcu/refscale.c37
-rw-r--r--kernel/rcu/tasks.h136
-rw-r--r--kernel/rcu/tree.c16
-rw-r--r--kernel/rcu/tree_nocb.h4
-rw-r--r--kernel/scftorture.c12
-rw-r--r--kernel/sched/completion.c26
-rw-r--r--kernel/sched/core.c501
-rw-r--r--kernel/sched/debug.c49
-rw-r--r--kernel/sched/fair.c1347
-rw-r--r--kernel/sched/features.h24
-rw-r--r--kernel/sched/psi.c2
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h72
-rw-r--r--kernel/sched/swait.c8
-rw-r--r--kernel/sched/topology.c15
-rw-r--r--kernel/sched/wait.c5
-rw-r--r--kernel/seccomp.c84
-rw-r--r--kernel/smp.c13
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/torture.c39
-rw-r--r--kernel/trace/trace.c70
-rw-r--r--kernel/trace/trace.h10
-rw-r--r--kernel/trace/trace_events_synth.c103
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--lib/Kconfig.debug21
-rw-r--r--lib/Kconfig.ubsan10
-rw-r--r--lib/Makefile2
-rw-r--r--lib/clz_ctz.c32
-rw-r--r--lib/iov_iter.c43
-rw-r--r--lib/list_debug.c16
-rw-r--r--lib/locking-selftest.c135
-rw-r--r--lib/maple_tree.c7
-rw-r--r--lib/radix-tree.c1
-rw-r--r--mm/Makefile2
-rw-r--r--mm/damon/vaddr.c2
-rw-r--r--mm/filemap.c65
-rw-r--r--mm/folio-compat.c2
-rw-r--r--mm/gup.c30
-rw-r--r--mm/hmm.c1
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/internal.h17
-rw-r--r--mm/khugepaged.c13
-rw-r--r--mm/ksm.c25
-rw-r--r--mm/madvise.c9
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/memory.c5
-rw-r--r--mm/mempolicy.c22
-rw-r--r--mm/migrate_device.c1
-rw-r--r--mm/mincore.c1
-rw-r--r--mm/mlock.c1
-rw-r--r--mm/mprotect.c1
-rw-r--r--mm/page-writeback.c49
-rw-r--r--mm/pagewalk.c36
-rw-r--r--mm/readahead.c13
-rw-r--r--mm/shmem.c857
-rw-r--r--mm/shmem_quota.c350
-rw-r--r--mm/truncate.c4
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c14
-rw-r--r--net/batman-adv/bat_v_elp.c3
-rw-r--r--net/batman-adv/bat_v_ogm.c7
-rw-r--r--net/batman-adv/hard-interface.c14
-rw-r--r--net/batman-adv/netlink.c3
-rw-r--r--net/batman-adv/soft-interface.c3
-rw-r--r--net/batman-adv/translation-table.c1
-rw-r--r--net/batman-adv/types.h6
-rw-r--r--net/can/isotp.c22
-rw-r--r--net/can/raw.c35
-rw-r--r--net/core/rtnetlink.c25
-rw-r--r--net/dccp/ipv4.c4
-rw-r--r--net/dccp/proto.c20
-rw-r--r--net/devlink/leftover.c3
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/mac80211/rx.c12
-rw-r--r--net/netfilter/nf_tables_api.c23
-rw-r--r--net/netfilter/nft_set_hash.c3
-rw-r--r--net/netfilter/nft_set_pipapo.c13
-rw-r--r--net/netfilter/nft_set_rbtree.c3
-rw-r--r--net/sched/sch_api.c53
-rw-r--r--net/sctp/socket.c4
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/xprtrdma/verbs.c9
-rw-r--r--samples/ftrace/ftrace-direct-modify.c4
-rw-r--r--samples/ftrace/ftrace-direct-multi-modify.c4
-rw-r--r--samples/ftrace/ftrace-direct-multi.c2
-rw-r--r--samples/ftrace/ftrace-direct-too.c2
-rw-r--r--samples/ftrace/ftrace-direct.c2
-rwxr-xr-xscripts/checkpatch.pl24
-rw-r--r--scripts/gcc-plugins/gcc-common.h4
-rw-r--r--security/Kconfig.hardening23
-rw-r--r--security/apparmor/apparmorfs.c11
-rw-r--r--security/apparmor/policy_unpack.c11
-rw-r--r--security/inode.c2
-rw-r--r--security/integrity/ima/ima_policy.c4
-rw-r--r--security/integrity/platform_certs/load_ipl_s390.c4
-rw-r--r--security/loadpin/loadpin.c3
-rw-r--r--security/security.c14
-rw-r--r--security/selinux/hooks.c22
-rw-r--r--security/selinux/selinuxfs.c2
-rw-r--r--security/selinux/ss/policydb.c2
-rw-r--r--security/smack/smack_lsm.c51
-rw-r--r--sound/pci/ymfpci/ymfpci.c10
-rw-r--r--sound/soc/amd/yc/acp6x-mach.c9
-rw-r--r--sound/soc/codecs/cs35l41.c2
-rw-r--r--sound/soc/codecs/cs35l56-i2c.c9
-rw-r--r--sound/soc/codecs/cs35l56-spi.c9
-rw-r--r--sound/soc/codecs/cs35l56.c31
-rw-r--r--sound/soc/codecs/tas2781-comlib.c19
-rw-r--r--sound/soc/sof/ipc4-pcm.c3
-rw-r--r--tools/arch/x86/include/uapi/asm/unistd_32.h3
-rw-r--r--tools/arch/x86/include/uapi/asm/unistd_64.h3
-rw-r--r--tools/include/linux/compiler.h18
-rw-r--r--tools/perf/bench/Build1
-rw-r--r--tools/perf/bench/bench.h1
-rw-r--r--tools/perf/bench/sched-seccomp-notify.c178
-rw-r--r--tools/perf/builtin-bench.c1
-rw-r--r--tools/power/x86/turbostat/turbostat.c2
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/arm64/Makefile2
-rw-r--r--tools/testing/selftests/arm64/abi/hwcap.c319
-rw-r--r--tools/testing/selftests/arm64/abi/syscall-abi.c38
-rw-r--r--tools/testing/selftests/arm64/bti/Makefile45
-rw-r--r--tools/testing/selftests/arm64/bti/compiler.h21
-rw-r--r--tools/testing/selftests/arm64/bti/gen/.gitignore2
-rw-r--r--tools/testing/selftests/arm64/bti/system.c4
-rw-r--r--tools/testing/selftests/arm64/bti/system.h4
-rw-r--r--tools/testing/selftests/arm64/bti/test.c1
-rw-r--r--tools/testing/selftests/arm64/fp/vec-syscfg.c127
-rw-r--r--tools/testing/selftests/arm64/signal/test_signals_utils.h27
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/zt_regs.c1
-rw-r--r--tools/testing/selftests/cachestat/test_cachestat.c80
-rw-r--r--tools/testing/selftests/cgroup/test_kmem.c4
-rw-r--r--tools/testing/selftests/drivers/net/bonding/Makefile4
-rwxr-xr-xtools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh4
-rwxr-xr-xtools/testing/selftests/drivers/net/bonding/bond_macvlan.sh99
-rwxr-xr-xtools/testing/selftests/drivers/net/bonding/bond_options.sh3
-rw-r--r--tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh158
-rw-r--r--tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh118
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh16
-rw-r--r--tools/testing/selftests/fchmodat2/.gitignore (renamed from tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore)2
-rw-r--r--tools/testing/selftests/fchmodat2/Makefile6
-rw-r--r--tools/testing/selftests/fchmodat2/fchmodat2_test.c142
-rw-r--r--tools/testing/selftests/filelock/Makefile5
-rw-r--r--tools/testing/selftests/filelock/ofdlocks.c132
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/snapshot1.tc31
-rw-r--r--tools/testing/selftests/kselftest_harness.h11
-rw-r--r--tools/testing/selftests/mm/hmm-tests.c7
-rw-r--r--tools/testing/selftests/net/.gitignore2
-rw-r--r--tools/testing/selftests/nolibc/nolibc-test.c1
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/configcheck.sh61
-rw-r--r--tools/testing/selftests/rcutorture/bin/functions.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh8
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck.sh44
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-remote.sh12
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh12
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/mkinitrd.sh17
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/torture.sh127
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh5
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TASKS031
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE011
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh5
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/TRACE012
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh5
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT1
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh5
-rw-r--r--tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT2
-rw-r--r--tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh5
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/Makefile17
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/delay.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/export.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/mutex.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/percpu.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/preempt.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/rcupdate.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/sched.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/smp.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/workqueue.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/uapi/linux/types.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore2
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/kconfig.h1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h152
-rwxr-xr-xtools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk376
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/assume.h17
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h41
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/bug_on.h14
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/combined_source.c14
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h28
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/include_srcu.c32
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/int_typedefs.h34
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h221
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.c12
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.h58
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/percpu.h93
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.c79
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.h59
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/simple_sync_srcu.c51
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/workqueues.h103
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore2
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/Makefile12
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/assert_end.fail1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force.fail1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force2.fail1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force3.fail1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/main.pass0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/test.c73
-rwxr-xr-xtools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/test_script.sh103
-rw-r--r--tools/testing/selftests/seccomp/seccomp_bpf.c67
1205 files changed, 19596 insertions, 13861 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-netdev b/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
index 78b62a23b14a..f6d9d72ce77b 100644
--- a/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-netdev
@@ -13,7 +13,7 @@ Description:
Specifies the duration of the LED blink in milliseconds.
Defaults to 50 ms.
- With hw_control ON, the interval value MUST be set to the
+ When offloaded is true, the interval value MUST be set to the
default value and cannot be changed.
Trying to set any value in this specific mode will return
an EINVAL error.
@@ -44,8 +44,8 @@ Description:
If set to 1, the LED will blink for the milliseconds specified
in interval to signal transmission.
- With hw_control ON, the blink interval is controlled by hardware
- and won't reflect the value set in interval.
+ When offloaded is true, the blink interval is controlled by
+ hardware and won't reflect the value set in interval.
What: /sys/class/leds/<led>/rx
Date: Dec 2017
@@ -59,21 +59,21 @@ Description:
If set to 1, the LED will blink for the milliseconds specified
in interval to signal reception.
- With hw_control ON, the blink interval is controlled by hardware
- and won't reflect the value set in interval.
+ When offloaded is true, the blink interval is controlled by
+ hardware and won't reflect the value set in interval.
-What: /sys/class/leds/<led>/hw_control
+What: /sys/class/leds/<led>/offloaded
Date: Jun 2023
KernelVersion: 6.5
Contact: linux-leds@vger.kernel.org
Description:
- Communicate whether the LED trigger modes are driven by hardware
- or software fallback is used.
+ Communicate whether the LED trigger modes are offloaded to
+ hardware or whether software fallback is used.
If 0, the LED is using software fallback to blink.
- If 1, the LED is using hardware control to blink and signal the
- requested modes.
+ If 1, the LED blinking in requested mode is offloaded to
+ hardware.
What: /sys/class/leds/<led>/link_10
Date: Jun 2023
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 77942eedf4f6..183a07c4f191 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -556,6 +556,7 @@ Description: Control Symmetric Multi Threading (SMT)
================ =========================================
"on" SMT is enabled
"off" SMT is disabled
+ "<N>" SMT is enabled with N threads per core.
"forceoff" SMT is force disabled. Cannot be changed.
"notsupported" SMT is not supported by the CPU
"notimplemented" SMT runtime toggling is not
diff --git a/Documentation/ABI/testing/sysfs-driver-chromeos-acpi b/Documentation/ABI/testing/sysfs-driver-chromeos-acpi
index c308926e1568..d46b1c85840d 100644
--- a/Documentation/ABI/testing/sysfs-driver-chromeos-acpi
+++ b/Documentation/ABI/testing/sysfs-driver-chromeos-acpi
@@ -1,4 +1,5 @@
What: /sys/bus/platform/devices/GGL0001:*/BINF.2
+ /sys/bus/platform/devices/GOOG0016:*/BINF.2
Date: May 2022
KernelVersion: 5.19
Description:
@@ -10,6 +11,7 @@ Description:
== ===============================
What: /sys/bus/platform/devices/GGL0001:*/BINF.3
+ /sys/bus/platform/devices/GOOG0016:*/BINF.3
Date: May 2022
KernelVersion: 5.19
Description:
@@ -23,6 +25,7 @@ Description:
== =====================================
What: /sys/bus/platform/devices/GGL0001:*/CHSW
+ /sys/bus/platform/devices/GOOG0016:*/CHSW
Date: May 2022
KernelVersion: 5.19
Description:
@@ -38,6 +41,7 @@ Description:
==== ===========================================
What: /sys/bus/platform/devices/GGL0001:*/FMAP
+ /sys/bus/platform/devices/GOOG0016:*/FMAP
Date: May 2022
KernelVersion: 5.19
Description:
@@ -45,6 +49,7 @@ Description:
processor firmware flashmap.
What: /sys/bus/platform/devices/GGL0001:*/FRID
+ /sys/bus/platform/devices/GOOG0016:*/FRID
Date: May 2022
KernelVersion: 5.19
Description:
@@ -52,6 +57,7 @@ Description:
main processor firmware.
What: /sys/bus/platform/devices/GGL0001:*/FWID
+ /sys/bus/platform/devices/GOOG0016:*/FWID
Date: May 2022
KernelVersion: 5.19
Description:
@@ -59,6 +65,7 @@ Description:
main processor firmware.
What: /sys/bus/platform/devices/GGL0001:*/GPIO.X/GPIO.0
+ /sys/bus/platform/devices/GOOG0016:*/GPIO.X/GPIO.0
Date: May 2022
KernelVersion: 5.19
Description:
@@ -73,6 +80,7 @@ Description:
=========== ==================================
What: /sys/bus/platform/devices/GGL0001:*/GPIO.X/GPIO.1
+ /sys/bus/platform/devices/GOOG0016:*/GPIO.X/GPIO.1
Date: May 2022
KernelVersion: 5.19
Description:
@@ -84,6 +92,7 @@ Description:
== =======================
What: /sys/bus/platform/devices/GGL0001:*/GPIO.X/GPIO.2
+ /sys/bus/platform/devices/GOOG0016:*/GPIO.X/GPIO.2
Date: May 2022
KernelVersion: 5.19
Description:
@@ -91,18 +100,21 @@ Description:
controller.
What: /sys/bus/platform/devices/GGL0001:*/GPIO.X/GPIO.3
+ /sys/bus/platform/devices/GOOG0016:*/GPIO.X/GPIO.3
Date: May 2022
KernelVersion: 5.19
Description:
Returns name of the GPIO controller.
What: /sys/bus/platform/devices/GGL0001:*/HWID
+ /sys/bus/platform/devices/GOOG0016:*/HWID
Date: May 2022
KernelVersion: 5.19
Description:
Returns hardware ID for the Chromebook.
What: /sys/bus/platform/devices/GGL0001:*/MECK
+ /sys/bus/platform/devices/GOOG0016:*/MECK
Date: May 2022
KernelVersion: 5.19
Description:
@@ -113,6 +125,7 @@ Description:
present, or if the firmware was unable to read the extended registers, this buffer size can be zero.
What: /sys/bus/platform/devices/GGL0001:*/VBNV.0
+ /sys/bus/platform/devices/GOOG0016:*/VBNV.0
Date: May 2022
KernelVersion: 5.19
Description:
@@ -122,6 +135,7 @@ Description:
clock data).
What: /sys/bus/platform/devices/GGL0001:*/VBNV.1
+ /sys/bus/platform/devices/GOOG0016:*/VBNV.1
Date: May 2022
KernelVersion: 5.19
Description:
@@ -129,9 +143,10 @@ Description:
storage block.
What: /sys/bus/platform/devices/GGL0001:*/VDAT
+ /sys/bus/platform/devices/GOOG0016:*/VDAT
Date: May 2022
KernelVersion: 5.19
Description:
Returns the verified boot data block shared between the
firmware verification step and the kernel verification step
- (binary).
+ (hex dump).
diff --git a/Documentation/RCU/lockdep-splat.rst b/Documentation/RCU/lockdep-splat.rst
index 2a5c79db57dc..bcbc4b3c88d7 100644
--- a/Documentation/RCU/lockdep-splat.rst
+++ b/Documentation/RCU/lockdep-splat.rst
@@ -10,7 +10,7 @@ misuses of the RCU API, most notably using one of the rcu_dereference()
family to access an RCU-protected pointer without the proper protection.
When such misuse is detected, an lockdep-RCU splat is emitted.
-The usual cause of a lockdep-RCU slat is someone accessing an
+The usual cause of a lockdep-RCU splat is someone accessing an
RCU-protected data structure without either (1) being in the right kind of
RCU read-side critical section or (2) holding the right update-side lock.
This problem can therefore be serious: it might result in random memory
diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst
index 9a734bf54b76..21e40fcc08de 100644
--- a/Documentation/RCU/rculist_nulls.rst
+++ b/Documentation/RCU/rculist_nulls.rst
@@ -18,7 +18,16 @@ to solve following problem.
Without 'nulls', a typical RCU linked list managing objects which are
allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can use the following
-algorithms:
+algorithms. Following examples assume 'obj' is a pointer to such
+objects, which is having below type.
+
+::
+
+ struct object {
+ struct hlist_node obj_node;
+ atomic_t refcnt;
+ unsigned int key;
+ };
1) Lookup algorithm
-------------------
@@ -26,11 +35,13 @@ algorithms:
::
begin:
- rcu_read_lock()
+ rcu_read_lock();
obj = lockless_lookup(key);
if (obj) {
- if (!try_get_ref(obj)) // might fail for free objects
+ if (!try_get_ref(obj)) { // might fail for free objects
+ rcu_read_unlock();
goto begin;
+ }
/*
* Because a writer could delete object, and a writer could
* reuse these object before the RCU grace period, we
@@ -54,7 +65,7 @@ but a version with an additional memory barrier (smp_rmb())
struct hlist_node *node, *next;
for (pos = rcu_dereference((head)->first);
pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
- ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
+ ({ obj = hlist_entry(pos, typeof(*obj), obj_node); 1; });
pos = rcu_dereference(next))
if (obj->key == key)
return obj;
@@ -66,10 +77,10 @@ And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb()::
struct hlist_node *node;
for (pos = rcu_dereference((head)->first);
pos && ({ prefetch(pos->next); 1; }) &&
- ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
+ ({ obj = hlist_entry(pos, typeof(*obj), obj_node); 1; });
pos = rcu_dereference(pos->next))
- if (obj->key == key)
- return obj;
+ if (obj->key == key)
+ return obj;
return NULL;
Quoting Corey Minyard::
@@ -86,7 +97,7 @@ Quoting Corey Minyard::
2) Insertion algorithm
----------------------
-We need to make sure a reader cannot read the new 'obj->obj_next' value
+We need to make sure a reader cannot read the new 'obj->obj_node.next' value
and previous value of 'obj->key'. Otherwise, an item could be deleted
from a chain, and inserted into another chain. If new chain was empty
before the move, 'next' pointer is NULL, and lockless reader can not
@@ -129,8 +140,7 @@ very very fast (before the end of RCU grace period)
Avoiding extra smp_rmb()
========================
-With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
-and extra _release() in insert function.
+With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup().
For example, if we choose to store the slot number as the 'nulls'
end-of-list marker for each slot of the hash table, we can detect
@@ -142,6 +152,9 @@ the beginning. If the object was moved to the same chain,
then the reader doesn't care: It might occasionally
scan the list again without harm.
+Note that using hlist_nulls means the type of 'obj_node' field of
+'struct object' becomes 'struct hlist_nulls_node'.
+
1) lookup algorithm
-------------------
@@ -151,7 +164,7 @@ scan the list again without harm.
head = &table[slot];
begin:
rcu_read_lock();
- hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
+ hlist_nulls_for_each_entry_rcu(obj, node, head, obj_node) {
if (obj->key == key) {
if (!try_get_ref(obj)) { // might fail for free objects
rcu_read_unlock();
@@ -182,6 +195,9 @@ scan the list again without harm.
2) Insert algorithm
-------------------
+Same to the above one, but uses hlist_nulls_add_head_rcu() instead of
+hlist_add_head_rcu().
+
::
/*
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 722b6eca2e93..0c38a8af95ce 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -553,7 +553,7 @@
others).
ccw_timeout_log [S390]
- See Documentation/s390/common_io.rst for details.
+ See Documentation/arch/s390/common_io.rst for details.
cgroup_disable= [KNL] Disable a particular controller or optional feature
Format: {name of the controller(s) or feature(s) to disable}
@@ -598,7 +598,7 @@
Setting checkreqprot to 1 is deprecated.
cio_ignore= [S390]
- See Documentation/s390/common_io.rst for details.
+ See Documentation/arch/s390/common_io.rst for details.
clearcpuid=X[,X...] [X86]
Disable CPUID feature X for the kernel. See
@@ -2938,6 +2938,10 @@
locktorture.torture_type= [KNL]
Specify the locking implementation to test.
+ locktorture.writer_fifo= [KNL]
+ Run the write-side locktorture kthreads at
+ sched_set_fifo() real-time priority.
+
locktorture.verbose= [KNL]
Enable additional printk() statements.
@@ -4949,6 +4953,15 @@
test until boot completes in order to avoid
interference.
+ rcuscale.kfree_by_call_rcu= [KNL]
+ In kernels built with CONFIG_RCU_LAZY=y, test
+ call_rcu() instead of kfree_rcu().
+
+ rcuscale.kfree_mult= [KNL]
+ Instead of allocating an object of size kfree_obj,
+ allocate one of kfree_mult * sizeof(kfree_obj).
+ Defaults to 1.
+
rcuscale.kfree_rcu_test= [KNL]
Set to measure performance of kfree_rcu() flooding.
@@ -4974,6 +4987,12 @@
Number of loops doing rcuscale.kfree_alloc_num number
of allocations and frees.
+ rcuscale.minruntime= [KNL]
+ Set the minimum test run time in seconds. This
+ does not affect the data-collection interval,
+ but instead allows better measurement of things
+ like CPU consumption.
+
rcuscale.nreaders= [KNL]
Set number of RCU readers. The value -1 selects
N, where N is the number of CPUs. A value
@@ -4988,7 +5007,7 @@
the same as for rcuscale.nreaders.
N, where N is the number of CPUs
- rcuscale.perf_type= [KNL]
+ rcuscale.scale_type= [KNL]
Specify the RCU implementation to test.
rcuscale.shutdown= [KNL]
@@ -5004,6 +5023,11 @@
in microseconds. The default of zero says
no holdoff.
+ rcuscale.writer_holdoff_jiffies= [KNL]
+ Additional write-side holdoff between grace
+ periods, but in jiffies. The default of zero
+ says no holdoff.
+
rcutorture.fqs_duration= [KNL]
Set duration of force_quiescent_state bursts
in microseconds.
@@ -5285,6 +5309,13 @@
number avoids disturbing real-time workloads,
but lengthens grace periods.
+ rcupdate.rcu_task_lazy_lim= [KNL]
+ Number of callbacks on a given CPU that will
+ cancel laziness on that CPU. Use -1 to disable
+ cancellation of laziness, but be advised that
+ doing so increases the danger of OOM due to
+ callback flooding.
+
rcupdate.rcu_task_stall_info= [KNL]
Set initial timeout in jiffies for RCU task stall
informational messages, which give some indication
@@ -5314,6 +5345,29 @@
A change in value does not take effect until
the beginning of the next grace period.
+ rcupdate.rcu_tasks_lazy_ms= [KNL]
+ Set timeout in milliseconds RCU Tasks asynchronous
+ callback batching for call_rcu_tasks().
+ A negative value will take the default. A value
+ of zero will disable batching. Batching is
+ always disabled for synchronize_rcu_tasks().
+
+ rcupdate.rcu_tasks_rude_lazy_ms= [KNL]
+ Set timeout in milliseconds RCU Tasks
+ Rude asynchronous callback batching for
+ call_rcu_tasks_rude(). A negative value
+ will take the default. A value of zero will
+ disable batching. Batching is always disabled
+ for synchronize_rcu_tasks_rude().
+
+ rcupdate.rcu_tasks_trace_lazy_ms= [KNL]
+ Set timeout in milliseconds RCU Tasks
+ Trace asynchronous callback batching for
+ call_rcu_tasks_trace(). A negative value
+ will take the default. A value of zero will
+ disable batching. Batching is always disabled
+ for synchronize_rcu_tasks_trace().
+
rcupdate.rcu_self_test= [KNL]
Run the RCU early boot self tests
@@ -5522,6 +5576,10 @@
Useful for devices that are detected asynchronously
(e.g. USB and MMC devices).
+ rootwait= [KNL] Maximum time (in seconds) to wait for root device
+ to show up before attempting to mount the root
+ filesystem.
+
rproc_mem=nn[KMG][@address]
[KNL,ARM,CMA] Remoteproc physical memory block.
Memory area to be used by remote processor image,
@@ -6275,10 +6333,6 @@
-1: disable all critical trip points in all thermal zones
<degrees C>: override all critical trip points
- thermal.nocrt= [HW,ACPI]
- Set to disable actions on ACPI thermal zone
- critical and hot trip points.
-
thermal.off= [HW,ACPI]
1: disable ACPI thermal control
diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
index bedd3a1d7b42..e96f057ea2a0 100644
--- a/Documentation/arch/arm64/silicon-errata.rst
+++ b/Documentation/arch/arm64/silicon-errata.rst
@@ -63,6 +63,14 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #1902691 | ARM64_ERRATUM_1902691 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2051678 | ARM64_ERRATUM_2051678 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2077057 | ARM64_ERRATUM_2077057 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2658417 | ARM64_ERRATUM_2658417 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 |
@@ -109,14 +117,6 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A77 | #1508412 | ARM64_ERRATUM_1508412 |
+----------------+-----------------+-----------------+-----------------------------+
-| ARM | Cortex-A510 | #2051678 | ARM64_ERRATUM_2051678 |
-+----------------+-----------------+-----------------+-----------------------------+
-| ARM | Cortex-A510 | #2077057 | ARM64_ERRATUM_2077057 |
-+----------------+-----------------+-----------------+-----------------------------+
-| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
-+----------------+-----------------+-----------------+-----------------------------+
-| ARM | Cortex-A510 | #2658417 | ARM64_ERRATUM_2658417 |
-+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
@@ -198,6 +198,9 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+| Hisilicon | Hip08 SMMU PMCG | #162001900 | N/A |
+| | Hip09 SMMU PMCG | | |
++----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
+----------------+-----------------+-----------------+-----------------------------+
diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst
index ba529a1dc606..3d0e53ecac4f 100644
--- a/Documentation/arch/arm64/sme.rst
+++ b/Documentation/arch/arm64/sme.rst
@@ -322,7 +322,7 @@ The regset data starts with struct user_za_header, containing:
VL is supported.
* The size and layout of the payload depends on the header fields. The
- SME_PT_ZA_*() macros are provided to facilitate access to the data.
+ ZA_PT_ZA*() macros are provided to facilitate access to the data.
* In either case, for SETREGSET it is permissible to omit the payload, in which
case the vector length and flags are changed and PSTATE.ZA is set to 0
diff --git a/Documentation/arch/index.rst b/Documentation/arch/index.rst
index 8458b88e9b79..c9a209878cf3 100644
--- a/Documentation/arch/index.rst
+++ b/Documentation/arch/index.rst
@@ -21,7 +21,7 @@ implementation.
parisc/index
../powerpc/index
../riscv/index
- ../s390/index
+ s390/index
sh/index
sparc/index
x86/index
diff --git a/Documentation/s390/3270.ChangeLog b/Documentation/arch/s390/3270.ChangeLog
index ecaf60b6c381..ecaf60b6c381 100644
--- a/Documentation/s390/3270.ChangeLog
+++ b/Documentation/arch/s390/3270.ChangeLog
diff --git a/Documentation/s390/3270.rst b/Documentation/arch/s390/3270.rst
index e09e77954238..467eace91473 100644
--- a/Documentation/s390/3270.rst
+++ b/Documentation/arch/s390/3270.rst
@@ -116,7 +116,7 @@ Here are the installation steps in detail:
as a 3270, not a 3215.
5. Run the 3270 configuration script config3270. It is
- distributed in this same directory, Documentation/s390, as
+ distributed in this same directory, Documentation/arch/s390, as
config3270.sh. Inspect the output script it produces,
/tmp/mkdev3270, and then run that script. This will create the
necessary character special device files and make the necessary
@@ -125,7 +125,7 @@ Here are the installation steps in detail:
Then notify /sbin/init that /etc/inittab has changed, by issuing
the telinit command with the q operand::
- cd Documentation/s390
+ cd Documentation/arch/s390
sh config3270.sh
sh /tmp/mkdev3270
telinit q
diff --git a/Documentation/s390/cds.rst b/Documentation/arch/s390/cds.rst
index 7006d8209d2e..bcad2a14244a 100644
--- a/Documentation/s390/cds.rst
+++ b/Documentation/arch/s390/cds.rst
@@ -39,7 +39,7 @@ some of them are ESA/390 platform specific.
Note:
In order to write a driver for S/390, you also need to look into the interface
- described in Documentation/s390/driver-model.rst.
+ described in Documentation/arch/s390/driver-model.rst.
Note for porting drivers from 2.4:
diff --git a/Documentation/s390/common_io.rst b/Documentation/arch/s390/common_io.rst
index 846485681ce7..6dcb40cb7145 100644
--- a/Documentation/s390/common_io.rst
+++ b/Documentation/arch/s390/common_io.rst
@@ -136,5 +136,5 @@ debugfs entries
The level of logging can be changed to be more or less verbose by piping to
/sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the
- documentation on the S/390 debug feature (Documentation/s390/s390dbf.rst)
+ documentation on the S/390 debug feature (Documentation/arch/s390/s390dbf.rst)
for details.
diff --git a/Documentation/s390/config3270.sh b/Documentation/arch/s390/config3270.sh
index 515e2f431487..515e2f431487 100644
--- a/Documentation/s390/config3270.sh
+++ b/Documentation/arch/s390/config3270.sh
diff --git a/Documentation/s390/driver-model.rst b/Documentation/arch/s390/driver-model.rst
index ad4bc2dbea43..ad4bc2dbea43 100644
--- a/Documentation/s390/driver-model.rst
+++ b/Documentation/arch/s390/driver-model.rst
diff --git a/Documentation/s390/features.rst b/Documentation/arch/s390/features.rst
index 57c296a9d8f3..57c296a9d8f3 100644
--- a/Documentation/s390/features.rst
+++ b/Documentation/arch/s390/features.rst
diff --git a/Documentation/s390/index.rst b/Documentation/arch/s390/index.rst
index 73c79bf586fd..73c79bf586fd 100644
--- a/Documentation/s390/index.rst
+++ b/Documentation/arch/s390/index.rst
diff --git a/Documentation/s390/monreader.rst b/Documentation/arch/s390/monreader.rst
index 21cdfb699b49..21cdfb699b49 100644
--- a/Documentation/s390/monreader.rst
+++ b/Documentation/arch/s390/monreader.rst
diff --git a/Documentation/s390/pci.rst b/Documentation/arch/s390/pci.rst
index a1a72a47dc96..d5755484d8e7 100644
--- a/Documentation/s390/pci.rst
+++ b/Documentation/arch/s390/pci.rst
@@ -40,7 +40,7 @@ For example:
Change the level of logging to be more or less verbose by piping
a number between 0 and 6 to /sys/kernel/debug/s390dbf/pci_*/level. For
details, see the documentation on the S/390 debug feature at
- Documentation/s390/s390dbf.rst.
+ Documentation/arch/s390/s390dbf.rst.
Sysfs entries
=============
diff --git a/Documentation/s390/qeth.rst b/Documentation/arch/s390/qeth.rst
index f02fdaa68de0..f02fdaa68de0 100644
--- a/Documentation/s390/qeth.rst
+++ b/Documentation/arch/s390/qeth.rst
diff --git a/Documentation/s390/s390dbf.rst b/Documentation/arch/s390/s390dbf.rst
index af8bdc3629e7..af8bdc3629e7 100644
--- a/Documentation/s390/s390dbf.rst
+++ b/Documentation/arch/s390/s390dbf.rst
diff --git a/Documentation/s390/text_files.rst b/Documentation/arch/s390/text_files.rst
index c94d05d4fa17..c94d05d4fa17 100644
--- a/Documentation/s390/text_files.rst
+++ b/Documentation/arch/s390/text_files.rst
diff --git a/Documentation/s390/vfio-ap-locking.rst b/Documentation/arch/s390/vfio-ap-locking.rst
index 0dfcdb562e21..0dfcdb562e21 100644
--- a/Documentation/s390/vfio-ap-locking.rst
+++ b/Documentation/arch/s390/vfio-ap-locking.rst
diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/arch/s390/vfio-ap.rst
index bb3f4c4e2885..bb3f4c4e2885 100644
--- a/Documentation/s390/vfio-ap.rst
+++ b/Documentation/arch/s390/vfio-ap.rst
diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/arch/s390/vfio-ccw.rst
index 37026fa18179..42960b7b0d70 100644
--- a/Documentation/s390/vfio-ccw.rst
+++ b/Documentation/arch/s390/vfio-ccw.rst
@@ -440,6 +440,6 @@ Reference
1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832)
2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204)
3. https://en.wikipedia.org/wiki/Channel_I/O
-4. Documentation/s390/cds.rst
+4. Documentation/arch/s390/cds.rst
5. Documentation/driver-api/vfio.rst
6. Documentation/driver-api/vfio-mediated-device.rst
diff --git a/Documentation/s390/zfcpdump.rst b/Documentation/arch/s390/zfcpdump.rst
index a61de7aa8778..a61de7aa8778 100644
--- a/Documentation/s390/zfcpdump.rst
+++ b/Documentation/arch/s390/zfcpdump.rst
diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst
index 33520ecdb37a..cdbca15a4fc2 100644
--- a/Documentation/arch/x86/boot.rst
+++ b/Documentation/arch/x86/boot.rst
@@ -1417,7 +1417,7 @@ execution context provided by the EFI firmware.
The function prototype for the handover entry point looks like this::
- efi_main(void *handle, efi_system_table_t *table, struct boot_params *bp)
+ efi_stub_entry(void *handle, efi_system_table_t *table, struct boot_params *bp)
'handle' is the EFI image handle passed to the boot loader by the EFI
firmware, 'table' is the EFI system table - these are the first two
diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst
index e6f5bc39cf5c..b9ae591d0b18 100644
--- a/Documentation/core-api/cpu_hotplug.rst
+++ b/Documentation/core-api/cpu_hotplug.rst
@@ -395,8 +395,8 @@ multi-instance state the following function is available:
* cpuhp_setup_state_multi(state, name, startup, teardown)
The @state argument is either a statically allocated state or one of the
-constants for dynamically allocated states - CPUHP_PREPARE_DYN,
-CPUHP_ONLINE_DYN - depending on the state section (PREPARE, ONLINE) for
+constants for dynamically allocated states - CPUHP_BP_PREPARE_DYN,
+CPUHP_AP_ONLINE_DYN - depending on the state section (PREPARE, ONLINE) for
which a dynamic state should be allocated.
The @name argument is used for sysfs output and for instrumentation. The
@@ -588,7 +588,7 @@ notifications on online and offline operations::
Setup and teardown a dynamically allocated state in the ONLINE section
for notifications on offline operations::
- state = cpuhp_setup_state(CPUHP_ONLINE_DYN, "subsys:offline", NULL, subsys_cpu_offline);
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "subsys:offline", NULL, subsys_cpu_offline);
if (state < 0)
return state;
....
@@ -597,7 +597,7 @@ for notifications on offline operations::
Setup and teardown a dynamically allocated state in the ONLINE section
for notifications on online operations without invoking the callbacks::
- state = cpuhp_setup_state_nocalls(CPUHP_ONLINE_DYN, "subsys:online", subsys_cpu_online, NULL);
+ state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "subsys:online", subsys_cpu_online, NULL);
if (state < 0)
return state;
....
@@ -606,7 +606,7 @@ for notifications on online operations without invoking the callbacks::
Setup, use and teardown a dynamically allocated multi-instance state in the
ONLINE section for notifications on online and offline operation::
- state = cpuhp_setup_state_multi(CPUHP_ONLINE_DYN, "subsys:online", subsys_cpu_online, subsys_cpu_offline);
+ state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "subsys:online", subsys_cpu_online, subsys_cpu_offline);
if (state < 0)
return state;
....
diff --git a/Documentation/devicetree/bindings/arm/pmu.yaml b/Documentation/devicetree/bindings/arm/pmu.yaml
index e14358bf0b9c..99b5e9530707 100644
--- a/Documentation/devicetree/bindings/arm/pmu.yaml
+++ b/Documentation/devicetree/bindings/arm/pmu.yaml
@@ -49,9 +49,14 @@ properties:
- arm,cortex-a77-pmu
- arm,cortex-a78-pmu
- arm,cortex-a510-pmu
+ - arm,cortex-a520-pmu
- arm,cortex-a710-pmu
+ - arm,cortex-a715-pmu
+ - arm,cortex-a720-pmu
- arm,cortex-x1-pmu
- arm,cortex-x2-pmu
+ - arm,cortex-x3-pmu
+ - arm,cortex-x4-pmu
- arm,neoverse-e1-pmu
- arm,neoverse-n1-pmu
- arm,neoverse-n2-pmu
diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml
index e84e4f33b358..3d06db98e978 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml
+++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml
@@ -35,6 +35,7 @@ properties:
- amlogic,meson-sm1-gpio-intc
- amlogic,meson-a1-gpio-intc
- amlogic,meson-s4-gpio-intc
+ - amlogic,c3-gpio-intc
- const: amlogic,meson-gpio-intc
reg:
diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst
index 5158577bc29b..8c0845c4eee7 100644
--- a/Documentation/driver-api/s390-drivers.rst
+++ b/Documentation/driver-api/s390-drivers.rst
@@ -27,7 +27,7 @@ not strictly considered I/O devices. They are considered here as well,
although they are not the focus of this document.
Some additional information can also be found in the kernel source under
-Documentation/s390/driver-model.rst.
+Documentation/arch/s390/driver-model.rst.
The css bus
===========
@@ -38,7 +38,7 @@ into several categories:
* Standard I/O subchannels, for use by the system. They have a child
device on the ccw bus and are described below.
* I/O subchannels bound to the vfio-ccw driver. See
- Documentation/s390/vfio-ccw.rst.
+ Documentation/arch/s390/vfio-ccw.rst.
* Message subchannels. No Linux driver currently exists.
* CHSC subchannels (at most one). The chsc subchannel driver can be used
to send asynchronous chsc commands.
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index eccd327e6df5..a624e92f2687 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -332,54 +332,121 @@ Encryption modes and usage
fscrypt allows one encryption mode to be specified for file contents
and one encryption mode to be specified for filenames. Different
directory trees are permitted to use different encryption modes.
+
+Supported modes
+---------------
+
Currently, the following pairs of encryption modes are supported:
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
-- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
+- AES-256-XTS for contents and AES-256-HCTR2 for filenames
- Adiantum for both contents and filenames
-- AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only)
-- SM4-XTS for contents and SM4-CTS-CBC for filenames (v2 policies only)
-
-If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
-
-AES-128-CBC was added only for low-powered embedded devices with
-crypto accelerators such as CAAM or CESA that do not support XTS. To
-use AES-128-CBC, CONFIG_CRYPTO_ESSIV and CONFIG_CRYPTO_SHA256 (or
-another SHA-256 implementation) must be enabled so that ESSIV can be
-used.
-
-Adiantum is a (primarily) stream cipher-based mode that is fast even
-on CPUs without dedicated crypto instructions. It's also a true
-wide-block mode, unlike XTS. It can also eliminate the need to derive
-per-file encryption keys. However, it depends on the security of two
-primitives, XChaCha12 and AES-256, rather than just one. See the
-paper "Adiantum: length-preserving encryption for entry-level
-processors" (https://eprint.iacr.org/2018/720.pdf) for more details.
-To use Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
-implementations of ChaCha and NHPoly1305 should be enabled, e.g.
-CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
-
-AES-256-HCTR2 is another true wide-block encryption mode that is intended for
-use on CPUs with dedicated crypto instructions. AES-256-HCTR2 has the property
-that a bitflip in the plaintext changes the entire ciphertext. This property
-makes it desirable for filename encryption since initialization vectors are
-reused within a directory. For more details on AES-256-HCTR2, see the paper
-"Length-preserving encryption with HCTR2"
-(https://eprint.iacr.org/2021/1441.pdf). To use AES-256-HCTR2,
-CONFIG_CRYPTO_HCTR2 must be enabled. Also, fast implementations of XCTR and
-POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and
-CRYPTO_AES_ARM64_CE_BLK for ARM64.
-
-SM4 is a Chinese block cipher that is an alternative to AES. It has
-not seen as much security review as AES, and it only has a 128-bit key
-size. It may be useful in cases where its use is mandated.
-Otherwise, it should not be used. For SM4 support to be available, it
-also needs to be enabled in the kernel crypto API.
-
-New encryption modes can be added relatively easily, without changes
-to individual filesystems. However, authenticated encryption (AE)
-modes are not currently supported because of the difficulty of dealing
-with ciphertext expansion.
+- AES-128-CBC-ESSIV for contents and AES-128-CTS-CBC for filenames
+- SM4-XTS for contents and SM4-CTS-CBC for filenames
+
+Authenticated encryption modes are not currently supported because of
+the difficulty of dealing with ciphertext expansion. Therefore,
+contents encryption uses a block cipher in `XTS mode
+<https://en.wikipedia.org/wiki/Disk_encryption_theory#XTS>`_ or
+`CBC-ESSIV mode
+<https://en.wikipedia.org/wiki/Disk_encryption_theory#Encrypted_salt-sector_initialization_vector_(ESSIV)>`_,
+or a wide-block cipher. Filenames encryption uses a
+block cipher in `CTS-CBC mode
+<https://en.wikipedia.org/wiki/Ciphertext_stealing>`_ or a wide-block
+cipher.
+
+The (AES-256-XTS, AES-256-CTS-CBC) pair is the recommended default.
+It is also the only option that is *guaranteed* to always be supported
+if the kernel supports fscrypt at all; see `Kernel config options`_.
+
+The (AES-256-XTS, AES-256-HCTR2) pair is also a good choice that
+upgrades the filenames encryption to use a wide-block cipher. (A
+*wide-block cipher*, also called a tweakable super-pseudorandom
+permutation, has the property that changing one bit scrambles the
+entire result.) As described in `Filenames encryption`_, a wide-block
+cipher is the ideal mode for the problem domain, though CTS-CBC is the
+"least bad" choice among the alternatives. For more information about
+HCTR2, see `the HCTR2 paper <https://eprint.iacr.org/2021/1441.pdf>`_.
+
+Adiantum is recommended on systems where AES is too slow due to lack
+of hardware acceleration for AES. Adiantum is a wide-block cipher
+that uses XChaCha12 and AES-256 as its underlying components. Most of
+the work is done by XChaCha12, which is much faster than AES when AES
+acceleration is unavailable. For more information about Adiantum, see
+`the Adiantum paper <https://eprint.iacr.org/2018/720.pdf>`_.
+
+The (AES-128-CBC-ESSIV, AES-128-CTS-CBC) pair exists only to support
+systems whose only form of AES acceleration is an off-CPU crypto
+accelerator such as CAAM or CESA that does not support XTS.
+
+The remaining mode pairs are the "national pride ciphers":
+
+- (SM4-XTS, SM4-CTS-CBC)
+
+Generally speaking, these ciphers aren't "bad" per se, but they
+receive limited security review compared to the usual choices such as
+AES and ChaCha. They also don't bring much new to the table. It is
+suggested to only use these ciphers where their use is mandated.
+
+Kernel config options
+---------------------
+
+Enabling fscrypt support (CONFIG_FS_ENCRYPTION) automatically pulls in
+only the basic support from the crypto API needed to use AES-256-XTS
+and AES-256-CTS-CBC encryption. For optimal performance, it is
+strongly recommended to also enable any available platform-specific
+kconfig options that provide acceleration for the algorithm(s) you
+wish to use. Support for any "non-default" encryption modes typically
+requires extra kconfig options as well.
+
+Below, some relevant options are listed by encryption mode. Note,
+acceleration options not listed below may be available for your
+platform; refer to the kconfig menus. File contents encryption can
+also be configured to use inline encryption hardware instead of the
+kernel crypto API (see `Inline encryption support`_); in that case,
+the file contents mode doesn't need to supported in the kernel crypto
+API, but the filenames mode still does.
+
+- AES-256-XTS and AES-256-CTS-CBC
+ - Recommended:
+ - arm64: CONFIG_CRYPTO_AES_ARM64_CE_BLK
+ - x86: CONFIG_CRYPTO_AES_NI_INTEL
+
+- AES-256-HCTR2
+ - Mandatory:
+ - CONFIG_CRYPTO_HCTR2
+ - Recommended:
+ - arm64: CONFIG_CRYPTO_AES_ARM64_CE_BLK
+ - arm64: CONFIG_CRYPTO_POLYVAL_ARM64_CE
+ - x86: CONFIG_CRYPTO_AES_NI_INTEL
+ - x86: CONFIG_CRYPTO_POLYVAL_CLMUL_NI
+
+- Adiantum
+ - Mandatory:
+ - CONFIG_CRYPTO_ADIANTUM
+ - Recommended:
+ - arm32: CONFIG_CRYPTO_CHACHA20_NEON
+ - arm32: CONFIG_CRYPTO_NHPOLY1305_NEON
+ - arm64: CONFIG_CRYPTO_CHACHA20_NEON
+ - arm64: CONFIG_CRYPTO_NHPOLY1305_NEON
+ - x86: CONFIG_CRYPTO_CHACHA20_X86_64
+ - x86: CONFIG_CRYPTO_NHPOLY1305_SSE2
+ - x86: CONFIG_CRYPTO_NHPOLY1305_AVX2
+
+- AES-128-CBC-ESSIV and AES-128-CTS-CBC:
+ - Mandatory:
+ - CONFIG_CRYPTO_ESSIV
+ - CONFIG_CRYPTO_SHA256 or another SHA-256 implementation
+ - Recommended:
+ - AES-CBC acceleration
+
+fscrypt also uses HMAC-SHA512 for key derivation, so enabling SHA-512
+acceleration is recommended:
+
+- SHA-512
+ - Recommended:
+ - arm64: CONFIG_CRYPTO_SHA512_ARM64_CE
+ - x86: CONFIG_CRYPTO_SHA512_SSSE3
Contents encryption
-------------------
@@ -493,7 +560,14 @@ This structure must be initialized as follows:
be set to constants from ``<linux/fscrypt.h>`` which identify the
encryption modes to use. If unsure, use FSCRYPT_MODE_AES_256_XTS
(1) for ``contents_encryption_mode`` and FSCRYPT_MODE_AES_256_CTS
- (4) for ``filenames_encryption_mode``.
+ (4) for ``filenames_encryption_mode``. For details, see `Encryption
+ modes and usage`_.
+
+ v1 encryption policies only support three combinations of modes:
+ (FSCRYPT_MODE_AES_256_XTS, FSCRYPT_MODE_AES_256_CTS),
+ (FSCRYPT_MODE_AES_128_CBC, FSCRYPT_MODE_AES_128_CTS), and
+ (FSCRYPT_MODE_ADIANTUM, FSCRYPT_MODE_ADIANTUM). v2 policies support
+ all combinations documented in `Supported modes`_.
- ``flags`` contains optional flags from ``<linux/fscrypt.h>``:
diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst
index ad6d21640576..d095c5838f94 100644
--- a/Documentation/filesystems/idmappings.rst
+++ b/Documentation/filesystems/idmappings.rst
@@ -146,9 +146,10 @@ For the rest of this document we will prefix all userspace ids with ``u`` and
all kernel ids with ``k``. Ranges of idmappings will be prefixed with ``r``. So
an idmapping will be written as ``u0:k10000:r10000``.
-For example, the id ``u1000`` is an id in the upper idmapset or "userspace
-idmapset" starting with ``u1000``. And it is mapped to ``k11000`` which is a
-kernel id in the lower idmapset or "kernel idmapset" starting with ``k10000``.
+For example, within this idmapping, the id ``u1000`` is an id in the upper
+idmapset or "userspace idmapset" starting with ``u0``. And it is mapped to
+``k11000`` which is a kernel id in the lower idmapset or "kernel idmapset"
+starting with ``k10000``.
A kernel id is always created by an idmapping. Such idmappings are associated
with user namespaces. Since we mainly care about how idmappings work we're not
@@ -373,6 +374,13 @@ kernel maps the caller's userspace id down into a kernel id according to the
caller's idmapping and then maps that kernel id up according to the
filesystem's idmapping.
+From the implementation point it's worth mentioning how idmappings are represented.
+All idmappings are taken from the corresponding user namespace.
+
+ - caller's idmapping (usually taken from ``current_user_ns()``)
+ - filesystem's idmapping (``sb->s_user_ns``)
+ - mount's idmapping (``mnt_idmap(vfsmnt)``)
+
Let's see some examples with caller/filesystem idmapping but without mount
idmappings. This will exhibit some problems we can hit. After that we will
revisit/reconsider these examples, this time using mount idmappings, to see how
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 0ca479dbb1cd..b7e5a3841aa4 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -85,13 +85,14 @@ prototypes::
struct dentry *dentry, struct fileattr *fa);
int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int);
+ struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
locking rules:
all may block
-============== =============================================
+============== ==================================================
ops i_rwsem(inode)
-============== =============================================
+============== ==================================================
lookup: shared
create: exclusive
link: exclusive (both)
@@ -115,7 +116,8 @@ atomic_open: shared (exclusive if O_CREAT is set in open flags)
tmpfile: no
fileattr_get: no or exclusive
fileattr_set: exclusive
-============== =============================================
+get_offset_ctx no
+============== ==================================================
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem
@@ -374,10 +376,17 @@ invalidate_lock before invalidating page cache in truncate / hole punch
path (and thus calling into ->invalidate_folio) to block races between page
cache invalidation and page cache filling functions (fault, read, ...).
-->release_folio() is called when the kernel is about to try to drop the
-buffers from the folio in preparation for freeing it. It returns false to
-indicate that the buffers are (or may be) freeable. If ->release_folio is
-NULL, the kernel assumes that the fs has no private interest in the buffers.
+->release_folio() is called when the MM wants to make a change to the
+folio that would invalidate the filesystem's private data. For example,
+it may be about to be removed from the address_space or split. The folio
+is locked and not under writeback. It may be dirty. The gfp parameter
+is not usually used for allocation, but rather to indicate what the
+filesystem may do to attempt to free the private data. The filesystem may
+return false to indicate that the folio's private data cannot be freed.
+If it returns true, it should have already removed the private data from
+the folio. If a filesystem does not provide a ->release_folio method,
+the pagecache will assume that private data is buffer_heads and call
+try_to_free_buffers().
->free_folio() is called when the kernel has dropped the folio
from the page cache.
diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst
index 2cd8fa332feb..56a26c843dbe 100644
--- a/Documentation/filesystems/tmpfs.rst
+++ b/Documentation/filesystems/tmpfs.rst
@@ -21,8 +21,8 @@ explained further below, some of which can be reconfigured dynamically on the
fly using a remount ('mount -o remount ...') of the filesystem. A tmpfs
filesystem can be resized but it cannot be resized to a size below its current
usage. tmpfs also supports POSIX ACLs, and extended attributes for the
-trusted.* and security.* namespaces. ramfs does not use swap and you cannot
-modify any parameter for a ramfs filesystem. The size limit of a ramfs
+trusted.*, security.* and user.* namespaces. ramfs does not use swap and you
+cannot modify any parameter for a ramfs filesystem. The size limit of a ramfs
filesystem is how much memory you have available, and so care must be taken if
used so to not run out of memory.
@@ -97,6 +97,9 @@ mount with such options, since it allows any user with write access to
use up all the memory on the machine; but enhances the scalability of
that instance in a system with many CPUs making intensive use of it.
+If nr_inodes is not 0, that limited space for inodes is also used up by
+extended attributes: "df -i"'s IUsed and IUse% increase, IFree decreases.
+
tmpfs blocks may be swapped out, when there is a shortage of memory.
tmpfs has a mount option to disable its use of swap:
@@ -123,6 +126,37 @@ sysfs file /sys/kernel/mm/transparent_hugepage/shmem_enabled: which can
be used to deny huge pages on all tmpfs mounts in an emergency, or to
force huge pages on all tmpfs mounts for testing.
+tmpfs also supports quota with the following mount options
+
+======================== =================================================
+quota User and group quota accounting and enforcement
+ is enabled on the mount. Tmpfs is using hidden
+ system quota files that are initialized on mount.
+usrquota User quota accounting and enforcement is enabled
+ on the mount.
+grpquota Group quota accounting and enforcement is enabled
+ on the mount.
+usrquota_block_hardlimit Set global user quota block hard limit.
+usrquota_inode_hardlimit Set global user quota inode hard limit.
+grpquota_block_hardlimit Set global group quota block hard limit.
+grpquota_inode_hardlimit Set global group quota inode hard limit.
+======================== =================================================
+
+None of the quota related mount options can be set or changed on remount.
+
+Quota limit parameters accept a suffix k, m or g for kilo, mega and giga
+and can't be changed on remount. Default global quota limits are taking
+effect for any and all user/group/project except root the first time the
+quota entry for user/group/project id is being accessed - typically the
+first time an inode with a particular id ownership is being created after
+the mount. In other words, instead of the limits being initialized to zero,
+they are initialized with the particular value provided with these mount
+options. The limits can be changed for any user/group id at any time as they
+normally can be.
+
+Note that tmpfs quotas do not support user namespaces so no uid/gid
+translation is done if quotas are enabled inside user namespaces.
+
tmpfs has a mount option to set the NUMA memory allocation policy for
all files in that instance (if CONFIG_NUMA is enabled) - which can be
adjusted on the fly via 'mount -o remount ...'
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index cb2a97e49872..f8fe815ab1f3 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -260,9 +260,11 @@ filesystem. The following members are defined:
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
- int (*freeze_super) (struct super_block *);
+ int (*freeze_super) (struct super_block *sb,
+ enum freeze_holder who);
int (*freeze_fs) (struct super_block *);
- int (*thaw_super) (struct super_block *);
+ int (*thaw_super) (struct super_block *sb,
+ enum freeze_wholder who);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
@@ -515,6 +517,7 @@ As of kernel 2.6.22, the following members are defined:
int (*fileattr_set)(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
+ struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
};
Again, all methods are called without any locks being held, unless
@@ -675,7 +678,10 @@ otherwise noted.
called on ioctl(FS_IOC_SETFLAGS) and ioctl(FS_IOC_FSSETXATTR) to
change miscellaneous file flags and attributes. Callers hold
i_rwsem exclusive. If unset, then fall back to f_op->ioctl().
-
+``get_offset_ctx``
+ called to get the offset context for a directory inode. A
+ filesystem must define this operation to use
+ simple_offset_dir_operations.
The Address Space Object
========================
diff --git a/Documentation/firmware-guide/acpi/chromeos-acpi-device.rst b/Documentation/firmware-guide/acpi/chromeos-acpi-device.rst
index f37fc90ce340..89419e116413 100644
--- a/Documentation/firmware-guide/acpi/chromeos-acpi-device.rst
+++ b/Documentation/firmware-guide/acpi/chromeos-acpi-device.rst
@@ -5,9 +5,8 @@ Chrome OS ACPI Device
=====================
Hardware functionality specific to Chrome OS is exposed through a Chrome OS ACPI device.
-The plug and play ID of a Chrome OS ACPI device is GGL0001. GGL is a valid PNP ID of Google.
-PNP ID can be used with the ACPI devices according to the guidelines. The following ACPI
-objects are supported:
+The plug and play ID of a Chrome OS ACPI device is GGL0001 and the hardware ID is
+GOOG0016. The following ACPI objects are supported:
.. flat-table:: Supported ACPI Objects
:widths: 1 2
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
index 03db55504515..f68919800f05 100644
--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
way the previous scheduler had, and has no heuristics whatsoever. There is
only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
- /sys/kernel/debug/sched/min_granularity_ns
+ /sys/kernel/debug/sched/base_slice_ns
which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
"server" (i.e., good batching) workloads. It defaults to a setting suitable
diff --git a/MAINTAINERS b/MAINTAINERS
index d590ce31aa72..07e40f03435e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4819,6 +4819,7 @@ F: drivers/input/touchscreen/chipone_icn8505.c
CHROME HARDWARE PLATFORM SUPPORT
M: Benson Leung <bleung@chromium.org>
+M: Tzung-Bi Shih <tzungbi@kernel.org>
L: chrome-platform@lists.linux.dev
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git
@@ -8769,6 +8770,15 @@ S: Supported
F: Documentation/networking/device_drivers/ethernet/google/gve.rst
F: drivers/net/ethernet/google
+GOOGLE FIRMWARE DRIVERS
+M: Tzung-Bi Shih <tzungbi@kernel.org>
+R: Brian Norris <briannorris@chromium.org>
+R: Julius Werner <jwerner@chromium.org>
+L: chrome-platform@lists.linux.dev
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git
+F: drivers/firmware/google/
+
GPD POCKET FAN DRIVER
M: Hans de Goede <hdegoede@redhat.com>
L: platform-driver-x86@vger.kernel.org
@@ -9306,7 +9316,7 @@ F: drivers/crypto/hisilicon/hpre/hpre_crypto.c
F: drivers/crypto/hisilicon/hpre/hpre_main.c
HISILICON HNS3 PMU DRIVER
-M: Guangbin Huang <huangguangbin2@huawei.com>
+M: Jijie Shao <shaojijie@huawei.com>
S: Supported
F: Documentation/admin-guide/perf/hns3-pmu.rst
F: drivers/perf/hisilicon/hns3_pmu.c
@@ -9344,7 +9354,7 @@ F: Documentation/devicetree/bindings/net/hisilicon*.txt
F: drivers/net/ethernet/hisilicon/
HISILICON PMU DRIVER
-M: Shaokun Zhang <zhangshaokun@hisilicon.com>
+M: Yicong Yang <yangyicong@hisilicon.com>
M: Jonathan Cameron <jonathan.cameron@huawei.com>
S: Supported
W: http://www.hisilicon.com
@@ -14803,6 +14813,16 @@ F: net/netfilter/xt_CONNSECMARK.c
F: net/netfilter/xt_SECMARK.c
F: net/netlabel/
+NETWORKING [MACSEC]
+M: Sabrina Dubroca <sd@queasysnail.net>
+L: netdev@vger.kernel.org
+S: Maintained
+F: drivers/net/macsec.c
+F: include/net/macsec.h
+F: include/uapi/linux/if_macsec.h
+K: macsec
+K: \bmdo_
+
NETWORKING [MPTCP]
M: Matthieu Baerts <matthieu.baerts@tessares.net>
M: Mat Martineau <martineau@kernel.org>
@@ -17047,6 +17067,7 @@ F: drivers/net/ppp/pptp.c
PRESSURE STALL INFORMATION (PSI)
M: Johannes Weiner <hannes@cmpxchg.org>
M: Suren Baghdasaryan <surenb@google.com>
+R: Peter Ziljstra <peterz@infradead.org>
S: Maintained
F: include/linux/psi*
F: kernel/sched/psi.c
@@ -18594,7 +18615,7 @@ L: linux-s390@vger.kernel.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git
F: Documentation/driver-api/s390-drivers.rst
-F: Documentation/s390/
+F: Documentation/arch/s390/
F: arch/s390/
F: drivers/s390/
F: drivers/watchdog/diag288_wdt.c
@@ -18655,7 +18676,7 @@ M: Niklas Schnelle <schnelle@linux.ibm.com>
M: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
L: linux-s390@vger.kernel.org
S: Supported
-F: Documentation/s390/pci.rst
+F: Documentation/arch/s390/pci.rst
F: arch/s390/pci/
F: drivers/pci/hotplug/s390_pci_hpc.c
@@ -18672,7 +18693,7 @@ M: Halil Pasic <pasic@linux.ibm.com>
M: Jason Herne <jjherne@linux.ibm.com>
L: linux-s390@vger.kernel.org
S: Supported
-F: Documentation/s390/vfio-ap*
+F: Documentation/arch/s390/vfio-ap*
F: drivers/s390/crypto/vfio_ap*
S390 VFIO-CCW DRIVER
@@ -18682,7 +18703,7 @@ R: Halil Pasic <pasic@linux.ibm.com>
L: linux-s390@vger.kernel.org
L: kvm@vger.kernel.org
S: Supported
-F: Documentation/s390/vfio-ccw.rst
+F: Documentation/arch/s390/vfio-ccw.rst
F: drivers/s390/cio/vfio_ccw*
F: include/uapi/linux/vfio_ccw.h
@@ -19275,7 +19296,6 @@ F: drivers/misc/sgi-gru/
SGI XP/XPC/XPNET DRIVER
M: Robin Holt <robinmholt@gmail.com>
M: Steve Wahl <steve.wahl@hpe.com>
-R: Mike Travis <mike.travis@hpe.com>
S: Maintained
F: drivers/misc/sgi-xp/
@@ -21052,6 +21072,39 @@ S: Maintained
F: Documentation/devicetree/bindings/sound/davinci-mcasp-audio.yaml
F: sound/soc/ti/
+TEXAS INSTRUMENTS AUDIO (ASoC/HDA) DRIVERS
+M: Shenghao Ding <shenghao-ding@ti.com>
+M: Kevin Lu <kevin-lu@ti.com>
+M: Baojun Xu <x1077012@ti.com>
+L: alsa-devel@alsa-project.org (moderated for non-subscribers)
+S: Maintained
+F: Documentation/devicetree/bindings/sound/tas2552.txt
+F: Documentation/devicetree/bindings/sound/tas2562.yaml
+F: Documentation/devicetree/bindings/sound/tas2770.yaml
+F: Documentation/devicetree/bindings/sound/tas27xx.yaml
+F: Documentation/devicetree/bindings/sound/ti,pcm1681.txt
+F: Documentation/devicetree/bindings/sound/ti,pcm3168a.yaml
+F: Documentation/devicetree/bindings/sound/ti,tlv320*.yaml
+F: Documentation/devicetree/bindings/sound/tlv320adcx140.yaml
+F: Documentation/devicetree/bindings/sound/tlv320aic31xx.txt
+F: Documentation/devicetree/bindings/sound/tpa6130a2.txt
+F: include/sound/tas2*.h
+F: include/sound/tlv320*.h
+F: include/sound/tpa6130a2-plat.h
+F: sound/pci/hda/tas2781_hda_i2c.c
+F: sound/soc/codecs/pcm1681.c
+F: sound/soc/codecs/pcm1789*.*
+F: sound/soc/codecs/pcm179x*.*
+F: sound/soc/codecs/pcm186x*.*
+F: sound/soc/codecs/pcm3008.*
+F: sound/soc/codecs/pcm3060*.*
+F: sound/soc/codecs/pcm3168a*.*
+F: sound/soc/codecs/pcm5102a.c
+F: sound/soc/codecs/pcm512x*.*
+F: sound/soc/codecs/tas2*.*
+F: sound/soc/codecs/tlv320*.*
+F: sound/soc/codecs/tpa6130a2.*
+
TEXAS INSTRUMENTS DMA DRIVERS
M: Peter Ujfalusi <peter.ujfalusi@gmail.com>
L: dmaengine@vger.kernel.org
@@ -23139,7 +23192,8 @@ F: arch/x86/platform
X86 PLATFORM UV HPE SUPERDOME FLEX
M: Steve Wahl <steve.wahl@hpe.com>
-R: Mike Travis <mike.travis@hpe.com>
+R: Justin Ernst <justin.ernst@hpe.com>
+R: Kyle Meyer <kyle.meyer@hpe.com>
R: Dimitri Sivanich <dimitri.sivanich@hpe.com>
R: Russ Anderson <russ.anderson@hpe.com>
S: Supported
diff --git a/Makefile b/Makefile
index 4739c21a63e2..2fdd8b40b7e0 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 6
PATCHLEVEL = 5
SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
NAME = Hurr durr I'ma ninja sloth
# *DOCUMENTATION*
diff --git a/arch/Kconfig b/arch/Kconfig
index aff2746c8af2..63c5d6a2022b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -34,6 +34,9 @@ config ARCH_HAS_SUBPAGE_FAULTS
config HOTPLUG_SMT
bool
+config SMT_NUM_THREADS_DYNAMIC
+ bool
+
# Selected by HOTPLUG_CORE_SYNC_DEAD or HOTPLUG_CORE_SYNC_FULL
config HOTPLUG_CORE_SYNC
bool
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index d98701ee36c6..5db88b627439 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -97,7 +97,7 @@ struct osf_dirent {
unsigned int d_ino;
unsigned short d_reclen;
unsigned short d_namlen;
- char d_name[1];
+ char d_name[];
};
struct osf_dirent_callback {
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 1f13995d00d7..ad37569d0507 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -491,3 +491,4 @@
559 common futex_waitv sys_futex_waitv
560 common set_mempolicy_home_node sys_ni_syscall
561 common cachestat sys_cachestat
+562 common fchmodat2 sys_fchmodat2
diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index dfeed440254a..fe4326d938c1 100644
--- a/arch/arm/include/asm/syscall.h
+++ b/arch/arm/include/asm/syscall.h
@@ -25,6 +25,9 @@ static inline int syscall_get_nr(struct task_struct *task,
if (IS_ENABLED(CONFIG_AEABI) && !IS_ENABLED(CONFIG_OABI_COMPAT))
return task_thread_info(task)->abi_syscall;
+ if (task_thread_info(task)->abi_syscall == -1)
+ return -1;
+
return task_thread_info(task)->abi_syscall & __NR_SYSCALL_MASK;
}
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index bcc4c9ec3aa4..5c31e9de7a60 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -90,6 +90,7 @@ slow_work_pending:
cmp r0, #0
beq no_work_pending
movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE)
+ str scno, [tsk, #TI_ABI_SYSCALL] @ make sure tracers see update
ldmia sp, {r0 - r6} @ have to reload r0 - r6
b local_restart @ ... and off we go
ENDPROC(ret_fast_syscall)
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 054e9199f30d..dc0fb7a81371 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -626,7 +626,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
hw->address &= ~alignment_mask;
hw->ctrl.len <<= offset;
- if (is_default_overflow_handler(bp)) {
+ if (uses_default_overflow_handler(bp)) {
/*
* Mismatch breakpoints are required for single-stepping
* breakpoints.
@@ -798,7 +798,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
* Otherwise, insert a temporary mismatch breakpoint so that
* we can single-step over the watchpoint trigger.
*/
- if (!is_default_overflow_handler(wp))
+ if (!uses_default_overflow_handler(wp))
continue;
step:
enable_single_step(wp, instruction_pointer(regs));
@@ -811,7 +811,7 @@ step:
info->trigger = addr;
pr_debug("watchpoint fired: address = 0x%x\n", info->trigger);
perf_bp_event(wp, regs);
- if (is_default_overflow_handler(wp))
+ if (uses_default_overflow_handler(wp))
enable_single_step(wp, instruction_pointer(regs));
}
@@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
info->trigger = addr;
pr_debug("breakpoint fired: address = 0x%x\n", addr);
perf_bp_event(bp, regs);
- if (is_default_overflow_handler(bp))
+ if (uses_default_overflow_handler(bp))
enable_single_step(bp, addr);
goto unlock;
}
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 2d8e2516906b..fef32d73f912 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -783,8 +783,9 @@ long arch_ptrace(struct task_struct *child, long request,
break;
case PTRACE_SET_SYSCALL:
- task_thread_info(child)->abi_syscall = data &
- __NR_SYSCALL_MASK;
+ if (data != -1)
+ data &= __NR_SYSCALL_MASK;
+ task_thread_info(child)->abi_syscall = data;
ret = 0;
break;
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 8ebed8a13874..c572d6c3dee0 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -465,3 +465,4 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2511b30d0f6..29db061db9bb 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1793,9 +1793,6 @@ config ARM64_PAN
The feature is detected at runtime, and will remain as a 'nop'
instruction if the cpu does not implement the feature.
-config AS_HAS_LDAPR
- def_bool $(as-instr,.arch_extension rcpc)
-
config AS_HAS_LSE_ATOMICS
def_bool $(as-instr,.arch_extension lse)
@@ -1933,6 +1930,9 @@ config AS_HAS_ARMV8_3
config AS_HAS_CFI_NEGATE_RA_STATE
def_bool $(as-instr,.cfi_startproc\n.cfi_negate_ra_state\n.cfi_endproc\n)
+config AS_HAS_LDAPR
+ def_bool $(as-instr,.arch_extension rcpc)
+
endmenu # "ARMv8.3 architectural features"
menu "ARMv8.4 architectural features"
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index bd68e1b7f29f..4d537d56eb84 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -42,6 +42,9 @@
#define ACPI_MADT_GICC_SPE (offsetof(struct acpi_madt_generic_interrupt, \
spe_interrupt) + sizeof(u16))
+#define ACPI_MADT_GICC_TRBE (offsetof(struct acpi_madt_generic_interrupt, \
+ trbe_interrupt) + sizeof(u16))
+
/* Basic configuration for ACPI */
#ifdef CONFIG_ACPI
pgprot_t __acpi_get_mem_attribute(phys_addr_t addr);
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 4cf2cb053bc8..f482b994c608 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -30,28 +30,16 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md,
bool has_bti);
-#define arch_efi_call_virt_setup() \
-({ \
- efi_virtmap_load(); \
- __efi_fpsimd_begin(); \
- raw_spin_lock(&efi_rt_lock); \
-})
-
#undef arch_efi_call_virt
#define arch_efi_call_virt(p, f, args...) \
__efi_rt_asm_wrapper((p)->f, #f, args)
-#define arch_efi_call_virt_teardown() \
-({ \
- raw_spin_unlock(&efi_rt_lock); \
- __efi_fpsimd_end(); \
- efi_virtmap_unload(); \
-})
-
-extern raw_spinlock_t efi_rt_lock;
extern u64 *efi_rt_stack_top;
efi_status_t __efi_rt_asm_wrapper(void *, const char *, ...);
+void arch_efi_call_virt_setup(void);
+void arch_efi_call_virt_teardown(void);
+
/*
* efi_rt_stack_top[-1] contains the value the stack pointer had before
* switching to the EFI runtime stack.
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 692b1ec663b2..521267478d18 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -138,6 +138,7 @@
#define KERNEL_HWCAP_SME_B16B16 __khwcap2_feature(SME_B16B16)
#define KERNEL_HWCAP_SME_F16F16 __khwcap2_feature(SME_F16F16)
#define KERNEL_HWCAP_MOPS __khwcap2_feature(MOPS)
+#define KERNEL_HWCAP_HBC __khwcap2_feature(HBC)
/*
* This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 577773870b66..85d26143faa5 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -118,31 +118,4 @@
#define SWAPPER_RX_MMUFLAGS (SWAPPER_RW_MMUFLAGS | PTE_RDONLY)
#endif
-/*
- * To make optimal use of block mappings when laying out the linear
- * mapping, round down the base of physical memory to a size that can
- * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
- * (64k granule), or a multiple that can be mapped using contiguous bits
- * in the page tables: 32 * PMD_SIZE (16k granule)
- */
-#if defined(CONFIG_ARM64_4K_PAGES)
-#define ARM64_MEMSTART_SHIFT PUD_SHIFT
-#elif defined(CONFIG_ARM64_16K_PAGES)
-#define ARM64_MEMSTART_SHIFT CONT_PMD_SHIFT
-#else
-#define ARM64_MEMSTART_SHIFT PMD_SHIFT
-#endif
-
-/*
- * sparsemem vmemmap imposes an additional requirement on the alignment of
- * memstart_addr, due to the fact that the base of the vmemmap region
- * has a direct correspondence, and needs to appear sufficiently aligned
- * in the virtual address space.
- */
-#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
-#define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS)
-#else
-#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT)
-#endif
-
#endif /* __ASM_KERNEL_PGTABLE_H */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 4384eaa0aeb7..94b68850cb9f 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -64,7 +64,6 @@ extern void arm64_memblock_init(void);
extern void paging_init(void);
extern void bootmem_init(void);
extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
-extern void init_mem_pgprot(void);
extern void create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
phys_addr_t size, pgprot_t prot);
extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0bd18de9fd97..72c2e8431360 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -103,6 +103,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
#define pte_young(pte) (!!(pte_val(pte) & PTE_AF))
#define pte_special(pte) (!!(pte_val(pte) & PTE_SPECIAL))
#define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE))
+#define pte_rdonly(pte) (!!(pte_val(pte) & PTE_RDONLY))
#define pte_user(pte) (!!(pte_val(pte) & PTE_USER))
#define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN))
#define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT))
@@ -120,7 +121,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
(__boundary - 1 < (end) - 1) ? __boundary : (end); \
})
-#define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
+#define pte_hw_dirty(pte) (pte_write(pte) && !pte_rdonly(pte))
#define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY))
#define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte))
@@ -212,7 +213,7 @@ static inline pte_t pte_wrprotect(pte_t pte)
* clear), set the PTE_DIRTY bit.
*/
if (pte_hw_dirty(pte))
- pte = pte_mkdirty(pte);
+ pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
@@ -823,7 +824,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
PTE_ATTRINDX_MASK;
/* preserve the hardware dirty information */
if (pte_hw_dirty(pte))
- pte = pte_mkdirty(pte);
+ pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+
pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
return pte;
}
diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 4292d9bafb9d..484cb6972e99 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -17,6 +17,9 @@
#include <asm/virt.h>
+DECLARE_PER_CPU(struct sdei_registered_event *, sdei_active_normal_event);
+DECLARE_PER_CPU(struct sdei_registered_event *, sdei_active_critical_event);
+
extern unsigned long sdei_exit_mode;
/* Software Delegated Exception entry point from firmware*/
@@ -29,6 +32,9 @@ asmlinkage void __sdei_asm_entry_trampoline(unsigned long event_num,
unsigned long pc,
unsigned long pstate);
+/* Abort a running handler. Context is discarded. */
+void __sdei_handler_abort(void);
+
/*
* The above entry point does the minimum to call C code. This function does
* anything else, before calling the driver.
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index b481935e9314..16464bf9a8aa 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -803,15 +803,21 @@
/*
* For registers without architectural names, or simply unsupported by
* GAS.
+ *
+ * __check_r forces warnings to be generated by the compiler when
+ * evaluating r which wouldn't normally happen due to being passed to
+ * the assembler via __stringify(r).
*/
#define read_sysreg_s(r) ({ \
u64 __val; \
+ u32 __maybe_unused __check_r = (u32)(r); \
asm volatile(__mrs_s("%0", r) : "=r" (__val)); \
__val; \
})
#define write_sysreg_s(v, r) do { \
u64 __val = (u64)(v); \
+ u32 __maybe_unused __check_r = (u32)(r); \
asm volatile(__msr_s(r, "%x0") : : "rZ" (__val)); \
} while (0)
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 64a514f90131..bd77253b62e0 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 452
+#define __NR_compat_syscalls 453
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index d952a28463e0..78b68311ec81 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -909,6 +909,8 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
#define __NR_cachestat 451
__SYSCALL(__NR_cachestat, sys_cachestat)
+#define __NR_fchmodat2 452
+__SYSCALL(__NR_fchmodat2, sys_fchmodat2)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index a2cac4305b1e..53026f45a509 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -103,5 +103,6 @@
#define HWCAP2_SME_B16B16 (1UL << 41)
#define HWCAP2_SME_F16F16 (1UL << 42)
#define HWCAP2_MOPS (1UL << 43)
+#define HWCAP2_HBC (1UL << 44)
#endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index f9d456fe132d..a5f533f63b60 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -222,7 +222,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_MOPS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0),
@@ -2708,12 +2708,8 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.desc = "Enhanced Virtualization Traps",
.capability = ARM64_HAS_EVT,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
- .sys_reg = SYS_ID_AA64MMFR2_EL1,
- .sign = FTR_UNSIGNED,
- .field_pos = ID_AA64MMFR2_EL1_EVT_SHIFT,
- .field_width = 4,
- .min_field_value = ID_AA64MMFR2_EL1_EVT_IMP,
.matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP)
},
{},
};
@@ -2844,6 +2840,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(ID_AA64ISAR2_EL1, RPRES, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRES),
HWCAP_CAP(ID_AA64ISAR2_EL1, WFxT, IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT),
HWCAP_CAP(ID_AA64ISAR2_EL1, MOPS, IMP, CAP_HWCAP, KERNEL_HWCAP_MOPS),
+ HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC),
#ifdef CONFIG_ARM64_SME
HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME),
HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c
index d1f68599c29f..f372295207fb 100644
--- a/arch/arm64/kernel/cpuidle.c
+++ b/arch/arm64/kernel/cpuidle.c
@@ -9,8 +9,6 @@
#include <linux/acpi.h>
#include <linux/cpuidle.h>
#include <linux/cpu_pm.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
#include <linux/psci.h>
#ifdef CONFIG_ACPI_PROCESSOR_IDLE
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 58622dc85917..98fda8500535 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -126,6 +126,7 @@ static const char *const hwcap_str[] = {
[KERNEL_HWCAP_SME_B16B16] = "smeb16b16",
[KERNEL_HWCAP_SME_F16F16] = "smef16f16",
[KERNEL_HWCAP_MOPS] = "mops",
+ [KERNEL_HWCAP_HBC] = "hbc",
};
#ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index baab8dd3ead3..49efbdbd6f7a 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -158,7 +158,21 @@ asmlinkage efi_status_t efi_handle_corrupted_x18(efi_status_t s, const char *f)
return s;
}
-DEFINE_RAW_SPINLOCK(efi_rt_lock);
+static DEFINE_RAW_SPINLOCK(efi_rt_lock);
+
+void arch_efi_call_virt_setup(void)
+{
+ efi_virtmap_load();
+ __efi_fpsimd_begin();
+ raw_spin_lock(&efi_rt_lock);
+}
+
+void arch_efi_call_virt_teardown(void)
+{
+ raw_spin_unlock(&efi_rt_lock);
+ __efi_fpsimd_end();
+ efi_virtmap_unload();
+}
asmlinkage u64 *efi_rt_stack_top __ro_after_init;
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 6b2e0c367702..0fc94207e69a 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -355,6 +355,35 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
}
#endif /* CONFIG_ARM64_ERRATUM_1463225 */
+/*
+ * As per the ABI exit SME streaming mode and clear the SVE state not
+ * shared with FPSIMD on syscall entry.
+ */
+static inline void fp_user_discard(void)
+{
+ /*
+ * If SME is active then exit streaming mode. If ZA is active
+ * then flush the SVE registers but leave userspace access to
+ * both SVE and SME enabled, otherwise disable SME for the
+ * task and fall through to disabling SVE too. This means
+ * that after a syscall we never have any streaming mode
+ * register state to track, if this changes the KVM code will
+ * need updating.
+ */
+ if (system_supports_sme())
+ sme_smstop_sm();
+
+ if (!system_supports_sve())
+ return;
+
+ if (test_thread_flag(TIF_SVE)) {
+ unsigned int sve_vq_minus_one;
+
+ sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1;
+ sve_flush_live(true, sve_vq_minus_one);
+ }
+}
+
UNHANDLED(el1t, 64, sync)
UNHANDLED(el1t, 64, irq)
UNHANDLED(el1t, 64, fiq)
@@ -644,6 +673,8 @@ static void noinstr el0_svc(struct pt_regs *regs)
{
enter_from_user_mode(regs);
cortex_a76_erratum_1463225_svc_handler();
+ fp_user_discard();
+ local_daif_restore(DAIF_PROCCTX);
do_el0_svc(regs);
exit_to_user_mode(regs);
}
@@ -783,6 +814,7 @@ static void noinstr el0_svc_compat(struct pt_regs *regs)
{
enter_from_user_mode(regs);
cortex_a76_erratum_1463225_svc_handler();
+ local_daif_restore(DAIF_PROCCTX);
do_el0_svc_compat(regs);
exit_to_user_mode(regs);
}
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a40e5e50fa55..6ad61de03d0a 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -986,9 +986,13 @@ SYM_CODE_START(__sdei_asm_handler)
mov x19, x1
-#if defined(CONFIG_VMAP_STACK) || defined(CONFIG_SHADOW_CALL_STACK)
+ /* Store the registered-event for crash_smp_send_stop() */
ldrb w4, [x19, #SDEI_EVENT_PRIORITY]
-#endif
+ cbnz w4, 1f
+ adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6
+ b 2f
+1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6
+2: str x19, [x5]
#ifdef CONFIG_VMAP_STACK
/*
@@ -1055,6 +1059,14 @@ SYM_CODE_START(__sdei_asm_handler)
ldr_l x2, sdei_exit_mode
+ /* Clear the registered-event seen by crash_smp_send_stop() */
+ ldrb w3, [x4, #SDEI_EVENT_PRIORITY]
+ cbnz w3, 1f
+ adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6
+ b 2f
+1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6
+2: str xzr, [x5]
+
alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0
sdei_handler_exit exit_mode=x2
alternative_else_nop_endif
@@ -1065,4 +1077,15 @@ alternative_else_nop_endif
#endif
SYM_CODE_END(__sdei_asm_handler)
NOKPROBE(__sdei_asm_handler)
+
+SYM_CODE_START(__sdei_handler_abort)
+ mov_q x0, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME
+ adr x1, 1f
+ ldr_l x2, sdei_exit_mode
+ sdei_handler_exit exit_mode=x2
+ // exit the handler and jump to the next instruction.
+ // Exit will stomp x0-x17, PSTATE, ELR_ELx, and SPSR_ELx.
+1: ret
+SYM_CODE_END(__sdei_handler_abort)
+NOKPROBE(__sdei_handler_abort)
#endif /* CONFIG_ARM_SDE_INTERFACE */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 087c05aa960e..91e44ac7150f 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1179,9 +1179,6 @@ void sve_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
*/
u64 read_zcr_features(void)
{
- u64 zcr;
- unsigned int vq_max;
-
/*
* Set the maximum possible VL, and write zeroes to all other
* bits to see if they stick.
@@ -1189,12 +1186,8 @@ u64 read_zcr_features(void)
sve_kernel_enable(NULL);
write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL1);
- zcr = read_sysreg_s(SYS_ZCR_EL1);
- zcr &= ~(u64)ZCR_ELx_LEN_MASK; /* find sticky 1s outside LEN field */
- vq_max = sve_vq_from_vl(sve_get_vl());
- zcr |= vq_max - 1; /* set LEN field to maximum effective value */
-
- return zcr;
+ /* Return LEN value that would be written to get the maximum VL */
+ return sve_vq_from_vl(sve_get_vl()) - 1;
}
void __init sve_setup(void)
@@ -1349,9 +1342,6 @@ void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
*/
u64 read_smcr_features(void)
{
- u64 smcr;
- unsigned int vq_max;
-
sme_kernel_enable(NULL);
/*
@@ -1360,12 +1350,8 @@ u64 read_smcr_features(void)
write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_LEN_MASK,
SYS_SMCR_EL1);
- smcr = read_sysreg_s(SYS_SMCR_EL1);
- smcr &= ~(u64)SMCR_ELx_LEN_MASK; /* Only the LEN field */
- vq_max = sve_vq_from_vl(sme_get_vl());
- smcr |= vq_max - 1; /* set LEN field to maximum effective value */
-
- return smcr;
+ /* Return LEN value that would be written to get the maximum VL */
+ return sve_vq_from_vl(sme_get_vl()) - 1;
}
void __init sme_setup(void)
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 757a0de07f91..7b236994f0e1 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -113,7 +113,7 @@ SYM_CODE_START(primary_entry)
*/
#if VA_BITS > 48
mrs_s x0, SYS_ID_AA64MMFR2_EL1
- tst x0, #0xf << ID_AA64MMFR2_EL1_VARange_SHIFT
+ tst x0, ID_AA64MMFR2_EL1_VARange_MASK
mov x0, #VA_BITS
mov x25, #VA_BITS_MIN
csel x25, x25, x0, eq
@@ -756,7 +756,7 @@ SYM_FUNC_START(__cpu_secondary_check52bitva)
b.ne 2f
mrs_s x0, SYS_ID_AA64MMFR2_EL1
- and x0, x0, #(0xf << ID_AA64MMFR2_EL1_VARange_SHIFT)
+ and x0, x0, ID_AA64MMFR2_EL1_VARange_MASK
cbnz x0, 2f
update_early_cpu_boot_status \
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index db2a1861bb97..35225632d70a 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -654,7 +654,7 @@ static int breakpoint_handler(unsigned long unused, unsigned long esr,
perf_bp_event(bp, regs);
/* Do we need to handle the stepping? */
- if (is_default_overflow_handler(bp))
+ if (uses_default_overflow_handler(bp))
step = 1;
unlock:
rcu_read_unlock();
@@ -733,7 +733,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
static int watchpoint_report(struct perf_event *wp, unsigned long addr,
struct pt_regs *regs)
{
- int step = is_default_overflow_handler(wp);
+ int step = uses_default_overflow_handler(wp);
struct arch_hw_breakpoint *info = counter_arch_bp(wp);
info->trigger = addr;
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c
index 2fe2491b692c..aee12c75b738 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -262,9 +262,9 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases)
if (!len)
return;
- len = min(len, ARRAY_SIZE(buf) - 1);
- strncpy(buf, cmdline, len);
- buf[len] = 0;
+ len = strscpy(buf, cmdline, ARRAY_SIZE(buf));
+ if (len == -E2BIG)
+ len = ARRAY_SIZE(buf) - 1;
if (strcmp(buf, "--") == 0)
return;
diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
index 2276689b5411..f872c57e9909 100644
--- a/arch/arm64/kernel/pci.c
+++ b/arch/arm64/kernel/pci.c
@@ -11,8 +11,6 @@
#include <linux/io.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/of_pci.h>
-#include <linux/of_platform.h>
#include <linux/pci.h>
#include <linux/pci-acpi.h>
#include <linux/pci-ecam.h>
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 187aa2b175b4..20d7ef82de90 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -891,7 +891,8 @@ static int sve_set_common(struct task_struct *target,
break;
default:
WARN_ON_ONCE(1);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
/*
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 830be01af32d..255d12f881c2 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -47,6 +47,9 @@ DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr);
DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr);
#endif
+DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_normal_event);
+DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_critical_event);
+
static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu)
{
unsigned long *p;
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index edd63894d61e..960b98b43506 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1044,10 +1044,8 @@ void crash_smp_send_stop(void)
* If this cpu is the only one alive at this point in time, online or
* not, there are no stop messages to be sent around, so just back out.
*/
- if (num_other_online_cpus() == 0) {
- sdei_mask_local_cpu();
- return;
- }
+ if (num_other_online_cpus() == 0)
+ goto skip_ipi;
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
@@ -1066,7 +1064,9 @@ void crash_smp_send_stop(void)
pr_warn("SMP: failed to stop secondary CPUs %*pbl\n",
cpumask_pr_args(&mask));
+skip_ipi:
sdei_mask_local_cpu();
+ sdei_handler_abort();
}
bool smp_crash_stop_failed(void)
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index b1ae2f2eaf77..9a70d9746b66 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -8,7 +8,6 @@
#include <linux/randomize_kstack.h>
#include <linux/syscalls.h>
-#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/exception.h>
#include <asm/fpsimd.h>
@@ -101,8 +100,6 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
* (Similarly for HVC and SMC elsewhere.)
*/
- local_daif_restore(DAIF_PROCCTX);
-
if (flags & _TIF_MTE_ASYNC_FAULT) {
/*
* Process the asynchronous tag check fault before the actual
@@ -153,38 +150,8 @@ trace_exit:
syscall_trace_exit(regs);
}
-/*
- * As per the ABI exit SME streaming mode and clear the SVE state not
- * shared with FPSIMD on syscall entry.
- */
-static inline void fp_user_discard(void)
-{
- /*
- * If SME is active then exit streaming mode. If ZA is active
- * then flush the SVE registers but leave userspace access to
- * both SVE and SME enabled, otherwise disable SME for the
- * task and fall through to disabling SVE too. This means
- * that after a syscall we never have any streaming mode
- * register state to track, if this changes the KVM code will
- * need updating.
- */
- if (system_supports_sme())
- sme_smstop_sm();
-
- if (!system_supports_sve())
- return;
-
- if (test_thread_flag(TIF_SVE)) {
- unsigned int sve_vq_minus_one;
-
- sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1;
- sve_flush_live(true, sve_vq_minus_one);
- }
-}
-
void do_el0_svc(struct pt_regs *regs)
{
- fp_user_discard();
el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table);
}
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index 6028f1fe2d1c..45354f2ddf70 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -50,9 +50,7 @@ SECTIONS
. = ALIGN(4);
.altinstructions : {
- __alt_instructions = .;
*(.altinstructions)
- __alt_instructions_end = .;
}
.dynamic : { *(.dynamic) } :text :dynamic
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 9ddc025e4b86..2250253a6429 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -25,7 +25,7 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o
cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
-hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
+hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
hyp-obj-y += $(lib-objs)
##
diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c
index d68abd7ea124..46a2d4f2b3c6 100644
--- a/arch/arm64/kvm/hyp/nvhe/list_debug.c
+++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c
@@ -26,8 +26,9 @@ static inline __must_check bool nvhe_check_data_corruption(bool v)
/* The predicates checked here are taken from lib/list_debug.c. */
-bool __list_add_valid(struct list_head *new, struct list_head *prev,
- struct list_head *next)
+__list_valid_slowpath
+bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
{
if (NVHE_CHECK_DATA_CORRUPTION(next->prev != prev) ||
NVHE_CHECK_DATA_CORRUPTION(prev->next != next) ||
@@ -37,7 +38,8 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev,
return true;
}
-bool __list_del_entry_valid(struct list_head *entry)
+__list_valid_slowpath
+bool __list_del_entry_valid_or_report(struct list_head *entry)
{
struct list_head *prev, *next;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index d31c3a9290c5..4fcb88a445ef 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -73,6 +73,33 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit;
#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
+/*
+ * To make optimal use of block mappings when laying out the linear
+ * mapping, round down the base of physical memory to a size that can
+ * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
+ * (64k granule), or a multiple that can be mapped using contiguous bits
+ * in the page tables: 32 * PMD_SIZE (16k granule)
+ */
+#if defined(CONFIG_ARM64_4K_PAGES)
+#define ARM64_MEMSTART_SHIFT PUD_SHIFT
+#elif defined(CONFIG_ARM64_16K_PAGES)
+#define ARM64_MEMSTART_SHIFT CONT_PMD_SHIFT
+#else
+#define ARM64_MEMSTART_SHIFT PMD_SHIFT
+#endif
+
+/*
+ * sparsemem vmemmap imposes an additional requirement on the alignment of
+ * memstart_addr, due to the fact that the base of the vmemmap region
+ * has a direct correspondence, and needs to appear sufficiently aligned
+ * in the virtual address space.
+ */
+#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
+#define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS)
+#else
+#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT)
+#endif
+
static int __init reserve_crashkernel_low(unsigned long long low_size)
{
unsigned long long low_base;
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 2baeec419f62..14fdf645edc8 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -447,7 +447,7 @@ SYM_FUNC_START(__cpu_setup)
* via capabilities.
*/
mrs x9, ID_AA64MMFR1_EL1
- and x9, x9, #0xf
+ and x9, x9, ID_AA64MMFR1_EL1_HAFDBS_MASK
cbz x9, 1f
orr tcr, tcr, #TCR_HA // hardware Access flag update
1:
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index 87927eb824cc..58500a964238 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -11,7 +11,7 @@
#ifdef __KERNEL__
-#include <acpi/pdc_intel.h>
+#include <acpi/proc_cap_intel.h>
#include <linux/init.h>
#include <linux/numa.h>
@@ -69,9 +69,9 @@ extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
#endif
static inline bool arch_has_acpi_pdc(void) { return true; }
-static inline void arch_acpi_set_pdc_bits(u32 *buf)
+static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
{
- buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
+ *cap |= ACPI_PROC_CAP_EST_CAPABILITY_SMP;
}
#ifdef CONFIG_ACPI_NUMA
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index f8c74ffeeefb..83d8609aec03 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -372,3 +372,4 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index e71d5bf2cee0..465759f6b0ed 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -662,5 +662,3 @@ source "kernel/power/Kconfig"
source "drivers/acpi/Kconfig"
endmenu
-
-source "drivers/firmware/Kconfig"
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index b1e5db51b61c..ef87bab46754 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -83,8 +83,8 @@ KBUILD_CFLAGS_KERNEL += -fPIE
LDFLAGS_vmlinux += -static -pie --no-dynamic-linker -z notext
endif
-cflags-y += -ffreestanding
cflags-y += $(call cc-option, -mno-check-zero-division)
+cflags-y += -fno-builtin-memcpy -fno-builtin-memmove -fno-builtin-memset
load-y = 0x9000000000200000
bootvars-y = VMLINUX_LOAD_ADDRESS=$(load-y)
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 6b222f227342..93783fa24f6e 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
generic-y += dma-contiguous.h
-generic-y += export.h
generic-y += mcs_spinlock.h
generic-y += parport.h
generic-y += early_ioremap.h
diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h
index b541f6248837..c2d8962fda00 100644
--- a/arch/loongarch/include/asm/fpu.h
+++ b/arch/loongarch/include/asm/fpu.h
@@ -173,16 +173,30 @@ static inline void restore_fp(struct task_struct *tsk)
_restore_fp(&tsk->thread.fpu);
}
-static inline union fpureg *get_fpu_regs(struct task_struct *tsk)
+static inline void save_fpu_regs(struct task_struct *tsk)
{
+ unsigned int euen;
+
if (tsk == current) {
preempt_disable();
- if (is_fpu_owner())
+
+ euen = csr_read32(LOONGARCH_CSR_EUEN);
+
+#ifdef CONFIG_CPU_HAS_LASX
+ if (euen & CSR_EUEN_LASXEN)
+ _save_lasx(&current->thread.fpu);
+ else
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+ if (euen & CSR_EUEN_LSXEN)
+ _save_lsx(&current->thread.fpu);
+ else
+#endif
+ if (euen & CSR_EUEN_FPEN)
_save_fp(&current->thread.fpu);
+
preempt_enable();
}
-
- return tsk->thread.fpu.fpr;
}
static inline int is_simd_owner(void)
diff --git a/arch/loongarch/include/asm/local.h b/arch/loongarch/include/asm/local.h
index 83e995b30e47..c49675852bdc 100644
--- a/arch/loongarch/include/asm/local.h
+++ b/arch/loongarch/include/asm/local.h
@@ -63,8 +63,8 @@ static inline long local_cmpxchg(local_t *l, long old, long new)
static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
{
- typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old;
- return try_cmpxchg_local(&l->a.counter, __old, new);
+ return try_cmpxchg_local(&l->a.counter,
+ (typeof(l->a.counter) *) old, new);
}
#define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n)))
diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h
index 35f0958163ac..f3ddaed9ef7f 100644
--- a/arch/loongarch/include/asm/ptrace.h
+++ b/arch/loongarch/include/asm/ptrace.h
@@ -162,7 +162,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long val
#define instruction_pointer(regs) ((regs)->csr_era)
#define profile_pc(regs) instruction_pointer(regs)
-extern void die(const char *, struct pt_regs *) __noreturn;
+extern void die(const char *str, struct pt_regs *regs);
static inline void die_if_kernel(const char *str, struct pt_regs *regs)
{
diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h
index 416b653bccb4..66ecb480c894 100644
--- a/arch/loongarch/include/asm/smp.h
+++ b/arch/loongarch/include/asm/smp.h
@@ -98,8 +98,6 @@ static inline void __cpu_die(unsigned int cpu)
{
loongson_cpu_die(cpu);
}
-
-extern void __noreturn play_dead(void);
#endif
#endif /* __ASM_SMP_H */
diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index f3df5f0a4509..501094a09f5d 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -6,12 +6,12 @@
*
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <linux/export.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/asm-extable.h>
#include <asm/asm-offsets.h>
#include <asm/errno.h>
-#include <asm/export.h>
#include <asm/fpregdef.h>
#include <asm/loongarch.h>
#include <asm/regdef.h>
diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c
index 021b59c248fa..fc55c4de2a11 100644
--- a/arch/loongarch/kernel/hw_breakpoint.c
+++ b/arch/loongarch/kernel/hw_breakpoint.c
@@ -207,8 +207,7 @@ static int hw_breakpoint_control(struct perf_event *bp,
write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE);
} else {
ctrl = encode_ctrl_reg(info->ctrl);
- write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | CTRL_PLV_ENABLE |
- 1 << MWPnCFG3_LoadEn | 1 << MWPnCFG3_StoreEn);
+ write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | CTRL_PLV_ENABLE);
}
enable = csr_read64(LOONGARCH_CSR_CRMD);
csr_write64(CSR_CRMD_WE | enable, LOONGARCH_CSR_CRMD);
diff --git a/arch/loongarch/kernel/mcount.S b/arch/loongarch/kernel/mcount.S
index cb8e5803de4b..3015896016a0 100644
--- a/arch/loongarch/kernel/mcount.S
+++ b/arch/loongarch/kernel/mcount.S
@@ -5,7 +5,7 @@
* Copyright (C) 2022 Loongson Technology Corporation Limited
*/
-#include <asm/export.h>
+#include <linux/export.h>
#include <asm/ftrace.h>
#include <asm/regdef.h>
#include <asm/stackframe.h>
diff --git a/arch/loongarch/kernel/mcount_dyn.S b/arch/loongarch/kernel/mcount_dyn.S
index e16ab0b98e5a..482aa553aa2d 100644
--- a/arch/loongarch/kernel/mcount_dyn.S
+++ b/arch/loongarch/kernel/mcount_dyn.S
@@ -3,7 +3,6 @@
* Copyright (C) 2022 Loongson Technology Corporation Limited
*/
-#include <asm/export.h>
#include <asm/ftrace.h>
#include <asm/regdef.h>
#include <asm/stackframe.h>
diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c
index 2e04eb07abb6..4ee1e9d6a65f 100644
--- a/arch/loongarch/kernel/process.c
+++ b/arch/loongarch/kernel/process.c
@@ -61,13 +61,6 @@ EXPORT_SYMBOL(__stack_chk_guard);
unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);
-#ifdef CONFIG_HOTPLUG_CPU
-void __noreturn arch_cpu_idle_dead(void)
-{
- play_dead();
-}
-#endif
-
asmlinkage void ret_from_fork(void);
asmlinkage void ret_from_kernel_thread(void);
diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c
index a0767c3a0f0a..f72adbf530c6 100644
--- a/arch/loongarch/kernel/ptrace.c
+++ b/arch/loongarch/kernel/ptrace.c
@@ -147,6 +147,8 @@ static int fpr_get(struct task_struct *target,
{
int r;
+ save_fpu_regs(target);
+
if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
r = gfpr_get(target, &to);
else
@@ -278,6 +280,8 @@ static int simd_get(struct task_struct *target,
{
const unsigned int wr_size = NUM_FPU_REGS * regset->size;
+ save_fpu_regs(target);
+
if (!tsk_used_math(target)) {
/* The task hasn't used FP or LSX, fill with 0xff */
copy_pad_fprs(target, regset, &to, 0);
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index 8ea1bbcf13a7..6667b0a90f81 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -317,7 +317,7 @@ void loongson_cpu_die(unsigned int cpu)
mb();
}
-void play_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
register uint64_t addr;
register void (*init_fn)(void);
diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
index 8fb5e7a77145..89699db45cec 100644
--- a/arch/loongarch/kernel/traps.c
+++ b/arch/loongarch/kernel/traps.c
@@ -383,16 +383,15 @@ void show_registers(struct pt_regs *regs)
static DEFINE_RAW_SPINLOCK(die_lock);
-void __noreturn die(const char *str, struct pt_regs *regs)
+void die(const char *str, struct pt_regs *regs)
{
+ int ret;
static int die_counter;
- int sig = SIGSEGV;
oops_enter();
- if (notify_die(DIE_OOPS, str, regs, 0, current->thread.trap_nr,
- SIGSEGV) == NOTIFY_STOP)
- sig = 0;
+ ret = notify_die(DIE_OOPS, str, regs, 0,
+ current->thread.trap_nr, SIGSEGV);
console_verbose();
raw_spin_lock_irq(&die_lock);
@@ -405,6 +404,9 @@ void __noreturn die(const char *str, struct pt_regs *regs)
oops_exit();
+ if (ret == NOTIFY_STOP)
+ return;
+
if (regs && kexec_should_crash(current))
crash_kexec(regs);
@@ -414,7 +416,7 @@ void __noreturn die(const char *str, struct pt_regs *regs)
if (panic_on_oops)
panic("Fatal exception");
- make_task_dead(sig);
+ make_task_dead(SIGSEGV);
}
static inline void setup_vint_size(unsigned int size)
diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S
index 9dcf71719387..0790eadce166 100644
--- a/arch/loongarch/lib/clear_user.S
+++ b/arch/loongarch/lib/clear_user.S
@@ -3,12 +3,12 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <linux/export.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/asm-extable.h>
#include <asm/cpu.h>
-#include <asm/export.h>
#include <asm/regdef.h>
.irp to, 0, 1, 2, 3, 4, 5, 6, 7
diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S
index fecd08cad702..bfe3d2793d00 100644
--- a/arch/loongarch/lib/copy_user.S
+++ b/arch/loongarch/lib/copy_user.S
@@ -3,12 +3,12 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <linux/export.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/asm-extable.h>
#include <asm/cpu.h>
-#include <asm/export.h>
#include <asm/regdef.h>
.irp to, 0, 1, 2, 3, 4, 5, 6, 7
diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S
index 39ce6621c704..cc30b3b6252f 100644
--- a/arch/loongarch/lib/memcpy.S
+++ b/arch/loongarch/lib/memcpy.S
@@ -3,11 +3,11 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <linux/export.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/cpu.h>
-#include <asm/export.h>
#include <asm/regdef.h>
SYM_FUNC_START(memcpy)
diff --git a/arch/loongarch/lib/memmove.S b/arch/loongarch/lib/memmove.S
index 45b725ba7867..7dc76d1484b6 100644
--- a/arch/loongarch/lib/memmove.S
+++ b/arch/loongarch/lib/memmove.S
@@ -3,11 +3,11 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <linux/export.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/cpu.h>
-#include <asm/export.h>
#include <asm/regdef.h>
SYM_FUNC_START(memmove)
diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S
index b39c6194e3ae..3f20f7996e8e 100644
--- a/arch/loongarch/lib/memset.S
+++ b/arch/loongarch/lib/memset.S
@@ -3,11 +3,11 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <linux/export.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/cpu.h>
-#include <asm/export.h>
#include <asm/regdef.h>
.macro fill_to_64 r0
diff --git a/arch/loongarch/lib/unaligned.S b/arch/loongarch/lib/unaligned.S
index 9177fd638f07..185f82d85810 100644
--- a/arch/loongarch/lib/unaligned.S
+++ b/arch/loongarch/lib/unaligned.S
@@ -9,7 +9,6 @@
#include <asm/asmmacro.h>
#include <asm/asm-extable.h>
#include <asm/errno.h>
-#include <asm/export.h>
#include <asm/regdef.h>
.L_fixup_handle_unaligned:
diff --git a/arch/loongarch/mm/page.S b/arch/loongarch/mm/page.S
index 4c874a7af0ad..7ad76551d313 100644
--- a/arch/loongarch/mm/page.S
+++ b/arch/loongarch/mm/page.S
@@ -2,9 +2,9 @@
/*
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/asm.h>
-#include <asm/export.h>
#include <asm/page.h>
#include <asm/regdef.h>
diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
index 4ad78703de6f..ca17dd3a1915 100644
--- a/arch/loongarch/mm/tlbex.S
+++ b/arch/loongarch/mm/tlbex.S
@@ -3,7 +3,6 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
#include <asm/asm.h>
-#include <asm/export.h>
#include <asm/loongarch.h>
#include <asm/page.h>
#include <asm/pgtable.h>
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index 4383ed851063..6deb8faa564b 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -591,7 +591,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -635,6 +634,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index ec0f9c9f9562..802c161827f4 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -548,7 +548,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -591,6 +590,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 8656ae1f239e..2cb3d755873b 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -568,7 +568,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -612,6 +611,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 496fb6a415ea..b13552caa6b3 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -540,7 +540,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -583,6 +582,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 4add7ab9973b..f88356c45440 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -550,7 +550,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -593,6 +592,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 5845f1f71fd1..7c2ebb616fba 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -570,7 +570,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -614,6 +613,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index bbb251bab81a..d3b272910b38 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -656,7 +656,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -700,6 +699,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 4f9cfc70c66d..4529bc4b843c 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -539,7 +539,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -582,6 +581,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 67c42b4822f0..30824032e4d5 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -540,7 +540,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -583,6 +582,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 85f19515200b..3911211410ed 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -557,7 +557,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -601,6 +600,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index b1b15acb5d5f..991730c50957 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -538,7 +538,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -580,6 +579,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 91d66c0f5ab6..e80d7509ab1d 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -538,7 +538,6 @@ CONFIG_CRYPTO_CHACHA20POLY1305=m
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_SHA3=m
CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_VMAC=m
CONFIG_CRYPTO_WP512=m
@@ -581,6 +580,7 @@ CONFIG_TEST_SCANF=m
CONFIG_TEST_BITMAP=m
CONFIG_TEST_UUID=m
CONFIG_TEST_XARRAY=m
+CONFIG_TEST_MAPLE_TREE=m
CONFIG_TEST_RHASHTABLE=m
CONFIG_TEST_IDA=m
CONFIG_TEST_BITOPS=m
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 1b720299deb1..0dbf9c5c6fae 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
generated-y += syscall_table.h
-generic-y += export.h
generic-y += extable.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
diff --git a/arch/m68k/include/asm/div64.h b/arch/m68k/include/asm/div64.h
index 365f39f5e256..df1f6b450cc5 100644
--- a/arch/m68k/include/asm/div64.h
+++ b/arch/m68k/include/asm/div64.h
@@ -31,6 +31,9 @@
__rem; \
})
+/* defining this stops the unused helper function from being built */
+#define __div64_32 __div64_32
+
#endif /* CONFIG_CPU_HAS_NO_MULDIV64 */
#endif /* _M68K_DIV64_H */
diff --git a/arch/m68k/include/asm/string.h b/arch/m68k/include/asm/string.h
index f0f5021d6327..760cc13acdf4 100644
--- a/arch/m68k/include/asm/string.h
+++ b/arch/m68k/include/asm/string.h
@@ -41,6 +41,7 @@ static inline char *strncpy(char *dest, const char *src, size_t n)
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *, const void *, __kernel_size_t);
+extern int memcmp(const void *, const void *, __kernel_size_t);
#define memcmp(d, s, n) __builtin_memcmp(d, s, n)
#define __HAVE_ARCH_MEMSET
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 4f504783371f..259ceb125367 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -451,3 +451,4 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/arch/m68k/lib/divsi3.S b/arch/m68k/lib/divsi3.S
index 3a2143f51631..62787b4333e7 100644
--- a/arch/m68k/lib/divsi3.S
+++ b/arch/m68k/lib/divsi3.S
@@ -33,7 +33,7 @@ General Public License for more details. */
D. V. Henkel-Wallace (gumby@cygnus.com) Fete Bastille, 1992
*/
-#include <asm/export.h>
+#include <linux/export.h>
/* These are predefined by new versions of GNU cpp. */
diff --git a/arch/m68k/lib/modsi3.S b/arch/m68k/lib/modsi3.S
index 1c967649a4e0..1bcb742d0b76 100644
--- a/arch/m68k/lib/modsi3.S
+++ b/arch/m68k/lib/modsi3.S
@@ -33,7 +33,7 @@ General Public License for more details. */
D. V. Henkel-Wallace (gumby@cygnus.com) Fete Bastille, 1992
*/
-#include <asm/export.h>
+#include <linux/export.h>
/* These are predefined by new versions of GNU cpp. */
diff --git a/arch/m68k/lib/mulsi3.S b/arch/m68k/lib/mulsi3.S
index 855675e69a8a..c2853248249e 100644
--- a/arch/m68k/lib/mulsi3.S
+++ b/arch/m68k/lib/mulsi3.S
@@ -32,7 +32,7 @@ General Public License for more details. */
Some of this code comes from MINIX, via the folks at ericsson.
D. V. Henkel-Wallace (gumby@cygnus.com) Fete Bastille, 1992
*/
-#include <asm/export.h>
+#include <linux/export.h>
/* These are predefined by new versions of GNU cpp. */
#ifndef __USER_LABEL_PREFIX__
diff --git a/arch/m68k/lib/udivsi3.S b/arch/m68k/lib/udivsi3.S
index 78440ae513bf..39ad70596293 100644
--- a/arch/m68k/lib/udivsi3.S
+++ b/arch/m68k/lib/udivsi3.S
@@ -32,7 +32,7 @@ General Public License for more details. */
Some of this code comes from MINIX, via the folks at ericsson.
D. V. Henkel-Wallace (gumby@cygnus.com) Fete Bastille, 1992
*/
-#include <asm/export.h>
+#include <linux/export.h>
/* These are predefined by new versions of GNU cpp. */
#ifndef __USER_LABEL_PREFIX__
diff --git a/arch/m68k/lib/umodsi3.S b/arch/m68k/lib/umodsi3.S
index b6fd11f58948..6640eaa9eb03 100644
--- a/arch/m68k/lib/umodsi3.S
+++ b/arch/m68k/lib/umodsi3.S
@@ -32,7 +32,7 @@ General Public License for more details. */
Some of this code comes from MINIX, via the folks at ericsson.
D. V. Henkel-Wallace (gumby@cygnus.com) Fete Bastille, 1992
*/
-#include <asm/export.h>
+#include <linux/export.h>
/* These are predefined by new versions of GNU cpp. */
#ifndef __USER_LABEL_PREFIX__
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 858d22bf275c..a3798c2637fd 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -457,3 +457,4 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/arch/mips/include/asm/local.h b/arch/mips/include/asm/local.h
index 5daf6fe8e3e9..e6ae3df0349d 100644
--- a/arch/mips/include/asm/local.h
+++ b/arch/mips/include/asm/local.h
@@ -101,8 +101,8 @@ static __inline__ long local_cmpxchg(local_t *l, long old, long new)
static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new)
{
- typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old;
- return try_cmpxchg_local(&l->a.counter, __old, new);
+ return try_cmpxchg_local(&l->a.counter,
+ (typeof(l->a.counter) *) old, new);
}
#define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n)))
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 1976317d4e8b..152034b8e0a0 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -390,3 +390,4 @@
449 n32 futex_waitv sys_futex_waitv
450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node
451 n32 cachestat sys_cachestat
+452 n32 fchmodat2 sys_fchmodat2
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index cfda2511badf..cb5e757f6621 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -366,3 +366,4 @@
449 n64 futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 n64 cachestat sys_cachestat
+452 n64 fchmodat2 sys_fchmodat2
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 7692234c3768..1a646813afdc 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -439,3 +439,4 @@
449 o32 futex_waitv sys_futex_waitv
450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node
451 o32 cachestat sys_cachestat
+452 o32 fchmodat2 sys_fchmodat2
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index a0a9145b6dd4..e97c175b56f9 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -450,3 +450,4 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 8c0b08b7a80e..20e50586e8a2 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -538,3 +538,4 @@
449 common futex_waitv sys_futex_waitv
450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 0dc85556dec5..ec98e526167e 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -145,6 +145,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops subpage_walk_ops = {
.pmd_entry = subpage_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index ea807aa0c31a..38c5be34c895 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -86,7 +86,7 @@ spufs_new_inode(struct super_block *sb, umode_t mode)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
out:
return inode;
}
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 10e7a7ad175a..bea7b73e895d 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -580,15 +580,15 @@ config TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI
and Zifencei are supported in binutils from version 2.36 onwards.
To make life easier, and avoid forcing toolchains that default to a
newer ISA spec to version 2.2, relax the check to binutils >= 2.36.
- For clang < 17 or GCC < 11.1.0, for which this is not possible, this is
- dealt with in CONFIG_TOOLCHAIN_NEEDS_OLD_ISA_SPEC.
+ For clang < 17 or GCC < 11.3.0, for which this is not possible or need
+ special treatment, this is dealt with in TOOLCHAIN_NEEDS_OLD_ISA_SPEC.
config TOOLCHAIN_NEEDS_OLD_ISA_SPEC
def_bool y
depends on TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI
# https://github.com/llvm/llvm-project/commit/22e199e6afb1263c943c0c0d4498694e15bf8a16
- # https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=b03be74bad08c382da47e048007a78fa3fb4ef49
- depends on (CC_IS_CLANG && CLANG_VERSION < 170000) || (CC_IS_GCC && GCC_VERSION < 110100)
+ # https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=d29f5d6ab513c52fd872f532c492e35ae9fd6671
+ depends on (CC_IS_CLANG && CLANG_VERSION < 170000) || (CC_IS_GCC && GCC_VERSION < 110300)
help
Certain versions of clang and GCC do not support zicsr and zifencei via
-march. This option causes an older ISA spec compatible with these older
diff --git a/arch/riscv/include/asm/efi.h b/arch/riscv/include/asm/efi.h
index 29e9a0d84b16..8a6a128ec57f 100644
--- a/arch/riscv/include/asm/efi.h
+++ b/arch/riscv/include/asm/efi.h
@@ -21,12 +21,6 @@ extern void efi_init(void);
int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md, bool);
-#define arch_efi_call_virt_setup() ({ \
- sync_kernel_mappings(efi_mm.pgd); \
- efi_virtmap_load(); \
- })
-#define arch_efi_call_virt_teardown() efi_virtmap_unload()
-
#define ARCH_EFI_IRQ_FLAGS_MASK (SR_IE | SR_SPIE)
/* Load initrd anywhere in system RAM */
@@ -46,8 +40,8 @@ static inline unsigned long efi_get_kimg_min_align(void)
#define EFI_KIMG_PREFERRED_ADDRESS efi_get_kimg_min_align()
-void efi_virtmap_load(void);
-void efi_virtmap_unload(void);
+void arch_efi_call_virt_setup(void);
+void arch_efi_call_virt_teardown(void);
unsigned long stext_offset(void);
diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
index 3d78930cab51..c5ee07b3df07 100644
--- a/arch/riscv/include/asm/vector.h
+++ b/arch/riscv/include/asm/vector.h
@@ -70,8 +70,9 @@ static __always_inline void __vstate_csr_save(struct __riscv_v_ext_state *dest)
"csrr %1, " __stringify(CSR_VTYPE) "\n\t"
"csrr %2, " __stringify(CSR_VL) "\n\t"
"csrr %3, " __stringify(CSR_VCSR) "\n\t"
+ "csrr %4, " __stringify(CSR_VLENB) "\n\t"
: "=r" (dest->vstart), "=r" (dest->vtype), "=r" (dest->vl),
- "=r" (dest->vcsr) : :);
+ "=r" (dest->vcsr), "=r" (dest->vlenb) : :);
}
static __always_inline void __vstate_csr_restore(struct __riscv_v_ext_state *src)
diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h
index e17c550986a6..283800130614 100644
--- a/arch/riscv/include/uapi/asm/ptrace.h
+++ b/arch/riscv/include/uapi/asm/ptrace.h
@@ -97,6 +97,7 @@ struct __riscv_v_ext_state {
unsigned long vl;
unsigned long vtype;
unsigned long vcsr;
+ unsigned long vlenb;
void *datap;
/*
* In signal handler, datap will be set a correct user stack offset
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index 1d572cf3140f..487303e3ef22 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -25,9 +25,6 @@ enum riscv_regset {
#ifdef CONFIG_FPU
REGSET_F,
#endif
-#ifdef CONFIG_RISCV_ISA_V
- REGSET_V,
-#endif
};
static int riscv_gpr_get(struct task_struct *target,
@@ -84,61 +81,6 @@ static int riscv_fpr_set(struct task_struct *target,
}
#endif
-#ifdef CONFIG_RISCV_ISA_V
-static int riscv_vr_get(struct task_struct *target,
- const struct user_regset *regset,
- struct membuf to)
-{
- struct __riscv_v_ext_state *vstate = &target->thread.vstate;
-
- if (!riscv_v_vstate_query(task_pt_regs(target)))
- return -EINVAL;
-
- /*
- * Ensure the vector registers have been saved to the memory before
- * copying them to membuf.
- */
- if (target == current)
- riscv_v_vstate_save(current, task_pt_regs(current));
-
- /* Copy vector header from vstate. */
- membuf_write(&to, vstate, offsetof(struct __riscv_v_ext_state, datap));
- membuf_zero(&to, sizeof(vstate->datap));
-
- /* Copy all the vector registers from vstate. */
- return membuf_write(&to, vstate->datap, riscv_v_vsize);
-}
-
-static int riscv_vr_set(struct task_struct *target,
- const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- const void *kbuf, const void __user *ubuf)
-{
- int ret, size;
- struct __riscv_v_ext_state *vstate = &target->thread.vstate;
-
- if (!riscv_v_vstate_query(task_pt_regs(target)))
- return -EINVAL;
-
- /* Copy rest of the vstate except datap */
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vstate, 0,
- offsetof(struct __riscv_v_ext_state, datap));
- if (unlikely(ret))
- return ret;
-
- /* Skip copy datap. */
- size = sizeof(vstate->datap);
- count -= size;
- ubuf += size;
-
- /* Copy all the vector registers. */
- pos = 0;
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vstate->datap,
- 0, riscv_v_vsize);
- return ret;
-}
-#endif
-
static const struct user_regset riscv_user_regset[] = {
[REGSET_X] = {
.core_note_type = NT_PRSTATUS,
@@ -158,17 +100,6 @@ static const struct user_regset riscv_user_regset[] = {
.set = riscv_fpr_set,
},
#endif
-#ifdef CONFIG_RISCV_ISA_V
- [REGSET_V] = {
- .core_note_type = NT_RISCV_VECTOR,
- .align = 16,
- .n = ((32 * RISCV_MAX_VLENB) +
- sizeof(struct __riscv_v_ext_state)) / sizeof(__u32),
- .size = sizeof(__u32),
- .regset_get = riscv_vr_get,
- .set = riscv_vr_set,
- },
-#endif
};
static const struct user_regset_view riscv_user_native_view = {
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index ea3d61de065b..161d0b34c2cb 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -102,6 +102,7 @@ static const struct mm_walk_ops pageattr_ops = {
.pmd_entry = pageattr_pmd_entry,
.pte_entry = pageattr_pte_entry,
.pte_hole = pageattr_pte_hole,
+ .walk_lock = PGWALK_RDLOCK,
};
static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index 76e362277179..8e4d74f51115 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -3,7 +3,7 @@ obj-y += kernel/
obj-y += mm/
obj-$(CONFIG_KVM) += kvm/
obj-y += crypto/
-obj-$(CONFIG_S390_HYPFS_FS) += hypfs/
+obj-$(CONFIG_S390_HYPFS) += hypfs/
obj-$(CONFIG_APPLDATA_BASE) += appldata/
obj-y += net/
obj-$(CONFIG_PCI) += pci/
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5b39918b7042..18bf754e1fad 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -174,6 +174,7 @@ config S390
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
+ select HAVE_FUNCTION_GRAPH_RETVAL
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
@@ -512,6 +513,17 @@ config KEXEC_SIG
verification for the corresponding kernel image type being
loaded in order for this to work.
+config CERT_STORE
+ bool "Get user certificates via DIAG320"
+ depends on KEYS
+ select CRYPTO_LIB_SHA256
+ help
+ Enable this option if you want to access user-provided secure boot
+ certificates via DIAG 0x320.
+
+ These certificates will be made available via the keyring named
+ 'cert_store'.
+
config KERNEL_NOBP
def_bool n
prompt "Enable modified branch prediction for the kernel by default"
@@ -743,9 +755,9 @@ config CRASH_DUMP
Crash dump kernels are loaded in the main kernel with kexec-tools
into a specially reserved region and then later executed after
a crash by kdump/kexec.
- Refer to <file:Documentation/s390/zfcpdump.rst> for more details on this.
+ Refer to <file:Documentation/arch/s390/zfcpdump.rst> for more details on this.
This option also enables s390 zfcpdump.
- See also <file:Documentation/s390/zfcpdump.rst>
+ See also <file:Documentation/arch/s390/zfcpdump.rst>
endmenu
@@ -867,13 +879,24 @@ config APPLDATA_NET_SUM
This can also be compiled as a module, which will be called
appldata_net_sum.o.
-config S390_HYPFS_FS
+config S390_HYPFS
def_bool y
+ prompt "s390 hypervisor information"
+ help
+ This provides several binary files at (debugfs)/s390_hypfs/ to
+ provide accounting information in an s390 hypervisor environment.
+
+config S390_HYPFS_FS
+ def_bool n
prompt "s390 hypervisor file system support"
select SYS_HYPERVISOR
+ depends on S390_HYPFS
help
This is a virtual file system intended to provide accounting
- information in an s390 hypervisor environment.
+ information in an s390 hypervisor environment. This file system
+ is deprecated and should not be used.
+
+ Say N if you are unsure.
source "arch/s390/kvm/Kconfig"
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 5ed242897b0d..a53a36ee0731 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -119,7 +119,6 @@ export KBUILD_CFLAGS_DECOMPRESSOR
OBJCOPYFLAGS := -O binary
libs-y += arch/s390/lib/
-drivers-y += drivers/s390/
boot := arch/s390/boot
syscalls := arch/s390/kernel/syscalls
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 64bd7ac3e35d..b9681cb22753 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -27,6 +27,7 @@ struct page *__bootdata_preserved(vmemmap);
unsigned long __bootdata_preserved(vmemmap_size);
unsigned long __bootdata_preserved(MODULES_VADDR);
unsigned long __bootdata_preserved(MODULES_END);
+unsigned long __bootdata_preserved(max_mappable);
unsigned long __bootdata(ident_map_size);
u64 __bootdata_preserved(stfle_fac_list[16]);
@@ -176,6 +177,7 @@ static unsigned long setup_kernel_memory_layout(void)
unsigned long asce_limit;
unsigned long rte_size;
unsigned long pages;
+ unsigned long vsize;
unsigned long vmax;
pages = ident_map_size / PAGE_SIZE;
@@ -183,19 +185,19 @@ static unsigned long setup_kernel_memory_layout(void)
vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page);
/* choose kernel address space layout: 4 or 3 levels. */
- vmemmap_start = round_up(ident_map_size, _REGION3_SIZE);
- if (IS_ENABLED(CONFIG_KASAN) ||
- vmalloc_size > _REGION2_SIZE ||
- vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN >
- _REGION2_SIZE) {
+ vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size +
+ MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE;
+ vsize = size_add(vsize, vmalloc_size);
+ if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) {
asce_limit = _REGION1_SIZE;
rte_size = _REGION2_SIZE;
} else {
asce_limit = _REGION2_SIZE;
rte_size = _REGION3_SIZE;
}
+
/*
- * forcing modules and vmalloc area under the ultravisor
+ * Forcing modules and vmalloc area under the ultravisor
* secure storage limit, so that any vmalloc allocation
* we do could be used to back secure guest storage.
*/
@@ -204,7 +206,7 @@ static unsigned long setup_kernel_memory_layout(void)
/* force vmalloc and modules below kasan shadow */
vmax = min(vmax, KASAN_SHADOW_START);
#endif
- __memcpy_real_area = round_down(vmax - PAGE_SIZE, PAGE_SIZE);
+ __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE);
__abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE,
sizeof(struct lowcore));
MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE);
@@ -220,8 +222,9 @@ static unsigned long setup_kernel_memory_layout(void)
pages = SECTION_ALIGN_UP(pages);
/* keep vmemmap_start aligned to a top level region table entry */
vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size);
- /* vmemmap_start is the future VMEM_MAX_PHYS, make sure it is within MAX_PHYSMEM */
vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS);
+ /* maximum mappable address as seen by arch_get_mappable_range() */
+ max_mappable = vmemmap_start;
/* make sure identity map doesn't overlay with vmemmap */
ident_map_size = min(ident_map_size, vmemmap_start);
vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page);
@@ -286,8 +289,9 @@ void startup_kernel(void)
setup_lpp();
safe_addr = mem_safe_offset();
+
/*
- * reserve decompressor memory together with decompression heap, buffer and
+ * Reserve decompressor memory together with decompression heap, buffer and
* memory which might be occupied by uncompressed kernel at default 1Mb
* position (if KASLR is off or failed).
*/
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 3a6a9a88bd38..af2fbe48e16c 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -835,6 +835,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=300
# CONFIG_RCU_TRACE is not set
CONFIG_LATENCYTOP=y
CONFIG_BOOTTIME_TRACING=y
+CONFIG_FUNCTION_GRAPH_RETVAL=y
CONFIG_FPROBE=y
CONFIG_FUNCTION_PROFILER=y
CONFIG_STACK_TRACER=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index b13a5a0d99d2..3f263b767a4c 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -787,6 +787,7 @@ CONFIG_RCU_REF_SCALE_TEST=m
CONFIG_RCU_CPU_STALL_TIMEOUT=60
CONFIG_LATENCYTOP=y
CONFIG_BOOTTIME_TRACING=y
+CONFIG_FUNCTION_GRAPH_RETVAL=y
CONFIG_FPROBE=y
CONFIG_FUNCTION_PROFILER=y
CONFIG_STACK_TRACER=y
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index 38349150c96e..8b541e44151d 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -35,7 +35,7 @@
* and padding is also possible, the limits need to be generous.
*/
#define PAES_MIN_KEYSIZE 16
-#define PAES_MAX_KEYSIZE 320
+#define PAES_MAX_KEYSIZE MAXEP11AESKEYBLOBSIZE
static u8 *ctrblk;
static DEFINE_MUTEX(ctrblk_lock);
diff --git a/arch/s390/hypfs/Makefile b/arch/s390/hypfs/Makefile
index 06f601509ce9..c34854d298f8 100644
--- a/arch/s390/hypfs/Makefile
+++ b/arch/s390/hypfs/Makefile
@@ -3,7 +3,12 @@
# Makefile for the linux hypfs filesystem routines.
#
-obj-$(CONFIG_S390_HYPFS_FS) += s390_hypfs.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_dbfs.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_diag.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_diag0c.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_sprp.o
+obj-$(CONFIG_S390_HYPFS) += hypfs_vm.o
-s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o hypfs_dbfs.o hypfs_sprp.o
-s390_hypfs-objs += hypfs_diag0c.o
+obj-$(CONFIG_S390_HYPFS_FS) += hypfs_diag_fs.o
+obj-$(CONFIG_S390_HYPFS_FS) += hypfs_vm_fs.o
+obj-$(CONFIG_S390_HYPFS_FS) += inode.o
diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h
index 05f3f9aee5fc..65f4036fd541 100644
--- a/arch/s390/hypfs/hypfs.h
+++ b/arch/s390/hypfs/hypfs.h
@@ -46,6 +46,15 @@ void hypfs_diag0c_exit(void);
void hypfs_sprp_init(void);
void hypfs_sprp_exit(void);
+int __hypfs_fs_init(void);
+
+static inline int hypfs_fs_init(void)
+{
+ if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+ return __hypfs_fs_init();
+ return 0;
+}
+
/* debugfs interface */
struct hypfs_dbfs_file;
@@ -69,7 +78,6 @@ struct hypfs_dbfs_file {
struct dentry *dentry;
};
-extern void hypfs_dbfs_init(void);
extern void hypfs_dbfs_exit(void);
extern void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df);
extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df);
diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c
index f4c7dbfaf8ee..4024599eb448 100644
--- a/arch/s390/hypfs/hypfs_dbfs.c
+++ b/arch/s390/hypfs/hypfs_dbfs.c
@@ -90,12 +90,33 @@ void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df)
debugfs_remove(df->dentry);
}
-void hypfs_dbfs_init(void)
+static int __init hypfs_dbfs_init(void)
{
- dbfs_dir = debugfs_create_dir("s390_hypfs", NULL);
-}
+ int rc = -ENODATA;
-void hypfs_dbfs_exit(void)
-{
+ dbfs_dir = debugfs_create_dir("s390_hypfs", NULL);
+ if (hypfs_diag_init())
+ goto fail_dbfs_exit;
+ if (hypfs_vm_init())
+ goto fail_hypfs_diag_exit;
+ hypfs_sprp_init();
+ if (hypfs_diag0c_init())
+ goto fail_hypfs_sprp_exit;
+ rc = hypfs_fs_init();
+ if (rc)
+ goto fail_hypfs_diag0c_exit;
+ return 0;
+
+fail_hypfs_diag0c_exit:
+ hypfs_diag0c_exit();
+fail_hypfs_sprp_exit:
+ hypfs_sprp_exit();
+ hypfs_vm_exit();
+fail_hypfs_diag_exit:
+ hypfs_diag_exit();
+ pr_err("Initialization of hypfs failed with rc=%i\n", rc);
+fail_dbfs_exit:
debugfs_remove(dbfs_dir);
+ return rc;
}
+device_initcall(hypfs_dbfs_init)
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index c3be533c4cd3..279b7bba4d43 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -18,188 +18,27 @@
#include <linux/mm.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
+#include "hypfs_diag.h"
#include "hypfs.h"
-#define TMP_SIZE 64 /* size of temporary buffers */
-
#define DBFS_D204_HDR_VERSION 0
-static char *diag224_cpu_names; /* diag 224 name table */
static enum diag204_sc diag204_store_sc; /* used subcode for store */
static enum diag204_format diag204_info_type; /* used diag 204 data format */
static void *diag204_buf; /* 4K aligned buffer for diag204 data */
-static void *diag204_buf_vmalloc; /* vmalloc pointer for diag204 data */
static int diag204_buf_pages; /* number of pages for diag204 data */
static struct dentry *dbfs_d204_file;
-/*
- * DIAG 204 member access functions.
- *
- * Since we have two different diag 204 data formats for old and new s390
- * machines, we do not access the structs directly, but use getter functions for
- * each struct member instead. This should make the code more readable.
- */
-
-/* Time information block */
-
-static inline int info_blk_hdr__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_info_blk_hdr);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_info_blk_hdr);
-}
-
-static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_info_blk_hdr *)hdr)->npar;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
-}
-
-static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_info_blk_hdr *)hdr)->flags;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
-}
-
-/* Partition header */
-
-static inline int part_hdr__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_part_hdr);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_part_hdr);
-}
-
-static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_part_hdr *)hdr)->cpus;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_part_hdr *)hdr)->rcpus;
-}
-
-static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
- char *name)
-{
- if (type == DIAG204_INFO_SIMPLE)
- memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
- DIAG204_LPAR_NAME_LEN);
- else /* DIAG204_INFO_EXT */
- memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
- DIAG204_LPAR_NAME_LEN);
- EBCASC(name, DIAG204_LPAR_NAME_LEN);
- name[DIAG204_LPAR_NAME_LEN] = 0;
- strim(name);
-}
-
-/* CPU info block */
-
-static inline int cpu_info__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_cpu_info);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_cpu_info);
-}
-
-static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_cpu_info *)hdr)->ctidx;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->ctidx;
-}
-
-static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
+enum diag204_format diag204_get_info_type(void)
{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_cpu_info *)hdr)->cpu_addr;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
+ return diag204_info_type;
}
-static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
+static void diag204_set_info_type(enum diag204_format type)
{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_cpu_info *)hdr)->acc_time;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->acc_time;
-}
-
-static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_cpu_info *)hdr)->lp_time;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->lp_time;
-}
-
-static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return 0; /* online_time not available in simple info */
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_cpu_info *)hdr)->online_time;
-}
-
-/* Physical header */
-
-static inline int phys_hdr__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_phys_hdr);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_phys_hdr);
-}
-
-static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_phys_hdr *)hdr)->cpus;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_phys_hdr *)hdr)->cpus;
-}
-
-/* Physical CPU info block */
-
-static inline int phys_cpu__size(enum diag204_format type)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return sizeof(struct diag204_phys_cpu);
- else /* DIAG204_INFO_EXT */
- return sizeof(struct diag204_x_phys_cpu);
-}
-
-static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
-}
-
-static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_phys_cpu *)hdr)->mgm_time;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
-}
-
-static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
-{
- if (type == DIAG204_INFO_SIMPLE)
- return ((struct diag204_phys_cpu *)hdr)->ctidx;
- else /* DIAG204_INFO_EXT */
- return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
+ diag204_info_type = type;
}
/* Diagnose 204 functions */
@@ -212,43 +51,11 @@ static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
static void diag204_free_buffer(void)
{
- if (!diag204_buf)
- return;
- if (diag204_buf_vmalloc) {
- vfree(diag204_buf_vmalloc);
- diag204_buf_vmalloc = NULL;
- } else {
- free_pages((unsigned long) diag204_buf, 0);
- }
+ vfree(diag204_buf);
diag204_buf = NULL;
}
-static void *page_align_ptr(void *ptr)
-{
- return (void *) PAGE_ALIGN((unsigned long) ptr);
-}
-
-static void *diag204_alloc_vbuf(int pages)
-{
- /* The buffer has to be page aligned! */
- diag204_buf_vmalloc = vmalloc(array_size(PAGE_SIZE, (pages + 1)));
- if (!diag204_buf_vmalloc)
- return ERR_PTR(-ENOMEM);
- diag204_buf = page_align_ptr(diag204_buf_vmalloc);
- diag204_buf_pages = pages;
- return diag204_buf;
-}
-
-static void *diag204_alloc_rbuf(void)
-{
- diag204_buf = (void*)__get_free_pages(GFP_KERNEL,0);
- if (!diag204_buf)
- return ERR_PTR(-ENOMEM);
- diag204_buf_pages = 1;
- return diag204_buf;
-}
-
-static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
+void *diag204_get_buffer(enum diag204_format fmt, int *pages)
{
if (diag204_buf) {
*pages = diag204_buf_pages;
@@ -256,15 +63,19 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
}
if (fmt == DIAG204_INFO_SIMPLE) {
*pages = 1;
- return diag204_alloc_rbuf();
} else {/* DIAG204_INFO_EXT */
*pages = diag204((unsigned long)DIAG204_SUBC_RSI |
(unsigned long)DIAG204_INFO_EXT, 0, NULL);
if (*pages <= 0)
- return ERR_PTR(-ENOSYS);
- else
- return diag204_alloc_vbuf(*pages);
+ return ERR_PTR(-EOPNOTSUPP);
}
+ diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE),
+ PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (!diag204_buf)
+ return ERR_PTR(-ENOMEM);
+ diag204_buf_pages = *pages;
+ return diag204_buf;
}
/*
@@ -291,13 +102,13 @@ static int diag204_probe(void)
if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
(unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
diag204_store_sc = DIAG204_SUBC_STIB7;
- diag204_info_type = DIAG204_INFO_EXT;
+ diag204_set_info_type(DIAG204_INFO_EXT);
goto out;
}
if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
(unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
diag204_store_sc = DIAG204_SUBC_STIB6;
- diag204_info_type = DIAG204_INFO_EXT;
+ diag204_set_info_type(DIAG204_INFO_EXT);
goto out;
}
diag204_free_buffer();
@@ -313,10 +124,10 @@ static int diag204_probe(void)
if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
(unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
diag204_store_sc = DIAG204_SUBC_STIB4;
- diag204_info_type = DIAG204_INFO_SIMPLE;
+ diag204_set_info_type(DIAG204_INFO_SIMPLE);
goto out;
} else {
- rc = -ENOSYS;
+ rc = -EOPNOTSUPP;
goto fail_store;
}
out:
@@ -327,58 +138,13 @@ fail_alloc:
return rc;
}
-static int diag204_do_store(void *buf, int pages)
+int diag204_store(void *buf, int pages)
{
int rc;
- rc = diag204((unsigned long) diag204_store_sc |
- (unsigned long) diag204_info_type, pages, buf);
- return rc < 0 ? -ENOSYS : 0;
-}
-
-static void *diag204_store(void)
-{
- void *buf;
- int pages, rc;
-
- buf = diag204_get_buffer(diag204_info_type, &pages);
- if (IS_ERR(buf))
- goto out;
- rc = diag204_do_store(buf, pages);
- if (rc)
- return ERR_PTR(rc);
-out:
- return buf;
-}
-
-/* Diagnose 224 functions */
-
-static int diag224_get_name_table(void)
-{
- /* memory must be below 2GB */
- diag224_cpu_names = (char *) __get_free_page(GFP_KERNEL | GFP_DMA);
- if (!diag224_cpu_names)
- return -ENOMEM;
- if (diag224(diag224_cpu_names)) {
- free_page((unsigned long) diag224_cpu_names);
- return -EOPNOTSUPP;
- }
- EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16);
- return 0;
-}
-
-static void diag224_delete_name_table(void)
-{
- free_page((unsigned long) diag224_cpu_names);
-}
-
-static int diag224_idx2name(int index, char *name)
-{
- memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
- DIAG204_CPU_NAME_LEN);
- name[DIAG204_CPU_NAME_LEN] = 0;
- strim(name);
- return 0;
+ rc = diag204((unsigned long)diag204_store_sc |
+ (unsigned long)diag204_get_info_type(), pages, buf);
+ return rc < 0 ? -EOPNOTSUPP : 0;
}
struct dbfs_d204_hdr {
@@ -403,8 +169,8 @@ static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size)
base = vzalloc(buf_size);
if (!base)
return -ENOMEM;
- d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr);
- rc = diag204_do_store(d204->buf, diag204_buf_pages);
+ d204 = PTR_ALIGN(base + sizeof(d204->hdr), PAGE_SIZE) - sizeof(d204->hdr);
+ rc = diag204_store(d204->buf, diag204_buf_pages);
if (rc) {
vfree(base);
return rc;
@@ -433,176 +199,21 @@ __init int hypfs_diag_init(void)
return -ENODATA;
}
- if (diag204_info_type == DIAG204_INFO_EXT)
+ if (diag204_get_info_type() == DIAG204_INFO_EXT)
hypfs_dbfs_create_file(&dbfs_file_d204);
- if (MACHINE_IS_LPAR) {
- rc = diag224_get_name_table();
- if (rc) {
- pr_err("The hardware system does not provide all "
- "functions required by hypfs\n");
- debugfs_remove(dbfs_d204_file);
- return rc;
- }
+ rc = hypfs_diag_fs_init();
+ if (rc) {
+ pr_err("The hardware system does not provide all functions required by hypfs\n");
+ debugfs_remove(dbfs_d204_file);
}
- return 0;
+ return rc;
}
void hypfs_diag_exit(void)
{
debugfs_remove(dbfs_d204_file);
- diag224_delete_name_table();
+ hypfs_diag_fs_exit();
diag204_free_buffer();
hypfs_dbfs_remove_file(&dbfs_file_d204);
}
-
-/*
- * Functions to create the directory structure
- * *******************************************
- */
-
-static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
-{
- struct dentry *cpu_dir;
- char buffer[TMP_SIZE];
- void *rc;
-
- snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_info_type,
- cpu_info));
- cpu_dir = hypfs_mkdir(cpus_dir, buffer);
- rc = hypfs_create_u64(cpu_dir, "mgmtime",
- cpu_info__acc_time(diag204_info_type, cpu_info) -
- cpu_info__lp_time(diag204_info_type, cpu_info));
- if (IS_ERR(rc))
- return PTR_ERR(rc);
- rc = hypfs_create_u64(cpu_dir, "cputime",
- cpu_info__lp_time(diag204_info_type, cpu_info));
- if (IS_ERR(rc))
- return PTR_ERR(rc);
- if (diag204_info_type == DIAG204_INFO_EXT) {
- rc = hypfs_create_u64(cpu_dir, "onlinetime",
- cpu_info__online_time(diag204_info_type,
- cpu_info));
- if (IS_ERR(rc))
- return PTR_ERR(rc);
- }
- diag224_idx2name(cpu_info__ctidx(diag204_info_type, cpu_info), buffer);
- rc = hypfs_create_str(cpu_dir, "type", buffer);
- return PTR_ERR_OR_ZERO(rc);
-}
-
-static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
-{
- struct dentry *cpus_dir;
- struct dentry *lpar_dir;
- char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
- void *cpu_info;
- int i;
-
- part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
- lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
- lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
- if (IS_ERR(lpar_dir))
- return lpar_dir;
- cpus_dir = hypfs_mkdir(lpar_dir, "cpus");
- if (IS_ERR(cpus_dir))
- return cpus_dir;
- cpu_info = part_hdr + part_hdr__size(diag204_info_type);
- for (i = 0; i < part_hdr__rcpus(diag204_info_type, part_hdr); i++) {
- int rc;
- rc = hypfs_create_cpu_files(cpus_dir, cpu_info);
- if (rc)
- return ERR_PTR(rc);
- cpu_info += cpu_info__size(diag204_info_type);
- }
- return cpu_info;
-}
-
-static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info)
-{
- struct dentry *cpu_dir;
- char buffer[TMP_SIZE];
- void *rc;
-
- snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_info_type,
- cpu_info));
- cpu_dir = hypfs_mkdir(cpus_dir, buffer);
- if (IS_ERR(cpu_dir))
- return PTR_ERR(cpu_dir);
- rc = hypfs_create_u64(cpu_dir, "mgmtime",
- phys_cpu__mgm_time(diag204_info_type, cpu_info));
- if (IS_ERR(rc))
- return PTR_ERR(rc);
- diag224_idx2name(phys_cpu__ctidx(diag204_info_type, cpu_info), buffer);
- rc = hypfs_create_str(cpu_dir, "type", buffer);
- return PTR_ERR_OR_ZERO(rc);
-}
-
-static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr)
-{
- int i;
- void *cpu_info;
- struct dentry *cpus_dir;
-
- cpus_dir = hypfs_mkdir(parent_dir, "cpus");
- if (IS_ERR(cpus_dir))
- return cpus_dir;
- cpu_info = phys_hdr + phys_hdr__size(diag204_info_type);
- for (i = 0; i < phys_hdr__cpus(diag204_info_type, phys_hdr); i++) {
- int rc;
- rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info);
- if (rc)
- return ERR_PTR(rc);
- cpu_info += phys_cpu__size(diag204_info_type);
- }
- return cpu_info;
-}
-
-int hypfs_diag_create_files(struct dentry *root)
-{
- struct dentry *systems_dir, *hyp_dir;
- void *time_hdr, *part_hdr;
- int i, rc;
- void *buffer, *ptr;
-
- buffer = diag204_store();
- if (IS_ERR(buffer))
- return PTR_ERR(buffer);
-
- systems_dir = hypfs_mkdir(root, "systems");
- if (IS_ERR(systems_dir)) {
- rc = PTR_ERR(systems_dir);
- goto err_out;
- }
- time_hdr = (struct x_info_blk_hdr *)buffer;
- part_hdr = time_hdr + info_blk_hdr__size(diag204_info_type);
- for (i = 0; i < info_blk_hdr__npar(diag204_info_type, time_hdr); i++) {
- part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr);
- if (IS_ERR(part_hdr)) {
- rc = PTR_ERR(part_hdr);
- goto err_out;
- }
- }
- if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
- DIAG204_LPAR_PHYS_FLG) {
- ptr = hypfs_create_phys_files(root, part_hdr);
- if (IS_ERR(ptr)) {
- rc = PTR_ERR(ptr);
- goto err_out;
- }
- }
- hyp_dir = hypfs_mkdir(root, "hyp");
- if (IS_ERR(hyp_dir)) {
- rc = PTR_ERR(hyp_dir);
- goto err_out;
- }
- ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor");
- if (IS_ERR(ptr)) {
- rc = PTR_ERR(ptr);
- goto err_out;
- }
- rc = 0;
-
-err_out:
- return rc;
-}
diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h
new file mode 100644
index 000000000000..7090eff27fef
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_diag.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hypervisor filesystem for Linux on s390. Diag 204 and 224
+ * implementation.
+ *
+ * Copyright IBM Corp. 2006, 2008
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#ifndef _S390_HYPFS_DIAG_H_
+#define _S390_HYPFS_DIAG_H_
+
+#include <asm/diag.h>
+
+enum diag204_format diag204_get_info_type(void);
+void *diag204_get_buffer(enum diag204_format fmt, int *pages);
+int diag204_store(void *buf, int pages);
+
+int __hypfs_diag_fs_init(void);
+void __hypfs_diag_fs_exit(void);
+
+static inline int hypfs_diag_fs_init(void)
+{
+ if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+ return __hypfs_diag_fs_init();
+ return 0;
+}
+
+static inline void hypfs_diag_fs_exit(void)
+{
+ if (IS_ENABLED(CONFIG_S390_HYPFS_FS))
+ __hypfs_diag_fs_exit();
+}
+
+#endif /* _S390_HYPFS_DIAG_H_ */
diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c
new file mode 100644
index 000000000000..00a6d370a280
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_diag_fs.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hypervisor filesystem for Linux on s390. Diag 204 and 224
+ * implementation.
+ *
+ * Copyright IBM Corp. 2006, 2008
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#define KMSG_COMPONENT "hypfs"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include "hypfs_diag.h"
+#include "hypfs.h"
+
+#define TMP_SIZE 64 /* size of temporary buffers */
+
+static char *diag224_cpu_names; /* diag 224 name table */
+static int diag224_idx2name(int index, char *name);
+
+/*
+ * DIAG 204 member access functions.
+ *
+ * Since we have two different diag 204 data formats for old and new s390
+ * machines, we do not access the structs directly, but use getter functions for
+ * each struct member instead. This should make the code more readable.
+ */
+
+/* Time information block */
+
+static inline int info_blk_hdr__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_info_blk_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_info_blk_hdr);
+}
+
+static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->npar;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
+}
+
+static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->flags;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
+}
+
+/* Partition header */
+
+static inline int part_hdr__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_part_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_part_hdr);
+}
+
+static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_part_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_part_hdr *)hdr)->rcpus;
+}
+
+static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
+ char *name)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ else /* DIAG204_INFO_EXT */
+ memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ EBCASC(name, DIAG204_LPAR_NAME_LEN);
+ name[DIAG204_LPAR_NAME_LEN] = 0;
+ strim(name);
+}
+
+/* CPU info block */
+
+static inline int cpu_info__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_cpu_info);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_cpu_info);
+}
+
+static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->ctidx;
+}
+
+static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
+}
+
+static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->acc_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->acc_time;
+}
+
+static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->lp_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->lp_time;
+}
+
+static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return 0; /* online_time not available in simple info */
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->online_time;
+}
+
+/* Physical header */
+
+static inline int phys_hdr__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_hdr);
+}
+
+static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_hdr *)hdr)->cpus;
+}
+
+/* Physical CPU info block */
+
+static inline int phys_cpu__size(enum diag204_format type)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_cpu);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_cpu);
+}
+
+static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
+}
+
+static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
+}
+
+static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
+{
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
+}
+
+/*
+ * Functions to create the directory structure
+ * *******************************************
+ */
+
+static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
+{
+ struct dentry *cpu_dir;
+ char buffer[TMP_SIZE];
+ void *rc;
+
+ snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_get_info_type(),
+ cpu_info));
+ cpu_dir = hypfs_mkdir(cpus_dir, buffer);
+ rc = hypfs_create_u64(cpu_dir, "mgmtime",
+ cpu_info__acc_time(diag204_get_info_type(), cpu_info) -
+ cpu_info__lp_time(diag204_get_info_type(), cpu_info));
+ if (IS_ERR(rc))
+ return PTR_ERR(rc);
+ rc = hypfs_create_u64(cpu_dir, "cputime",
+ cpu_info__lp_time(diag204_get_info_type(), cpu_info));
+ if (IS_ERR(rc))
+ return PTR_ERR(rc);
+ if (diag204_get_info_type() == DIAG204_INFO_EXT) {
+ rc = hypfs_create_u64(cpu_dir, "onlinetime",
+ cpu_info__online_time(diag204_get_info_type(),
+ cpu_info));
+ if (IS_ERR(rc))
+ return PTR_ERR(rc);
+ }
+ diag224_idx2name(cpu_info__ctidx(diag204_get_info_type(), cpu_info), buffer);
+ rc = hypfs_create_str(cpu_dir, "type", buffer);
+ return PTR_ERR_OR_ZERO(rc);
+}
+
+static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
+{
+ struct dentry *cpus_dir;
+ struct dentry *lpar_dir;
+ char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
+ void *cpu_info;
+ int i;
+
+ part_hdr__part_name(diag204_get_info_type(), part_hdr, lpar_name);
+ lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
+ lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
+ if (IS_ERR(lpar_dir))
+ return lpar_dir;
+ cpus_dir = hypfs_mkdir(lpar_dir, "cpus");
+ if (IS_ERR(cpus_dir))
+ return cpus_dir;
+ cpu_info = part_hdr + part_hdr__size(diag204_get_info_type());
+ for (i = 0; i < part_hdr__rcpus(diag204_get_info_type(), part_hdr); i++) {
+ int rc;
+
+ rc = hypfs_create_cpu_files(cpus_dir, cpu_info);
+ if (rc)
+ return ERR_PTR(rc);
+ cpu_info += cpu_info__size(diag204_get_info_type());
+ }
+ return cpu_info;
+}
+
+static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info)
+{
+ struct dentry *cpu_dir;
+ char buffer[TMP_SIZE];
+ void *rc;
+
+ snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_get_info_type(),
+ cpu_info));
+ cpu_dir = hypfs_mkdir(cpus_dir, buffer);
+ if (IS_ERR(cpu_dir))
+ return PTR_ERR(cpu_dir);
+ rc = hypfs_create_u64(cpu_dir, "mgmtime",
+ phys_cpu__mgm_time(diag204_get_info_type(), cpu_info));
+ if (IS_ERR(rc))
+ return PTR_ERR(rc);
+ diag224_idx2name(phys_cpu__ctidx(diag204_get_info_type(), cpu_info), buffer);
+ rc = hypfs_create_str(cpu_dir, "type", buffer);
+ return PTR_ERR_OR_ZERO(rc);
+}
+
+static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr)
+{
+ int i;
+ void *cpu_info;
+ struct dentry *cpus_dir;
+
+ cpus_dir = hypfs_mkdir(parent_dir, "cpus");
+ if (IS_ERR(cpus_dir))
+ return cpus_dir;
+ cpu_info = phys_hdr + phys_hdr__size(diag204_get_info_type());
+ for (i = 0; i < phys_hdr__cpus(diag204_get_info_type(), phys_hdr); i++) {
+ int rc;
+
+ rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info);
+ if (rc)
+ return ERR_PTR(rc);
+ cpu_info += phys_cpu__size(diag204_get_info_type());
+ }
+ return cpu_info;
+}
+
+int hypfs_diag_create_files(struct dentry *root)
+{
+ struct dentry *systems_dir, *hyp_dir;
+ void *time_hdr, *part_hdr;
+ void *buffer, *ptr;
+ int i, rc, pages;
+
+ buffer = diag204_get_buffer(diag204_get_info_type(), &pages);
+ if (IS_ERR(buffer))
+ return PTR_ERR(buffer);
+ rc = diag204_store(buffer, pages);
+ if (rc)
+ return rc;
+
+ systems_dir = hypfs_mkdir(root, "systems");
+ if (IS_ERR(systems_dir)) {
+ rc = PTR_ERR(systems_dir);
+ goto err_out;
+ }
+ time_hdr = (struct x_info_blk_hdr *)buffer;
+ part_hdr = time_hdr + info_blk_hdr__size(diag204_get_info_type());
+ for (i = 0; i < info_blk_hdr__npar(diag204_get_info_type(), time_hdr); i++) {
+ part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr);
+ if (IS_ERR(part_hdr)) {
+ rc = PTR_ERR(part_hdr);
+ goto err_out;
+ }
+ }
+ if (info_blk_hdr__flags(diag204_get_info_type(), time_hdr) &
+ DIAG204_LPAR_PHYS_FLG) {
+ ptr = hypfs_create_phys_files(root, part_hdr);
+ if (IS_ERR(ptr)) {
+ rc = PTR_ERR(ptr);
+ goto err_out;
+ }
+ }
+ hyp_dir = hypfs_mkdir(root, "hyp");
+ if (IS_ERR(hyp_dir)) {
+ rc = PTR_ERR(hyp_dir);
+ goto err_out;
+ }
+ ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor");
+ if (IS_ERR(ptr)) {
+ rc = PTR_ERR(ptr);
+ goto err_out;
+ }
+ rc = 0;
+
+err_out:
+ return rc;
+}
+
+/* Diagnose 224 functions */
+
+static int diag224_idx2name(int index, char *name)
+{
+ memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+ DIAG204_CPU_NAME_LEN);
+ name[DIAG204_CPU_NAME_LEN] = 0;
+ strim(name);
+ return 0;
+}
+
+static int diag224_get_name_table(void)
+{
+ /* memory must be below 2GB */
+ diag224_cpu_names = (char *)__get_free_page(GFP_KERNEL | GFP_DMA);
+ if (!diag224_cpu_names)
+ return -ENOMEM;
+ if (diag224(diag224_cpu_names)) {
+ free_page((unsigned long)diag224_cpu_names);
+ return -EOPNOTSUPP;
+ }
+ EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16);
+ return 0;
+}
+
+static void diag224_delete_name_table(void)
+{
+ free_page((unsigned long)diag224_cpu_names);
+}
+
+int __init __hypfs_diag_fs_init(void)
+{
+ if (MACHINE_IS_LPAR)
+ return diag224_get_name_table();
+ return 0;
+}
+
+void __hypfs_diag_fs_exit(void)
+{
+ diag224_delete_name_table();
+}
diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c
index a3d881ca0a98..3db40ad853e0 100644
--- a/arch/s390/hypfs/hypfs_vm.c
+++ b/arch/s390/hypfs/hypfs_vm.c
@@ -14,47 +14,15 @@
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/timex.h>
+#include "hypfs_vm.h"
#include "hypfs.h"
-#define NAME_LEN 8
#define DBFS_D2FC_HDR_VERSION 0
static char local_guest[] = " ";
static char all_guests[] = "* ";
static char *all_groups = all_guests;
-static char *guest_query;
-
-struct diag2fc_data {
- __u32 version;
- __u32 flags;
- __u64 used_cpu;
- __u64 el_time;
- __u64 mem_min_kb;
- __u64 mem_max_kb;
- __u64 mem_share_kb;
- __u64 mem_used_kb;
- __u32 pcpus;
- __u32 lcpus;
- __u32 vcpus;
- __u32 ocpus;
- __u32 cpu_max;
- __u32 cpu_shares;
- __u32 cpu_use_samp;
- __u32 cpu_delay_samp;
- __u32 page_wait_samp;
- __u32 idle_samp;
- __u32 other_samp;
- __u32 total_samp;
- char guest_name[NAME_LEN];
-};
-
-struct diag2fc_parm_list {
- char userid[NAME_LEN];
- char aci_grp[NAME_LEN];
- __u64 addr;
- __u32 size;
- __u32 fmt;
-};
+char *diag2fc_guest_query;
static int diag2fc(int size, char* query, void *addr)
{
@@ -62,10 +30,10 @@ static int diag2fc(int size, char* query, void *addr)
unsigned long rc;
struct diag2fc_parm_list parm_list;
- memcpy(parm_list.userid, query, NAME_LEN);
- ASCEBC(parm_list.userid, NAME_LEN);
- memcpy(parm_list.aci_grp, all_groups, NAME_LEN);
- ASCEBC(parm_list.aci_grp, NAME_LEN);
+ memcpy(parm_list.userid, query, DIAG2FC_NAME_LEN);
+ ASCEBC(parm_list.userid, DIAG2FC_NAME_LEN);
+ memcpy(parm_list.aci_grp, all_groups, DIAG2FC_NAME_LEN);
+ ASCEBC(parm_list.aci_grp, DIAG2FC_NAME_LEN);
parm_list.addr = (unsigned long)addr;
parm_list.size = size;
parm_list.fmt = 0x02;
@@ -87,7 +55,7 @@ static int diag2fc(int size, char* query, void *addr)
/*
* Allocate buffer for "query" and store diag 2fc at "offset"
*/
-static void *diag2fc_store(char *query, unsigned int *count, int offset)
+void *diag2fc_store(char *query, unsigned int *count, int offset)
{
void *data;
int size;
@@ -108,132 +76,11 @@ static void *diag2fc_store(char *query, unsigned int *count, int offset)
return data;
}
-static void diag2fc_free(const void *data)
+void diag2fc_free(const void *data)
{
vfree(data);
}
-#define ATTRIBUTE(dir, name, member) \
-do { \
- void *rc; \
- rc = hypfs_create_u64(dir, name, member); \
- if (IS_ERR(rc)) \
- return PTR_ERR(rc); \
-} while(0)
-
-static int hypfs_vm_create_guest(struct dentry *systems_dir,
- struct diag2fc_data *data)
-{
- char guest_name[NAME_LEN + 1] = {};
- struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir;
- int dedicated_flag, capped_value;
-
- capped_value = (data->flags & 0x00000006) >> 1;
- dedicated_flag = (data->flags & 0x00000008) >> 3;
-
- /* guest dir */
- memcpy(guest_name, data->guest_name, NAME_LEN);
- EBCASC(guest_name, NAME_LEN);
- strim(guest_name);
- guest_dir = hypfs_mkdir(systems_dir, guest_name);
- if (IS_ERR(guest_dir))
- return PTR_ERR(guest_dir);
- ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time);
-
- /* logical cpu information */
- cpus_dir = hypfs_mkdir(guest_dir, "cpus");
- if (IS_ERR(cpus_dir))
- return PTR_ERR(cpus_dir);
- ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu);
- ATTRIBUTE(cpus_dir, "capped", capped_value);
- ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag);
- ATTRIBUTE(cpus_dir, "count", data->vcpus);
- /*
- * Note: The "weight_min" attribute got the wrong name.
- * The value represents the number of non-stopped (operating)
- * CPUS.
- */
- ATTRIBUTE(cpus_dir, "weight_min", data->ocpus);
- ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max);
- ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares);
-
- /* memory information */
- mem_dir = hypfs_mkdir(guest_dir, "mem");
- if (IS_ERR(mem_dir))
- return PTR_ERR(mem_dir);
- ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb);
- ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb);
- ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb);
- ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb);
-
- /* samples */
- samples_dir = hypfs_mkdir(guest_dir, "samples");
- if (IS_ERR(samples_dir))
- return PTR_ERR(samples_dir);
- ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp);
- ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp);
- ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp);
- ATTRIBUTE(samples_dir, "idle", data->idle_samp);
- ATTRIBUTE(samples_dir, "other", data->other_samp);
- ATTRIBUTE(samples_dir, "total", data->total_samp);
- return 0;
-}
-
-int hypfs_vm_create_files(struct dentry *root)
-{
- struct dentry *dir, *file;
- struct diag2fc_data *data;
- unsigned int count = 0;
- int rc, i;
-
- data = diag2fc_store(guest_query, &count, 0);
- if (IS_ERR(data))
- return PTR_ERR(data);
-
- /* Hypervisor Info */
- dir = hypfs_mkdir(root, "hyp");
- if (IS_ERR(dir)) {
- rc = PTR_ERR(dir);
- goto failed;
- }
- file = hypfs_create_str(dir, "type", "z/VM Hypervisor");
- if (IS_ERR(file)) {
- rc = PTR_ERR(file);
- goto failed;
- }
-
- /* physical cpus */
- dir = hypfs_mkdir(root, "cpus");
- if (IS_ERR(dir)) {
- rc = PTR_ERR(dir);
- goto failed;
- }
- file = hypfs_create_u64(dir, "count", data->lcpus);
- if (IS_ERR(file)) {
- rc = PTR_ERR(file);
- goto failed;
- }
-
- /* guests */
- dir = hypfs_mkdir(root, "systems");
- if (IS_ERR(dir)) {
- rc = PTR_ERR(dir);
- goto failed;
- }
-
- for (i = 0; i < count; i++) {
- rc = hypfs_vm_create_guest(dir, &(data[i]));
- if (rc)
- goto failed;
- }
- diag2fc_free(data);
- return 0;
-
-failed:
- diag2fc_free(data);
- return rc;
-}
-
struct dbfs_d2fc_hdr {
u64 len; /* Length of d2fc buffer without header */
u16 version; /* Version of header */
@@ -252,7 +99,7 @@ static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size)
struct dbfs_d2fc *d2fc;
unsigned int count;
- d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr));
+ d2fc = diag2fc_store(diag2fc_guest_query, &count, sizeof(d2fc->hdr));
if (IS_ERR(d2fc))
return PTR_ERR(d2fc);
store_tod_clock_ext(&d2fc->hdr.tod_ext);
@@ -277,9 +124,9 @@ int hypfs_vm_init(void)
if (!MACHINE_IS_VM)
return 0;
if (diag2fc(0, all_guests, NULL) > 0)
- guest_query = all_guests;
+ diag2fc_guest_query = all_guests;
else if (diag2fc(0, local_guest, NULL) > 0)
- guest_query = local_guest;
+ diag2fc_guest_query = local_guest;
else
return -EACCES;
hypfs_dbfs_create_file(&dbfs_file_2fc);
diff --git a/arch/s390/hypfs/hypfs_vm.h b/arch/s390/hypfs/hypfs_vm.h
new file mode 100644
index 000000000000..fe2e5851addd
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_vm.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hypervisor filesystem for Linux on s390. z/VM implementation.
+ *
+ * Copyright IBM Corp. 2006
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#ifndef _S390_HYPFS_VM_H_
+#define _S390_HYPFS_VM_H_
+
+#define DIAG2FC_NAME_LEN 8
+
+struct diag2fc_data {
+ __u32 version;
+ __u32 flags;
+ __u64 used_cpu;
+ __u64 el_time;
+ __u64 mem_min_kb;
+ __u64 mem_max_kb;
+ __u64 mem_share_kb;
+ __u64 mem_used_kb;
+ __u32 pcpus;
+ __u32 lcpus;
+ __u32 vcpus;
+ __u32 ocpus;
+ __u32 cpu_max;
+ __u32 cpu_shares;
+ __u32 cpu_use_samp;
+ __u32 cpu_delay_samp;
+ __u32 page_wait_samp;
+ __u32 idle_samp;
+ __u32 other_samp;
+ __u32 total_samp;
+ char guest_name[DIAG2FC_NAME_LEN];
+};
+
+struct diag2fc_parm_list {
+ char userid[DIAG2FC_NAME_LEN];
+ char aci_grp[DIAG2FC_NAME_LEN];
+ __u64 addr;
+ __u32 size;
+ __u32 fmt;
+};
+
+void *diag2fc_store(char *query, unsigned int *count, int offset);
+void diag2fc_free(const void *data);
+extern char *diag2fc_guest_query;
+
+#endif /* _S390_HYPFS_VM_H_ */
diff --git a/arch/s390/hypfs/hypfs_vm_fs.c b/arch/s390/hypfs/hypfs_vm_fs.c
new file mode 100644
index 000000000000..6011289afa8c
--- /dev/null
+++ b/arch/s390/hypfs/hypfs_vm_fs.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hypervisor filesystem for Linux on s390. z/VM implementation.
+ *
+ * Copyright IBM Corp. 2006
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <asm/extable.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include <asm/timex.h>
+#include "hypfs_vm.h"
+#include "hypfs.h"
+
+#define ATTRIBUTE(dir, name, member) \
+do { \
+ void *rc; \
+ rc = hypfs_create_u64(dir, name, member); \
+ if (IS_ERR(rc)) \
+ return PTR_ERR(rc); \
+} while (0)
+
+static int hypfs_vm_create_guest(struct dentry *systems_dir,
+ struct diag2fc_data *data)
+{
+ char guest_name[DIAG2FC_NAME_LEN + 1] = {};
+ struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir;
+ int dedicated_flag, capped_value;
+
+ capped_value = (data->flags & 0x00000006) >> 1;
+ dedicated_flag = (data->flags & 0x00000008) >> 3;
+
+ /* guest dir */
+ memcpy(guest_name, data->guest_name, DIAG2FC_NAME_LEN);
+ EBCASC(guest_name, DIAG2FC_NAME_LEN);
+ strim(guest_name);
+ guest_dir = hypfs_mkdir(systems_dir, guest_name);
+ if (IS_ERR(guest_dir))
+ return PTR_ERR(guest_dir);
+ ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time);
+
+ /* logical cpu information */
+ cpus_dir = hypfs_mkdir(guest_dir, "cpus");
+ if (IS_ERR(cpus_dir))
+ return PTR_ERR(cpus_dir);
+ ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu);
+ ATTRIBUTE(cpus_dir, "capped", capped_value);
+ ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag);
+ ATTRIBUTE(cpus_dir, "count", data->vcpus);
+ /*
+ * Note: The "weight_min" attribute got the wrong name.
+ * The value represents the number of non-stopped (operating)
+ * CPUS.
+ */
+ ATTRIBUTE(cpus_dir, "weight_min", data->ocpus);
+ ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max);
+ ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares);
+
+ /* memory information */
+ mem_dir = hypfs_mkdir(guest_dir, "mem");
+ if (IS_ERR(mem_dir))
+ return PTR_ERR(mem_dir);
+ ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb);
+ ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb);
+ ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb);
+ ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb);
+
+ /* samples */
+ samples_dir = hypfs_mkdir(guest_dir, "samples");
+ if (IS_ERR(samples_dir))
+ return PTR_ERR(samples_dir);
+ ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp);
+ ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp);
+ ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp);
+ ATTRIBUTE(samples_dir, "idle", data->idle_samp);
+ ATTRIBUTE(samples_dir, "other", data->other_samp);
+ ATTRIBUTE(samples_dir, "total", data->total_samp);
+ return 0;
+}
+
+int hypfs_vm_create_files(struct dentry *root)
+{
+ struct dentry *dir, *file;
+ struct diag2fc_data *data;
+ unsigned int count = 0;
+ int rc, i;
+
+ data = diag2fc_store(diag2fc_guest_query, &count, 0);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ /* Hypervisor Info */
+ dir = hypfs_mkdir(root, "hyp");
+ if (IS_ERR(dir)) {
+ rc = PTR_ERR(dir);
+ goto failed;
+ }
+ file = hypfs_create_str(dir, "type", "z/VM Hypervisor");
+ if (IS_ERR(file)) {
+ rc = PTR_ERR(file);
+ goto failed;
+ }
+
+ /* physical cpus */
+ dir = hypfs_mkdir(root, "cpus");
+ if (IS_ERR(dir)) {
+ rc = PTR_ERR(dir);
+ goto failed;
+ }
+ file = hypfs_create_u64(dir, "count", data->lcpus);
+ if (IS_ERR(file)) {
+ rc = PTR_ERR(file);
+ goto failed;
+ }
+
+ /* guests */
+ dir = hypfs_mkdir(root, "systems");
+ if (IS_ERR(dir)) {
+ rc = PTR_ERR(dir);
+ goto failed;
+ }
+
+ for (i = 0; i < count; i++) {
+ rc = hypfs_vm_create_guest(dir, &data[i]);
+ if (rc)
+ goto failed;
+ }
+ diag2fc_free(data);
+ return 0;
+
+failed:
+ diag2fc_free(data);
+ return rc;
+}
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index ee919bfc8186..ada83149932f 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -53,7 +53,7 @@ static void hypfs_update_update(struct super_block *sb)
struct inode *inode = d_inode(sb_info->update_file);
sb_info->last_update = ktime_get_seconds();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
/* directory tree removal functions */
@@ -101,7 +101,7 @@ static struct inode *hypfs_make_inode(struct super_block *sb, umode_t mode)
ret->i_mode = mode;
ret->i_uid = hypfs_info->uid;
ret->i_gid = hypfs_info->gid;
- ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret);
+ ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret);
if (S_ISDIR(mode))
set_nlink(ret, 2);
}
@@ -460,45 +460,18 @@ static const struct super_operations hypfs_s_ops = {
.show_options = hypfs_show_options,
};
-static int __init hypfs_init(void)
+int __init __hypfs_fs_init(void)
{
int rc;
- hypfs_dbfs_init();
-
- if (hypfs_diag_init()) {
- rc = -ENODATA;
- goto fail_dbfs_exit;
- }
- if (hypfs_vm_init()) {
- rc = -ENODATA;
- goto fail_hypfs_diag_exit;
- }
- hypfs_sprp_init();
- if (hypfs_diag0c_init()) {
- rc = -ENODATA;
- goto fail_hypfs_sprp_exit;
- }
rc = sysfs_create_mount_point(hypervisor_kobj, "s390");
if (rc)
- goto fail_hypfs_diag0c_exit;
+ return rc;
rc = register_filesystem(&hypfs_type);
if (rc)
- goto fail_filesystem;
+ goto fail;
return 0;
-
-fail_filesystem:
+fail:
sysfs_remove_mount_point(hypervisor_kobj, "s390");
-fail_hypfs_diag0c_exit:
- hypfs_diag0c_exit();
-fail_hypfs_sprp_exit:
- hypfs_sprp_exit();
- hypfs_vm_exit();
-fail_hypfs_diag_exit:
- hypfs_diag_exit();
- pr_err("Initialization of hypfs failed with rc=%i\n", rc);
-fail_dbfs_exit:
- hypfs_dbfs_exit();
return rc;
}
-device_initcall(hypfs_init)
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 1a18d7b82f86..4b904110d27c 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -5,6 +5,5 @@ generated-y += syscall_table.h
generated-y += unistd_nr.h
generic-y += asm-offsets.h
-generic-y += export.h
generic-y += kvm_types.h
generic-y += mcs_spinlock.h
diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index ac665b9670c5..ccd4e148b5ed 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -222,7 +222,7 @@ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level,
/*
* IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are
- * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details!
+ * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details!
*/
extern debug_entry_t *
__debug_sprintf_event(debug_info_t *id, int level, char *string, ...)
@@ -350,7 +350,7 @@ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level,
/*
* IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are
- * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details!
+ * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details!
*/
extern debug_entry_t *
__debug_sprintf_exception(debug_info_t *id, int level, char *string, ...)
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 902e0330dd91..bed804137537 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -36,6 +36,7 @@ enum diag_stat_enum {
DIAG_STAT_X304,
DIAG_STAT_X308,
DIAG_STAT_X318,
+ DIAG_STAT_X320,
DIAG_STAT_X500,
NR_DIAG_STAT
};
@@ -108,6 +109,8 @@ enum diag204_sc {
DIAG204_SUBC_STIB7 = 7
};
+#define DIAG204_SUBCODE_MASK 0xffff
+
/* The two available diag 204 data formats */
enum diag204_format {
DIAG204_INFO_SIMPLE = 0,
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index e5c5cb1207e2..5a82b08f03cd 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -54,6 +54,23 @@ static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *
return NULL;
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+struct fgraph_ret_regs {
+ unsigned long gpr2;
+ unsigned long fp;
+};
+
+static __always_inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs)
+{
+ return ret_regs->gpr2;
+}
+
+static __always_inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs)
+{
+ return ret_regs->fp;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
static __always_inline unsigned long
ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs)
{
diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h
index d55ba878378b..e47fd8cbe701 100644
--- a/arch/s390/include/asm/kfence.h
+++ b/arch/s390/include/asm/kfence.h
@@ -35,7 +35,7 @@ static __always_inline void kfence_split_mapping(void)
static inline bool kfence_protect_page(unsigned long addr, bool protect)
{
- __kernel_map_pages(virt_to_page(addr), 1, !protect);
+ __kernel_map_pages(virt_to_page((void *)addr), 1, !protect);
return true;
}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 2bbc3d54959d..91bfecb91321 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -1028,6 +1028,9 @@ static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa)
extern char sie_exit;
+bool kvm_s390_pv_is_protected(struct kvm *kvm);
+bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu);
+
extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h
index cfec3141fdba..50225940d971 100644
--- a/arch/s390/include/asm/maccess.h
+++ b/arch/s390/include/asm/maccess.h
@@ -4,6 +4,9 @@
#include <linux/types.h>
+#define MEMCPY_REAL_SIZE PAGE_SIZE
+#define MEMCPY_REAL_MASK PAGE_MASK
+
struct iov_iter;
extern unsigned long __memcpy_real_area;
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index a9c138fcd2ad..cfec0743314e 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -191,8 +191,16 @@ int arch_make_page_accessible(struct page *page);
#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
#define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
-#define pfn_to_virt(pfn) __va(pfn_to_phys(pfn))
-#define virt_to_pfn(kaddr) (phys_to_pfn(__pa(kaddr)))
+static inline void *pfn_to_virt(unsigned long pfn)
+{
+ return __va(pfn_to_phys(pfn));
+}
+
+static inline unsigned long virt_to_pfn(const void *kaddr)
+{
+ return phys_to_pfn(__pa(kaddr));
+}
+
#define pfn_to_kaddr(pfn) pfn_to_virt(pfn)
#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr))
diff --git a/arch/s390/include/asm/pfault.h b/arch/s390/include/asm/pfault.h
new file mode 100644
index 000000000000..a1bee4a1e470
--- /dev/null
+++ b/arch/s390/include/asm/pfault.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+#ifndef _ASM_S390_PFAULT_H
+#define _ASM_S390_PFAULT_H
+
+#include <linux/errno.h>
+
+int __pfault_init(void);
+void __pfault_fini(void);
+
+static inline int pfault_init(void)
+{
+ if (IS_ENABLED(CONFIG_PFAULT))
+ return __pfault_init();
+ return -EOPNOTSUPP;
+}
+
+static inline void pfault_fini(void)
+{
+ if (IS_ENABLED(CONFIG_PFAULT))
+ __pfault_fini();
+}
+
+#endif /* _ASM_S390_PFAULT_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index c55f3c3365af..30909fe27c24 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -89,8 +89,6 @@ extern unsigned long __bootdata_preserved(VMALLOC_END);
extern struct page *__bootdata_preserved(vmemmap);
extern unsigned long __bootdata_preserved(vmemmap_size);
-#define VMEM_MAX_PHYS ((unsigned long) vmemmap)
-
extern unsigned long __bootdata_preserved(MODULES_VADDR);
extern unsigned long __bootdata_preserved(MODULES_END);
#define MODULES_VADDR MODULES_VADDR
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index dac7da88f61f..5742d23bba13 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -86,6 +86,7 @@ struct sclp_info {
unsigned char has_kss : 1;
unsigned char has_gisaf : 1;
unsigned char has_diag318 : 1;
+ unsigned char has_diag320 : 1;
unsigned char has_sipl : 1;
unsigned char has_sipl_eckd : 1;
unsigned char has_dirq : 1;
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index f191255c60db..b30fe91166e3 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -74,6 +74,7 @@ extern unsigned int zlib_dfltcc_support;
extern int noexec_disabled;
extern unsigned long ident_map_size;
+extern unsigned long max_mappable;
/* The Write Back bit position in the physaddr is given by the SLPC PCI */
extern unsigned long mio_wb_bit_mask;
@@ -117,14 +118,6 @@ extern unsigned int console_irq;
#define SET_CONSOLE_VT220 do { console_mode = 4; } while (0)
#define SET_CONSOLE_HVC do { console_mode = 5; } while (0)
-#ifdef CONFIG_PFAULT
-extern int pfault_init(void);
-extern void pfault_fini(void);
-#else /* CONFIG_PFAULT */
-#define pfault_init() ({-1;})
-#define pfault_fini() do { } while (0)
-#endif /* CONFIG_PFAULT */
-
#ifdef CONFIG_VMCP
void vmcp_cma_reserve(void);
#else
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index d6bb2f4f78d1..d2cd42bb2c26 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -463,6 +463,7 @@ static inline int is_prot_virt_host(void)
return prot_virt_host;
}
+int uv_pin_shared(unsigned long paddr);
int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
int uv_destroy_owned_page(unsigned long paddr);
@@ -475,6 +476,11 @@ void setup_uv(void);
#define is_prot_virt_host() 0
static inline void setup_uv(void) {}
+static inline int uv_pin_shared(unsigned long paddr)
+{
+ return 0;
+}
+
static inline int uv_destroy_owned_page(unsigned long paddr)
{
return 0;
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
index 5faf0a1d2c16..5ad76471e73f 100644
--- a/arch/s390/include/uapi/asm/pkey.h
+++ b/arch/s390/include/uapi/asm/pkey.h
@@ -26,7 +26,7 @@
#define MAXCLRKEYSIZE 32 /* a clear key value may be up to 32 bytes */
#define MAXAESCIPHERKEYSIZE 136 /* our aes cipher keys have always 136 bytes */
#define MINEP11AESKEYBLOBSIZE 256 /* min EP11 AES key blob size */
-#define MAXEP11AESKEYBLOBSIZE 320 /* max EP11 AES key blob size */
+#define MAXEP11AESKEYBLOBSIZE 336 /* max EP11 AES key blob size */
/* Minimum size of a key blob */
#define MINKEYBLOBSIZE SECKEYBLOBSIZE
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 6b2a051e1f8a..0df2b88cc0da 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -37,9 +37,9 @@ CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls
obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o
obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o
-obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o
+obj-y += sysinfo.o lgr.o os_info.o
obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
-obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
+obj-y += entry.o reipl.o kdebugfs.o alternative.o
obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o
@@ -63,12 +63,13 @@ obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o
obj-$(CONFIG_FUNCTION_TRACER) += mcount.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o
obj-$(CONFIG_UPROBES) += uprobes.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o
-
+obj-$(CONFIG_CERT_STORE) += cert_store.o
obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 81cf72088041..fa5f6885c74a 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -12,6 +12,7 @@
#include <linux/sched.h>
#include <linux/purgatory.h>
#include <linux/pgtable.h>
+#include <linux/ftrace.h>
#include <asm/idle.h>
#include <asm/gmap.h>
#include <asm/stacktrace.h>
@@ -177,5 +178,13 @@ int main(void)
DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size));
DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line));
DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size));
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* function graph return value tracing */
+ OFFSET(__FGRAPH_RET_GPR2, fgraph_ret_regs, gpr2);
+ OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp);
+ DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs));
+#endif
+ OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs);
+ DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs));
return 0;
}
diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c
new file mode 100644
index 000000000000..3986a044eb36
--- /dev/null
+++ b/arch/s390/kernel/cert_store.c
@@ -0,0 +1,811 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DIAG 0x320 support and certificate store handling
+ *
+ * Copyright IBM Corp. 2023
+ * Author(s): Anastasia Eskova <anastasia.eskova@ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key-type.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <crypto/sha2.h>
+#include <keys/user-type.h>
+#include <asm/debug.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include <asm/sclp.h>
+
+#define DIAG_MAX_RETRIES 10
+
+#define VCE_FLAGS_VALID_MASK 0x80
+
+#define ISM_LEN_DWORDS 4
+#define VCSSB_LEN_BYTES 128
+#define VCSSB_LEN_NO_CERTS 4
+#define VCB_LEN_NO_CERTS 64
+#define VC_NAME_LEN_BYTES 64
+
+#define CERT_STORE_KEY_TYPE_NAME "cert_store_key"
+#define CERT_STORE_KEYRING_NAME "cert_store"
+
+static debug_info_t *cert_store_dbf;
+static debug_info_t *cert_store_hexdump;
+
+#define pr_dbf_msg(fmt, ...) \
+ debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__)
+
+enum diag320_subcode {
+ DIAG320_SUBCODES = 0,
+ DIAG320_STORAGE = 1,
+ DIAG320_CERT_BLOCK = 2,
+};
+
+enum diag320_rc {
+ DIAG320_RC_OK = 0x0001,
+ DIAG320_RC_CS_NOMATCH = 0x0306,
+};
+
+/* Verification Certificates Store Support Block (VCSSB). */
+struct vcssb {
+ u32 vcssb_length;
+ u8 pad_0x04[3];
+ u8 version;
+ u8 pad_0x08[8];
+ u32 cs_token;
+ u8 pad_0x14[12];
+ u16 total_vc_index_count;
+ u16 max_vc_index_count;
+ u8 pad_0x24[28];
+ u32 max_vce_length;
+ u32 max_vcxe_length;
+ u8 pad_0x48[8];
+ u32 max_single_vcb_length;
+ u32 total_vcb_length;
+ u32 max_single_vcxb_length;
+ u32 total_vcxb_length;
+ u8 pad_0x60[32];
+} __packed __aligned(8);
+
+/* Verification Certificate Entry (VCE) Header. */
+struct vce_header {
+ u32 vce_length;
+ u8 flags;
+ u8 key_type;
+ u16 vc_index;
+ u8 vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */
+ u8 vc_format;
+ u8 pad_0x49;
+ u16 key_id_length;
+ u8 pad_0x4c;
+ u8 vc_hash_type;
+ u16 vc_hash_length;
+ u8 pad_0x50[4];
+ u32 vc_length;
+ u8 pad_0x58[8];
+ u16 vc_hash_offset;
+ u16 vc_offset;
+ u8 pad_0x64[28];
+} __packed __aligned(4);
+
+/* Verification Certificate Block (VCB) Header. */
+struct vcb_header {
+ u32 vcb_input_length;
+ u8 pad_0x04[4];
+ u16 first_vc_index;
+ u16 last_vc_index;
+ u32 pad_0x0c;
+ u32 cs_token;
+ u8 pad_0x14[12];
+ u32 vcb_output_length;
+ u8 pad_0x24[3];
+ u8 version;
+ u16 stored_vc_count;
+ u16 remaining_vc_count;
+ u8 pad_0x2c[20];
+} __packed __aligned(4);
+
+/* Verification Certificate Block (VCB). */
+struct vcb {
+ struct vcb_header vcb_hdr;
+ u8 vcb_buf[];
+} __packed __aligned(4);
+
+/* Verification Certificate Entry (VCE). */
+struct vce {
+ struct vce_header vce_hdr;
+ u8 cert_data_buf[];
+} __packed __aligned(4);
+
+static void cert_store_key_describe(const struct key *key, struct seq_file *m)
+{
+ char ascii[VC_NAME_LEN_BYTES + 1];
+
+ /*
+ * First 64 bytes of the key description is key name in EBCDIC CP 500.
+ * Convert it to ASCII for displaying in /proc/keys.
+ */
+ strscpy(ascii, key->description, sizeof(ascii));
+ EBCASC_500(ascii, VC_NAME_LEN_BYTES);
+ seq_puts(m, ascii);
+
+ seq_puts(m, &key->description[VC_NAME_LEN_BYTES]);
+ if (key_is_positive(key))
+ seq_printf(m, ": %u", key->datalen);
+}
+
+/*
+ * Certificate store key type takes over properties of
+ * user key but cannot be updated.
+ */
+static struct key_type key_type_cert_store_key = {
+ .name = CERT_STORE_KEY_TYPE_NAME,
+ .preparse = user_preparse,
+ .free_preparse = user_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = cert_store_key_describe,
+ .read = user_read,
+};
+
+/* Logging functions. */
+static void pr_dbf_vcb(const struct vcb *b)
+{
+ pr_dbf_msg("VCB Header:");
+ pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length);
+ pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index);
+ pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index);
+ pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token);
+ pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length);
+ pr_dbf_msg("version: %d", b->vcb_hdr.version);
+ pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count);
+ pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count);
+}
+
+static void pr_dbf_vce(const struct vce *e)
+{
+ unsigned char vc_name[VC_NAME_LEN_BYTES + 1];
+ char log_string[VC_NAME_LEN_BYTES + 40];
+
+ pr_dbf_msg("VCE Header:");
+ pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length);
+ pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags);
+ pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type);
+ pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index);
+ pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format);
+ pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length);
+ pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type);
+ pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length);
+ pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset);
+ pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length);
+ pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset);
+
+ /* Certificate name in ASCII. */
+ memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES);
+ EBCASC_500(vc_name, VC_NAME_LEN_BYTES);
+ vc_name[VC_NAME_LEN_BYTES] = '\0';
+
+ snprintf(log_string, sizeof(log_string),
+ "index: %d vce_hdr.vc_name (ASCII): %s",
+ e->vce_hdr.vc_index, vc_name);
+ debug_text_event(cert_store_hexdump, 3, log_string);
+
+ /* Certificate data. */
+ debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start");
+ debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128);
+ debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end");
+ debug_event(cert_store_hexdump, 3,
+ (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128);
+}
+
+static void pr_dbf_vcssb(const struct vcssb *s)
+{
+ debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1");
+ debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES);
+
+ pr_dbf_msg("VCSSB:");
+ pr_dbf_msg("vcssb_length: %u", s->vcssb_length);
+ pr_dbf_msg("version: %u", s->version);
+ pr_dbf_msg("cs_token: %u", s->cs_token);
+ pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count);
+ pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count);
+ pr_dbf_msg("max_vce_length: %u", s->max_vce_length);
+ pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length);
+ pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length);
+ pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length);
+ pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length);
+ pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length);
+}
+
+static int __diag320(unsigned long subcode, void *addr)
+{
+ union register_pair rp = { .even = (unsigned long)addr, };
+
+ asm volatile(
+ " diag %[rp],%[subcode],0x320\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ : [rp] "+d" (rp.pair)
+ : [subcode] "d" (subcode)
+ : "cc", "memory");
+
+ return rp.odd;
+}
+
+static int diag320(unsigned long subcode, void *addr)
+{
+ diag_stat_inc(DIAG_STAT_X320);
+
+ return __diag320(subcode, addr);
+}
+
+/*
+ * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in
+ * VCE. Return -EINVAL if hashes don't match.
+ */
+static int check_certificate_hash(const struct vce *vce)
+{
+ u8 hash[SHA256_DIGEST_SIZE];
+ u16 vc_hash_length;
+ u8 *vce_hash;
+
+ vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset;
+ vc_hash_length = vce->vce_hdr.vc_hash_length;
+ sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash);
+ if (memcmp(vce_hash, hash, vc_hash_length) == 0)
+ return 0;
+
+ pr_dbf_msg("SHA256 hash of received certificate does not match");
+ debug_text_event(cert_store_hexdump, 3, "VCE hash:");
+ debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE);
+ debug_text_event(cert_store_hexdump, 3, "Calculated hash:");
+ debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE);
+
+ return -EINVAL;
+}
+
+static int check_certificate_valid(const struct vce *vce)
+{
+ if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) {
+ pr_dbf_msg("Certificate entry is invalid");
+ return -EINVAL;
+ }
+ if (vce->vce_hdr.vc_format != 1) {
+ pr_dbf_msg("Certificate format is not supported");
+ return -EINVAL;
+ }
+ if (vce->vce_hdr.vc_hash_type != 1) {
+ pr_dbf_msg("Hash type is not supported");
+ return -EINVAL;
+ }
+
+ return check_certificate_hash(vce);
+}
+
+static struct key *get_user_session_keyring(void)
+{
+ key_ref_t us_keyring_ref;
+
+ us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING,
+ KEY_LOOKUP_CREATE, KEY_NEED_LINK);
+ if (IS_ERR(us_keyring_ref)) {
+ pr_dbf_msg("Couldn't get user session keyring: %ld",
+ PTR_ERR(us_keyring_ref));
+ return ERR_PTR(-ENOKEY);
+ }
+ key_ref_put(us_keyring_ref);
+ return key_ref_to_ptr(us_keyring_ref);
+}
+
+/* Invalidate all keys from cert_store keyring. */
+static int invalidate_keyring_keys(struct key *keyring)
+{
+ unsigned long num_keys, key_index;
+ size_t keyring_payload_len;
+ key_serial_t *key_array;
+ struct key *current_key;
+ int rc;
+
+ keyring_payload_len = key_type_keyring.read(keyring, NULL, 0);
+ num_keys = keyring_payload_len / sizeof(key_serial_t);
+ key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL);
+ if (!key_array)
+ return -ENOMEM;
+
+ rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len);
+ if (rc != keyring_payload_len) {
+ pr_dbf_msg("Couldn't read keyring payload");
+ goto out;
+ }
+
+ for (key_index = 0; key_index < num_keys; key_index++) {
+ current_key = key_lookup(key_array[key_index]);
+ pr_dbf_msg("Invalidating key %08x", current_key->serial);
+
+ key_invalidate(current_key);
+ key_put(current_key);
+ rc = key_unlink(keyring, current_key);
+ if (rc) {
+ pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc);
+ break;
+ }
+ }
+out:
+ kfree(key_array);
+ return rc;
+}
+
+static struct key *find_cs_keyring(void)
+{
+ key_ref_t cs_keyring_ref;
+ struct key *cs_keyring;
+
+ cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true),
+ &key_type_keyring, CERT_STORE_KEYRING_NAME,
+ false);
+ if (!IS_ERR(cs_keyring_ref)) {
+ cs_keyring = key_ref_to_ptr(cs_keyring_ref);
+ key_ref_put(cs_keyring_ref);
+ goto found;
+ }
+ /* Search default locations: thread, process, session keyrings */
+ cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL);
+ if (IS_ERR(cs_keyring))
+ return NULL;
+ key_put(cs_keyring);
+found:
+ return cs_keyring;
+}
+
+static void cleanup_cs_keys(void)
+{
+ struct key *cs_keyring;
+
+ cs_keyring = find_cs_keyring();
+ if (!cs_keyring)
+ return;
+
+ pr_dbf_msg("Found cert_store keyring. Purging...");
+ /*
+ * Remove cert_store_key_type in case invalidation
+ * of old cert_store keys failed (= severe error).
+ */
+ if (invalidate_keyring_keys(cs_keyring))
+ unregister_key_type(&key_type_cert_store_key);
+
+ keyring_clear(cs_keyring);
+ key_invalidate(cs_keyring);
+ key_put(cs_keyring);
+ key_unlink(get_user_session_keyring(), cs_keyring);
+}
+
+static struct key *create_cs_keyring(void)
+{
+ static struct key *cs_keyring;
+
+ /* Cleanup previous cs_keyring and all associated keys if any. */
+ cleanup_cs_keys();
+ cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID,
+ GLOBAL_ROOT_GID, current_cred(),
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP,
+ NULL, get_user_session_keyring());
+ if (IS_ERR(cs_keyring)) {
+ pr_dbf_msg("Can't allocate cert_store keyring");
+ return NULL;
+ }
+
+ pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial);
+
+ /*
+ * In case a previous clean-up ran into an
+ * error and unregistered key type.
+ */
+ register_key_type(&key_type_cert_store_key);
+
+ return cs_keyring;
+}
+
+/*
+ * Allocate memory and create key description in format
+ * [key name in EBCDIC]:[VCE index]:[CS token].
+ * Return a pointer to key description or NULL if memory
+ * allocation failed. Memory should be freed by caller.
+ */
+static char *get_key_description(struct vcssb *vcssb, const struct vce *vce)
+{
+ size_t len, name_len;
+ u32 cs_token;
+ char *desc;
+
+ cs_token = vcssb->cs_token;
+ /* Description string contains "%64s:%04u:%08u\0". */
+ name_len = sizeof(vce->vce_hdr.vc_name);
+ len = name_len + 1 + 4 + 1 + 8 + 1;
+ desc = kmalloc(len, GFP_KERNEL);
+ if (!desc)
+ return NULL;
+
+ memcpy(desc, vce->vce_hdr.vc_name, name_len);
+ sprintf(desc + name_len, ":%04u:%08u", vce->vce_hdr.vc_index, cs_token);
+
+ return desc;
+}
+
+/*
+ * Create a key of type "cert_store_key" using the data from VCE for key
+ * payload and key description. Link the key to "cert_store" keyring.
+ */
+static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce,
+ struct key *keyring)
+{
+ key_ref_t newkey;
+ char *desc;
+ int rc;
+
+ desc = get_key_description(vcssb, vce);
+ if (!desc)
+ return -ENOMEM;
+
+ newkey = key_create_or_update(
+ make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME,
+ desc, (u8 *)vce + vce->vce_hdr.vc_offset,
+ vce->vce_hdr.vc_length,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+
+ rc = PTR_ERR_OR_ZERO(newkey);
+ if (rc) {
+ pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc);
+ rc = -ENOKEY;
+ goto out;
+ }
+
+ key_ref_put(newkey);
+out:
+ kfree(desc);
+ return rc;
+}
+
+/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */
+static int get_vcssb(struct vcssb *vcssb)
+{
+ int diag320_rc;
+
+ memset(vcssb, 0, sizeof(*vcssb));
+ vcssb->vcssb_length = VCSSB_LEN_BYTES;
+ diag320_rc = diag320(DIAG320_STORAGE, vcssb);
+ pr_dbf_vcssb(vcssb);
+
+ if (diag320_rc != DIAG320_RC_OK) {
+ pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc);
+ return -EIO;
+ }
+ if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) {
+ pr_dbf_msg("No certificates available for current configuration");
+ return -ENOKEY;
+ }
+
+ return 0;
+}
+
+static u32 get_4k_mult_vcb_size(struct vcssb *vcssb)
+{
+ return round_up(vcssb->max_single_vcb_length, PAGE_SIZE);
+}
+
+/* Fill input fields of single-entry VCB that will be read by LPAR. */
+static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index)
+{
+ memset(vcb, 0, sizeof(*vcb));
+ vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb);
+ vcb->vcb_hdr.cs_token = vcssb->cs_token;
+
+ /* Request single entry. */
+ vcb->vcb_hdr.first_vc_index = index;
+ vcb->vcb_hdr.last_vc_index = index;
+}
+
+static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce)
+{
+ struct vce *extracted_vce;
+
+ extracted_vce = (struct vce *)vcb->vcb_buf;
+ memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length);
+ pr_dbf_vce(vce);
+}
+
+static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb)
+{
+ int rc, diag320_rc;
+
+ fill_vcb_input(vcssb, vcb, index);
+
+ diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb);
+ pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc);
+ pr_dbf_vcb(vcb);
+
+ switch (diag320_rc) {
+ case DIAG320_RC_OK:
+ rc = 0;
+ if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) {
+ pr_dbf_msg("No certificate entry for index %u", index);
+ rc = -ENOKEY;
+ } else if (vcb->vcb_hdr.remaining_vc_count != 0) {
+ /* Retry on insufficient space. */
+ pr_dbf_msg("Couldn't get all requested certificates");
+ rc = -EAGAIN;
+ }
+ break;
+ case DIAG320_RC_CS_NOMATCH:
+ pr_dbf_msg("Certificate Store token mismatch");
+ rc = -EAGAIN;
+ break;
+ default:
+ pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc);
+ rc = -EINVAL;
+ break;
+ }
+
+ return rc;
+}
+
+/*
+ * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call,
+ * extract VCE and create a key from its' certificate.
+ */
+static int create_key_from_sevcb(struct vcssb *vcssb, u16 index,
+ struct key *keyring)
+{
+ struct vcb *vcb;
+ struct vce *vce;
+ int rc;
+
+ rc = -ENOMEM;
+ vcb = vmalloc(get_4k_mult_vcb_size(vcssb));
+ vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr));
+ if (!vcb || !vce)
+ goto out;
+
+ rc = get_sevcb(vcssb, index, vcb);
+ if (rc)
+ goto out;
+
+ extract_vce_from_sevcb(vcb, vce);
+ rc = check_certificate_valid(vce);
+ if (rc)
+ goto out;
+
+ rc = create_key_from_vce(vcssb, vce, keyring);
+ if (rc)
+ goto out;
+
+ pr_dbf_msg("Successfully created key from Certificate Entry %d", index);
+out:
+ vfree(vce);
+ vfree(vcb);
+ return rc;
+}
+
+/*
+ * Request a single-entry VCB for each VCE available for the partition.
+ * Create a key from it and link it to cert_store keyring. If no keys
+ * could be created (i.e. VCEs were invalid) return -ENOKEY.
+ */
+static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring)
+{
+ int rc, index, count, added;
+
+ count = 0;
+ added = 0;
+ /* Certificate Store entries indices start with 1 and have no gaps. */
+ for (index = 1; index < vcssb->total_vc_index_count + 1; index++) {
+ pr_dbf_msg("Creating key from VCE %u", index);
+ rc = create_key_from_sevcb(vcssb, index, keyring);
+ count++;
+
+ if (rc == -EAGAIN)
+ return rc;
+
+ if (rc)
+ pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc);
+ else
+ added++;
+ }
+
+ if (added == 0) {
+ pr_dbf_msg("Processed %d entries. No keys created", count);
+ return -ENOKEY;
+ }
+
+ pr_info("Added %d of %d keys to cert_store keyring", added, count);
+
+ /*
+ * Do not allow to link more keys to certificate store keyring after all
+ * the VCEs were processed.
+ */
+ rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL);
+ if (rc)
+ pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc);
+
+ return 0;
+}
+
+/*
+ * Check which DIAG320 subcodes are installed.
+ * Return -ENOENT if subcodes 1 or 2 are not available.
+ */
+static int query_diag320_subcodes(void)
+{
+ unsigned long ism[ISM_LEN_DWORDS];
+ int rc;
+
+ rc = diag320(0, ism);
+ if (rc != DIAG320_RC_OK) {
+ pr_dbf_msg("DIAG320 subcode query returned %04x", rc);
+ return -ENOENT;
+ }
+
+ debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0");
+ debug_event(cert_store_hexdump, 3, ism, sizeof(ism));
+
+ if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) {
+ pr_dbf_msg("Not all required DIAG320 subcodes are installed");
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/*
+ * Check if Certificate Store is supported by the firmware and DIAG320 subcodes
+ * 1 and 2 are installed. Create cert_store keyring and link all certificates
+ * available for the current partition to it as "cert_store_key" type
+ * keys. On refresh or error invalidate cert_store keyring and destroy
+ * all keys of "cert_store_key" type.
+ */
+static int fill_cs_keyring(void)
+{
+ struct key *cs_keyring;
+ struct vcssb *vcssb;
+ int rc;
+
+ rc = -ENOMEM;
+ vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL);
+ if (!vcssb)
+ goto cleanup_keys;
+
+ rc = -ENOENT;
+ if (!sclp.has_diag320) {
+ pr_dbf_msg("Certificate Store is not supported");
+ goto cleanup_keys;
+ }
+
+ rc = query_diag320_subcodes();
+ if (rc)
+ goto cleanup_keys;
+
+ rc = get_vcssb(vcssb);
+ if (rc)
+ goto cleanup_keys;
+
+ rc = -ENOMEM;
+ cs_keyring = create_cs_keyring();
+ if (!cs_keyring)
+ goto cleanup_keys;
+
+ rc = add_certificates_to_keyring(vcssb, cs_keyring);
+ if (rc)
+ goto cleanup_cs_keyring;
+
+ goto out;
+
+cleanup_cs_keyring:
+ key_put(cs_keyring);
+cleanup_keys:
+ cleanup_cs_keys();
+out:
+ kfree(vcssb);
+ return rc;
+}
+
+static DEFINE_MUTEX(cs_refresh_lock);
+static int cs_status_val = -1;
+
+static ssize_t cs_status_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ if (cs_status_val == -1)
+ return sysfs_emit(buf, "uninitialized\n");
+ else if (cs_status_val == 0)
+ return sysfs_emit(buf, "ok\n");
+
+ return sysfs_emit(buf, "failed (%d)\n", cs_status_val);
+}
+
+static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status);
+
+static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int rc, retries;
+
+ pr_dbf_msg("Refresh certificate store information requested");
+ rc = mutex_lock_interruptible(&cs_refresh_lock);
+ if (rc)
+ return rc;
+
+ for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) {
+ /* Request certificates from certificate store. */
+ rc = fill_cs_keyring();
+ if (rc)
+ pr_dbf_msg("Failed to refresh certificate store information (%d)", rc);
+ if (rc != -EAGAIN)
+ break;
+ }
+ cs_status_val = rc;
+ mutex_unlock(&cs_refresh_lock);
+
+ return rc ?: count;
+}
+
+static struct kobj_attribute refresh_attr = __ATTR_WO(refresh);
+
+static const struct attribute *cert_store_attrs[] __initconst = {
+ &cs_status_attr.attr,
+ &refresh_attr.attr,
+ NULL,
+};
+
+static struct kobject *cert_store_kobj;
+
+static int __init cert_store_init(void)
+{
+ int rc = -ENOMEM;
+
+ cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64);
+ if (!cert_store_dbf)
+ goto cleanup_dbf;
+
+ cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128);
+ if (!cert_store_hexdump)
+ goto cleanup_dbf;
+
+ debug_register_view(cert_store_hexdump, &debug_hex_ascii_view);
+ debug_register_view(cert_store_dbf, &debug_sprintf_view);
+
+ /* Create directory /sys/firmware/cert_store. */
+ cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj);
+ if (!cert_store_kobj)
+ goto cleanup_dbf;
+
+ rc = sysfs_create_files(cert_store_kobj, cert_store_attrs);
+ if (rc)
+ goto cleanup_kobj;
+
+ register_key_type(&key_type_cert_store_key);
+
+ return rc;
+
+cleanup_kobj:
+ kobject_put(cert_store_kobj);
+cleanup_dbf:
+ debug_unregister(cert_store_dbf);
+ debug_unregister(cert_store_hexdump);
+
+ return rc;
+}
+device_initcall(cert_store_init);
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 82079f2d8583..f9f06cd8fcee 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -11,6 +11,7 @@
#include <linux/cpu.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/trace/diag.h>
@@ -50,6 +51,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
[DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" },
[DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" },
[DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" },
+ [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" },
[DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" },
};
@@ -167,8 +169,29 @@ static inline int __diag204(unsigned long *subcode, unsigned long size, void *ad
return rp.odd;
}
+/**
+ * diag204() - Issue diagnose 204 call.
+ * @subcode: Subcode of diagnose 204 to be executed.
+ * @size: Size of area in pages which @area points to, if given.
+ * @addr: Vmalloc'ed memory area where the result is written to.
+ *
+ * Execute diagnose 204 with the given subcode and write the result to the
+ * memory area specified with @addr. For subcodes which do not write a
+ * result to memory both @size and @addr must be zero. If @addr is
+ * specified it must be page aligned and must have been allocated with
+ * vmalloc(). Conversion to real / physical addresses will be handled by
+ * this function if required.
+ */
int diag204(unsigned long subcode, unsigned long size, void *addr)
{
+ if (addr) {
+ if (WARN_ON_ONCE(!is_vmalloc_addr(addr)))
+ return -1;
+ if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE)))
+ return -1;
+ }
+ if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4)
+ addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr));
diag_stat_inc(DIAG_STAT_X204);
size = __diag204(&subcode, size, addr);
if (subcode)
@@ -200,7 +223,7 @@ int diag210(struct diag210 *addr)
EXPORT_SYMBOL(diag210);
/*
- * Diagnose 210: Get information about a virtual device
+ * Diagnose 8C: Access 3270 Display Device Information
*/
int diag8c(struct diag8c *addr, struct ccw_dev_id *devno)
{
diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c
index 7f8246c9be08..0e51fa537262 100644
--- a/arch/s390/kernel/ebcdic.c
+++ b/arch/s390/kernel/ebcdic.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * ECBDIC -> ASCII, ASCII -> ECBDIC,
+ * EBCDIC -> ASCII, ASCII -> EBCDIC,
* upper to lower case (EBCDIC) conversion tables.
*
* S390 version
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index a660f4b6d654..49a11f6dd7ae 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -8,6 +8,7 @@
* Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
*/
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/asm-extable.h>
@@ -26,7 +27,6 @@
#include <asm/vx-insn.h>
#include <asm/setup.h>
#include <asm/nmi.h>
-#include <asm/export.h>
#include <asm/nospec-insn.h>
_LPP_OFFSET = __LC_LPP
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 85a00d97a314..05e51666db03 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -266,7 +266,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \
struct kobj_attribute *attr, \
const char *buf, size_t len) \
{ \
- strncpy(_value, buf, sizeof(_value) - 1); \
+ strscpy(_value, buf, sizeof(_value)); \
strim(_value); \
return len; \
} \
@@ -557,15 +557,12 @@ static struct kobj_attribute sys_ipl_ccw_loadparm_attr =
__ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL);
static struct attribute *ipl_fcp_attrs[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_fcp_wwpn_attr.attr,
&sys_ipl_fcp_lun_attr.attr,
&sys_ipl_fcp_bootprog_attr.attr,
&sys_ipl_fcp_br_lba_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
@@ -575,14 +572,11 @@ static struct attribute_group ipl_fcp_attr_group = {
};
static struct attribute *ipl_nvme_attrs[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_nvme_fid_attr.attr,
&sys_ipl_nvme_nsid_attr.attr,
&sys_ipl_nvme_bootprog_attr.attr,
&sys_ipl_nvme_br_lba_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
@@ -592,13 +586,10 @@ static struct attribute_group ipl_nvme_attr_group = {
};
static struct attribute *ipl_eckd_attrs[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_eckd_bootprog_attr.attr,
&sys_ipl_eckd_br_chr_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
&sys_ipl_device_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
@@ -610,21 +601,15 @@ static struct attribute_group ipl_eckd_attr_group = {
/* CCW ipl device attributes */
static struct attribute *ipl_ccw_attrs_vm[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
&sys_ipl_vm_parm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
static struct attribute *ipl_ccw_attrs_lpar[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
@@ -636,15 +621,15 @@ static struct attribute_group ipl_ccw_attr_group_lpar = {
.attrs = ipl_ccw_attrs_lpar
};
-/* UNKNOWN ipl device attributes */
-
-static struct attribute *ipl_unknown_attrs[] = {
+static struct attribute *ipl_common_attrs[] = {
&sys_ipl_type_attr.attr,
+ &sys_ipl_secure_attr.attr,
+ &sys_ipl_has_secure_attr.attr,
NULL,
};
-static struct attribute_group ipl_unknown_attr_group = {
- .attrs = ipl_unknown_attrs,
+static struct attribute_group ipl_common_attr_group = {
+ .attrs = ipl_common_attrs,
};
static struct kset *ipl_kset;
@@ -668,6 +653,9 @@ static int __init ipl_init(void)
rc = -ENOMEM;
goto out;
}
+ rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group);
+ if (rc)
+ goto out;
switch (ipl_info.type) {
case IPL_TYPE_CCW:
if (MACHINE_IS_VM)
@@ -689,8 +677,6 @@ static int __init ipl_init(void)
rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group);
break;
default:
- rc = sysfs_create_group(&ipl_kset->kobj,
- &ipl_unknown_attr_group);
break;
}
out:
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 6d9276c096a6..12a2bd4fc88c 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -13,6 +13,7 @@
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/debug_locks.h>
+#include <asm/pfault.h>
#include <asm/cio.h>
#include <asm/setup.h>
#include <asm/smp.h>
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 2df94d32140c..8d207b82d9fe 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -188,7 +188,7 @@ static int kexec_file_add_ipl_report(struct kimage *image,
data->memsz = ALIGN(data->memsz, PAGE_SIZE);
buf.mem = data->memsz;
- ptr = (void *)ipl_cert_list_addr;
+ ptr = __va(ipl_cert_list_addr);
end = ptr + ipl_cert_list_size;
ncerts = 0;
while (ptr < end) {
@@ -200,7 +200,7 @@ static int kexec_file_add_ipl_report(struct kimage *image,
addr = data->memsz + data->report->size;
addr += ncerts * sizeof(struct ipl_rb_certificate_entry);
- ptr = (void *)ipl_cert_list_addr;
+ ptr = __va(ipl_cert_list_addr);
while (ptr < end) {
len = *(unsigned int *)ptr;
ptr += sizeof(len);
diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
index dbece2803c50..ae4d4fd9afcd 100644
--- a/arch/s390/kernel/mcount.S
+++ b/arch/s390/kernel/mcount.S
@@ -9,15 +9,20 @@
#include <asm/ftrace.h>
#include <asm/nospec-insn.h>
#include <asm/ptrace.h>
-#include <asm/export.h>
+#define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE)
+#define STACK_PTREGS (STACK_FRAME_OVERHEAD)
+#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
+#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
+
+#define STACK_FRAME_SIZE_FREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE)
+#define STACK_FREGS (STACK_FRAME_OVERHEAD)
+#define STACK_FREGS_PTREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS)
+#define STACK_FREGS_PTREGS_GPRS (STACK_FREGS_PTREGS + __PT_GPRS)
+#define STACK_FREGS_PTREGS_PSW (STACK_FREGS_PTREGS + __PT_PSW)
+#define STACK_FREGS_PTREGS_ORIG_GPR2 (STACK_FREGS_PTREGS + __PT_ORIG_GPR2)
+#define STACK_FREGS_PTREGS_FLAGS (STACK_FREGS_PTREGS + __PT_FLAGS)
-#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE)
-#define STACK_PTREGS (STACK_FRAME_OVERHEAD)
-#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
-#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
-#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2)
-#define STACK_PTREGS_FLAGS (STACK_PTREGS + __PT_FLAGS)
/* packed stack: allocate just enough for r14, r15 and backchain */
#define TRACED_FUNC_FRAME_SIZE 24
@@ -53,23 +58,23 @@ SYM_CODE_END(ftrace_stub_direct_tramp)
stg %r1,__SF_BACKCHAIN(%r15)
stg %r0,(__SF_GPRS+8*8)(%r15)
stg %r15,(__SF_GPRS+9*8)(%r15)
- # allocate pt_regs and stack frame for ftrace_trace_function
- aghi %r15,-STACK_FRAME_SIZE
- stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15)
- xc STACK_PTREGS_ORIG_GPR2(8,%r15),STACK_PTREGS_ORIG_GPR2(%r15)
+ # allocate ftrace_regs and stack frame for ftrace_trace_function
+ aghi %r15,-STACK_FRAME_SIZE_FREGS
+ stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15)
+ xc STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
.if \allregs == 1
- stg %r14,(STACK_PTREGS_PSW)(%r15)
- mvghi STACK_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS
+ stg %r14,(STACK_FREGS_PTREGS_PSW)(%r15)
+ mvghi STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS
.else
- xc STACK_PTREGS_FLAGS(8,%r15),STACK_PTREGS_FLAGS(%r15)
+ xc STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15)
.endif
lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address
aghi %r1,-TRACED_FUNC_FRAME_SIZE
stg %r1,__SF_BACKCHAIN(%r15)
- stg %r0,(STACK_PTREGS_PSW+8)(%r15)
- stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15)
+ stg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15)
+ stmg %r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
.endm
SYM_CODE_START(ftrace_regs_caller)
@@ -96,30 +101,30 @@ SYM_CODE_START(ftrace_common)
lg %r1,0(%r1)
#endif
lgr %r3,%r14
- la %r5,STACK_PTREGS(%r15)
+ la %r5,STACK_FREGS(%r15)
BASR_EX %r14,%r1
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
# The j instruction gets runtime patched to a nop instruction.
# See ftrace_enable_ftrace_graph_caller.
SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL)
j .Lftrace_graph_caller_end
- lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15)
- lg %r4,(STACK_PTREGS_PSW+8)(%r15)
+ lmg %r2,%r3,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15)
+ lg %r4,(STACK_FREGS_PTREGS_PSW+8)(%r15)
brasl %r14,prepare_ftrace_return
- stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15)
+ stg %r2,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15)
.Lftrace_graph_caller_end:
#endif
- lg %r0,(STACK_PTREGS_PSW+8)(%r15)
+ lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15)
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
- ltg %r1,STACK_PTREGS_ORIG_GPR2(%r15)
+ ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
locgrz %r1,%r0
#else
- lg %r1,STACK_PTREGS_ORIG_GPR2(%r15)
+ lg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
ltgr %r1,%r1
jnz 0f
lgr %r1,%r0
#endif
-0: lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
+0: lmg %r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
BR_EX %r1
SYM_CODE_END(ftrace_common)
@@ -128,10 +133,14 @@ SYM_CODE_END(ftrace_common)
SYM_FUNC_START(return_to_handler)
stmg %r2,%r5,32(%r15)
lgr %r1,%r15
- aghi %r15,-STACK_FRAME_OVERHEAD
+ aghi %r15,-(STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE)
stg %r1,__SF_BACKCHAIN(%r15)
+ la %r3,STACK_FRAME_OVERHEAD(%r15)
+ stg %r1,__FGRAPH_RET_FP(%r3)
+ stg %r2,__FGRAPH_RET_GPR2(%r3)
+ lgr %r2,%r3
brasl %r14,ftrace_return_to_handler
- aghi %r15,STACK_FRAME_OVERHEAD
+ aghi %r15,STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE
lgr %r14,%r2
lmg %r2,%r5,32(%r15)
BR_EX %r14
@@ -160,11 +169,11 @@ SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl)
SYM_CODE_START(arch_rethook_trampoline)
stg %r14,(__SF_GPRS+8*8)(%r15)
- lay %r15,-STACK_FRAME_SIZE(%r15)
+ lay %r15,-STACK_FRAME_SIZE_PTREGS(%r15)
stmg %r0,%r14,STACK_PTREGS_GPRS(%r15)
# store original stack pointer in backchain and pt_regs
- lay %r7,STACK_FRAME_SIZE(%r15)
+ lay %r7,STACK_FRAME_SIZE_PTREGS(%r15)
stg %r7,__SF_BACKCHAIN(%r15)
stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15)
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 00d76448319d..c744104e4a9c 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -146,6 +146,7 @@ static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31;
static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31;
int __bootdata(noexec_disabled);
+unsigned long __bootdata_preserved(max_mappable);
unsigned long __bootdata(ident_map_size);
struct physmem_info __bootdata(physmem_info);
@@ -874,7 +875,7 @@ static void __init log_component_list(void)
pr_info("Linux is running with Secure-IPL enabled\n");
else
pr_info("Linux is running with Secure-IPL disabled\n");
- ptr = (void *) early_ipl_comp_list_addr;
+ ptr = __va(early_ipl_comp_list_addr);
end = (void *) ptr + early_ipl_comp_list_size;
pr_info("The IPL report contains the following components:\n");
while (ptr < end) {
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index f9a2b755f510..a4edb7ea66ea 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -37,6 +37,7 @@
#include <linux/crash_dump.h>
#include <linux/kprobes.h>
#include <asm/asm-offsets.h>
+#include <asm/pfault.h>
#include <asm/diag.h>
#include <asm/switch_to.h>
#include <asm/facility.h>
@@ -252,8 +253,9 @@ static void pcpu_free_lowcore(struct pcpu *pcpu)
static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
{
- struct lowcore *lc = lowcore_ptr[cpu];
+ struct lowcore *lc, *abs_lc;
+ lc = lowcore_ptr[cpu];
cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
lc->cpu_nr = cpu;
@@ -266,7 +268,9 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
lc->machine_flags = S390_lowcore.machine_flags;
lc->user_timer = lc->system_timer =
lc->steal_timer = lc->avg_steal_timer = 0;
- __ctl_store(lc->cregs_save_area, 0, 15);
+ abs_lc = get_abs_lowcore();
+ memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area));
+ put_abs_lowcore(abs_lc);
lc->cregs_save_area[1] = lc->kernel_asce;
lc->cregs_save_area[7] = lc->user_asce;
save_access_regs((unsigned int *) lc->access_regs_save_area);
@@ -606,8 +610,8 @@ void smp_ctl_set_clear_bit(int cr, int bit, bool set)
ctlreg = (ctlreg & parms.andval) | parms.orval;
abs_lc->cregs_save_area[cr] = ctlreg;
put_abs_lowcore(abs_lc);
- spin_unlock(&ctl_lock);
on_each_cpu(smp_ctl_bit_callback, &parms, 1);
+ spin_unlock(&ctl_lock);
}
EXPORT_SYMBOL(smp_ctl_set_clear_bit);
@@ -927,12 +931,18 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
rc = pcpu_alloc_lowcore(pcpu, cpu);
if (rc)
return rc;
+ /*
+ * Make sure global control register contents do not change
+ * until new CPU has initialized control registers.
+ */
+ spin_lock(&ctl_lock);
pcpu_prepare_secondary(pcpu, cpu);
pcpu_attach_task(pcpu, tidle);
pcpu_start_fn(pcpu, smp_start_secondary, NULL);
/* Wait until cpu puts itself in the online & active maps */
while (!cpu_online(cpu))
cpu_relax();
+ spin_unlock(&ctl_lock);
return 0;
}
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index 2ea7f208f0e7..30bb20461db4 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -317,7 +317,9 @@ static void fill_diag(struct sthyi_sctns *sctns)
if (pages <= 0)
return;
- diag204_buf = vmalloc(array_size(pages, PAGE_SIZE));
+ diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE),
+ PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
if (!diag204_buf)
return;
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index a6935af2235c..0122cc156952 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -454,3 +454,4 @@
449 common futex_waitv sys_futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2 sys_fchmodat2
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 66f0eb1c872b..b771f1b4cdd1 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -88,7 +88,7 @@ fail:
* Requests the Ultravisor to pin the page in the shared state. This will
* cause an intercept when the guest attempts to unshare the pinned page.
*/
-static int uv_pin_shared(unsigned long paddr)
+int uv_pin_shared(unsigned long paddr)
{
struct uv_cb_cfs uvcb = {
.header.cmd = UVC_CMD_PIN_PAGE_SHARED,
@@ -100,6 +100,7 @@ static int uv_pin_shared(unsigned long paddr)
return -EINVAL;
return 0;
}
+EXPORT_SYMBOL_GPL(uv_pin_shared);
/*
* Requests the Ultravisor to destroy a guest page and make it
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 0261d42c7d01..a7ea80cfa445 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -270,18 +270,6 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
return vcpu->arch.pv.handle;
}
-static inline bool kvm_s390_pv_is_protected(struct kvm *kvm)
-{
- lockdep_assert_held(&kvm->lock);
- return !!kvm_s390_pv_get_handle(kvm);
-}
-
-static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
-{
- lockdep_assert_held(&vcpu->mutex);
- return !!kvm_s390_pv_cpu_get_handle(vcpu);
-}
-
/* implemented in interrupt.c */
int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index bf1fdc7bf89e..8d3f39a8a11e 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -18,6 +18,20 @@
#include <linux/mmu_notifier.h>
#include "kvm-s390.h"
+bool kvm_s390_pv_is_protected(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->lock);
+ return !!kvm_s390_pv_get_handle(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected);
+
+bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_held(&vcpu->mutex);
+ return !!kvm_s390_pv_cpu_get_handle(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
+
/**
* struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
* be destroyed
diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S
index 5a9a55de2e10..08f60a42b9a6 100644
--- a/arch/s390/lib/mem.S
+++ b/arch/s390/lib/mem.S
@@ -5,8 +5,8 @@
* Copyright IBM Corp. 2012
*/
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/export.h>
#include <asm/nospec-insn.h>
GEN_BR_THUNK %r14
diff --git a/arch/s390/lib/tishift.S b/arch/s390/lib/tishift.S
index de33cf02cfd2..96214f51f49b 100644
--- a/arch/s390/lib/tishift.S
+++ b/arch/s390/lib/tishift.S
@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/nospec-insn.h>
-#include <asm/export.h>
.section .noinstr.text, "ax"
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index d90db06a8af5..352ff520fd94 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
+obj-$(CONFIG_PFAULT) += pfault.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 5300c6867d5e..f47515313226 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -90,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter,
} else
free_page((unsigned long) npa);
}
- diag10_range(virt_to_pfn(addr), 1);
+ diag10_range(virt_to_pfn((void *)addr), 1);
pa->pages[pa->index++] = addr;
(*counter)++;
spin_unlock(&cmm_lock);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index ba5f80268878..afa5db750d92 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -297,7 +297,7 @@ static int pt_dump_init(void)
address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore;
address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE;
address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area;
- address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE;
+ address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + MEMCPY_REAL_SIZE;
address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
address_markers[VMALLOC_NR].start_address = VMALLOC_START;
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 1bc42ce26599..e41869f5cc95 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -640,10 +640,13 @@ void segment_warning(int rc, char *seg_name)
pr_err("There is not enough memory to load or query "
"DCSS %s\n", seg_name);
break;
- case -ERANGE:
- pr_err("DCSS %s exceeds the kernel mapping range (%lu) "
- "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS);
+ case -ERANGE: {
+ struct range mhp_range = arch_get_mappable_range();
+
+ pr_err("DCSS %s exceeds the kernel mapping range (%llu) "
+ "and cannot be loaded\n", seg_name, mhp_range.end + 1);
break;
+ }
default:
break;
}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 2f123429a291..b5e1bea9194c 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -43,8 +43,6 @@
#include "../kernel/entry.h"
#define __FAIL_ADDR_MASK -4096L
-#define __SUBCODE_MASK 0x0600
-#define __PF_RES_FIELD 0x8000000000000000ULL
/*
* Allocate private vm_fault_reason from top. Please make sure it won't
@@ -583,232 +581,6 @@ void do_dat_exception(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(do_dat_exception);
-#ifdef CONFIG_PFAULT
-/*
- * 'pfault' pseudo page faults routines.
- */
-static int pfault_disable;
-
-static int __init nopfault(char *str)
-{
- pfault_disable = 1;
- return 1;
-}
-
-__setup("nopfault", nopfault);
-
-struct pfault_refbk {
- u16 refdiagc;
- u16 reffcode;
- u16 refdwlen;
- u16 refversn;
- u64 refgaddr;
- u64 refselmk;
- u64 refcmpmk;
- u64 reserved;
-} __attribute__ ((packed, aligned(8)));
-
-static struct pfault_refbk pfault_init_refbk = {
- .refdiagc = 0x258,
- .reffcode = 0,
- .refdwlen = 5,
- .refversn = 2,
- .refgaddr = __LC_LPP,
- .refselmk = 1ULL << 48,
- .refcmpmk = 1ULL << 48,
- .reserved = __PF_RES_FIELD
-};
-
-int pfault_init(void)
-{
- int rc;
-
- if (pfault_disable)
- return -1;
- diag_stat_inc(DIAG_STAT_X258);
- asm volatile(
- " diag %1,%0,0x258\n"
- "0: j 2f\n"
- "1: la %0,8\n"
- "2:\n"
- EX_TABLE(0b,1b)
- : "=d" (rc)
- : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
- return rc;
-}
-
-static struct pfault_refbk pfault_fini_refbk = {
- .refdiagc = 0x258,
- .reffcode = 1,
- .refdwlen = 5,
- .refversn = 2,
-};
-
-void pfault_fini(void)
-{
-
- if (pfault_disable)
- return;
- diag_stat_inc(DIAG_STAT_X258);
- asm volatile(
- " diag %0,0,0x258\n"
- "0: nopr %%r7\n"
- EX_TABLE(0b,0b)
- : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
-}
-
-static DEFINE_SPINLOCK(pfault_lock);
-static LIST_HEAD(pfault_list);
-
-#define PF_COMPLETE 0x0080
-
-/*
- * The mechanism of our pfault code: if Linux is running as guest, runs a user
- * space process and the user space process accesses a page that the host has
- * paged out we get a pfault interrupt.
- *
- * This allows us, within the guest, to schedule a different process. Without
- * this mechanism the host would have to suspend the whole virtual cpu until
- * the page has been paged in.
- *
- * So when we get such an interrupt then we set the state of the current task
- * to uninterruptible and also set the need_resched flag. Both happens within
- * interrupt context(!). If we later on want to return to user space we
- * recognize the need_resched flag and then call schedule(). It's not very
- * obvious how this works...
- *
- * Of course we have a lot of additional fun with the completion interrupt (->
- * host signals that a page of a process has been paged in and the process can
- * continue to run). This interrupt can arrive on any cpu and, since we have
- * virtual cpus, actually appear before the interrupt that signals that a page
- * is missing.
- */
-static void pfault_interrupt(struct ext_code ext_code,
- unsigned int param32, unsigned long param64)
-{
- struct task_struct *tsk;
- __u16 subcode;
- pid_t pid;
-
- /*
- * Get the external interruption subcode & pfault initial/completion
- * signal bit. VM stores this in the 'cpu address' field associated
- * with the external interrupt.
- */
- subcode = ext_code.subcode;
- if ((subcode & 0xff00) != __SUBCODE_MASK)
- return;
- inc_irq_stat(IRQEXT_PFL);
- /* Get the token (= pid of the affected task). */
- pid = param64 & LPP_PID_MASK;
- rcu_read_lock();
- tsk = find_task_by_pid_ns(pid, &init_pid_ns);
- if (tsk)
- get_task_struct(tsk);
- rcu_read_unlock();
- if (!tsk)
- return;
- spin_lock(&pfault_lock);
- if (subcode & PF_COMPLETE) {
- /* signal bit is set -> a page has been swapped in by VM */
- if (tsk->thread.pfault_wait == 1) {
- /* Initial interrupt was faster than the completion
- * interrupt. pfault_wait is valid. Set pfault_wait
- * back to zero and wake up the process. This can
- * safely be done because the task is still sleeping
- * and can't produce new pfaults. */
- tsk->thread.pfault_wait = 0;
- list_del(&tsk->thread.list);
- wake_up_process(tsk);
- put_task_struct(tsk);
- } else {
- /* Completion interrupt was faster than initial
- * interrupt. Set pfault_wait to -1 so the initial
- * interrupt doesn't put the task to sleep.
- * If the task is not running, ignore the completion
- * interrupt since it must be a leftover of a PFAULT
- * CANCEL operation which didn't remove all pending
- * completion interrupts. */
- if (task_is_running(tsk))
- tsk->thread.pfault_wait = -1;
- }
- } else {
- /* signal bit not set -> a real page is missing. */
- if (WARN_ON_ONCE(tsk != current))
- goto out;
- if (tsk->thread.pfault_wait == 1) {
- /* Already on the list with a reference: put to sleep */
- goto block;
- } else if (tsk->thread.pfault_wait == -1) {
- /* Completion interrupt was faster than the initial
- * interrupt (pfault_wait == -1). Set pfault_wait
- * back to zero and exit. */
- tsk->thread.pfault_wait = 0;
- } else {
- /* Initial interrupt arrived before completion
- * interrupt. Let the task sleep.
- * An extra task reference is needed since a different
- * cpu may set the task state to TASK_RUNNING again
- * before the scheduler is reached. */
- get_task_struct(tsk);
- tsk->thread.pfault_wait = 1;
- list_add(&tsk->thread.list, &pfault_list);
-block:
- /* Since this must be a userspace fault, there
- * is no kernel task state to trample. Rely on the
- * return to userspace schedule() to block. */
- __set_current_state(TASK_UNINTERRUPTIBLE);
- set_tsk_need_resched(tsk);
- set_preempt_need_resched();
- }
- }
-out:
- spin_unlock(&pfault_lock);
- put_task_struct(tsk);
-}
-
-static int pfault_cpu_dead(unsigned int cpu)
-{
- struct thread_struct *thread, *next;
- struct task_struct *tsk;
-
- spin_lock_irq(&pfault_lock);
- list_for_each_entry_safe(thread, next, &pfault_list, list) {
- thread->pfault_wait = 0;
- list_del(&thread->list);
- tsk = container_of(thread, struct task_struct, thread);
- wake_up_process(tsk);
- put_task_struct(tsk);
- }
- spin_unlock_irq(&pfault_lock);
- return 0;
-}
-
-static int __init pfault_irq_init(void)
-{
- int rc;
-
- rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
- if (rc)
- goto out_extint;
- rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
- if (rc)
- goto out_pfault;
- irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
- cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
- NULL, pfault_cpu_dead);
- return 0;
-
-out_pfault:
- unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
-out_extint:
- pfault_disable = 1;
- return rc;
-}
-early_initcall(pfault_irq_init);
-
-#endif /* CONFIG_PFAULT */
-
#if IS_ENABLED(CONFIG_PGSTE)
void do_secure_storage_access(struct pt_regs *regs)
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 9c8af31be970..906a7bfc2a78 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2514,6 +2514,7 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops thp_split_walk_ops = {
.pmd_entry = thp_split_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
static inline void thp_split_mm(struct mm_struct *mm)
@@ -2565,6 +2566,7 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
static const struct mm_walk_ops zap_zero_walk_ops = {
.pmd_entry = __zap_zero_pages,
+ .walk_lock = PGWALK_WRLOCK,
};
/*
@@ -2655,6 +2657,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = {
.hugetlb_entry = __s390_enable_skey_hugetlb,
.pte_entry = __s390_enable_skey_pte,
.pmd_entry = __s390_enable_skey_pmd,
+ .walk_lock = PGWALK_WRLOCK,
};
int s390_enable_skey(void)
@@ -2692,6 +2695,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
static const struct mm_walk_ops reset_cmma_walk_ops = {
.pte_entry = __s390_reset_cmma,
+ .walk_lock = PGWALK_WRLOCK,
};
void s390_reset_cmma(struct mm_struct *mm)
@@ -2728,6 +2732,7 @@ static int s390_gather_pages(pte_t *ptep, unsigned long addr,
static const struct mm_walk_ops gather_pages_ops = {
.pte_entry = s390_gather_pages,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index cbe1df1e9c18..c805b3e2592b 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -86,11 +86,12 @@ size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
void *chunk;
pte_t pte;
+ BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE);
while (count) {
- phys = src & PAGE_MASK;
- offset = src & ~PAGE_MASK;
+ phys = src & MEMCPY_REAL_MASK;
+ offset = src & ~MEMCPY_REAL_MASK;
chunk = (void *)(__memcpy_real_area + offset);
- len = min(count, PAGE_SIZE - offset);
+ len = min(count, MEMCPY_REAL_SIZE - offset);
pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
mutex_lock(&memcpy_real_mutex);
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
new file mode 100644
index 000000000000..1aac13bb8f53
--- /dev/null
+++ b/arch/s390/mm/pfault.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/sched/task.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <asm/asm-extable.h>
+#include <asm/pfault.h>
+#include <asm/diag.h>
+
+#define __SUBCODE_MASK 0x0600
+#define __PF_RES_FIELD 0x8000000000000000UL
+
+/*
+ * 'pfault' pseudo page faults routines.
+ */
+static int pfault_disable;
+
+static int __init nopfault(char *str)
+{
+ pfault_disable = 1;
+ return 1;
+}
+early_param("nopfault", nopfault);
+
+struct pfault_refbk {
+ u16 refdiagc;
+ u16 reffcode;
+ u16 refdwlen;
+ u16 refversn;
+ u64 refgaddr;
+ u64 refselmk;
+ u64 refcmpmk;
+ u64 reserved;
+};
+
+static struct pfault_refbk pfault_init_refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 0,
+ .refdwlen = 5,
+ .refversn = 2,
+ .refgaddr = __LC_LPP,
+ .refselmk = 1UL << 48,
+ .refcmpmk = 1UL << 48,
+ .reserved = __PF_RES_FIELD
+};
+
+int __pfault_init(void)
+{
+ int rc = -EOPNOTSUPP;
+
+ if (pfault_disable)
+ return rc;
+ diag_stat_inc(DIAG_STAT_X258);
+ asm volatile(
+ " diag %[refbk],%[rc],0x258\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ : [rc] "+d" (rc)
+ : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk)
+ : "cc");
+ return rc;
+}
+
+static struct pfault_refbk pfault_fini_refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 1,
+ .refdwlen = 5,
+ .refversn = 2,
+};
+
+void __pfault_fini(void)
+{
+ if (pfault_disable)
+ return;
+ diag_stat_inc(DIAG_STAT_X258);
+ asm volatile(
+ " diag %[refbk],0,0x258\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ :
+ : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk)
+ : "cc");
+}
+
+static DEFINE_SPINLOCK(pfault_lock);
+static LIST_HEAD(pfault_list);
+
+#define PF_COMPLETE 0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule(). It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
+static void pfault_interrupt(struct ext_code ext_code,
+ unsigned int param32, unsigned long param64)
+{
+ struct task_struct *tsk;
+ __u16 subcode;
+ pid_t pid;
+
+ /*
+ * Get the external interruption subcode & pfault initial/completion
+ * signal bit. VM stores this in the 'cpu address' field associated
+ * with the external interrupt.
+ */
+ subcode = ext_code.subcode;
+ if ((subcode & 0xff00) != __SUBCODE_MASK)
+ return;
+ inc_irq_stat(IRQEXT_PFL);
+ /* Get the token (= pid of the affected task). */
+ pid = param64 & LPP_PID_MASK;
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk)
+ return;
+ spin_lock(&pfault_lock);
+ if (subcode & PF_COMPLETE) {
+ /* signal bit is set -> a page has been swapped in by VM */
+ if (tsk->thread.pfault_wait == 1) {
+ /*
+ * Initial interrupt was faster than the completion
+ * interrupt. pfault_wait is valid. Set pfault_wait
+ * back to zero and wake up the process. This can
+ * safely be done because the task is still sleeping
+ * and can't produce new pfaults.
+ */
+ tsk->thread.pfault_wait = 0;
+ list_del(&tsk->thread.list);
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ } else {
+ /*
+ * Completion interrupt was faster than initial
+ * interrupt. Set pfault_wait to -1 so the initial
+ * interrupt doesn't put the task to sleep.
+ * If the task is not running, ignore the completion
+ * interrupt since it must be a leftover of a PFAULT
+ * CANCEL operation which didn't remove all pending
+ * completion interrupts.
+ */
+ if (task_is_running(tsk))
+ tsk->thread.pfault_wait = -1;
+ }
+ } else {
+ /* signal bit not set -> a real page is missing. */
+ if (WARN_ON_ONCE(tsk != current))
+ goto out;
+ if (tsk->thread.pfault_wait == 1) {
+ /* Already on the list with a reference: put to sleep */
+ goto block;
+ } else if (tsk->thread.pfault_wait == -1) {
+ /*
+ * Completion interrupt was faster than the initial
+ * interrupt (pfault_wait == -1). Set pfault_wait
+ * back to zero and exit.
+ */
+ tsk->thread.pfault_wait = 0;
+ } else {
+ /*
+ * Initial interrupt arrived before completion
+ * interrupt. Let the task sleep.
+ * An extra task reference is needed since a different
+ * cpu may set the task state to TASK_RUNNING again
+ * before the scheduler is reached.
+ */
+ get_task_struct(tsk);
+ tsk->thread.pfault_wait = 1;
+ list_add(&tsk->thread.list, &pfault_list);
+block:
+ /*
+ * Since this must be a userspace fault, there
+ * is no kernel task state to trample. Rely on the
+ * return to userspace schedule() to block.
+ */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ set_tsk_need_resched(tsk);
+ set_preempt_need_resched();
+ }
+ }
+out:
+ spin_unlock(&pfault_lock);
+ put_task_struct(tsk);
+}
+
+static int pfault_cpu_dead(unsigned int cpu)
+{
+ struct thread_struct *thread, *next;
+ struct task_struct *tsk;
+
+ spin_lock_irq(&pfault_lock);
+ list_for_each_entry_safe(thread, next, &pfault_list, list) {
+ thread->pfault_wait = 0;
+ list_del(&thread->list);
+ tsk = container_of(thread, struct task_struct, thread);
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ }
+ spin_unlock_irq(&pfault_lock);
+ return 0;
+}
+
+static int __init pfault_irq_init(void)
+{
+ int rc;
+
+ rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+ if (rc)
+ goto out_extint;
+ rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
+ if (rc)
+ goto out_pfault;
+ irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
+ cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
+ NULL, pfault_cpu_dead);
+ return 0;
+
+out_pfault:
+ unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+out_extint:
+ pfault_disable = 1;
+ return rc;
+}
+early_initcall(pfault_irq_init);
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 24a66670f5c3..e44243b9c0a4 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -36,7 +36,7 @@ static void vmem_free_pages(unsigned long addr, int order)
{
/* We don't expect boot memory to be removed ever. */
if (!slab_is_available() ||
- WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
+ WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
return;
free_pages(addr, order);
}
@@ -531,7 +531,7 @@ struct range arch_get_mappable_range(void)
struct range mhp_range;
mhp_range.start = 0;
- mhp_range.end = VMEM_MAX_PHYS - 1;
+ mhp_range.end = max_mappable - 1;
return mhp_range;
}
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index ee367798e388..ee90a91ed888 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -666,9 +666,4 @@ static struct miscdevice clp_misc_device = {
.fops = &clp_misc_fops,
};
-static int __init clp_misc_init(void)
-{
- return misc_register(&clp_misc_device);
-}
-
-device_initcall(clp_misc_init);
+builtin_misc_device(clp_misc_device);
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 97377e8c5025..e90d585c4d3e 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -454,3 +454,4 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index faa835f3c54a..4ed06c71c43f 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -497,3 +497,4 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 5026e7b9adfe..ff4bda95b9c7 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -554,7 +554,7 @@ struct mconsole_output {
static DEFINE_SPINLOCK(client_lock);
static LIST_HEAD(clients);
-static char console_buf[MCONSOLE_MAX_DATA];
+static char console_buf[MCONSOLE_MAX_DATA] __nonstring;
static void console_write(struct console *console, const char *string,
unsigned int len)
@@ -567,7 +567,7 @@ static void console_write(struct console *console, const char *string,
while (len > 0) {
n = min((size_t) len, ARRAY_SIZE(console_buf));
- strncpy(console_buf, string, n);
+ memcpy(console_buf, string, n);
string += n;
len -= n;
diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c
index c650e428432b..c719e1ec4645 100644
--- a/arch/um/drivers/vector_user.c
+++ b/arch/um/drivers/vector_user.c
@@ -141,7 +141,7 @@ static int create_tap_fd(char *iface)
}
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
- strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1);
+ strscpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
err = ioctl(fd, TUNSETIFF, (void *) &ifr);
if (err != 0) {
@@ -171,7 +171,7 @@ static int create_raw_fd(char *iface, int flags, int proto)
goto raw_fd_cleanup;
}
memset(&ifr, 0, sizeof(ifr));
- strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1);
+ strscpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
err = -errno;
goto raw_fd_cleanup;
diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h
index 0347a190429c..981e11d8e025 100644
--- a/arch/um/include/shared/user.h
+++ b/arch/um/include/shared/user.h
@@ -50,7 +50,6 @@ static inline int printk(const char *fmt, ...)
#endif
extern int in_aton(char *str);
-extern size_t strlcpy(char *, const char *, size_t);
extern size_t strlcat(char *, const char *, size_t);
extern size_t strscpy(char *, const char *, size_t);
diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
index 7a1abb829930..288c422bfa96 100644
--- a/arch/um/os-Linux/umid.c
+++ b/arch/um/os-Linux/umid.c
@@ -40,7 +40,7 @@ static int __init make_uml_dir(void)
__func__);
goto err;
}
- strlcpy(dir, home, sizeof(dir));
+ strscpy(dir, home, sizeof(dir));
uml_dir++;
}
strlcat(dir, uml_dir, sizeof(dir));
@@ -243,7 +243,7 @@ int __init set_umid(char *name)
if (strlen(name) > UMID_LEN - 1)
return -E2BIG;
- strlcpy(umid, name, sizeof(umid));
+ strscpy(umid, name, sizeof(umid));
return 0;
}
@@ -262,7 +262,7 @@ static int __init make_umid(void)
make_uml_dir();
if (*umid == '\0') {
- strlcpy(tmp, uml_dir, sizeof(tmp));
+ strscpy(tmp, uml_dir, sizeof(tmp));
strlcat(tmp, "XXXXXX", sizeof(tmp));
fd = mkstemp(tmp);
if (fd < 0) {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e36261b4ea14..8d9e4b362572 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1308,44 +1308,8 @@ config X86_REBOOTFIXUPS
Say N otherwise.
config MICROCODE
- bool "CPU microcode loading support"
- default y
+ def_bool y
depends on CPU_SUP_AMD || CPU_SUP_INTEL
- help
- If you say Y here, you will be able to update the microcode on
- Intel and AMD processors. The Intel support is for the IA32 family,
- e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The
- AMD support is for families 0x10 and later. You will obviously need
- the actual microcode binary data itself which is not shipped with
- the Linux kernel.
-
- The preferred method to load microcode from a detached initrd is described
- in Documentation/arch/x86/microcode.rst. For that you need to enable
- CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
- initrd for microcode blobs.
-
- In addition, you can build the microcode into the kernel. For that you
- need to add the vendor-supplied microcode to the CONFIG_EXTRA_FIRMWARE
- config option.
-
-config MICROCODE_INTEL
- bool "Intel microcode loading support"
- depends on CPU_SUP_INTEL && MICROCODE
- default MICROCODE
- help
- This options enables microcode patch loading support for Intel
- processors.
-
- For the current Intel microcode data package go to
- <https://downloadcenter.intel.com> and search for
- 'Linux Processor Microcode Data File'.
-
-config MICROCODE_AMD
- bool "AMD microcode loading support"
- depends on CPU_SUP_AMD && MICROCODE
- help
- If you select this option, microcode patch loading support for AMD
- processors will be enabled.
config MICROCODE_LATE_LOADING
bool "Late microcode loading (DANGEROUS)"
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 40d2ff503079..71fc531b95b4 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -74,6 +74,11 @@ LDFLAGS_vmlinux += -z noexecstack
ifeq ($(CONFIG_LD_IS_BFD),y)
LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments)
endif
+ifeq ($(CONFIG_EFI_STUB),y)
+# ensure that the static EFI stub library will be pulled in, even if it is
+# never referenced explicitly from the startup code
+LDFLAGS_vmlinux += -u efi_pe_entry
+endif
LDFLAGS_vmlinux += -T
hostprogs := mkpiggy
diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S
index 4ca70bf93dc0..f4e22ef774ab 100644
--- a/arch/x86/boot/compressed/efi_mixed.S
+++ b/arch/x86/boot/compressed/efi_mixed.S
@@ -26,8 +26,8 @@
* When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode()
* is the first thing that runs after switching to long mode. Depending on
* whether the EFI handover protocol or the compat entry point was used to
- * enter the kernel, it will either branch to the 64-bit EFI handover
- * entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF
+ * enter the kernel, it will either branch to the common 64-bit EFI stub
+ * entrypoint efi_stub_entry() directly, or via the 64-bit EFI PE/COFF
* entrypoint efi_pe_entry(). In the former case, the bootloader must provide a
* struct bootparams pointer as the third argument, so the presence of such a
* pointer is used to disambiguate.
@@ -37,21 +37,23 @@
* | efi32_pe_entry |---->| | | +-----------+--+
* +------------------+ | | +------+----------------+ |
* | startup_32 |---->| startup_64_mixed_mode | |
- * +------------------+ | | +------+----------------+ V
- * | efi32_stub_entry |---->| | | +------------------+
- * +------------------+ +------------+ +---->| efi64_stub_entry |
- * +-------------+----+
- * +------------+ +----------+ |
- * | startup_64 |<----| efi_main |<--------------+
- * +------------+ +----------+
+ * +------------------+ | | +------+----------------+ |
+ * | efi32_stub_entry |---->| | | |
+ * +------------------+ +------------+ | |
+ * V |
+ * +------------+ +----------------+ |
+ * | startup_64 |<----| efi_stub_entry |<--------+
+ * +------------+ +----------------+
*/
SYM_FUNC_START(startup_64_mixed_mode)
lea efi32_boot_args(%rip), %rdx
mov 0(%rdx), %edi
mov 4(%rdx), %esi
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
mov 8(%rdx), %edx // saved bootparams pointer
test %edx, %edx
- jnz efi64_stub_entry
+ jnz efi_stub_entry
+#endif
/*
* efi_pe_entry uses MS calling convention, which requires 32 bytes of
* shadow space on the stack even if all arguments are passed in
@@ -138,6 +140,28 @@ SYM_FUNC_START(__efi64_thunk)
SYM_FUNC_END(__efi64_thunk)
.code32
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+SYM_FUNC_START(efi32_stub_entry)
+ call 1f
+1: popl %ecx
+
+ /* Clear BSS */
+ xorl %eax, %eax
+ leal (_bss - 1b)(%ecx), %edi
+ leal (_ebss - 1b)(%ecx), %ecx
+ subl %edi, %ecx
+ shrl $2, %ecx
+ cld
+ rep stosl
+
+ add $0x4, %esp /* Discard return address */
+ popl %ecx
+ popl %edx
+ popl %esi
+ jmp efi32_entry
+SYM_FUNC_END(efi32_stub_entry)
+#endif
+
/*
* EFI service pointer must be in %edi.
*
@@ -218,7 +242,7 @@ SYM_FUNC_END(efi_enter32)
* stub may still exit and return to the firmware using the Exit() EFI boot
* service.]
*/
-SYM_FUNC_START(efi32_entry)
+SYM_FUNC_START_LOCAL(efi32_entry)
call 1f
1: pop %ebx
@@ -245,10 +269,6 @@ SYM_FUNC_START(efi32_entry)
jmp startup_32
SYM_FUNC_END(efi32_entry)
-#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime)
-#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol)
-#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base)
-
/*
* efi_status_t efi32_pe_entry(efi_handle_t image_handle,
* efi_system_table_32_t *sys_table)
@@ -256,8 +276,6 @@ SYM_FUNC_END(efi32_entry)
SYM_FUNC_START(efi32_pe_entry)
pushl %ebp
movl %esp, %ebp
- pushl %eax // dummy push to allocate loaded_image
-
pushl %ebx // save callee-save registers
pushl %edi
@@ -266,48 +284,8 @@ SYM_FUNC_START(efi32_pe_entry)
movl $0x80000003, %eax // EFI_UNSUPPORTED
jnz 2f
- call 1f
-1: pop %ebx
-
- /* Get the loaded image protocol pointer from the image handle */
- leal -4(%ebp), %eax
- pushl %eax // &loaded_image
- leal (loaded_image_proto - 1b)(%ebx), %eax
- pushl %eax // pass the GUID address
- pushl 8(%ebp) // pass the image handle
-
- /*
- * Note the alignment of the stack frame.
- * sys_table
- * handle <-- 16-byte aligned on entry by ABI
- * return address
- * frame pointer
- * loaded_image <-- local variable
- * saved %ebx <-- 16-byte aligned here
- * saved %edi
- * &loaded_image
- * &loaded_image_proto
- * handle <-- 16-byte aligned for call to handle_protocol
- */
-
- movl 12(%ebp), %eax // sys_table
- movl ST32_boottime(%eax), %eax // sys_table->boottime
- call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol
- addl $12, %esp // restore argument space
- testl %eax, %eax
- jnz 2f
-
movl 8(%ebp), %ecx // image_handle
movl 12(%ebp), %edx // sys_table
- movl -4(%ebp), %esi // loaded_image
- movl LI32_image_base(%esi), %esi // loaded_image->image_base
- leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32
- /*
- * We need to set the image_offset variable here since startup_32() will
- * use it before we get to the 64-bit efi_pe_entry() in C code.
- */
- subl %esi, %ebp // calculate image_offset
- movl %ebp, (image_offset - 1b)(%ebx) // save image_offset
xorl %esi, %esi
jmp efi32_entry // pass %ecx, %edx, %esi
// no other registers remain live
@@ -318,14 +296,13 @@ SYM_FUNC_START(efi32_pe_entry)
RET
SYM_FUNC_END(efi32_pe_entry)
- .section ".rodata"
- /* EFI loaded image protocol GUID */
- .balign 4
-SYM_DATA_START_LOCAL(loaded_image_proto)
- .long 0x5b1b31a1
- .word 0x9562, 0x11d2
- .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b
-SYM_DATA_END(loaded_image_proto)
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+ .org efi32_stub_entry + 0x200
+ .code64
+SYM_FUNC_START_NOALIGN(efi64_stub_entry)
+ jmp efi_handover_entry
+SYM_FUNC_END(efi64_stub_entry)
+#endif
.data
.balign 8
diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c
index 5313c5cb2b80..19a8251de506 100644
--- a/arch/x86/boot/compressed/error.c
+++ b/arch/x86/boot/compressed/error.c
@@ -7,7 +7,7 @@
#include "misc.h"
#include "error.h"
-void warn(char *m)
+void warn(const char *m)
{
error_putstr("\n\n");
error_putstr(m);
diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h
index 86fe33b93715..31f9e080d61a 100644
--- a/arch/x86/boot/compressed/error.h
+++ b/arch/x86/boot/compressed/error.h
@@ -4,7 +4,7 @@
#include <linux/compiler.h>
-void warn(char *m);
+void warn(const char *m);
void error(char *m) __noreturn;
void panic(const char *fmt, ...) __noreturn __cold;
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 987ae727cf9f..1cfe9802a42f 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -84,19 +84,6 @@ SYM_FUNC_START(startup_32)
#ifdef CONFIG_RELOCATABLE
leal startup_32@GOTOFF(%edx), %ebx
-
-#ifdef CONFIG_EFI_STUB
-/*
- * If we were loaded via the EFI LoadImage service, startup_32() will be at an
- * offset to the start of the space allocated for the image. efi_pe_entry() will
- * set up image_offset to tell us where the image actually starts, so that we
- * can use the full available buffer.
- * image_offset = startup_32 - image_base
- * Otherwise image_offset will be zero and has no effect on the calculations.
- */
- subl image_offset@GOTOFF(%edx), %ebx
-#endif
-
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
@@ -150,17 +137,6 @@ SYM_FUNC_START(startup_32)
jmp *%eax
SYM_FUNC_END(startup_32)
-#ifdef CONFIG_EFI_STUB
-SYM_FUNC_START(efi32_stub_entry)
- add $0x4, %esp
- movl 8(%esp), %esi /* save boot_params pointer */
- call efi_main
- /* efi_main returns the possibly relocated address of startup_32 */
- jmp *%eax
-SYM_FUNC_END(efi32_stub_entry)
-SYM_FUNC_ALIAS(efi_stub_entry, efi32_stub_entry)
-#endif
-
.text
SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
@@ -179,13 +155,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
*/
/* push arguments for extract_kernel: */
- pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */
pushl %ebp /* output address */
- pushl input_len@GOTOFF(%ebx) /* input_len */
- leal input_data@GOTOFF(%ebx), %eax
- pushl %eax /* input_data */
- leal boot_heap@GOTOFF(%ebx), %eax
- pushl %eax /* heap area */
pushl %esi /* real mode pointer */
call extract_kernel /* returns kernel entry point in %eax */
addl $24, %esp
@@ -213,8 +183,6 @@ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
*/
.bss
.balign 4
-boot_heap:
- .fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 03c4328a88cb..bf4a10a5794f 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -146,19 +146,6 @@ SYM_FUNC_START(startup_32)
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
-
-#ifdef CONFIG_EFI_STUB
-/*
- * If we were loaded via the EFI LoadImage service, startup_32 will be at an
- * offset to the start of the space allocated for the image. efi_pe_entry will
- * set up image_offset to tell us where the image actually starts, so that we
- * can use the full available buffer.
- * image_offset = startup_32 - image_base
- * Otherwise image_offset will be zero and has no effect on the calculations.
- */
- subl rva(image_offset)(%ebp), %ebx
-#endif
-
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
@@ -294,17 +281,6 @@ SYM_FUNC_START(startup_32)
lret
SYM_FUNC_END(startup_32)
-#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL)
- .org 0x190
-SYM_FUNC_START(efi32_stub_entry)
- add $0x4, %esp /* Discard return address */
- popl %ecx
- popl %edx
- popl %esi
- jmp efi32_entry
-SYM_FUNC_END(efi32_stub_entry)
-#endif
-
.code64
.org 0x200
SYM_CODE_START(startup_64)
@@ -346,20 +322,6 @@ SYM_CODE_START(startup_64)
/* Start with the delta to where the kernel will run at. */
#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip) /* - $startup_32 */, %rbp
-
-#ifdef CONFIG_EFI_STUB
-/*
- * If we were loaded via the EFI LoadImage service, startup_32 will be at an
- * offset to the start of the space allocated for the image. efi_pe_entry will
- * set up image_offset to tell us where the image actually starts, so that we
- * can use the full available buffer.
- * image_offset = startup_32 - image_base
- * Otherwise image_offset will be zero and has no effect on the calculations.
- */
- movl image_offset(%rip), %eax
- subq %rax, %rbp
-#endif
-
movl BP_kernel_alignment(%rsi), %eax
decl %eax
addq %rax, %rbp
@@ -398,10 +360,6 @@ SYM_CODE_START(startup_64)
* For the trampoline, we need the top page table to reside in lower
* memory as we don't have a way to load 64-bit values into CR3 in
* 32-bit mode.
- *
- * We go though the trampoline even if we don't have to: if we're
- * already in a desired paging mode. This way the trampoline code gets
- * tested on every boot.
*/
/* Make sure we have GDT with 32-bit code segment */
@@ -416,10 +374,14 @@ SYM_CODE_START(startup_64)
lretq
.Lon_kernel_cs:
+ /*
+ * RSI holds a pointer to a boot_params structure provided by the
+ * loader, and this needs to be preserved across C function calls. So
+ * move it into a callee saved register.
+ */
+ movq %rsi, %r15
- pushq %rsi
call load_stage1_idt
- popq %rsi
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
@@ -430,63 +392,24 @@ SYM_CODE_START(startup_64)
* CPUID instructions being issued, so go ahead and do that now via
* sev_enable(), which will also handle the rest of the SEV-related
* detection/setup to ensure that has been done in advance of any dependent
- * code.
+ * code. Pass the boot_params pointer as the first argument.
*/
- pushq %rsi
- movq %rsi, %rdi /* real mode address */
+ movq %r15, %rdi
call sev_enable
- popq %rsi
#endif
/*
- * paging_prepare() sets up the trampoline and checks if we need to
- * enable 5-level paging.
+ * configure_5level_paging() updates the number of paging levels using
+ * a trampoline in 32-bit addressable memory if the current number does
+ * not match the desired number.
*
- * paging_prepare() returns a two-quadword structure which lands
- * into RDX:RAX:
- * - Address of the trampoline is returned in RAX.
- * - Non zero RDX means trampoline needs to enable 5-level
- * paging.
- *
- * RSI holds real mode data and needs to be preserved across
- * this function call.
- */
- pushq %rsi
- movq %rsi, %rdi /* real mode address */
- call paging_prepare
- popq %rsi
-
- /* Save the trampoline address in RCX */
- movq %rax, %rcx
-
- /*
- * Load the address of trampoline_return() into RDI.
- * It will be used by the trampoline to return to the main code.
+ * Pass the boot_params pointer as the first argument. The second
+ * argument is the relocated address of the page table to use instead
+ * of the page table in trampoline memory (if required).
*/
- leaq trampoline_return(%rip), %rdi
-
- /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
- pushq $__KERNEL32_CS
- leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
- pushq %rax
- lretq
-trampoline_return:
- /* Restore the stack, the 32-bit trampoline uses its own stack */
- leaq rva(boot_stack_end)(%rbx), %rsp
-
- /*
- * cleanup_trampoline() would restore trampoline memory.
- *
- * RDI is address of the page table to use instead of page table
- * in trampoline memory (if required).
- *
- * RSI holds real mode data and needs to be preserved across
- * this function call.
- */
- pushq %rsi
- leaq rva(top_pgtable)(%rbx), %rdi
- call cleanup_trampoline
- popq %rsi
+ movq %r15, %rdi
+ leaq rva(top_pgtable)(%rbx), %rsi
+ call configure_5level_paging
/* Zero EFLAGS */
pushq $0
@@ -496,7 +419,6 @@ trampoline_return:
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- pushq %rsi
leaq (_bss-8)(%rip), %rsi
leaq rva(_bss-8)(%rbx), %rdi
movl $(_bss - startup_32), %ecx
@@ -504,7 +426,6 @@ trampoline_return:
std
rep movsq
cld
- popq %rsi
/*
* The GDT may get overwritten either during the copy we just did or
@@ -523,21 +444,6 @@ trampoline_return:
jmp *%rax
SYM_CODE_END(startup_64)
-#ifdef CONFIG_EFI_STUB
-#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
- .org 0x390
-#endif
-SYM_FUNC_START(efi64_stub_entry)
- and $~0xf, %rsp /* realign the stack */
- movq %rdx, %rbx /* save boot_params pointer */
- call efi_main
- movq %rbx,%rsi
- leaq rva(startup_64)(%rax), %rax
- jmp *%rax
-SYM_FUNC_END(efi64_stub_entry)
-SYM_FUNC_ALIAS(efi_stub_entry, efi64_stub_entry)
-#endif
-
.text
SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
@@ -551,128 +457,122 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
shrq $3, %rcx
rep stosq
- pushq %rsi
call load_stage2_idt
/* Pass boot_params to initialize_identity_maps() */
- movq (%rsp), %rdi
+ movq %r15, %rdi
call initialize_identity_maps
- popq %rsi
/*
* Do the extraction, and jump to the new kernel..
*/
- pushq %rsi /* Save the real mode argument */
- movq %rsi, %rdi /* real mode address */
- leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
- leaq input_data(%rip), %rdx /* input_data */
- movl input_len(%rip), %ecx /* input_len */
- movq %rbp, %r8 /* output target address */
- movl output_len(%rip), %r9d /* decompressed length, end of relocs */
+ /* pass struct boot_params pointer and output target address */
+ movq %r15, %rdi
+ movq %rbp, %rsi
call extract_kernel /* returns kernel entry point in %rax */
- popq %rsi
/*
* Jump to the decompressed kernel.
*/
+ movq %r15, %rsi
jmp *%rax
SYM_FUNC_END(.Lrelocated)
- .code32
/*
- * This is the 32-bit trampoline that will be copied over to low memory.
+ * This is the 32-bit trampoline that will be copied over to low memory. It
+ * will be called using the ordinary 64-bit calling convention from code
+ * running in 64-bit mode.
*
- * RDI contains the return address (might be above 4G).
- * ECX contains the base address of the trampoline memory.
- * Non zero RDX means trampoline needs to enable 5-level paging.
+ * Return address is at the top of the stack (might be above 4G).
+ * The first argument (EDI) contains the address of the temporary PGD level
+ * page table in 32-bit addressable memory which will be programmed into
+ * register CR3.
*/
+ .section ".rodata", "a", @progbits
SYM_CODE_START(trampoline_32bit_src)
- /* Set up data and stack segments */
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %ss
+ /*
+ * Preserve callee save 64-bit registers on the stack: this is
+ * necessary because the architecture does not guarantee that GPRs will
+ * retain their full 64-bit values across a 32-bit mode switch.
+ */
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbp
+ pushq %rbx
+
+ /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
+ movq %rsp, %rbx
+ shrq $32, %rbx
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq 0f(%rip), %rax
+ pushq %rax
+ lretq
- /* Set up new stack */
- leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp
+ /*
+ * The 32-bit code below will do a far jump back to long mode and end
+ * up here after reconfiguring the number of paging levels. First, the
+ * stack pointer needs to be restored to its full 64-bit value before
+ * the callee save register contents can be popped from the stack.
+ */
+.Lret:
+ shlq $32, %rbx
+ orq %rbx, %rsp
+
+ /* Restore the preserved 64-bit registers */
+ popq %rbx
+ popq %rbp
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ retq
+ .code32
+0:
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
- /* Check what paging mode we want to be in after the trampoline */
- testl %edx, %edx
- jz 1f
-
- /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
- movl %cr4, %eax
- testl $X86_CR4_LA57, %eax
- jnz 3f
- jmp 2f
-1:
- /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
- movl %cr4, %eax
- testl $X86_CR4_LA57, %eax
- jz 3f
-2:
/* Point CR3 to the trampoline's new top level page table */
- leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
- movl %eax, %cr3
-3:
+ movl %edi, %cr3
+
/* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
- pushl %ecx
- pushl %edx
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
/* Avoid writing EFER if no change was made (for TDX guest) */
jc 1f
wrmsr
-1: popl %edx
- popl %ecx
-
-#ifdef CONFIG_X86_MCE
- /*
- * Preserve CR4.MCE if the kernel will enable #MC support.
- * Clearing MCE may fault in some environments (that also force #MC
- * support). Any machine check that occurs before #MC support is fully
- * configured will crash the system regardless of the CR4.MCE value set
- * here.
- */
- movl %cr4, %eax
- andl $X86_CR4_MCE, %eax
-#else
- movl $0, %eax
-#endif
-
- /* Enable PAE and LA57 (if required) paging modes */
- orl $X86_CR4_PAE, %eax
- testl %edx, %edx
- jz 1f
- orl $X86_CR4_LA57, %eax
1:
+ /* Toggle CR4.LA57 */
+ movl %cr4, %eax
+ btcl $X86_CR4_LA57_BIT, %eax
movl %eax, %cr4
- /* Calculate address of paging_enabled() once we are executing in the trampoline */
- leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
-
- /* Prepare the stack for far return to Long Mode */
- pushl $__KERNEL_CS
- pushl %eax
-
/* Enable paging again. */
movl %cr0, %eax
btsl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
- lret
+ /*
+ * Return to the 64-bit calling code using LJMP rather than LRET, to
+ * avoid the need for a 32-bit addressable stack. The destination
+ * address will be adjusted after the template code is copied into a
+ * 32-bit addressable buffer.
+ */
+.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
SYM_CODE_END(trampoline_32bit_src)
- .code64
-SYM_FUNC_START_LOCAL_NOALIGN(.Lpaging_enabled)
- /* Return from the trampoline */
- jmp *%rdi
-SYM_FUNC_END(.Lpaging_enabled)
+/*
+ * This symbol is placed right after trampoline_32bit_src() so its address can
+ * be used to infer the size of the trampoline code.
+ */
+SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
/*
* The trampoline code has a size limit.
@@ -681,7 +581,7 @@ SYM_FUNC_END(.Lpaging_enabled)
*/
.org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
- .code32
+ .text
SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1:
@@ -726,8 +626,6 @@ SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
*/
.bss
.balign 4
-SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0)
-
SYM_DATA_START_LOCAL(boot_stack)
.fill BOOT_STACK_SIZE, 1, 0
.balign 16
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 94b7abcf624b..f711f2a85862 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -330,6 +330,33 @@ static size_t parse_elf(void *output)
return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
}
+const unsigned long kernel_total_size = VO__end - VO__text;
+
+static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
+
+extern unsigned char input_data[];
+extern unsigned int input_len, output_len;
+
+unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
+ void (*error)(char *x))
+{
+ unsigned long entry;
+
+ if (!free_mem_ptr) {
+ free_mem_ptr = (unsigned long)boot_heap;
+ free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap);
+ }
+
+ if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len,
+ NULL, error) < 0)
+ return ULONG_MAX;
+
+ entry = parse_elf(outbuf);
+ handle_relocations(outbuf, output_len, virt_addr);
+
+ return entry;
+}
+
/*
* The compressed kernel image (ZO), has been moved so that its position
* is against the end of the buffer used to hold the uncompressed kernel
@@ -347,14 +374,10 @@ static size_t parse_elf(void *output)
* |-------uncompressed kernel image---------|
*
*/
-asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
- unsigned char *input_data,
- unsigned long input_len,
- unsigned char *output,
- unsigned long output_len)
+asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
{
- const unsigned long kernel_total_size = VO__end - VO__text;
unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
+ memptr heap = (memptr)boot_heap;
unsigned long needed_size;
size_t entry_offset;
@@ -412,7 +435,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
* entries. This ensures the full mapped area is usable RAM
* and doesn't include any reserved areas.
*/
- needed_size = max(output_len, kernel_total_size);
+ needed_size = max_t(unsigned long, output_len, kernel_total_size);
#ifdef CONFIG_X86_64
needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN);
#endif
@@ -443,7 +466,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
- if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE)
+ if (virt_addr + needed_size > KERNEL_IMAGE_SIZE)
error("Destination virtual address is beyond the kernel mapping area");
#else
if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
@@ -461,10 +484,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
accept_memory(__pa(output), __pa(output) + needed_size);
}
- __decompress(input_data, input_len, NULL, NULL, output, output_len,
- NULL, error);
- entry_offset = parse_elf(output);
- handle_relocations(output, output_len, virt_addr);
+ entry_offset = decompress_kernel(output, virt_addr, error);
debug_putstr("done.\nBooting the kernel (entry_offset: 0x");
debug_puthex(entry_offset);
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 964fe903a1cd..cc70d3fb9049 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -179,9 +179,7 @@ static inline int count_immovable_mem_regions(void) { return 0; }
#endif
/* ident_map_64.c */
-#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
-#endif
extern void kernel_add_identity_map(unsigned long start, unsigned long end);
/* Used by PAGE_KERN* macros: */
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index cc9b2529a086..6d595abe06b3 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -3,18 +3,16 @@
#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
-#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
-
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x80
-
-#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
+#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0
#ifndef __ASSEMBLER__
extern unsigned long *trampoline_32bit;
-extern void trampoline_32bit_src(void *return_ptr);
+extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl);
+
+extern const u16 trampoline_ljmp_imm_offset;
#endif /* __ASSEMBLER__ */
#endif /* BOOT_COMPRESSED_PAGETABLE_H */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 2ac12ff4111b..7939eb6e6ce9 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -16,11 +16,6 @@ unsigned int __section(".data") pgdir_shift = 39;
unsigned int __section(".data") ptrs_per_p4d = 1;
#endif
-struct paging_config {
- unsigned long trampoline_start;
- unsigned long l5_required;
-};
-
/* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
@@ -29,7 +24,7 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
* purposes.
*
* Avoid putting the pointer into .bss as it will be cleared between
- * paging_prepare() and extract_kernel().
+ * configure_5level_paging() and extract_kernel().
*/
unsigned long *trampoline_32bit __section(".data");
@@ -106,12 +101,13 @@ static unsigned long find_trampoline_placement(void)
return bios_start - TRAMPOLINE_32BIT_SIZE;
}
-struct paging_config paging_prepare(void *rmode)
+asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
{
- struct paging_config paging_config = {};
+ void (*toggle_la57)(void *cr3);
+ bool l5_required = false;
/* Initialize boot_params. Required for cmdline_find_option_bool(). */
- boot_params = rmode;
+ boot_params = bp;
/*
* Check if LA57 is desired and supported.
@@ -129,12 +125,22 @@ struct paging_config paging_prepare(void *rmode)
!cmdline_find_option_bool("no5lvl") &&
native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
- paging_config.l5_required = 1;
+ l5_required = true;
+
+ /* Initialize variables for 5-level paging */
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
}
- paging_config.trampoline_start = find_trampoline_placement();
+ /*
+ * The trampoline will not be used if the paging mode is already set to
+ * the desired one.
+ */
+ if (l5_required == !!(native_read_cr4() & X86_CR4_LA57))
+ return;
- trampoline_32bit = (unsigned long *)paging_config.trampoline_start;
+ trampoline_32bit = (unsigned long *)find_trampoline_placement();
/* Preserve trampoline memory */
memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
@@ -143,32 +149,32 @@ struct paging_config paging_prepare(void *rmode)
memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
/* Copy trampoline code in place */
- memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
+ toggle_la57 = memcpy(trampoline_32bit +
+ TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
&trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
/*
+ * Avoid the need for a stack in the 32-bit trampoline code, by using
+ * LJMP rather than LRET to return back to long mode. LJMP takes an
+ * immediate absolute address, which needs to be adjusted based on the
+ * placement of the trampoline.
+ */
+ *(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) +=
+ (unsigned long)toggle_la57;
+
+ /*
* The code below prepares page table in trampoline memory.
*
* The new page table will be used by trampoline code for switching
* from 4- to 5-level paging or vice versa.
- *
- * If switching is not required, the page table is unused: trampoline
- * code wouldn't touch CR3.
- */
-
- /*
- * We are not going to use the page table in trampoline memory if we
- * are already in the desired paging mode.
*/
- if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57))
- goto out;
- if (paging_config.l5_required) {
+ if (l5_required) {
/*
* For 4- to 5-level paging transition, set up current CR3 as
* the first and the only entry in a new top-level page table.
*/
- trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC;
+ *trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC;
} else {
unsigned long src;
@@ -181,38 +187,17 @@ struct paging_config paging_prepare(void *rmode)
* may be above 4G.
*/
src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
- memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long),
- (void *)src, PAGE_SIZE);
+ memcpy(trampoline_32bit, (void *)src, PAGE_SIZE);
}
-out:
- return paging_config;
-}
-
-void cleanup_trampoline(void *pgtable)
-{
- void *trampoline_pgtable;
-
- trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long);
+ toggle_la57(trampoline_32bit);
/*
- * Move the top level page table out of trampoline memory,
- * if it's there.
+ * Move the top level page table out of trampoline memory.
*/
- if ((void *)__native_read_cr3() == trampoline_pgtable) {
- memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
- native_write_cr3((unsigned long)pgtable);
- }
+ memcpy(pgtable, trampoline_32bit, PAGE_SIZE);
+ native_write_cr3((unsigned long)pgtable);
/* Restore trampoline memory */
memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
-
- /* Initialize variables for 5-level paging */
-#ifdef CONFIG_X86_5LEVEL
- if (__read_cr4() & X86_CR4_LA57) {
- __pgtable_l5_enabled = 1;
- pgdir_shift = 48;
- ptrs_per_p4d = 512;
- }
-#endif
}
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index c3e343bd4760..dc8c876fbd8f 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -365,22 +365,27 @@ static void enforce_vmpl0(void)
* by the guest kernel. As and when a new feature is implemented in the
* guest kernel, a corresponding bit should be added to the mask.
*/
-#define SNP_FEATURES_PRESENT (0)
+#define SNP_FEATURES_PRESENT MSR_AMD64_SNP_DEBUG_SWAP
+
+u64 snp_get_unsupported_features(u64 status)
+{
+ if (!(status & MSR_AMD64_SEV_SNP_ENABLED))
+ return 0;
+
+ return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
+}
void snp_check_features(void)
{
u64 unsupported;
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
- return;
-
/*
* Terminate the boot if hypervisor has enabled any feature lacking
* guest side implementation. Pass on the unsupported features mask through
* EXIT_INFO_2 of the GHCB protocol so that those features can be reported
* as part of the guest boot failure.
*/
- unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
+ unsupported = snp_get_unsupported_features(sev_status);
if (unsupported) {
if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb()))
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
@@ -390,35 +395,22 @@ void snp_check_features(void)
}
}
-void sev_enable(struct boot_params *bp)
+/*
+ * sev_check_cpu_support - Check for SEV support in the CPU capabilities
+ *
+ * Returns < 0 if SEV is not supported, otherwise the position of the
+ * encryption bit in the page table descriptors.
+ */
+static int sev_check_cpu_support(void)
{
unsigned int eax, ebx, ecx, edx;
- struct msr m;
- bool snp;
-
- /*
- * bp->cc_blob_address should only be set by boot/compressed kernel.
- * Initialize it to 0 to ensure that uninitialized values from
- * buggy bootloaders aren't propagated.
- */
- if (bp)
- bp->cc_blob_address = 0;
-
- /*
- * Do an initial SEV capability check before snp_init() which
- * loads the CPUID page and the same checks afterwards are done
- * without the hypervisor and are trustworthy.
- *
- * If the HV fakes SEV support, the guest will crash'n'burn
- * which is good enough.
- */
/* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
- return;
+ return -ENODEV;
/*
* Check for the SME/SEV feature:
@@ -433,6 +425,35 @@ void sev_enable(struct boot_params *bp)
native_cpuid(&eax, &ebx, &ecx, &edx);
/* Check whether SEV is supported */
if (!(eax & BIT(1)))
+ return -ENODEV;
+
+ return ebx & 0x3f;
+}
+
+void sev_enable(struct boot_params *bp)
+{
+ struct msr m;
+ int bitpos;
+ bool snp;
+
+ /*
+ * bp->cc_blob_address should only be set by boot/compressed kernel.
+ * Initialize it to 0 to ensure that uninitialized values from
+ * buggy bootloaders aren't propagated.
+ */
+ if (bp)
+ bp->cc_blob_address = 0;
+
+ /*
+ * Do an initial SEV capability check before snp_init() which
+ * loads the CPUID page and the same checks afterwards are done
+ * without the hypervisor and are trustworthy.
+ *
+ * If the HV fakes SEV support, the guest will crash'n'burn
+ * which is good enough.
+ */
+
+ if (sev_check_cpu_support() < 0)
return;
/*
@@ -443,26 +464,8 @@ void sev_enable(struct boot_params *bp)
/* Now repeat the checks with the SNP CPUID table. */
- /* Recheck the SME/SEV support leaf */
- eax = 0x80000000;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- if (eax < 0x8000001f)
- return;
-
- /*
- * Recheck for the SME/SEV feature:
- * CPUID Fn8000_001F[EAX]
- * - Bit 0 - Secure Memory Encryption support
- * - Bit 1 - Secure Encrypted Virtualization support
- * CPUID Fn8000_001F[EBX]
- * - Bits 5:0 - Pagetable bit position used to indicate encryption
- */
- eax = 0x8000001f;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- /* Check whether SEV is supported */
- if (!(eax & BIT(1))) {
+ bitpos = sev_check_cpu_support();
+ if (bitpos < 0) {
if (snp)
error("SEV-SNP support indicated by CC blob, but not CPUID.");
return;
@@ -494,7 +497,24 @@ void sev_enable(struct boot_params *bp)
if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
error("SEV-SNP supported indicated by CC blob, but not SEV status MSR.");
- sme_me_mask = BIT_ULL(ebx & 0x3f);
+ sme_me_mask = BIT_ULL(bitpos);
+}
+
+/*
+ * sev_get_status - Retrieve the SEV status mask
+ *
+ * Returns 0 if the CPU is not SEV capable, otherwise the value of the
+ * AMD64_SEV MSR.
+ */
+u64 sev_get_status(void)
+{
+ struct msr m;
+
+ if (sev_check_cpu_support() < 0)
+ return 0;
+
+ boot_rdmsr(MSR_AMD64_SEV, &m);
+ return m.q;
}
/* Search for Confidential Computing blob in the EFI config table. */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 75a343f10e58..1b411bbf3cb0 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -33,7 +33,6 @@ CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
CONFIG_NR_CPUS=8
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
-CONFIG_MICROCODE_AMD=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 0902518e9b93..409e9182bd29 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -31,7 +31,6 @@ CONFIG_SMP=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
-CONFIG_MICROCODE_AMD=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
CONFIG_NUMA=y
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index bc0a3c941b35..2d0b1bd866ea 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -456,3 +456,4 @@
449 i386 futex_waitv sys_futex_waitv
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
451 i386 cachestat sys_cachestat
+452 i386 fchmodat2 sys_fchmodat2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 227538b0ce80..814768249eae 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -373,6 +373,7 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 371014802191..6911c5399d02 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -156,8 +156,8 @@ perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
* count to the generic event atomically:
*/
prev_raw_count = local64_read(&hwc->prev_count);
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- new_raw_count) != prev_raw_count)
+ if (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count))
return 0;
/*
@@ -247,11 +247,33 @@ int forward_event_to_ibs(struct perf_event *event)
return -ENOENT;
}
+/*
+ * Grouping of IBS events is not possible since IBS can have only
+ * one event active at any point in time.
+ */
+static int validate_group(struct perf_event *event)
+{
+ struct perf_event *sibling;
+
+ if (event->group_leader == event)
+ return 0;
+
+ if (event->group_leader->pmu == event->pmu)
+ return -EINVAL;
+
+ for_each_sibling_event(sibling, event->group_leader) {
+ if (sibling->pmu == event->pmu)
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int perf_ibs_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs;
u64 max_cnt, config;
+ int ret;
perf_ibs = get_ibs_pmu(event->attr.type);
if (!perf_ibs)
@@ -265,6 +287,10 @@ static int perf_ibs_init(struct perf_event *event)
if (config & ~perf_ibs->config_mask)
return -EINVAL;
+ ret = validate_group(event);
+ if (ret)
+ return ret;
+
if (hwc->sample_period) {
if (config & perf_ibs->cnt_mask)
/* raw max_cnt may not be set */
@@ -702,38 +728,63 @@ static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
return op_data2->data_src_lo;
}
-static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
- union ibs_op_data3 *op_data3,
- struct perf_sample_data *data)
+#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
+#define LN(x) PERF_MEM_S(LVLNUM, x)
+#define REM PERF_MEM_S(REMOTE, REMOTE)
+#define HOPS(x) PERF_MEM_S(HOPS, x)
+
+static u64 g_data_src[8] = {
+ [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
+ [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM),
+ [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
+ [IBS_DATA_SRC_IO] = L(IO) | LN(IO),
+};
+
+#define RMT_NODE_BITS (1 << IBS_DATA_SRC_DRAM)
+#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
+
+static u64 g_zen4_data_src[32] = {
+ [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3),
+ [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
+ [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM),
+ [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
+ [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM),
+ [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO),
+ [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL),
+};
+
+#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \
+ (1 << IBS_DATA_SRC_EXT_PMEM) | \
+ (1 << IBS_DATA_SRC_EXT_EXT_MEM))
+#define ZEN4_RMT_NODE_APPLICABLE(x) (ZEN4_RMT_NODE_BITS & (1 << x))
+
+static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
+ union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
{
union perf_mem_data_src *data_src = &data->data_src;
u8 ibs_data_src = perf_ibs_data_src(op_data2);
data_src->mem_lvl = 0;
+ data_src->mem_lvl_num = 0;
/*
* DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
* memory accesses. So, check DcUcMemAcc bit early.
*/
- if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
- data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
+ return L(UNC) | LN(UNC);
/* L1 Hit */
- if (op_data3->dc_miss == 0) {
- data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_miss == 0)
+ return L(L1) | LN(L1);
/* L2 Hit */
if (op_data3->l2_miss == 0) {
/* Erratum #1293 */
if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
- !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
- data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
- return;
- }
+ !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
+ return L(L2) | LN(L2);
}
/*
@@ -743,82 +794,36 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
if (data_src->mem_op != PERF_MEM_OP_LOAD)
goto check_mab;
- /* L3 Hit */
if (ibs_caps & IBS_CAPS_ZEN4) {
- if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
- return;
- }
- } else {
- if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 |
- PERF_MEM_LVL_HIT;
- return;
- }
- }
+ u64 val = g_zen4_data_src[ibs_data_src];
- /* A peer cache in a near CCX */
- if (ibs_caps & IBS_CAPS_ZEN4 &&
- ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
- return;
- }
+ if (!val)
+ goto check_mab;
- /* A peer cache in a far CCX */
- if (ibs_caps & IBS_CAPS_ZEN4) {
- if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
- return;
+ /* HOPS_1 because IBS doesn't provide remote socket detail */
+ if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
+ if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
+ val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
+ else
+ val |= REM | HOPS(1);
}
- } else {
- if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
- return;
- }
- }
- /* DRAM */
- if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) {
- if (op_data2->rmt_node == 0)
- data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
- else
- data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
- return;
- }
+ return val;
+ } else {
+ u64 val = g_data_src[ibs_data_src];
- /* PMEM */
- if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) {
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
- }
- return;
- }
+ if (!val)
+ goto check_mab;
- /* Extension Memory */
- if (ibs_caps & IBS_CAPS_ZEN4 &&
- ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
+ /* HOPS_1 because IBS doesn't provide remote socket detail */
+ if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
+ if (ibs_data_src == IBS_DATA_SRC_DRAM)
+ val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
+ else
+ val |= REM | HOPS(1);
}
- return;
- }
- /* IO */
- if (ibs_data_src == IBS_DATA_SRC_EXT_IO) {
- data_src->mem_lvl = PERF_MEM_LVL_IO;
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
- }
- return;
+ return val;
}
check_mab:
@@ -829,12 +834,11 @@ check_mab:
* DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
* MAB only when IBS fails to provide DataSrc.
*/
- if (op_data3->dc_miss_no_mab_alloc) {
- data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_miss_no_mab_alloc)
+ return L(LFB) | LN(LFB);
- data_src->mem_lvl = PERF_MEM_LVL_NA;
+ /* Don't set HIT with NA */
+ return PERF_MEM_S(LVL, NA) | LN(NA);
}
static bool perf_ibs_cache_hit_st_valid(void)
@@ -924,7 +928,9 @@ static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
union ibs_op_data2 *op_data2,
union ibs_op_data3 *op_data3)
{
- perf_ibs_get_mem_lvl(op_data2, op_data3, data);
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
perf_ibs_get_mem_snoop(op_data2, data);
perf_ibs_get_tlb_lvl(op_data3, data);
perf_ibs_get_mem_lock(op_data3, data);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 9d248703cbdd..185f902e5f28 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -129,13 +129,11 @@ u64 x86_perf_event_update(struct perf_event *event)
* exchange a new raw count - then add that new-prev delta
* count to the generic event atomically:
*/
-again:
prev_raw_count = local64_read(&hwc->prev_count);
- rdpmcl(hwc->event_base_rdpmc, new_raw_count);
-
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- new_raw_count) != prev_raw_count)
- goto again;
+ do {
+ rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+ } while (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count));
/*
* Now we have the new raw value and have updated the prev
@@ -2168,7 +2166,6 @@ static int __init init_hw_perf_events(void)
hybrid_pmu->pmu = pmu;
hybrid_pmu->pmu.type = -1;
hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
- hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2a284ba951b7..fa355d3658a6 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2129,6 +2129,17 @@ static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+EVENT_ATTR_STR(topdown-retiring, td_retiring_cmt, "event=0x72,umask=0x0");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_cmt, "event=0x73,umask=0x0");
+
+static struct attribute *cmt_events_attrs[] = {
+ EVENT_PTR(td_fe_bound_tnt),
+ EVENT_PTR(td_retiring_cmt),
+ EVENT_PTR(td_bad_spec_cmt),
+ EVENT_PTR(td_be_bound_tnt),
+ NULL
+};
+
static struct extra_reg intel_cmt_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff3ffffffffffull, RSP_0),
@@ -4847,6 +4858,8 @@ PMU_FORMAT_ATTR(ldlat, "config1:0-15");
PMU_FORMAT_ATTR(frontend, "config1:0-23");
+PMU_FORMAT_ATTR(snoop_rsp, "config1:0-63");
+
static struct attribute *intel_arch3_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
@@ -4877,6 +4890,13 @@ static struct attribute *slm_format_attr[] = {
NULL
};
+static struct attribute *cmt_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ &format_attr_snoop_rsp.attr,
+ NULL
+};
+
static struct attribute *skl_format_attr[] = {
&format_attr_frontend.attr,
NULL,
@@ -5656,7 +5676,6 @@ static struct attribute *adl_hybrid_extra_attr[] = {
NULL
};
-PMU_FORMAT_ATTR_SHOW(snoop_rsp, "config1:0-63");
FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small);
static struct attribute *mtl_hybrid_extra_attr_rtm[] = {
@@ -6174,7 +6193,7 @@ __init int intel_pmu_init(void)
name = "Tremont";
break;
- case INTEL_FAM6_ALDERLAKE_N:
+ case INTEL_FAM6_ATOM_GRACEMONT:
x86_pmu.mid_ack = true;
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -6204,6 +6223,37 @@ __init int intel_pmu_init(void)
name = "gracemont";
break;
+ case INTEL_FAM6_ATOM_CRESTMONT:
+ case INTEL_FAM6_ATOM_CRESTMONT_X:
+ x86_pmu.mid_ack = true;
+ memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+ hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_cmt_extra_regs;
+
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+
+ intel_pmu_pebs_data_source_cmt();
+ x86_pmu.pebs_latency_data = mtl_latency_data_small;
+ x86_pmu.get_event_constraints = cmt_get_event_constraints;
+ x86_pmu.limit_period = spr_limit_period;
+ td_attr = cmt_events_attrs;
+ mem_attr = grt_mem_attrs;
+ extra_attr = cmt_format_attr;
+ pr_cont("Crestmont events, ");
+ name = "crestmont";
+ break;
+
case INTEL_FAM6_WESTMERE:
case INTEL_FAM6_WESTMERE_EP:
case INTEL_FAM6_WESTMERE_EX:
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 835862c548cc..96fffb2d521d 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -365,13 +365,11 @@ static void cstate_pmu_event_update(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
u64 prev_raw_count, new_raw_count;
-again:
prev_raw_count = local64_read(&hwc->prev_count);
- new_raw_count = cstate_pmu_read_counter(event);
-
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- new_raw_count) != prev_raw_count)
- goto again;
+ do {
+ new_raw_count = cstate_pmu_read_counter(event);
+ } while (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count));
local64_add(new_raw_count - prev_raw_count, &event->count);
}
@@ -671,6 +669,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
@@ -686,7 +685,6 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates),
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index df88576d6b2a..eb8dd8b8a1e8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -144,7 +144,7 @@ void __init intel_pmu_pebs_data_source_adl(void)
__intel_pmu_pebs_data_source_grt(data_source);
}
-static void __init intel_pmu_pebs_data_source_cmt(u64 *data_source)
+static void __init __intel_pmu_pebs_data_source_cmt(u64 *data_source)
{
data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
@@ -164,7 +164,12 @@ void __init intel_pmu_pebs_data_source_mtl(void)
data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
- intel_pmu_pebs_data_source_cmt(data_source);
+ __intel_pmu_pebs_data_source_cmt(data_source);
+}
+
+void __init intel_pmu_pebs_data_source_cmt(void)
+{
+ __intel_pmu_pebs_data_source_cmt(pebs_data_source);
}
static u64 precise_store_data(u64 status)
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index bc226603ef3e..69043e02e8a7 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1858,7 +1858,6 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init),
@@ -1867,6 +1866,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init),
{},
};
MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index d49e90dc04a4..4d349986f76a 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1502,7 +1502,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
pci_dev_put(ubox_dev);
- return err ? pcibios_err_to_errno(err) : 0;
+ return pcibios_err_to_errno(err);
}
int snbep_uncore_pci_init(void)
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 0feaaa571303..9e237b30f017 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -106,7 +106,7 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_ROCKETLAKE:
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
- case INTEL_FAM6_ALDERLAKE_N:
+ case INTEL_FAM6_ATOM_GRACEMONT:
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
case INTEL_FAM6_RAPTORLAKE_S:
@@ -244,12 +244,10 @@ static void msr_event_update(struct perf_event *event)
s64 delta;
/* Careful, an NMI might modify the previous event value: */
-again:
prev = local64_read(&event->hw.prev_count);
- now = msr_read_counter(event);
-
- if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
- goto again;
+ do {
+ now = msr_read_counter(event);
+ } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, now));
delta = now - prev;
if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index d6de4487348c..c8ba2be7585d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1606,6 +1606,8 @@ void intel_pmu_pebs_data_source_grt(void);
void intel_pmu_pebs_data_source_mtl(void);
+void intel_pmu_pebs_data_source_cmt(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 52e6e7ed4f78..1579429846cc 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -804,7 +804,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr),
X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &model_spr),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl),
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2888c0ee4df0..c8a7fc23f63c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -6,7 +6,7 @@
* Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
* Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
*/
-#include <acpi/pdc_intel.h>
+#include <acpi/proc_cap_intel.h>
#include <asm/numa.h>
#include <asm/fixmap.h>
@@ -102,23 +102,31 @@ static inline bool arch_has_acpi_pdc(void)
c->x86_vendor == X86_VENDOR_CENTAUR);
}
-static inline void arch_acpi_set_pdc_bits(u32 *buf)
+static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
{
struct cpuinfo_x86 *c = &cpu_data(0);
- buf[2] |= ACPI_PDC_C_CAPABILITY_SMP;
+ *cap |= ACPI_PROC_CAP_C_CAPABILITY_SMP;
+
+ /* Enable coordination with firmware's _TSD info */
+ *cap |= ACPI_PROC_CAP_SMP_T_SWCOORD;
if (cpu_has(c, X86_FEATURE_EST))
- buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
+ *cap |= ACPI_PROC_CAP_EST_CAPABILITY_SWSMP;
if (cpu_has(c, X86_FEATURE_ACPI))
- buf[2] |= ACPI_PDC_T_FFH;
+ *cap |= ACPI_PROC_CAP_T_FFH;
+
+ if (cpu_has(c, X86_FEATURE_HWP))
+ *cap |= ACPI_PROC_CAP_COLLAB_PROC_PERF;
/*
- * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
+ * If mwait/monitor is unsupported, C_C1_FFH and
+ * C2/C3_FFH will be disabled.
*/
- if (!cpu_has(c, X86_FEATURE_MWAIT))
- buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
+ if (!cpu_has(c, X86_FEATURE_MWAIT) ||
+ boot_option_idle_override == IDLE_NOMWAIT)
+ *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH);
}
static inline bool acpi_has_cpu_in_madt(void)
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 9191280d9ea3..4ae14339cb8c 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -62,4 +62,12 @@
# define BOOT_STACK_SIZE 0x1000
#endif
+#ifndef __ASSEMBLY__
+extern unsigned int output_len;
+extern const unsigned long kernel_total_size;
+
+unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
+ void (*error)(char *x));
+#endif
+
#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index b8f1dc0761e4..9931e4c7d73f 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -71,6 +71,12 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
}
#define mul_u32_u32 mul_u32_u32
+/*
+ * __div64_32() is never called on x86, so prevent the
+ * generic definition from getting built.
+ */
+#define __div64_32
+
#else
# include <asm-generic/div64.h>
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8b4be7cecdb8..b0994ae3bc23 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,8 @@ static inline void efi_fpu_end(void)
}
#ifdef CONFIG_X86_32
+#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1)
+
#define arch_efi_call_virt_setup() \
({ \
efi_fpu_begin(); \
@@ -103,8 +105,7 @@ static inline void efi_fpu_end(void)
})
#else /* !CONFIG_X86_32 */
-
-#define EFI_LOADER_SIGNATURE "EL64"
+#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT
extern asmlinkage u64 __efi_call(void *fp, ...);
@@ -218,6 +219,8 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
#ifdef CONFIG_EFI_MIXED
+#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX)
+
#define ARCH_HAS_EFISTUB_WRAPPERS
static inline bool efi_is_64bit(void)
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index b3af2d45bbbb..5fcd85fd64fd 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -98,8 +98,6 @@
#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */
#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */
-#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */
-
#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */
#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */
@@ -112,21 +110,24 @@
#define INTEL_FAM6_GRANITERAPIDS_X 0xAD
#define INTEL_FAM6_GRANITERAPIDS_D 0xAE
+/* "Hybrid" Processors (P-Core/E-Core) */
+
+#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */
+
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
-#define INTEL_FAM6_ALDERLAKE_N 0xBE
-#define INTEL_FAM6_RAPTORLAKE 0xB7
+#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */
#define INTEL_FAM6_RAPTORLAKE_P 0xBA
#define INTEL_FAM6_RAPTORLAKE_S 0xBF
#define INTEL_FAM6_METEORLAKE 0xAC
#define INTEL_FAM6_METEORLAKE_L 0xAA
-#define INTEL_FAM6_LUNARLAKE_M 0xBD
-
#define INTEL_FAM6_ARROWLAKE 0xC6
+#define INTEL_FAM6_LUNARLAKE_M 0xBD
+
/* "Small Core" Processors (Atom/E-Core) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
@@ -154,9 +155,10 @@
#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */
#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */
-#define INTEL_FAM6_SIERRAFOREST_X 0xAF
+#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */
-#define INTEL_FAM6_GRANDRIDGE 0xB6
+#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */
+#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */
/* Xeon Phi */
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 56d4ef604b91..635132a12778 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -127,8 +127,8 @@ static inline long local_cmpxchg(local_t *l, long old, long new)
static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
{
- typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old;
- return try_cmpxchg_local(&l->a.counter, __old, new);
+ return try_cmpxchg_local(&l->a.counter,
+ (typeof(l->a.counter) *) old, new);
}
/* Always has a lock prefix */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 7f97a8a97e24..473b16d73b47 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -50,8 +50,8 @@ void __init sme_enable(struct boot_params *bp);
int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
-void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages,
- bool enc);
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr,
+ unsigned long size, bool enc);
void __init mem_encrypt_free_decrypted_mem(void);
@@ -85,7 +85,7 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0;
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
static inline void __init
-early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {}
+early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) {}
static inline void mem_encrypt_free_decrypted_mem(void) { }
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 66dbba181bd9..bbbe9d744977 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -2,138 +2,77 @@
#ifndef _ASM_X86_MICROCODE_H
#define _ASM_X86_MICROCODE_H
-#include <asm/cpu.h>
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-#include <asm/microcode_amd.h>
-
-struct ucode_patch {
- struct list_head plist;
- void *data; /* Intel uses only this one */
- unsigned int size;
- u32 patch_id;
- u16 equiv_cpu;
-};
-
-extern struct list_head microcode_cache;
-
struct cpu_signature {
unsigned int sig;
unsigned int pf;
unsigned int rev;
};
-struct device;
-
-enum ucode_state {
- UCODE_OK = 0,
- UCODE_NEW,
- UCODE_UPDATED,
- UCODE_NFOUND,
- UCODE_ERROR,
-};
-
-struct microcode_ops {
- enum ucode_state (*request_microcode_fw) (int cpu, struct device *);
-
- void (*microcode_fini_cpu) (int cpu);
-
- /*
- * The generic 'microcode_core' part guarantees that
- * the callbacks below run on a target cpu when they
- * are being called.
- * See also the "Synchronization" section in microcode_core.c.
- */
- enum ucode_state (*apply_microcode) (int cpu);
- int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
-};
-
struct ucode_cpu_info {
struct cpu_signature cpu_sig;
void *mc;
};
-extern struct ucode_cpu_info ucode_cpu_info[];
-struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa);
-
-#ifdef CONFIG_MICROCODE_INTEL
-extern struct microcode_ops * __init init_intel_microcode(void);
-#else
-static inline struct microcode_ops * __init init_intel_microcode(void)
-{
- return NULL;
-}
-#endif /* CONFIG_MICROCODE_INTEL */
-#ifdef CONFIG_MICROCODE_AMD
-extern struct microcode_ops * __init init_amd_microcode(void);
-extern void __exit exit_amd_microcode(void);
+#ifdef CONFIG_MICROCODE
+void load_ucode_bsp(void);
+void load_ucode_ap(void);
+void microcode_bsp_resume(void);
#else
-static inline struct microcode_ops * __init init_amd_microcode(void)
-{
- return NULL;
-}
-static inline void __exit exit_amd_microcode(void) {}
+static inline void load_ucode_bsp(void) { }
+static inline void load_ucode_ap(void) { }
+static inline void microcode_bsp_resume(void) { }
#endif
-#define MAX_UCODE_COUNT 128
+#ifdef CONFIG_CPU_SUP_INTEL
+/* Intel specific microcode defines. Public for IFS */
+struct microcode_header_intel {
+ unsigned int hdrver;
+ unsigned int rev;
+ unsigned int date;
+ unsigned int sig;
+ unsigned int cksum;
+ unsigned int ldrver;
+ unsigned int pf;
+ unsigned int datasize;
+ unsigned int totalsize;
+ unsigned int metasize;
+ unsigned int reserved[2];
+};
-#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
-#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
-#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
-#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
-#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
-#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
-#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+struct microcode_intel {
+ struct microcode_header_intel hdr;
+ unsigned int bits[];
+};
-#define CPUID_IS(a, b, c, ebx, ecx, edx) \
- (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+#define DEFAULT_UCODE_DATASIZE (2000)
+#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
+#define MC_HEADER_TYPE_MICROCODE 1
+#define MC_HEADER_TYPE_IFS 2
-/*
- * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_cpuid_vendor() gets vendor id for BSP.
- *
- * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
- *
- * x86_cpuid_vendor() gets vendor information directly from CPUID.
- */
-static inline int x86_cpuid_vendor(void)
+static inline int intel_microcode_get_datasize(struct microcode_header_intel *hdr)
{
- u32 eax = 0x00000000;
- u32 ebx, ecx = 0, edx;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
- return X86_VENDOR_INTEL;
-
- if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
- return X86_VENDOR_AMD;
-
- return X86_VENDOR_UNKNOWN;
+ return hdr->datasize ? : DEFAULT_UCODE_DATASIZE;
}
-static inline unsigned int x86_cpuid_family(void)
+static inline u32 intel_get_microcode_revision(void)
{
- u32 eax = 0x00000001;
- u32 ebx, ecx = 0, edx;
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_IA32_UCODE_REV, 0);
- native_cpuid(&eax, &ebx, &ecx, &edx);
+ /* As documented in the SDM: Do a CPUID 1 here */
+ native_cpuid_eax(1);
- return x86_family(eax);
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev);
+
+ return rev;
}
-#ifdef CONFIG_MICROCODE
-extern void __init load_ucode_bsp(void);
-extern void load_ucode_ap(void);
-void reload_early_microcode(unsigned int cpu);
-extern bool initrd_gone;
-void microcode_bsp_resume(void);
-#else
-static inline void __init load_ucode_bsp(void) { }
-static inline void load_ucode_ap(void) { }
-static inline void reload_early_microcode(unsigned int cpu) { }
-static inline void microcode_bsp_resume(void) { }
-#endif
+void show_ucode_info_early(void);
+
+#else /* CONFIG_CPU_SUP_INTEL */
+static inline void show_ucode_info_early(void) { }
+#endif /* !CONFIG_CPU_SUP_INTEL */
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
deleted file mode 100644
index 9675c621c1ca..000000000000
--- a/arch/x86/include/asm/microcode_amd.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_MICROCODE_AMD_H
-#define _ASM_X86_MICROCODE_AMD_H
-
-#include <asm/microcode.h>
-
-#define UCODE_MAGIC 0x00414d44
-#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
-#define UCODE_UCODE_TYPE 0x00000001
-
-#define SECTION_HDR_SIZE 8
-#define CONTAINER_HDR_SZ 12
-
-struct equiv_cpu_entry {
- u32 installed_cpu;
- u32 fixed_errata_mask;
- u32 fixed_errata_compare;
- u16 equiv_cpu;
- u16 res;
-} __attribute__((packed));
-
-struct microcode_header_amd {
- u32 data_code;
- u32 patch_id;
- u16 mc_patch_data_id;
- u8 mc_patch_data_len;
- u8 init_flag;
- u32 mc_patch_data_checksum;
- u32 nb_dev_id;
- u32 sb_dev_id;
- u16 processor_rev_id;
- u8 nb_rev_id;
- u8 sb_rev_id;
- u8 bios_api_rev;
- u8 reserved1[3];
- u32 match_reg[8];
-} __attribute__((packed));
-
-struct microcode_amd {
- struct microcode_header_amd hdr;
- unsigned int mpb[];
-};
-
-#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
-
-#ifdef CONFIG_MICROCODE_AMD
-extern void __init load_ucode_amd_bsp(unsigned int family);
-extern void load_ucode_amd_ap(unsigned int family);
-extern int __init save_microcode_in_initrd_amd(unsigned int family);
-void reload_ucode_amd(unsigned int cpu);
-extern void amd_check_microcode(void);
-#else
-static inline void __init load_ucode_amd_bsp(unsigned int family) {}
-static inline void load_ucode_amd_ap(unsigned int family) {}
-static inline int __init
-save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
-static inline void reload_ucode_amd(unsigned int cpu) {}
-static inline void amd_check_microcode(void) {}
-#endif
-#endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
deleted file mode 100644
index f1fa979e05bf..000000000000
--- a/arch/x86/include/asm/microcode_intel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_MICROCODE_INTEL_H
-#define _ASM_X86_MICROCODE_INTEL_H
-
-#include <asm/microcode.h>
-
-struct microcode_header_intel {
- unsigned int hdrver;
- unsigned int rev;
- unsigned int date;
- unsigned int sig;
- unsigned int cksum;
- unsigned int ldrver;
- unsigned int pf;
- unsigned int datasize;
- unsigned int totalsize;
- unsigned int metasize;
- unsigned int reserved[2];
-};
-
-struct microcode_intel {
- struct microcode_header_intel hdr;
- unsigned int bits[];
-};
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
- unsigned int sig;
- unsigned int pf;
- unsigned int cksum;
-};
-
-struct extended_sigtable {
- unsigned int count;
- unsigned int cksum;
- unsigned int reserved[3];
- struct extended_signature sigs[];
-};
-
-#define DEFAULT_UCODE_DATASIZE (2000)
-#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
-#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
-#define MC_HEADER_TYPE_MICROCODE 1
-#define MC_HEADER_TYPE_IFS 2
-
-#define get_totalsize(mc) \
- (((struct microcode_intel *)mc)->hdr.datasize ? \
- ((struct microcode_intel *)mc)->hdr.totalsize : \
- DEFAULT_UCODE_TOTALSIZE)
-
-#define get_datasize(mc) \
- (((struct microcode_intel *)mc)->hdr.datasize ? \
- ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-static inline u32 intel_get_microcode_revision(void)
-{
- u32 rev, dummy;
-
- native_wrmsrl(MSR_IA32_UCODE_REV, 0);
-
- /* As documented in the SDM: Do a CPUID 1 here */
- native_cpuid_eax(1);
-
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev);
-
- return rev;
-}
-
-#ifdef CONFIG_MICROCODE_INTEL
-extern void __init load_ucode_intel_bsp(void);
-extern void load_ucode_intel_ap(void);
-extern void show_ucode_info_early(void);
-extern int __init save_microcode_in_initrd_intel(void);
-void reload_ucode_intel(void);
-#else
-static inline __init void load_ucode_intel_bsp(void) {}
-static inline void load_ucode_intel_ap(void) {}
-static inline void show_ucode_info_early(void) {}
-static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; }
-static inline void reload_ucode_intel(void) {}
-#endif
-
-#endif /* _ASM_X86_MICROCODE_INTEL_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index b49778664d2b..6c8ff12140ae 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -739,6 +739,7 @@ static __always_inline unsigned long arch_local_irq_save(void)
".popsection")
extern void default_banner(void);
+void native_pv_lock_init(void) __init;
#else /* __ASSEMBLY__ */
@@ -778,6 +779,12 @@ extern void default_banner(void);
#endif /* __ASSEMBLY__ */
#else /* CONFIG_PARAVIRT */
# define default_banner x86_init_noop
+
+#ifndef __ASSEMBLY__
+static inline void native_pv_lock_init(void)
+{
+}
+#endif
#endif /* !CONFIG_PARAVIRT */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fd750247ca89..861e53e201e9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -678,11 +678,13 @@ extern u32 amd_get_nodes_per_socket(void);
extern u32 amd_get_highest_perf(void);
extern bool cpu_has_ibpb_brtype_microcode(void);
extern void amd_clear_divider(void);
+extern void amd_check_microcode(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; }
static inline void amd_clear_divider(void) { }
+static inline void amd_check_microcode(void) { }
#endif
extern unsigned long arch_align_stack(unsigned long sp);
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index d87451df480b..cde8357bb226 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -74,8 +74,6 @@ static inline bool vcpu_is_preempted(long cpu)
*/
DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
-void native_pv_lock_init(void) __init;
-
/*
* Shortcut for the queued_spin_lock_slowpath() function that allows
* virt to hijack it.
@@ -103,10 +101,7 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
return true;
}
-#else
-static inline void native_pv_lock_init(void)
-{
-}
+
#endif /* CONFIG_PARAVIRT */
#include <asm-generic/qspinlock.h>
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 42b17cf10b10..85b6e3609cb9 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -4,6 +4,8 @@
#include <asm/ibt.h>
+void __lockfunc __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked);
+
/*
* For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
* registers. For i386, however, only 1 32-bit register needs to be saved
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 66c806784c52..5b4a1ce3d368 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -164,6 +164,7 @@ static __always_inline void sev_es_nmi_complete(void)
__sev_es_nmi_complete();
}
extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
+extern void sev_enable(struct boot_params *bp);
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
{
@@ -210,12 +211,15 @@ bool snp_init(struct boot_params *bp);
void __init __noreturn snp_abort(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
+u64 snp_get_unsupported_features(u64 status);
+u64 sev_get_status(void);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { }
static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
+static inline void sev_enable(struct boot_params *bp) { }
static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
static inline void setup_ghcb(void) { }
@@ -235,6 +239,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
}
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
+static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
+static inline u64 sev_get_status(void) { return 0; }
#endif
#endif
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index caf41c4869a0..3235ba1e5b06 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -136,10 +136,11 @@ static inline int topology_max_smt_threads(void)
return __max_smt_threads;
}
+#include <linux/cpu_smt.h>
+
int topology_update_package_map(unsigned int apicid, unsigned int cpu);
int topology_update_die_map(unsigned int dieid, unsigned int cpu);
int topology_phys_to_logical_pkg(unsigned int pkg);
-bool topology_smt_supported(void);
extern struct cpumask __cpu_primary_thread_mask;
#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask)
@@ -162,7 +163,6 @@ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_die_per_package(void) { return 1; }
static inline int topology_max_smt_threads(void) { return 1; }
static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
-static inline bool topology_smt_supported(void) { return false; }
#endif /* !CONFIG_SMP */
static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 1b6455f881f9..6989b824fd32 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -10,6 +10,7 @@
* Copyright (c) Russ Anderson <rja@sgi.com>
*/
+#include <linux/efi.h>
#include <linux/rtc.h>
/*
@@ -115,7 +116,8 @@ struct uv_arch_type_entry {
struct uv_systab {
char signature[4]; /* must be UV_SYSTAB_SIG */
u32 revision; /* distinguish different firmware revs */
- u64 function; /* BIOS runtime callback function ptr */
+ u64 (__efiapi *function)(enum uv_bios_cmd, ...);
+ /* BIOS runtime callback function ptr */
u32 size; /* systab size (starting with _VERSION_UV4) */
struct {
u32 type:8; /* type of entry */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index fa9ec20783fa..85e63d58c074 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -295,7 +295,10 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
/* VIRT <-> MACHINE conversion */
#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
-#define virt_to_pfn(v) (PFN_DOWN(__pa(v)))
+static inline unsigned long virt_to_pfn(const void *v)
+{
+ return PFN_DOWN(__pa(v));
+}
#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 099d58d02a26..a5ead6a6d233 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1527,6 +1527,7 @@ static noinline void __init int3_selftest(void)
static __initdata int __alt_reloc_selftest_addr;
+extern void __init __alt_reloc_selftest(void *arg);
__visible noinline void __init __alt_reloc_selftest(void *arg)
{
WARN_ON(arg != &__alt_reloc_selftest_addr);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 035a3db5330b..356de955e78d 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -24,6 +24,8 @@
#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8
#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
+#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a
+#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507
#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
@@ -39,6 +41,7 @@
#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
+#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4
#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
/* Protect the PCI config register pairs used for SMN. */
@@ -56,6 +59,8 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
{}
};
@@ -85,6 +90,8 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
{}
};
@@ -106,6 +113,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
{}
};
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 2a6509e8c840..9bfd6e397384 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -301,6 +301,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
local_irq_restore(flags);
}
+#ifdef CONFIG_SMP
/* must come after the send_IPI functions above for inlining */
static int convert_apicid_to_cpu(int apic_id)
{
@@ -329,3 +330,4 @@ int safe_smp_processor_id(void)
return cpuid >= 0 ? cpuid : 0;
}
#endif
+#endif
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d9384d5b4b8e..b524dee1cbbb 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -294,8 +294,7 @@ static void __init early_get_apic_socketid_shift(void)
static void __init uv_stringify(int len, char *to, char *from)
{
- /* Relies on 'to' being NULL chars so result will be NULL terminated */
- strncpy(to, from, len-1);
+ strscpy(to, from, len);
/* Trim trailing spaces */
(void)strim(to);
@@ -1013,7 +1012,7 @@ static void __init calc_mmioh_map(enum mmioh_arch index,
/* One (UV2) mapping */
if (index == UV2_MMIOH) {
- strncpy(id, "MMIOH", sizeof(id));
+ strscpy(id, "MMIOH", sizeof(id));
max_io = max_pnode;
mapped = 0;
goto map_exit;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e3a65e9fc750..41b573f34a10 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,7 +59,6 @@
#include <asm/cacheinfo.h>
#include <asm/memtype.h>
#include <asm/microcode.h>
-#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/uv/uv.h>
@@ -2300,8 +2299,7 @@ void store_cpu_caps(struct cpuinfo_x86 *curr_info)
* @prev_info: CPU capabilities stored before an update.
*
* The microcode loader calls this upon late microcode load to recheck features,
- * only when microcode has been updated. Caller holds microcode_mutex and CPU
- * hotplug lock.
+ * only when microcode has been updated. Caller holds and CPU hotplug lock.
*
* Return: None
*/
@@ -2343,7 +2341,7 @@ void __init arch_cpu_finalize_init(void)
* identify_boot_cpu() initialized SMT support information, let the
* core code know.
*/
- cpu_smt_check_topology();
+ cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings);
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1c4639588ff9..be4045628fd3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -20,7 +20,7 @@
#include <asm/bugs.h>
#include <asm/cpu.h>
#include <asm/intel-family.h>
-#include <asm/microcode_intel.h>
+#include <asm/microcode.h>
#include <asm/hwcap2.h>
#include <asm/elf.h>
#include <asm/cpu_device_id.h>
@@ -184,180 +184,6 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
return false;
}
-int intel_cpu_collect_info(struct ucode_cpu_info *uci)
-{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig = { 0 };
- unsigned int eax, ebx, ecx, edx;
-
- memset(uci, 0, sizeof(*uci));
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
-
- family = x86_family(eax);
- model = x86_model(eax);
-
- if (model >= 5 || family > 6) {
- /* get processor flags from MSR 0x17 */
- native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
- }
-
- csig.rev = intel_get_microcode_revision();
-
- uci->cpu_sig = csig;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int intel_find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
- struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
- struct extended_signature *ext_sig;
- int i;
-
- if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
-
- ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
- ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
-
- for (i = 0; i < ext_hdr->count; i++) {
- if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(intel_find_matching_signature);
-
-/**
- * intel_microcode_sanity_check() - Sanity check microcode file.
- * @mc: Pointer to the microcode file contents.
- * @print_err: Display failure reason if true, silent if false.
- * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file.
- * Validate if the microcode header type matches with the type
- * specified here.
- *
- * Validate certain header fields and verify if computed checksum matches
- * with the one specified in the header.
- *
- * Return: 0 if the file passes all the checks, -EINVAL if any of the checks
- * fail.
- */
-int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- u32 sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- if (print_err)
- pr_err("Error: bad microcode data file size.\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) {
- if (print_err)
- pr_err("Error: invalid/unknown microcode update format. Header type %d\n",
- mc_header->hdrver);
- return -EINVAL;
- }
-
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- u32 ext_table_sum = 0;
- u32 *ext_tablep;
-
- if (ext_table_size < EXT_HEADER_SIZE ||
- ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- if (print_err)
- pr_err("Error: truncated extended signature table.\n");
- return -EINVAL;
- }
-
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- if (print_err)
- pr_err("Error: extended signature table size mismatch.\n");
- return -EFAULT;
- }
-
- ext_sigcount = ext_header->count;
-
- /*
- * Check extended table checksum: the sum of all dwords that
- * comprise a valid table must be 0.
- */
- ext_tablep = (u32 *)ext_header;
-
- i = ext_table_size / sizeof(u32);
- while (i--)
- ext_table_sum += ext_tablep[i];
-
- if (ext_table_sum) {
- if (print_err)
- pr_warn("Bad extended signature table checksum, aborting.\n");
- return -EINVAL;
- }
- }
-
- /*
- * Calculate the checksum of update data and header. The checksum of
- * valid update data and header including the extended signature table
- * must be 0.
- */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
- while (i--)
- orig_sum += ((u32 *)mc)[i];
-
- if (orig_sum) {
- if (print_err)
- pr_err("Bad microcode data checksum, aborting.\n");
- return -EINVAL;
- }
-
- if (!ext_table_size)
- return 0;
-
- /*
- * Check extended signature checksum: 0 => valid.
- */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
-
- sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
- (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- if (print_err)
- pr_err("Bad extended signature checksum, aborting.\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
-
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index 3b8476158236..e4c3ba91321c 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -206,7 +206,7 @@ static int intel_epb_offline(unsigned int cpu)
static const struct x86_cpu_id intel_epb_normal[] = {
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,
ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,
ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,
ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 89e2aab5d34d..6f35f724cc14 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -843,6 +843,26 @@ static noinstr bool quirk_skylake_repmov(void)
}
/*
+ * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
+ * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
+ *
+ * However, the context is still valid, so save the "cs" register for later use.
+ *
+ * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
+ *
+ * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
+ */
+static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 1)
+ return;
+ if (!(m->status & MCI_STATUS_POISON))
+ return;
+
+ m->cs = regs->cs;
+}
+
+/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
@@ -861,6 +881,9 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo
if (mce_flags.snb_ifu_quirk)
quirk_sandybridge_ifu(i, m, regs);
+ if (mce_flags.zen_ifu_quirk)
+ quirk_zen_ifu(i, m, regs);
+
m->bank = i;
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
mce_read_aux(m, i);
@@ -1608,6 +1631,13 @@ static void __start_timer(struct timer_list *t, unsigned long interval)
local_irq_restore(flags);
}
+static void mc_poll_banks_default(void)
+{
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+}
+
+void (*mc_poll_banks)(void) = mc_poll_banks_default;
+
static void mce_timer_fn(struct timer_list *t)
{
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
@@ -1618,7 +1648,7 @@ static void mce_timer_fn(struct timer_list *t)
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ mc_poll_banks();
if (mce_intel_cmci_poll()) {
iv = mce_adjust_timer(iv);
@@ -1842,6 +1872,9 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
+ if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+ mce_flags.zen_ifu_quirk = 1;
+
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 95275a5e57e0..f5323551c1a9 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -56,6 +56,13 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt);
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+/*
+ * On systems that do support CMCI but it's disabled, polling for MCEs can
+ * cause the same event to be reported multiple times because IA32_MCi_STATUS
+ * is shared by the same package.
+ */
+static DEFINE_SPINLOCK(cmci_poll_lock);
+
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
#define CMCI_STORM_INTERVAL (HZ)
@@ -426,12 +433,22 @@ void cmci_disable_bank(int bank)
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
+/* Bank polling function when CMCI is disabled. */
+static void cmci_mc_poll_banks(void)
+{
+ spin_lock(&cmci_poll_lock);
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ spin_unlock(&cmci_poll_lock);
+}
+
void intel_init_cmci(void)
{
int banks;
- if (!cmci_supported(&banks))
+ if (!cmci_supported(&banks)) {
+ mc_poll_banks = cmci_mc_poll_banks;
return;
+ }
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks);
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index d2412ce2d312..bcf1b3c66c9c 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -157,6 +157,9 @@ struct mce_vendor_flags {
*/
smca : 1,
+ /* Zen IFU quirk */
+ zen_ifu_quirk : 1,
+
/* AMD-style error thresholding banks present. */
amd_threshold : 1,
@@ -172,7 +175,7 @@ struct mce_vendor_flags {
/* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
skx_repmov_quirk : 1,
- __reserved_0 : 56;
+ __reserved_0 : 55;
};
extern struct mce_vendor_flags mce_flags;
@@ -274,4 +277,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
return 0;
}
+extern void (*mc_poll_banks)(void);
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
index 34098d48c48f..193d98b33a0a 100644
--- a/arch/x86/kernel/cpu/microcode/Makefile
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
-microcode-$(CONFIG_MICROCODE_INTEL) += intel.o
-microcode-$(CONFIG_MICROCODE_AMD) += amd.o
+microcode-$(CONFIG_CPU_SUP_INTEL) += intel.o
+microcode-$(CONFIG_CPU_SUP_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 87208e46f7ed..bbd1dc38ea03 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -29,13 +29,53 @@
#include <linux/kernel.h>
#include <linux/pci.h>
-#include <asm/microcode_amd.h>
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/setup.h>
#include <asm/cpu.h>
#include <asm/msr.h>
+#include "internal.h"
+
+#define UCODE_MAGIC 0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE 0x00000001
+
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
+
+struct equiv_cpu_entry {
+ u32 installed_cpu;
+ u32 fixed_errata_mask;
+ u32 fixed_errata_compare;
+ u16 equiv_cpu;
+ u16 res;
+} __packed;
+
+struct microcode_header_amd {
+ u32 data_code;
+ u32 patch_id;
+ u16 mc_patch_data_id;
+ u8 mc_patch_data_len;
+ u8 init_flag;
+ u32 mc_patch_data_checksum;
+ u32 nb_dev_id;
+ u32 sb_dev_id;
+ u16 processor_rev_id;
+ u8 nb_rev_id;
+ u8 sb_rev_id;
+ u8 bios_api_rev;
+ u8 reserved1[3];
+ u32 match_reg[8];
+} __packed;
+
+struct microcode_amd {
+ struct microcode_header_amd hdr;
+ unsigned int mpb[];
+};
+
+#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
+
static struct equiv_cpu_table {
unsigned int num_entries;
struct equiv_cpu_entry *entry;
@@ -56,9 +96,6 @@ struct cont_desc {
static u32 ucode_new_rev;
-/* One blob per node. */
-static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE];
-
/*
* Microcode patch container file is prepended to the initrd in cpio
* format. See Documentation/arch/x86/microcode.rst
@@ -415,20 +452,17 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
*
* Returns true if container found (sets @desc), false otherwise.
*/
-static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
+static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size)
{
struct cont_desc desc = { 0 };
- u8 (*patch)[PATCH_MAX_SIZE];
struct microcode_amd *mc;
u32 rev, dummy, *new_rev;
bool ret = false;
#ifdef CONFIG_X86_32
new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
- patch = &amd_ucode_patch[0];
#endif
desc.cpuid_1_eax = cpuid_1_eax;
@@ -452,9 +486,6 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, boo
if (!__apply_microcode_amd(mc)) {
*new_rev = mc->hdr.patch_id;
ret = true;
-
- if (save_patch)
- memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE));
}
return ret;
@@ -507,7 +538,7 @@ static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data
*ret = cp;
}
-void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
+static void apply_ucode_from_containers(unsigned int cpuid_1_eax)
{
struct cpio_data cp = { };
@@ -515,42 +546,12 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
if (!(cp.data && cp.size))
return;
- early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true);
+ early_apply_microcode(cpuid_1_eax, cp.data, cp.size);
}
-void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+void load_ucode_amd_early(unsigned int cpuid_1_eax)
{
- struct microcode_amd *mc;
- struct cpio_data cp;
- u32 *new_rev, rev, dummy;
-
- if (IS_ENABLED(CONFIG_X86_32)) {
- mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- } else {
- mc = (struct microcode_amd *)amd_ucode_patch;
- new_rev = &ucode_new_rev;
- }
-
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
- /*
- * Check whether a new patch has been saved already. Also, allow application of
- * the same revision in order to pick up SMT-thread-specific configuration even
- * if the sibling SMT thread already has an up-to-date revision.
- */
- if (*new_rev && rev <= mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- *new_rev = mc->hdr.patch_id;
- return;
- }
- }
-
- find_blobs_in_containers(cpuid_1_eax, &cp);
- if (!(cp.data && cp.size))
- return;
-
- early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false);
+ return apply_ucode_from_containers(cpuid_1_eax);
}
static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
@@ -578,23 +579,6 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
return 0;
}
-void reload_ucode_amd(unsigned int cpu)
-{
- u32 rev, dummy __always_unused;
- struct microcode_amd *mc;
-
- mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)];
-
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
- if (rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- ucode_new_rev = mc->hdr.patch_id;
- pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
- }
- }
-}
-
/*
* a small, trivial cache of per-family ucode patches
*/
@@ -655,6 +639,28 @@ static struct ucode_patch *find_patch(unsigned int cpu)
return cache_find_patch(equiv_id);
}
+void reload_ucode_amd(unsigned int cpu)
+{
+ u32 rev, dummy __always_unused;
+ struct microcode_amd *mc;
+ struct ucode_patch *p;
+
+ p = find_patch(cpu);
+ if (!p)
+ return;
+
+ mc = p->data;
+
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ if (rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ ucode_new_rev = mc->hdr.patch_id;
+ pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
+ }
+ }
+}
+
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -875,9 +881,6 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
continue;
ret = UCODE_NEW;
-
- memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE);
- memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE));
}
return ret;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 3afcf3de0dd4..6cc7a2c181da 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -31,15 +31,14 @@
#include <linux/fs.h>
#include <linux/mm.h>
-#include <asm/microcode_intel.h>
#include <asm/cpu_device_id.h>
-#include <asm/microcode_amd.h>
#include <asm/perf_event.h>
-#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
#include <asm/setup.h>
+#include "internal.h"
+
#define DRIVER_VERSION "2.2"
static struct microcode_ops *microcode_ops;
@@ -54,15 +53,12 @@ LIST_HEAD(microcode_cache);
*
* All non cpu-hotplug-callback call sites use:
*
- * - microcode_mutex to synchronize with each other;
* - cpus_read_lock/unlock() to synchronize with
* the cpu-hotplug-callback call sites.
*
* We guarantee that only a single cpu is being
* updated at any particular moment of time.
*/
-static DEFINE_MUTEX(microcode_mutex);
-
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
struct cpu_info_ctx {
@@ -172,7 +168,7 @@ void __init load_ucode_bsp(void)
if (intel)
load_ucode_intel_bsp();
else
- load_ucode_amd_bsp(cpuid_1_eax);
+ load_ucode_amd_early(cpuid_1_eax);
}
static bool check_loader_disabled_ap(void)
@@ -200,7 +196,7 @@ void load_ucode_ap(void)
break;
case X86_VENDOR_AMD:
if (x86_family(cpuid_1_eax) >= 0x10)
- load_ucode_amd_ap(cpuid_1_eax);
+ load_ucode_amd_early(cpuid_1_eax);
break;
default:
break;
@@ -298,7 +294,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
#endif
}
-void reload_early_microcode(unsigned int cpu)
+static void reload_early_microcode(unsigned int cpu)
{
int vendor, family;
@@ -488,10 +484,7 @@ static ssize_t reload_store(struct device *dev,
if (tmp_ret != UCODE_NEW)
goto put;
- mutex_lock(&microcode_mutex);
ret = microcode_reload_late();
- mutex_unlock(&microcode_mutex);
-
put:
cpus_read_unlock();
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 467cf37ea90a..94dd6af9c963 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -10,15 +10,7 @@
* Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
* H Peter Anvin" <hpa@zytor.com>
*/
-
-/*
- * This needs to be before all headers so that pr_debug in printk.h doesn't turn
- * printk calls into no_printk().
- *
- *#define DEBUG
- */
#define pr_fmt(fmt) "microcode: " fmt
-
#include <linux/earlycpio.h>
#include <linux/firmware.h>
#include <linux/uaccess.h>
@@ -30,13 +22,14 @@
#include <linux/uio.h>
#include <linux/mm.h>
-#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
#include <asm/msr.h>
+#include "internal.h"
+
static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
/* Current microcode patch used in early patching on the APs. */
@@ -45,6 +38,208 @@ static struct microcode_intel *intel_ucode_patch;
/* last level cache size per core */
static int llc_size_per_core;
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[];
+};
+
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+
+static inline unsigned int get_totalsize(struct microcode_header_intel *hdr)
+{
+ return hdr->datasize ? hdr->totalsize : DEFAULT_UCODE_TOTALSIZE;
+}
+
+static inline unsigned int exttable_size(struct extended_sigtable *et)
+{
+ return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE;
+}
+
+int intel_cpu_collect_info(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ unsigned int family, model;
+ struct cpu_signature csig = { 0 };
+ unsigned int eax, ebx, ecx, edx;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ family = x86_family(eax);
+ model = x86_model(eax);
+
+ if (model >= 5 || family > 6) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ csig.rev = intel_get_microcode_revision();
+
+ uci->cpu_sig = csig;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
+
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+int intel_find_matching_signature(void *mc, unsigned int csig, int cpf)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
+ struct extended_signature *ext_sig;
+ int i;
+
+ if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ return 1;
+
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_find_matching_signature);
+
+/**
+ * intel_microcode_sanity_check() - Sanity check microcode file.
+ * @mc: Pointer to the microcode file contents.
+ * @print_err: Display failure reason if true, silent if false.
+ * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file.
+ * Validate if the microcode header type matches with the type
+ * specified here.
+ *
+ * Validate certain header fields and verify if computed checksum matches
+ * with the one specified in the header.
+ *
+ * Return: 0 if the file passes all the checks, -EINVAL if any of the checks
+ * fail.
+ */
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = intel_microcode_get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format. Header type %d\n",
+ mc_header->hdrver);
+ return -EINVAL;
+ }
+
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
+ if (ext_table_size < EXT_HEADER_SIZE ||
+ ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
+
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
+ }
+
+ if (!ext_table_size)
+ return 0;
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
+
/*
* Returns 1 if update has been found, 0 otherwise.
*/
@@ -202,86 +397,6 @@ next:
return patch;
}
-static void show_saved_mc(void)
-{
-#ifdef DEBUG
- int i = 0, j;
- unsigned int sig, pf, rev, total_size, data_size, date;
- struct ucode_cpu_info uci;
- struct ucode_patch *p;
-
- if (list_empty(&microcode_cache)) {
- pr_debug("no microcode data saved.\n");
- return;
- }
-
- intel_cpu_collect_info(&uci);
-
- sig = uci.cpu_sig.sig;
- pf = uci.cpu_sig.pf;
- rev = uci.cpu_sig.rev;
- pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
-
- list_for_each_entry(p, &microcode_cache, plist) {
- struct microcode_header_intel *mc_saved_header;
- struct extended_sigtable *ext_header;
- struct extended_signature *ext_sig;
- int ext_sigcount;
-
- mc_saved_header = (struct microcode_header_intel *)p->data;
-
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- rev = mc_saved_header->rev;
- date = mc_saved_header->date;
-
- total_size = get_totalsize(mc_saved_header);
- data_size = get_datasize(mc_saved_header);
-
- pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
- i++, sig, pf, rev, total_size,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- continue;
-
- ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
- for (j = 0; j < ext_sigcount; j++) {
- sig = ext_sig->sig;
- pf = ext_sig->pf;
-
- pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
- j, sig, pf);
-
- ext_sig++;
- }
- }
-#endif
-}
-
-/*
- * Save this microcode patch. It will be loaded early when a CPU is
- * hot-added or resumes.
- */
-static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size)
-{
- /* Synchronization during CPU hotplug. */
- static DEFINE_MUTEX(x86_cpu_microcode_mutex);
-
- mutex_lock(&x86_cpu_microcode_mutex);
-
- save_microcode_patch(uci, mc, size);
- show_saved_mc();
-
- mutex_unlock(&x86_cpu_microcode_mutex);
-}
-
static bool load_builtin_intel_microcode(struct cpio_data *cp)
{
unsigned int eax = 1, ebx, ecx = 0, edx;
@@ -428,9 +543,6 @@ int __init save_microcode_in_initrd_intel(void)
intel_cpu_collect_info(&uci);
scan_microcode(cp.data, cp.size, &uci, true);
-
- show_saved_mc();
-
return 0;
}
@@ -701,12 +813,8 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
- /*
- * If early loading microcode is supported, save this mc into
- * permanent memory. So it will be loaded early when a CPU is hot added
- * or resumes.
- */
- save_mc_for_early(uci, new_mc, new_mc_size);
+ /* Save for CPU hotplug */
+ save_microcode_patch(uci, new_mc, new_mc_size);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
new file mode 100644
index 000000000000..bf883aa71233
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_MICROCODE_INTERNAL_H
+#define _X86_MICROCODE_INTERNAL_H
+
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+
+#include <asm/cpu.h>
+#include <asm/microcode.h>
+
+struct ucode_patch {
+ struct list_head plist;
+ void *data; /* Intel uses only this one */
+ unsigned int size;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+extern struct list_head microcode_cache;
+
+struct device;
+
+enum ucode_state {
+ UCODE_OK = 0,
+ UCODE_NEW,
+ UCODE_UPDATED,
+ UCODE_NFOUND,
+ UCODE_ERROR,
+};
+
+struct microcode_ops {
+ enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev);
+
+ void (*microcode_fini_cpu)(int cpu);
+
+ /*
+ * The generic 'microcode_core' part guarantees that
+ * the callbacks below run on a target cpu when they
+ * are being called.
+ * See also the "Synchronization" section in microcode_core.c.
+ */
+ enum ucode_state (*apply_microcode)(int cpu);
+ int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+};
+
+extern struct ucode_cpu_info ucode_cpu_info[];
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa);
+
+#define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!(((ebx) ^ (a)) | ((edx) ^ (b)) | ((ecx) ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_cpuid_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
+ *
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_cpuid_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int x86_cpuid_family(void)
+{
+ u32 eax = 0x00000001;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ return x86_family(eax);
+}
+
+extern bool initrd_gone;
+
+#ifdef CONFIG_CPU_SUP_AMD
+void load_ucode_amd_bsp(unsigned int family);
+void load_ucode_amd_ap(unsigned int family);
+void load_ucode_amd_early(unsigned int cpuid_1_eax);
+int save_microcode_in_initrd_amd(unsigned int family);
+void reload_ucode_amd(unsigned int cpu);
+struct microcode_ops *init_amd_microcode(void);
+void exit_amd_microcode(void);
+#else /* CONFIG_CPU_SUP_AMD */
+static inline void load_ucode_amd_bsp(unsigned int family) { }
+static inline void load_ucode_amd_ap(unsigned int family) { }
+static inline void load_ucode_amd_early(unsigned int family) { }
+static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
+static inline void reload_ucode_amd(unsigned int cpu) { }
+static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
+static inline void exit_amd_microcode(void) { }
+#endif /* !CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+void load_ucode_intel_bsp(void);
+void load_ucode_intel_ap(void);
+int save_microcode_in_initrd_intel(void);
+void reload_ucode_intel(void);
+struct microcode_ops *init_intel_microcode(void);
+#else /* CONFIG_CPU_SUP_INTEL */
+static inline void load_ucode_intel_bsp(void) { }
+static inline void load_ucode_intel_ap(void) { }
+static inline int save_microcode_in_initrd_intel(void) { return -EINVAL; }
+static inline void reload_ucode_intel(void) { }
+static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
+#endif /* !CONFIG_CPU_SUP_INTEL */
+
+#endif /* _X86_MICROCODE_INTERNAL_H */
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index af5cbdd9bd29..f6d856bd50bc 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -19,8 +19,7 @@
* FPU state for a task MUST let the rest of the kernel know that the
* FPU registers are no longer valid for this task.
*
- * Either one of these invalidation functions is enough. Invalidate
- * a resource you control: CPU if using the CPU for something else
+ * Invalidate a resource you control: CPU if using the CPU for something else
* (with preemption disabled), FPU for the current task, or a task that
* is prevented from running by the current task.
*/
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1015af1ae562..98e507cc7d34 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -679,7 +679,7 @@ static void fpu_reset_fpregs(void)
struct fpu *fpu = &current->thread.fpu;
fpregs_lock();
- fpu__drop(fpu);
+ __fpu_invalidate_fpregs_state(fpu);
/*
* This does not change the actual hardware registers. It just
* resets the memory image and sets TIF_NEED_FPU_LOAD so a
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0bab497c9436..1afbc4866b10 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -882,6 +882,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ /*
+ * CPU capabilities initialization runs before FPU init. So
+ * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
+ * functional, set the feature bit so depending code works.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
+
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
fpu_kernel_cfg.max_features,
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index c5b9289837dc..ea6995920b7a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -51,7 +51,9 @@ SYM_CODE_START_NOALIGN(startup_64)
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %rsi holds a physical pointer to real_mode_data.
+ * %RSI holds the physical address of the boot_params structure
+ * provided by the bootloader. Preserve it in %R15 so C function calls
+ * will not clobber it.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86/boot/compressed/head_64.S.
@@ -62,6 +64,7 @@ SYM_CODE_START_NOALIGN(startup_64)
* compiled to run at we first fixup the physical addresses in our page
* tables and then reload them.
*/
+ mov %rsi, %r15
/* Set up the stack for verify_cpu() */
leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
@@ -75,9 +78,7 @@ SYM_CODE_START_NOALIGN(startup_64)
shrq $32, %rdx
wrmsr
- pushq %rsi
call startup_64_setup_env
- popq %rsi
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
@@ -93,12 +94,10 @@ SYM_CODE_START_NOALIGN(startup_64)
* Activate SEV/SME memory encryption if supported/enabled. This needs to
* be done now, since this also includes setup of the SEV-SNP CPUID table,
* which needs to be done before any CPUID instructions are executed in
- * subsequent code.
+ * subsequent code. Pass the boot_params pointer as the first argument.
*/
- movq %rsi, %rdi
- pushq %rsi
+ movq %r15, %rdi
call sme_enable
- popq %rsi
#endif
/* Sanitize CPU configuration */
@@ -111,9 +110,8 @@ SYM_CODE_START_NOALIGN(startup_64)
* programmed into CR3.
*/
leaq _text(%rip), %rdi
- pushq %rsi
+ movq %r15, %rsi
call __startup_64
- popq %rsi
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
@@ -127,8 +125,6 @@ SYM_CODE_START(secondary_startup_64)
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
*
- * %rsi holds a physical pointer to real_mode_data.
- *
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
*
@@ -153,6 +149,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
+ /* Clear %R15 which holds the boot_params pointer on the boot CPU */
+ xorq %r15, %r15
+
/*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
@@ -199,13 +198,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* hypervisor could lie about the C-bit position to perform a ROP
* attack on the guest by writing to the unencrypted stack and wait for
* the next RET instruction.
- * %rsi carries pointer to realmode data and is callee-clobbered. Save
- * and restore it.
*/
- pushq %rsi
movq %rax, %rdi
call sev_verify_cbit
- popq %rsi
/*
* Switch to new page-table
@@ -365,9 +360,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
wrmsr
/* Setup and Load IDT */
- pushq %rsi
call early_setup_idt
- popq %rsi
/* Check if nx is implemented */
movl $0x80000001, %eax
@@ -403,9 +396,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
pushq $0
popfq
- /* rsi is pointer to real mode structure with interesting info.
- pass it to C */
- movq %rsi, %rdi
+ /* Pass the boot_params pointer as first argument */
+ movq %r15, %rdi
.Ljump_to_C_code:
/*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c8eb1ac5125a..1648aa0204d9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -421,7 +421,7 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
* the IO_APIC has been initialized.
*/
hc->cpu = boot_cpu_data.cpu_index;
- strncpy(hc->name, "hpet", sizeof(hc->name));
+ strscpy(hc->name, "hpet", sizeof(hc->name));
hpet_init_clockevent(hc, 50);
hc->evt.tick_resume = hpet_clkevt_legacy_resume;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1cceac5984da..526d4da3dcd4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -966,10 +966,8 @@ static void __init kvm_init_platform(void)
* Ensure that _bss_decrypted section is marked as decrypted in the
* shared pages list.
*/
- nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted,
- PAGE_SIZE);
early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
- nr_pages, 0);
+ __end_bss_decrypted - __start_bss_decrypted, 0);
/*
* If not booted using EFI, enable Live migration support.
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ac10b46c5832..975f98d5eee5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -75,10 +75,16 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
void __init native_pv_lock_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
static_branch_disable(&virt_spin_lock_key);
}
+static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ tlb_remove_page(tlb, table);
+}
+
unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
unsigned int len)
{
@@ -295,8 +301,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table =
- (void (*)(struct mmu_gather *, void *))tlb_remove_page,
+ .mmu.tlb_remove_table = native_tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 1ee7bed453de..d380c9399480 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -1575,6 +1575,9 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
long val, *reg = vc_insn_get_rm(ctxt);
enum es_result ret;
+ if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
+ return ES_VMM_ERROR;
+
if (!reg)
return ES_DECODE_FAILED;
@@ -1612,6 +1615,9 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
long *reg = vc_insn_get_rm(ctxt);
+ if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
+ return ES_VMM_ERROR;
+
if (!reg)
return ES_DECODE_FAILED;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e1aa2cd7734b..d40ed3a7dc23 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -327,14 +327,6 @@ static void notrace start_secondary(void *unused)
}
/**
- * topology_smt_supported - Check whether SMT is supported by the CPUs
- */
-bool topology_smt_supported(void)
-{
- return smp_num_siblings > 1;
-}
-
-/**
* topology_phys_to_logical_pkg - Map a physical package id to a logical
* @phys_pkg: The physical package id to map
*
@@ -632,14 +624,9 @@ static void __init build_sched_topology(void)
};
#endif
#ifdef CONFIG_SCHED_CLUSTER
- /*
- * For now, skip the cluster domain on Hybrid.
- */
- if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
- x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
- };
- }
+ x86_topology[i++] = (struct sched_domain_topology_level){
+ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
+ };
#endif
#ifdef CONFIG_SCHED_MC
x86_topology[i++] = (struct sched_domain_topology_level){
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3425c6a943e4..15f97c0abc9d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1258,7 +1258,7 @@ static void __init check_system_tsc_reliable(void)
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
- nr_online_nodes <= 2)
+ nr_online_nodes <= 4)
tsc_disable_clocksource_watchdog();
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8192452d1d2d..ffa25e962343 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,7 +20,6 @@
#include <asm/tlb.h>
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
-#include <asm/microcode.h>
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
@@ -273,7 +272,7 @@ static void __init probe_page_size_mask(void)
static const struct x86_cpu_id invlpg_miss_ids[] = {
INTEL_MATCH(INTEL_FAM6_ALDERLAKE ),
INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ),
+ INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ),
INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ),
INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 54bbd5163e8d..6faea41e99b6 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -288,11 +288,10 @@ static bool amd_enc_cache_flush_required(void)
return !cpu_feature_enabled(X86_FEATURE_SME_COHERENT);
}
-static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
+static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc)
{
#ifdef CONFIG_PARAVIRT
- unsigned long sz = npages << PAGE_SHIFT;
- unsigned long vaddr_end = vaddr + sz;
+ unsigned long vaddr_end = vaddr + size;
while (vaddr < vaddr_end) {
int psize, pmask, level;
@@ -342,7 +341,7 @@ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool e
snp_set_memory_private(vaddr, npages);
if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
- enc_dec_hypercall(vaddr, npages, enc);
+ enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc);
return true;
}
@@ -466,7 +465,7 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr,
ret = 0;
- early_set_mem_enc_dec_hypercall(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
+ early_set_mem_enc_dec_hypercall(start, size, enc);
out:
__flush_tlb_all();
return ret;
@@ -482,9 +481,9 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
return early_set_memory_enc_dec(vaddr, size, true);
}
-void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc)
{
- enc_dec_hypercall(vaddr, npages, enc);
+ enc_dec_hypercall(vaddr, size, enc);
}
void __init sme_early_init(void)
diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c
index c69f8471e6d0..4ef20b49eb5e 100644
--- a/arch/x86/platform/efi/memmap.c
+++ b/arch/x86/platform/efi/memmap.c
@@ -82,7 +82,7 @@ int __init efi_memmap_alloc(unsigned int num_entries,
/**
* efi_memmap_install - Install a new EFI memory map in efi.memmap
- * @ctx: map allocation parameters (address, size, flags)
+ * @data: efi memmap installation parameters
*
* Unlike efi_memmap_init_*(), this function does not allow the caller
* to switch from early to late mappings. It simply uses the existing
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index a60af0230e27..a6ab43f69b7d 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -202,21 +202,17 @@ static int param_set_action(const char *val, const struct kernel_param *kp)
{
int i;
int n = ARRAY_SIZE(valid_acts);
- char arg[ACTION_LEN], *p;
+ char arg[ACTION_LEN];
/* (remove possible '\n') */
- strncpy(arg, val, ACTION_LEN - 1);
- arg[ACTION_LEN - 1] = '\0';
- p = strchr(arg, '\n');
- if (p)
- *p = '\0';
+ strscpy(arg, val, strnchrnul(val, sizeof(arg)-1, '\n') - val + 1);
for (i = 0; i < n; i++)
if (!strcmp(arg, valid_acts[i].action))
break;
if (i < n) {
- strcpy(uv_nmi_action, arg);
+ strscpy(uv_nmi_action, arg, sizeof(uv_nmi_action));
pr_info("UV: New NMI action:%s\n", uv_nmi_action);
return 0;
}
@@ -959,7 +955,7 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
/* Unexpected return, revert action to "dump" */
if (master)
- strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action));
+ strscpy(uv_nmi_action, "dump", sizeof(uv_nmi_action));
}
/* Pause as all CPU's enter the NMI handler */
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index 7558139920f8..aea47e793963 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -14,6 +14,7 @@
#include <crypto/sha2.h>
#include <asm/purgatory.h>
+#include "../boot/compressed/error.h"
#include "../boot/string.h"
u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(".kexec-purgatory");
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 93b658248d01..27fc170838e9 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -79,7 +79,7 @@
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
#include <asm/acpi.h>
-#include <acpi/pdc_intel.h>
+#include <acpi/proc_cap_intel.h>
#include <acpi/processor.h>
#include <xen/interface/platform.h>
#endif
@@ -288,17 +288,17 @@ static bool __init xen_check_mwait(void)
native_cpuid(&ax, &bx, &cx, &dx);
- /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ /* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so,
* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
*/
buf[0] = ACPI_PDC_REVISION_ID;
buf[1] = 1;
- buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+ buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP);
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
if ((HYPERVISOR_platform_op(&op) == 0) &&
- (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ (buf[2] & (ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
}
@@ -523,7 +523,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
- pfn = virt_to_pfn(va);
+ pfn = virt_to_pfn((void *)va);
mfn = pfn_to_mfn(pfn);
pte = pfn_pte(pfn, PAGE_KERNEL_RO);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index e0a975165de7..eb3bac0b22a1 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2202,13 +2202,13 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
mcs = __xen_mc_entry(0);
if (in_frames)
- in_frames[i] = virt_to_mfn(vaddr);
+ in_frames[i] = virt_to_mfn((void *)vaddr);
MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
- __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+ __set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY);
if (out_frames)
- out_frames[i] = virt_to_pfn(vaddr);
+ out_frames[i] = virt_to_pfn((void *)vaddr);
}
xen_mc_issue(0);
}
@@ -2250,7 +2250,7 @@ static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
MULTI_update_va_mapping(mcs.mc, vaddr,
mfn_pte(mfn, PAGE_KERNEL), flags);
- set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn);
}
xen_mc_issue(0);
@@ -2310,12 +2310,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
int success;
unsigned long vstart = (unsigned long)phys_to_virt(pstart);
- /*
- * Currently an auto-translated guest will not perform I/O, nor will
- * it require PAE page directories below 4GB. Therefore any calls to
- * this function are redundant and can be ignored.
- */
-
if (unlikely(order > MAX_CONTIG_ORDER))
return -ENOMEM;
@@ -2327,7 +2321,7 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
xen_zap_pfn_range(vstart, order, in_frames, NULL);
/* 2. Get a new contiguous memory extent. */
- out_frame = virt_to_pfn(vstart);
+ out_frame = virt_to_pfn((void *)vstart);
success = xen_exchange_memory(1UL << order, 0, in_frames,
1, order, &out_frame,
address_bits);
@@ -2360,7 +2354,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
spin_lock_irqsave(&xen_reservation_lock, flags);
/* 1. Find start MFN of contiguous extent. */
- in_frame = virt_to_mfn(vstart);
+ in_frame = virt_to_mfn((void *)vstart);
/* 2. Zap current PTEs. */
xen_zap_pfn_range(vstart, order, NULL, out_frames);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8b5cf7bb1f47..50c998b844fb 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -340,7 +340,7 @@ static void __init xen_do_set_identity_and_remap_chunk(
WARN_ON(size == 0);
- mfn_save = virt_to_mfn(buf);
+ mfn_save = virt_to_mfn((void *)buf);
for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
ident_pfn_iter < ident_end_pfn;
@@ -503,7 +503,7 @@ void __init xen_remap_memory(void)
unsigned long pfn_s = ~0UL;
unsigned long len = 0;
- mfn_save = virt_to_mfn(buf);
+ mfn_save = virt_to_mfn((void *)buf);
while (xen_remap_mfn != INVALID_P2M_ENTRY) {
/* Map the remap information */
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 2b69c3c035b6..fc1a4f3c81d9 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -422,3 +422,4 @@
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2
diff --git a/block/bdev.c b/block/bdev.c
index 979e28a46b98..f3b13aa1b7d4 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -206,23 +206,6 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
}
EXPORT_SYMBOL(sync_blockdev_range);
-/*
- * Write out and wait upon all dirty data associated with this
- * device. Filesystem data as well as the underlying block
- * device. Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
- struct super_block *sb = get_super(bdev);
- if (sb) {
- int res = sync_filesystem(sb);
- drop_super(sb);
- return res;
- }
- return sync_blockdev(bdev);
-}
-EXPORT_SYMBOL(fsync_bdev);
-
/**
* freeze_bdev - lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
@@ -248,9 +231,9 @@ int freeze_bdev(struct block_device *bdev)
if (!sb)
goto sync;
if (sb->s_op->freeze_super)
- error = sb->s_op->freeze_super(sb);
+ error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
else
- error = freeze_super(sb);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
deactivate_super(sb);
if (error) {
@@ -291,9 +274,9 @@ int thaw_bdev(struct block_device *bdev)
goto out;
if (sb->s_op->thaw_super)
- error = sb->s_op->thaw_super(sb);
+ error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
else
- error = thaw_super(sb);
+ error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
if (error)
bdev->bd_fsfreeze_count++;
else
@@ -960,26 +943,38 @@ out_path_put:
}
EXPORT_SYMBOL(lookup_bdev);
-int __invalidate_device(struct block_device *bdev, bool kill_dirty)
+/**
+ * bdev_mark_dead - mark a block device as dead
+ * @bdev: block device to operate on
+ * @surprise: indicate a surprise removal
+ *
+ * Tell the file system that this devices or media is dead. If @surprise is set
+ * to %true the device or media is already gone, if not we are preparing for an
+ * orderly removal.
+ *
+ * This calls into the file system, which then typicall syncs out all dirty data
+ * and writes back inodes and then invalidates any cached data in the inodes on
+ * the file system. In addition we also invalidate the block device mapping.
+ */
+void bdev_mark_dead(struct block_device *bdev, bool surprise)
{
- struct super_block *sb = get_super(bdev);
- int res = 0;
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
+ bdev->bd_holder_ops->mark_dead(bdev, surprise);
+ else
+ sync_blockdev(bdev);
+ mutex_unlock(&bdev->bd_holder_lock);
- if (sb) {
- /*
- * no need to lock the super, get_super holds the
- * read mutex so the filesystem cannot go away
- * under us (->put_super runs with the write lock
- * hold).
- */
- shrink_dcache_sb(sb);
- res = invalidate_inodes(sb, kill_dirty);
- drop_super(sb);
- }
invalidate_bdev(bdev);
- return res;
}
-EXPORT_SYMBOL(__invalidate_device);
+#ifdef CONFIG_DASD_MODULE
+/*
+ * Drivers should not use this directly, but the DASD driver has historically
+ * had a shutdown to offline mode that doesn't actually remove the gendisk
+ * that otherwise looks a lot like a safe device removal.
+ */
+EXPORT_SYMBOL_GPL(bdev_mark_dead);
+#endif
void sync_bdevs(bool wait)
{
diff --git a/block/disk-events.c b/block/disk-events.c
index 0cfac464e6d1..422db8292d09 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -281,9 +281,7 @@ bool disk_check_media_change(struct gendisk *disk)
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return false;
- if (__invalidate_device(disk->part0, true))
- pr_warn("VFS: busy inodes on changed media %s\n",
- disk->disk_name);
+ bdev_mark_dead(disk->part0, true);
set_bit(GD_NEED_PART_SCAN, &disk->state);
return true;
}
@@ -294,25 +292,16 @@ EXPORT_SYMBOL(disk_check_media_change);
* @disk: the disk which will raise the event
* @events: the events to raise
*
- * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present,
- * attempt to free all dentries and inodes and invalidates all block
+ * Should be called when the media changes for @disk. Generates a uevent
+ * and attempts to free all dentries and inodes and invalidates all block
* device page cache entries in that case.
- *
- * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not.
*/
-bool disk_force_media_change(struct gendisk *disk, unsigned int events)
+void disk_force_media_change(struct gendisk *disk)
{
- disk_event_uevent(disk, events);
-
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return false;
-
+ disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE);
inc_diskseq(disk);
- if (__invalidate_device(disk->part0, true))
- pr_warn("VFS: busy inodes on changed media %s\n",
- disk->disk_name);
+ bdev_mark_dead(disk->part0, true);
set_bit(GD_NEED_PART_SCAN, &disk->state);
- return true;
}
EXPORT_SYMBOL_GPL(disk_force_media_change);
diff --git a/block/genhd.c b/block/genhd.c
index 3d287b32d50d..cc32a0c704eb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -554,7 +554,7 @@ out_exit_elevator:
}
EXPORT_SYMBOL(device_add_disk);
-static void blk_report_disk_dead(struct gendisk *disk)
+static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
{
struct block_device *bdev;
unsigned long idx;
@@ -565,10 +565,7 @@ static void blk_report_disk_dead(struct gendisk *disk)
continue;
rcu_read_unlock();
- mutex_lock(&bdev->bd_holder_lock);
- if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
- bdev->bd_holder_ops->mark_dead(bdev);
- mutex_unlock(&bdev->bd_holder_lock);
+ bdev_mark_dead(bdev, surprise);
put_device(&bdev->bd_device);
rcu_read_lock();
@@ -576,14 +573,7 @@ static void blk_report_disk_dead(struct gendisk *disk)
rcu_read_unlock();
}
-/**
- * blk_mark_disk_dead - mark a disk as dead
- * @disk: disk to mark as dead
- *
- * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O
- * to this disk.
- */
-void blk_mark_disk_dead(struct gendisk *disk)
+static void __blk_mark_disk_dead(struct gendisk *disk)
{
/*
* Fail any new I/O.
@@ -603,8 +593,19 @@ void blk_mark_disk_dead(struct gendisk *disk)
* Prevent new I/O from crossing bio_queue_enter().
*/
blk_queue_start_drain(disk->queue);
+}
- blk_report_disk_dead(disk);
+/**
+ * blk_mark_disk_dead - mark a disk as dead
+ * @disk: disk to mark as dead
+ *
+ * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O
+ * to this disk.
+ */
+void blk_mark_disk_dead(struct gendisk *disk)
+{
+ __blk_mark_disk_dead(disk);
+ blk_report_disk_dead(disk, true);
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
@@ -641,18 +642,20 @@ void del_gendisk(struct gendisk *disk)
disk_del_events(disk);
/*
- * Prevent new openers by unlinked the bdev inode, and write out
- * dirty data before marking the disk dead and stopping all I/O.
+ * Prevent new openers by unlinked the bdev inode.
*/
mutex_lock(&disk->open_mutex);
- xa_for_each(&disk->part_tbl, idx, part) {
+ xa_for_each(&disk->part_tbl, idx, part)
remove_inode_hash(part->bd_inode);
- fsync_bdev(part);
- __invalidate_device(part, true);
- }
mutex_unlock(&disk->open_mutex);
- blk_mark_disk_dead(disk);
+ /*
+ * Tell the file system to write back all dirty data and shut down if
+ * it hasn't been notified earlier.
+ */
+ if (!test_bit(GD_DEAD, &disk->state))
+ blk_report_disk_dead(disk, false);
+ __blk_mark_disk_dead(disk);
/*
* Drop all partitions now that the disk is marked dead.
diff --git a/block/ioctl.c b/block/ioctl.c
index 3be11941fb2d..648670ddb164 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -364,7 +364,14 @@ static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
{
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- fsync_bdev(bdev);
+
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync)
+ bdev->bd_holder_ops->sync(bdev);
+ else
+ sync_blockdev(bdev);
+ mutex_unlock(&bdev->bd_holder_lock);
+
invalidate_bdev(bdev);
return 0;
}
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 13a7341299a9..e137a87f4db0 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -281,10 +281,7 @@ static void delete_partition(struct block_device *part)
* looked up any more even when openers still hold references.
*/
remove_inode_hash(part->bd_inode);
-
- fsync_bdev(part);
- __invalidate_device(part, true);
-
+ bdev_mark_dead(part, false);
drop_partition(part);
}
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 06b15b9f661c..10efb56d8b48 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1241,6 +1241,8 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
return -ENOMEM;
}
+ rsgl->sgl.need_unpin =
+ iov_iter_extract_will_pin(&msg->msg_iter);
rsgl->sgl.sgt.sgl = rsgl->sgl.sgl;
rsgl->sgl.sgt.nents = 0;
rsgl->sgl.sgt.orig_nents = 0;
@@ -1255,8 +1257,6 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
}
sg_mark_end(rsgl->sgl.sgt.sgl + rsgl->sgl.sgt.nents - 1);
- rsgl->sgl.need_unpin =
- iov_iter_extract_will_pin(&msg->msg_iter);
/* chain the new scatterlist with previous one */
if (areq->last_rsgl)
diff --git a/drivers/Makefile b/drivers/Makefile
index 7241d80a7b29..a7459e77df37 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -195,3 +195,5 @@ obj-$(CONFIG_PECI) += peci/
obj-$(CONFIG_HTE) += hte/
obj-$(CONFIG_DRM_ACCEL) += accel/
obj-$(CONFIG_CDX_BUS) += cdx/
+
+obj-$(CONFIG_S390) += s390/
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 00dd309b6682..cee82b473dc5 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -581,7 +581,7 @@ config ACPI_VIOT
config ACPI_PRMT
bool "Platform Runtime Mechanism Support"
- depends on EFI && (X86_64 || ARM64)
+ depends on EFI_RUNTIME_WRAPPERS && (X86_64 || ARM64)
default y
help
Platform Runtime Mechanism (PRM) is a firmware interface exposing a
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 3fc5a0d54f6e..eaa09bf52f17 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -50,7 +50,6 @@ acpi-$(CONFIG_PCI) += acpi_lpss.o
acpi-y += acpi_apd.o
acpi-y += acpi_platform.o
acpi-y += acpi_pnp.o
-acpi-$(CONFIG_ARM_AMBA) += acpi_amba.o
acpi-y += power.o
acpi-y += event.o
acpi-y += evged.o
diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 1ace70b831cd..225dc6818751 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -34,7 +34,7 @@ MODULE_LICENSE("GPL");
static int acpi_ac_add(struct acpi_device *device);
static void acpi_ac_remove(struct acpi_device *device);
-static void acpi_ac_notify(struct acpi_device *device, u32 event);
+static void acpi_ac_notify(acpi_handle handle, u32 event, void *data);
static const struct acpi_device_id ac_device_ids[] = {
{"ACPI0003", 0},
@@ -54,11 +54,9 @@ static struct acpi_driver acpi_ac_driver = {
.name = "ac",
.class = ACPI_AC_CLASS,
.ids = ac_device_ids,
- .flags = ACPI_DRIVER_ALL_NOTIFY_EVENTS,
.ops = {
.add = acpi_ac_add,
.remove = acpi_ac_remove,
- .notify = acpi_ac_notify,
},
.drv.pm = &acpi_ac_pm,
};
@@ -128,8 +126,9 @@ static enum power_supply_property ac_props[] = {
};
/* Driver Model */
-static void acpi_ac_notify(struct acpi_device *device, u32 event)
+static void acpi_ac_notify(acpi_handle handle, u32 event, void *data)
{
+ struct acpi_device *device = data;
struct acpi_ac *ac = acpi_driver_data(device);
if (!ac)
@@ -235,7 +234,7 @@ static int acpi_ac_add(struct acpi_device *device)
result = acpi_ac_get_state(ac);
if (result)
- goto end;
+ goto err_release_ac;
psy_cfg.drv_data = ac;
@@ -248,7 +247,7 @@ static int acpi_ac_add(struct acpi_device *device)
&ac->charger_desc, &psy_cfg);
if (IS_ERR(ac->charger)) {
result = PTR_ERR(ac->charger);
- goto end;
+ goto err_release_ac;
}
pr_info("%s [%s] (%s)\n", acpi_device_name(device),
@@ -256,9 +255,19 @@ static int acpi_ac_add(struct acpi_device *device)
ac->battery_nb.notifier_call = acpi_ac_battery_notify;
register_acpi_notifier(&ac->battery_nb);
-end:
+
+ result = acpi_dev_install_notify_handler(device, ACPI_ALL_NOTIFY,
+ acpi_ac_notify);
if (result)
- kfree(ac);
+ goto err_unregister;
+
+ return 0;
+
+err_unregister:
+ power_supply_unregister(ac->charger);
+ unregister_acpi_notifier(&ac->battery_nb);
+err_release_ac:
+ kfree(ac);
return result;
}
@@ -297,6 +306,8 @@ static void acpi_ac_remove(struct acpi_device *device)
ac = acpi_driver_data(device);
+ acpi_dev_remove_notify_handler(device, ACPI_ALL_NOTIFY,
+ acpi_ac_notify);
power_supply_unregister(ac->charger);
unregister_acpi_notifier(&ac->battery_nb);
diff --git a/drivers/acpi/acpi_cmos_rtc.c b/drivers/acpi/acpi_cmos_rtc.c
index 4cf4aef7ce0c..9b55d1593d16 100644
--- a/drivers/acpi/acpi_cmos_rtc.c
+++ b/drivers/acpi/acpi_cmos_rtc.c
@@ -51,12 +51,11 @@ acpi_cmos_rtc_space_handler(u32 function, acpi_physical_address address,
return AE_OK;
}
-static int acpi_install_cmos_rtc_space_handler(struct acpi_device *adev,
- const struct acpi_device_id *id)
+int acpi_install_cmos_rtc_space_handler(acpi_handle handle)
{
acpi_status status;
- status = acpi_install_address_space_handler(adev->handle,
+ status = acpi_install_address_space_handler(handle,
ACPI_ADR_SPACE_CMOS,
&acpi_cmos_rtc_space_handler,
NULL, NULL);
@@ -67,18 +66,30 @@ static int acpi_install_cmos_rtc_space_handler(struct acpi_device *adev,
return 1;
}
+EXPORT_SYMBOL_GPL(acpi_install_cmos_rtc_space_handler);
-static void acpi_remove_cmos_rtc_space_handler(struct acpi_device *adev)
+void acpi_remove_cmos_rtc_space_handler(acpi_handle handle)
{
- if (ACPI_FAILURE(acpi_remove_address_space_handler(adev->handle,
+ if (ACPI_FAILURE(acpi_remove_address_space_handler(handle,
ACPI_ADR_SPACE_CMOS, &acpi_cmos_rtc_space_handler)))
pr_err("Error removing CMOS-RTC region handler\n");
}
+EXPORT_SYMBOL_GPL(acpi_remove_cmos_rtc_space_handler);
+
+static int acpi_cmos_rtc_attach_handler(struct acpi_device *adev, const struct acpi_device_id *id)
+{
+ return acpi_install_cmos_rtc_space_handler(adev->handle);
+}
+
+static void acpi_cmos_rtc_detach_handler(struct acpi_device *adev)
+{
+ acpi_remove_cmos_rtc_space_handler(adev->handle);
+}
static struct acpi_scan_handler cmos_rtc_handler = {
.ids = acpi_cmos_rtc_ids,
- .attach = acpi_install_cmos_rtc_space_handler,
- .detach = acpi_remove_cmos_rtc_space_handler,
+ .attach = acpi_cmos_rtc_attach_handler,
+ .detach = acpi_cmos_rtc_detach_handler,
};
void __init acpi_cmos_rtc_init(void)
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index e648158368a7..e120a96e1eae 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -172,7 +172,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
fru_text = "";
sec_type = (guid_t *)gdata->section_type;
if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
- struct cper_sec_mem_err *mem = (void *)(gdata + 1);
+ struct cper_sec_mem_err *mem = acpi_hest_get_payload(gdata);
if (gdata->error_data_length >= sizeof(*mem))
trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index f9aa02cac6d1..c711db8a9c33 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -9,9 +9,11 @@
* Copyright (C) 2013, Intel Corporation
* Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#define pr_fmt(fmt) "ACPI: " fmt
#include <linux/acpi.h>
#include <linux/device.h>
+#include <linux/dmi.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
@@ -21,6 +23,8 @@
#include <asm/cpu.h>
+#include <xen/xen.h>
+
#include "internal.h"
DEFINE_PER_CPU(struct acpi_processor *, processors);
@@ -508,54 +512,110 @@ static void acpi_processor_remove(struct acpi_device *device)
}
#endif /* CONFIG_ACPI_HOTPLUG_CPU */
-#ifdef CONFIG_X86
-static bool acpi_hwp_native_thermal_lvt_set;
-static acpi_status __init acpi_hwp_native_thermal_lvt_osc(acpi_handle handle,
- u32 lvl,
- void *context,
- void **rv)
+#ifdef CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC
+bool __init processor_physically_present(acpi_handle handle)
+{
+ int cpuid, type;
+ u32 acpi_id;
+ acpi_status status;
+ acpi_object_type acpi_type;
+ unsigned long long tmp;
+ union acpi_object object = {};
+ struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+
+ status = acpi_get_type(handle, &acpi_type);
+ if (ACPI_FAILURE(status))
+ return false;
+
+ switch (acpi_type) {
+ case ACPI_TYPE_PROCESSOR:
+ status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
+ if (ACPI_FAILURE(status))
+ return false;
+ acpi_id = object.processor.proc_id;
+ break;
+ case ACPI_TYPE_DEVICE:
+ status = acpi_evaluate_integer(handle, METHOD_NAME__UID,
+ NULL, &tmp);
+ if (ACPI_FAILURE(status))
+ return false;
+ acpi_id = tmp;
+ break;
+ default:
+ return false;
+ }
+
+ if (xen_initial_domain())
+ /*
+ * When running as a Xen dom0 the number of processors Linux
+ * sees can be different from the real number of processors on
+ * the system, and we still need to execute _PDC or _OSC for
+ * all of them.
+ */
+ return xen_processor_present(acpi_id);
+
+ type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
+ cpuid = acpi_get_cpuid(handle, type, acpi_id);
+
+ return !invalid_logical_cpuid(cpuid);
+}
+
+/* vendor specific UUID indicating an Intel platform */
+static u8 sb_uuid_str[] = "4077A616-290C-47BE-9EBD-D87058713953";
+
+static acpi_status __init acpi_processor_osc(acpi_handle handle, u32 lvl,
+ void *context, void **rv)
{
- u8 sb_uuid_str[] = "4077A616-290C-47BE-9EBD-D87058713953";
- u32 capbuf[2];
+ u32 capbuf[2] = {};
struct acpi_osc_context osc_context = {
.uuid_str = sb_uuid_str,
.rev = 1,
.cap.length = 8,
.cap.pointer = capbuf,
};
+ acpi_status status;
- if (acpi_hwp_native_thermal_lvt_set)
- return AE_CTRL_TERMINATE;
+ if (!processor_physically_present(handle))
+ return AE_OK;
- capbuf[0] = 0x0000;
- capbuf[1] = 0x1000; /* set bit 12 */
+ arch_acpi_set_proc_cap_bits(&capbuf[OSC_SUPPORT_DWORD]);
- if (ACPI_SUCCESS(acpi_run_osc(handle, &osc_context))) {
- if (osc_context.ret.pointer && osc_context.ret.length > 1) {
- u32 *capbuf_ret = osc_context.ret.pointer;
+ status = acpi_run_osc(handle, &osc_context);
+ if (ACPI_FAILURE(status))
+ return status;
- if (capbuf_ret[1] & 0x1000) {
- acpi_handle_info(handle,
- "_OSC native thermal LVT Acked\n");
- acpi_hwp_native_thermal_lvt_set = true;
- }
- }
- kfree(osc_context.ret.pointer);
- }
+ kfree(osc_context.ret.pointer);
return AE_OK;
}
-void __init acpi_early_processor_osc(void)
+static bool __init acpi_early_processor_osc(void)
{
- if (boot_cpu_has(X86_FEATURE_HWP)) {
- acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX,
- acpi_hwp_native_thermal_lvt_osc,
- NULL, NULL, NULL);
- acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID,
- acpi_hwp_native_thermal_lvt_osc,
- NULL, NULL);
+ acpi_status status;
+
+ acpi_proc_quirk_mwait_check();
+
+ status = acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX, acpi_processor_osc, NULL,
+ NULL, NULL);
+ if (ACPI_FAILURE(status))
+ return false;
+
+ status = acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, acpi_processor_osc,
+ NULL, NULL);
+ if (ACPI_FAILURE(status))
+ return false;
+
+ return true;
+}
+
+void __init acpi_early_processor_control_setup(void)
+{
+ if (acpi_early_processor_osc()) {
+ pr_info("_OSC evaluated successfully for all CPUs\n");
+ } else {
+ pr_info("_OSC evaluation for CPUs failed, trying _PDC\n");
+ acpi_early_processor_set_pdc();
}
}
#endif
diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c
index e9b8e8305e23..33c3b16af556 100644
--- a/drivers/acpi/acpi_tad.c
+++ b/drivers/acpi/acpi_tad.c
@@ -557,6 +557,7 @@ static int acpi_tad_disable_timer(struct device *dev, u32 timer_id)
static int acpi_tad_remove(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
+ acpi_handle handle = ACPI_HANDLE(dev);
struct acpi_tad_driver_data *dd = dev_get_drvdata(dev);
device_init_wakeup(dev, false);
@@ -577,6 +578,7 @@ static int acpi_tad_remove(struct platform_device *pdev)
pm_runtime_put_sync(dev);
pm_runtime_disable(dev);
+ acpi_remove_cmos_rtc_space_handler(handle);
return 0;
}
@@ -589,6 +591,11 @@ static int acpi_tad_probe(struct platform_device *pdev)
unsigned long long caps;
int ret;
+ ret = acpi_install_cmos_rtc_space_handler(handle);
+ if (ret < 0) {
+ dev_info(dev, "Unable to install space handler\n");
+ return -ENODEV;
+ }
/*
* Initialization failure messages are mostly about firmware issues, so
* print them at the "info" level.
@@ -596,22 +603,27 @@ static int acpi_tad_probe(struct platform_device *pdev)
status = acpi_evaluate_integer(handle, "_GCP", NULL, &caps);
if (ACPI_FAILURE(status)) {
dev_info(dev, "Unable to get capabilities\n");
- return -ENODEV;
+ ret = -ENODEV;
+ goto remove_handler;
}
if (!(caps & ACPI_TAD_AC_WAKE)) {
dev_info(dev, "Unsupported capabilities\n");
- return -ENODEV;
+ ret = -ENODEV;
+ goto remove_handler;
}
if (!acpi_has_method(handle, "_PRW")) {
dev_info(dev, "Missing _PRW\n");
- return -ENODEV;
+ ret = -ENODEV;
+ goto remove_handler;
}
dd = devm_kzalloc(dev, sizeof(*dd), GFP_KERNEL);
- if (!dd)
- return -ENOMEM;
+ if (!dd) {
+ ret = -ENOMEM;
+ goto remove_handler;
+ }
dd->capabilities = caps;
dev_set_drvdata(dev, dd);
@@ -653,6 +665,11 @@ static int acpi_tad_probe(struct platform_device *pdev)
fail:
acpi_tad_remove(pdev);
+ /* Don't fallthrough because cmos rtc space handler is removed in acpi_tad_remove() */
+ return ret;
+
+remove_handler:
+ acpi_remove_cmos_rtc_space_handler(handle);
return ret;
}
diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index 62f4364e4460..948e31f7ce6e 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -77,7 +77,7 @@ static DEFINE_MUTEX(video_list_lock);
static LIST_HEAD(video_bus_head);
static int acpi_video_bus_add(struct acpi_device *device);
static void acpi_video_bus_remove(struct acpi_device *device);
-static void acpi_video_bus_notify(struct acpi_device *device, u32 event);
+static void acpi_video_bus_notify(acpi_handle handle, u32 event, void *data);
/*
* Indices in the _BCL method response: the first two items are special,
@@ -104,7 +104,6 @@ static struct acpi_driver acpi_video_bus = {
.ops = {
.add = acpi_video_bus_add,
.remove = acpi_video_bus_remove,
- .notify = acpi_video_bus_notify,
},
};
@@ -1527,8 +1526,9 @@ static int acpi_video_bus_stop_devices(struct acpi_video_bus *video)
acpi_osi_is_win8() ? 0 : 1);
}
-static void acpi_video_bus_notify(struct acpi_device *device, u32 event)
+static void acpi_video_bus_notify(acpi_handle handle, u32 event, void *data)
{
+ struct acpi_device *device = data;
struct acpi_video_bus *video = acpi_driver_data(device);
struct input_dev *input;
int keycode = 0;
@@ -2027,6 +2027,12 @@ static int acpi_video_bus_add(struct acpi_device *device)
if (error)
goto err_put_video;
+ /*
+ * HP ZBook Fury 16 G10 requires ACPI video's child devices have _PS0
+ * evaluated to have functional panel brightness control.
+ */
+ acpi_device_fix_up_power_extended(device);
+
pr_info("%s [%s] (multi-head: %s rom: %s post: %s)\n",
ACPI_VIDEO_DEVICE_NAME, acpi_device_bid(device),
video->flags.multihead ? "yes" : "no",
@@ -2053,8 +2059,19 @@ static int acpi_video_bus_add(struct acpi_device *device)
acpi_video_bus_add_notify_handler(video);
+ error = acpi_dev_install_notify_handler(device, ACPI_DEVICE_NOTIFY,
+ acpi_video_bus_notify);
+ if (error)
+ goto err_remove;
+
return 0;
+err_remove:
+ mutex_lock(&video_list_lock);
+ list_del(&video->entry);
+ mutex_unlock(&video_list_lock);
+ acpi_video_bus_remove_notify_handler(video);
+ acpi_video_bus_unregister_backlight(video);
err_put_video:
acpi_video_bus_put_devices(video);
kfree(video->attached_array);
@@ -2075,6 +2092,9 @@ static void acpi_video_bus_remove(struct acpi_device *device)
video = acpi_driver_data(device);
+ acpi_dev_remove_notify_handler(device, ACPI_DEVICE_NOTIFY,
+ acpi_video_bus_notify);
+
mutex_lock(&video_list_lock);
list_del(&video->entry);
mutex_unlock(&video_list_lock);
diff --git a/drivers/acpi/acpica/acdebug.h b/drivers/acpi/acpica/acdebug.h
index 22f1f7a9e5a3..911875c5a5f1 100644
--- a/drivers/acpi/acpica/acdebug.h
+++ b/drivers/acpi/acpica/acdebug.h
@@ -287,4 +287,6 @@ struct acpi_namespace_node *acpi_db_local_ns_lookup(char *name);
void acpi_db_uint32_to_hex_string(u32 value, char *buffer);
+void acpi_db_generate_interrupt(char *gsiv_arg);
+
#endif /* __ACDEBUG_H__ */
diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 778241173ed4..f4c90fc99be2 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -129,6 +129,7 @@ ACPI_GLOBAL(acpi_table_handler, acpi_gbl_table_handler);
ACPI_GLOBAL(void *, acpi_gbl_table_handler_context);
ACPI_GLOBAL(acpi_interface_handler, acpi_gbl_interface_handler);
ACPI_GLOBAL(struct acpi_sci_handler_info *, acpi_gbl_sci_handler_list);
+ACPI_GLOBAL(struct acpi_ged_handler_info *, acpi_gbl_ged_handler_list);
/* Owner ID support */
diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h
index 12d4a024f029..82563b44af35 100644
--- a/drivers/acpi/acpica/aclocal.h
+++ b/drivers/acpi/acpica/aclocal.h
@@ -543,6 +543,14 @@ struct acpi_field_info {
u32 pkg_length;
};
+/* Information about the interrupt ID and _EVT of a GED device */
+
+struct acpi_ged_handler_info {
+ struct acpi_ged_handler_info *next;
+ u32 int_id; /* The interrupt ID that triggers the execution ofthe evt_method. */
+ struct acpi_namespace_node *evt_method; /* The _EVT method to be executed when an interrupt with ID = int_ID is received */
+};
+
/*****************************************************************************
*
* Generic "state" object for stacks
@@ -560,25 +568,28 @@ struct acpi_field_info {
u8 descriptor_type; /* To differentiate various internal objs */\
u8 flags; \
u16 value; \
- u16 state;
+ u16 state
/* There are 2 bytes available here until the next natural alignment boundary */
struct acpi_common_state {
-ACPI_STATE_COMMON};
+ ACPI_STATE_COMMON;
+};
/*
* Update state - used to traverse complex objects such as packages
*/
struct acpi_update_state {
- ACPI_STATE_COMMON union acpi_operand_object *object;
+ ACPI_STATE_COMMON;
+ union acpi_operand_object *object;
};
/*
* Pkg state - used to traverse nested package structures
*/
struct acpi_pkg_state {
- ACPI_STATE_COMMON u32 index;
+ ACPI_STATE_COMMON;
+ u32 index;
union acpi_operand_object *source_object;
union acpi_operand_object *dest_object;
struct acpi_walk_state *walk_state;
@@ -591,7 +602,8 @@ struct acpi_pkg_state {
* Allows nesting of these constructs
*/
struct acpi_control_state {
- ACPI_STATE_COMMON u16 opcode;
+ ACPI_STATE_COMMON;
+ u16 opcode;
union acpi_parse_object *predicate_op;
u8 *aml_predicate_start; /* Start of if/while predicate */
u8 *package_end; /* End of if/while block */
@@ -602,11 +614,13 @@ struct acpi_control_state {
* Scope state - current scope during namespace lookups
*/
struct acpi_scope_state {
- ACPI_STATE_COMMON struct acpi_namespace_node *node;
+ ACPI_STATE_COMMON;
+ struct acpi_namespace_node *node;
};
struct acpi_pscope_state {
- ACPI_STATE_COMMON u32 arg_count; /* Number of fixed arguments */
+ ACPI_STATE_COMMON;
+ u32 arg_count; /* Number of fixed arguments */
union acpi_parse_object *op; /* Current op being parsed */
u8 *arg_end; /* Current argument end */
u8 *pkg_end; /* Current package end */
@@ -618,7 +632,8 @@ struct acpi_pscope_state {
* states are created when there are nested control methods executing.
*/
struct acpi_thread_state {
- ACPI_STATE_COMMON u8 current_sync_level; /* Mutex Sync (nested acquire) level */
+ ACPI_STATE_COMMON;
+ u8 current_sync_level; /* Mutex Sync (nested acquire) level */
struct acpi_walk_state *walk_state_list; /* Head of list of walk_states for this thread */
union acpi_operand_object *acquired_mutex_list; /* List of all currently acquired mutexes */
acpi_thread_id thread_id; /* Running thread ID */
@@ -629,8 +644,8 @@ struct acpi_thread_state {
* AML arguments
*/
struct acpi_result_values {
- ACPI_STATE_COMMON
- union acpi_operand_object *obj_desc[ACPI_RESULTS_FRAME_OBJ_NUM];
+ ACPI_STATE_COMMON;
+ union acpi_operand_object *obj_desc[ACPI_RESULTS_FRAME_OBJ_NUM];
};
typedef
@@ -652,7 +667,8 @@ struct acpi_global_notify_handler {
* handler/dispatcher.
*/
struct acpi_notify_info {
- ACPI_STATE_COMMON u8 handler_list_id;
+ ACPI_STATE_COMMON;
+ u8 handler_list_id;
struct acpi_namespace_node *node;
union acpi_operand_object *handler_list_head;
struct acpi_global_notify_handler *global;
diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h
index e64aabe3d33a..2e442f5a3123 100644
--- a/drivers/acpi/acpica/acpredef.h
+++ b/drivers/acpi/acpica/acpredef.h
@@ -440,6 +440,9 @@ const union acpi_predefined_info acpi_gbl_predefined_methods[] = {
{{"_DOS", METHOD_1ARGS(ACPI_TYPE_INTEGER),
METHOD_NO_RETURN_VALUE}},
+ {{"_DSC", METHOD_0ARGS,
+ METHOD_RETURNS(ACPI_RTYPE_INTEGER)}},
+
{{"_DSD", METHOD_0ARGS, /* ACPI 6.0 */
METHOD_RETURNS(ACPI_RTYPE_PACKAGE)}}, /* Variable-length (Pkgs) each: 1 Buf, 1 Pkg */
PACKAGE_INFO(ACPI_PTYPE2_UUID_PAIR, ACPI_RTYPE_BUFFER, 1,
diff --git a/drivers/acpi/acpica/dbcmds.c b/drivers/acpi/acpica/dbcmds.c
index 9eb68e0751c7..3d99a9048585 100644
--- a/drivers/acpi/acpica/dbcmds.c
+++ b/drivers/acpi/acpica/dbcmds.c
@@ -1010,6 +1010,64 @@ void acpi_db_display_resources(char *object_arg)
acpi_db_set_output_destination(ACPI_DB_CONSOLE_OUTPUT);
}
+/*******************************************************************************
+ *
+ * FUNCTION: acpi_db_generate_ged
+ *
+ * PARAMETERS: ged_arg - Raw GED number, ascii string
+ *
+ * RETURN: None
+ *
+ * DESCRIPTION: Simulate firing of a GED
+ *
+ ******************************************************************************/
+
+void acpi_db_generate_interrupt(char *gsiv_arg)
+{
+ u32 gsiv_number;
+ struct acpi_ged_handler_info *ged_info = acpi_gbl_ged_handler_list;
+
+ if (!ged_info) {
+ acpi_os_printf("No GED handling present\n");
+ }
+
+ gsiv_number = strtoul(gsiv_arg, NULL, 0);
+
+ while (ged_info) {
+
+ if (ged_info->int_id == gsiv_number) {
+ struct acpi_object_list arg_list;
+ union acpi_object arg0;
+ acpi_handle evt_handle = ged_info->evt_method;
+ acpi_status status;
+
+ acpi_os_printf("Evaluate GED _EVT (GSIV=%d)\n",
+ gsiv_number);
+
+ if (!evt_handle) {
+ acpi_os_printf("Undefined _EVT method\n");
+ return;
+ }
+
+ arg0.integer.type = ACPI_TYPE_INTEGER;
+ arg0.integer.value = gsiv_number;
+
+ arg_list.count = 1;
+ arg_list.pointer = &arg0;
+
+ status =
+ acpi_evaluate_object(evt_handle, NULL, &arg_list,
+ NULL);
+ if (ACPI_FAILURE(status)) {
+ acpi_os_printf("Could not evaluate _EVT\n");
+ return;
+ }
+
+ }
+ ged_info = ged_info->next;
+ }
+}
+
#if (!ACPI_REDUCED_HARDWARE)
/*******************************************************************************
*
diff --git a/drivers/acpi/acpica/dbinput.c b/drivers/acpi/acpica/dbinput.c
index b8a48923064f..861b12c334ab 100644
--- a/drivers/acpi/acpica/dbinput.c
+++ b/drivers/acpi/acpica/dbinput.c
@@ -106,6 +106,7 @@ enum acpi_ex_debugger_commands {
CMD_THREADS,
CMD_TEST,
+ CMD_INTERRUPT,
#endif
};
@@ -185,6 +186,7 @@ static const struct acpi_db_command_info acpi_gbl_db_commands[] = {
{"THREADS", 3},
{"TEST", 1},
+ {"INTERRUPT", 1},
#endif
{NULL, 0}
};
@@ -318,6 +320,7 @@ static const struct acpi_db_command_help acpi_gbl_db_command_help[] = {
{1, " Gpes", "Display info on all GPE devices\n"},
{1, " Sci", "Generate an SCI\n"},
{1, " Sleep [SleepState]", "Simulate sleep/wake sequence(s) (0-5)\n"},
+ {1, " Interrupt <GSIV>", "Simulate an interrupt\n"},
#endif
{0, NULL, NULL}
};
@@ -1064,6 +1067,11 @@ acpi_db_command_dispatch(char *input_buffer,
acpi_os_printf("Event command not implemented\n");
break;
+ case CMD_INTERRUPT:
+
+ acpi_db_generate_interrupt(acpi_gbl_db_args[1]);
+ break;
+
case CMD_GPE:
acpi_db_generate_gpe(acpi_gbl_db_args[1], acpi_gbl_db_args[2]);
diff --git a/drivers/acpi/acpica/dswstate.c b/drivers/acpi/acpica/dswstate.c
index d3841ded3a81..75338a13c802 100644
--- a/drivers/acpi/acpica/dswstate.c
+++ b/drivers/acpi/acpica/dswstate.c
@@ -146,8 +146,8 @@ acpi_ds_result_push(union acpi_operand_object *object,
if (!object) {
ACPI_ERROR((AE_INFO,
- "Null Object! Obj=%p State=%p Num=%u",
- object, walk_state, walk_state->result_count));
+ "Null Object! State=%p Num=%u",
+ walk_state, walk_state->result_count));
return (AE_BAD_PARAMETER);
}
diff --git a/drivers/acpi/acpica/exserial.c b/drivers/acpi/acpica/exserial.c
index 5d99b1a76c83..5241f4c01c76 100644
--- a/drivers/acpi/acpica/exserial.c
+++ b/drivers/acpi/acpica/exserial.c
@@ -343,8 +343,7 @@ acpi_ex_write_serial_bus(union acpi_operand_object *source_desc,
/* Copy the input buffer data to the transfer buffer */
buffer = buffer_desc->buffer.pointer;
- data_length = (buffer_length < source_desc->buffer.length ?
- buffer_length : source_desc->buffer.length);
+ data_length = ACPI_MIN(buffer_length, source_desc->buffer.length);
memcpy(buffer, source_desc->buffer.pointer, data_length);
/* Lock entire transaction if requested */
diff --git a/drivers/acpi/acpica/psopcode.c b/drivers/acpi/acpica/psopcode.c
index 09029fe545f1..39e31030e5f4 100644
--- a/drivers/acpi/acpica/psopcode.c
+++ b/drivers/acpi/acpica/psopcode.c
@@ -603,7 +603,7 @@ const struct acpi_opcode_info acpi_gbl_aml_op_info[AML_NUM_OPCODES] = {
/* 7E */ ACPI_OP("Timer", ARGP_TIMER_OP, ARGI_TIMER_OP, ACPI_TYPE_ANY,
AML_CLASS_EXECUTE, AML_TYPE_EXEC_0A_0T_1R,
- AML_FLAGS_EXEC_0A_0T_1R),
+ AML_FLAGS_EXEC_0A_0T_1R | AML_NO_OPERAND_RESOLVE),
/* ACPI 5.0 opcodes */
diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c
index 1bbba8585fa6..c5f6c85a3a09 100644
--- a/drivers/acpi/acpica/utdebug.c
+++ b/drivers/acpi/acpica/utdebug.c
@@ -37,7 +37,12 @@ void acpi_ut_init_stack_ptr_trace(void)
{
acpi_size current_sp;
+#pragma GCC diagnostic push
+#if defined(__GNUC__) && __GNUC__ >= 12
+#pragma GCC diagnostic ignored "-Wdangling-pointer="
+#endif
acpi_gbl_entry_stack_pointer = &current_sp;
+#pragma GCC diagnostic pop
}
/*******************************************************************************
diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index f81fe24894b2..143debc1ba4a 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -3,4 +3,5 @@ obj-$(CONFIG_ACPI_AGDI) += agdi.o
obj-$(CONFIG_ACPI_IORT) += iort.o
obj-$(CONFIG_ACPI_GTDT) += gtdt.o
obj-$(CONFIG_ACPI_APMT) += apmt.o
+obj-$(CONFIG_ARM_AMBA) += amba.o
obj-y += dma.o init.o
diff --git a/drivers/acpi/acpi_amba.c b/drivers/acpi/arm64/amba.c
index f5b443ab01c2..b2a7631d7ac7 100644
--- a/drivers/acpi/acpi_amba.c
+++ b/drivers/acpi/arm64/amba.c
@@ -17,7 +17,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include "internal.h"
+#include "init.h"
static const struct acpi_device_id amba_id_list[] = {
{"ARMH0061", 0}, /* PL061 GPIO Device */
diff --git a/drivers/acpi/arm64/init.c b/drivers/acpi/arm64/init.c
index d3ce53dda122..d0c8aed90fd1 100644
--- a/drivers/acpi/arm64/init.c
+++ b/drivers/acpi/arm64/init.c
@@ -10,4 +10,6 @@ void __init acpi_arm_init(void)
acpi_apmt_init();
if (IS_ENABLED(CONFIG_ACPI_IORT))
acpi_iort_init();
+ if (IS_ENABLED(CONFIG_ARM_AMBA))
+ acpi_amba_init();
}
diff --git a/drivers/acpi/arm64/init.h b/drivers/acpi/arm64/init.h
index a1715a2a34e9..dcc277977194 100644
--- a/drivers/acpi/arm64/init.h
+++ b/drivers/acpi/arm64/init.h
@@ -4,3 +4,4 @@
void __init acpi_agdi_init(void);
void __init acpi_apmt_init(void);
void __init acpi_iort_init(void);
+void __init acpi_amba_init(void);
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 56d887323ae5..6496ff5a6ba2 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1708,7 +1708,10 @@ static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
static struct acpi_platform_list pmcg_plat_info[] __initdata = {
/* HiSilicon Hip08 Platform */
{"HISI ", "HIP08 ", 0, ACPI_SIG_IORT, greater_than_or_equal,
- "Erratum #162001800", IORT_SMMU_V3_PMCG_HISI_HIP08},
+ "Erratum #162001800, Erratum #162001900", IORT_SMMU_V3_PMCG_HISI_HIP08},
+ /* HiSilicon Hip09 Platform */
+ {"HISI ", "HIP09 ", 0, ACPI_SIG_IORT, greater_than_or_equal,
+ "Erratum #162001900", IORT_SMMU_V3_PMCG_HISI_HIP09},
{ }
};
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 9c67ed02d797..969bf81e8d54 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -1034,8 +1034,9 @@ static void acpi_battery_refresh(struct acpi_battery *battery)
}
/* Driver Interface */
-static void acpi_battery_notify(struct acpi_device *device, u32 event)
+static void acpi_battery_notify(acpi_handle handle, u32 event, void *data)
{
+ struct acpi_device *device = data;
struct acpi_battery *battery = acpi_driver_data(device);
struct power_supply *old;
@@ -1212,13 +1213,22 @@ static int acpi_battery_add(struct acpi_device *device)
device_init_wakeup(&device->dev, 1);
- return result;
+ result = acpi_dev_install_notify_handler(device, ACPI_ALL_NOTIFY,
+ acpi_battery_notify);
+ if (result)
+ goto fail_pm;
+
+ return 0;
+fail_pm:
+ device_init_wakeup(&device->dev, 0);
+ unregister_pm_notifier(&battery->pm_nb);
fail:
sysfs_remove_battery(battery);
mutex_destroy(&battery->lock);
mutex_destroy(&battery->sysfs_lock);
kfree(battery);
+
return result;
}
@@ -1228,10 +1238,16 @@ static void acpi_battery_remove(struct acpi_device *device)
if (!device || !acpi_driver_data(device))
return;
- device_init_wakeup(&device->dev, 0);
+
battery = acpi_driver_data(device);
+
+ acpi_dev_remove_notify_handler(device, ACPI_ALL_NOTIFY,
+ acpi_battery_notify);
+
+ device_init_wakeup(&device->dev, 0);
unregister_pm_notifier(&battery->pm_nb);
sysfs_remove_battery(battery);
+
mutex_destroy(&battery->lock);
mutex_destroy(&battery->sysfs_lock);
kfree(battery);
@@ -1264,11 +1280,9 @@ static struct acpi_driver acpi_battery_driver = {
.name = "battery",
.class = ACPI_BATTERY_CLASS,
.ids = battery_device_ids,
- .flags = ACPI_DRIVER_ALL_NOTIFY_EVENTS,
.ops = {
.add = acpi_battery_add,
.remove = acpi_battery_remove,
- .notify = acpi_battery_notify,
},
.drv.pm = &acpi_battery_pm,
};
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 2fc2b43a4ed3..f41dda2d3493 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -554,6 +554,30 @@ static void acpi_device_remove_notify_handler(struct acpi_device *device,
acpi_os_wait_events_complete();
}
+int acpi_dev_install_notify_handler(struct acpi_device *adev,
+ u32 handler_type,
+ acpi_notify_handler handler)
+{
+ acpi_status status;
+
+ status = acpi_install_notify_handler(adev->handle, handler_type,
+ handler, adev);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_dev_install_notify_handler);
+
+void acpi_dev_remove_notify_handler(struct acpi_device *adev,
+ u32 handler_type,
+ acpi_notify_handler handler)
+{
+ acpi_remove_notify_handler(adev->handle, handler_type, handler);
+ acpi_os_wait_events_complete();
+}
+EXPORT_SYMBOL_GPL(acpi_dev_remove_notify_handler);
+
/* Handle events targeting \_SB device (at present only graceful shutdown) */
#define ACPI_SB_NOTIFY_SHUTDOWN_REQUEST 0x81
@@ -1005,8 +1029,10 @@ static int acpi_device_probe(struct device *dev)
return -ENOSYS;
ret = acpi_drv->ops.add(acpi_dev);
- if (ret)
+ if (ret) {
+ acpi_dev->driver_data = NULL;
return ret;
+ }
pr_debug("Driver [%s] successfully bound to device [%s]\n",
acpi_drv->name, acpi_dev->pnp.bus_id);
@@ -1296,9 +1322,6 @@ static int __init acpi_bus_init(void)
goto error1;
}
- /* Set capability bits for _OSC under processor scope */
- acpi_early_processor_osc();
-
/*
* _OSC method may exist in module level code,
* so it must be run after ACPI_FULL_INITIALIZATION
@@ -1314,7 +1337,7 @@ static int __init acpi_bus_init(void)
acpi_sysfs_init();
- acpi_early_processor_set_pdc();
+ acpi_early_processor_control_setup();
/*
* Maybe EC region is required at bus_scan/acpi_get_devices. So it
diff --git a/drivers/acpi/hed.c b/drivers/acpi/hed.c
index 78d44e3fe129..46c6f8c35b43 100644
--- a/drivers/acpi/hed.c
+++ b/drivers/acpi/hed.c
@@ -42,22 +42,32 @@ EXPORT_SYMBOL_GPL(unregister_acpi_hed_notifier);
* it is used by HEST Generic Hardware Error Source with notify type
* SCI.
*/
-static void acpi_hed_notify(struct acpi_device *device, u32 event)
+static void acpi_hed_notify(acpi_handle handle, u32 event, void *data)
{
blocking_notifier_call_chain(&acpi_hed_notify_list, 0, NULL);
}
static int acpi_hed_add(struct acpi_device *device)
{
+ int err;
+
/* Only one hardware error device */
if (hed_handle)
return -EINVAL;
hed_handle = device->handle;
- return 0;
+
+ err = acpi_dev_install_notify_handler(device, ACPI_DEVICE_NOTIFY,
+ acpi_hed_notify);
+ if (err)
+ hed_handle = NULL;
+
+ return err;
}
static void acpi_hed_remove(struct acpi_device *device)
{
+ acpi_dev_remove_notify_handler(device, ACPI_DEVICE_NOTIFY,
+ acpi_hed_notify);
hed_handle = NULL;
}
@@ -68,7 +78,6 @@ static struct acpi_driver acpi_hed_driver = {
.ops = {
.add = acpi_hed_add,
.remove = acpi_hed_remove,
- .notify = acpi_hed_notify,
},
};
module_acpi_driver(acpi_hed_driver);
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index f4148dc50b9c..866c7c4ed233 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -28,11 +28,6 @@ void acpi_processor_init(void);
void acpi_platform_init(void);
void acpi_pnp_init(void);
void acpi_int340x_thermal_init(void);
-#ifdef CONFIG_ARM_AMBA
-void acpi_amba_init(void);
-#else
-static inline void acpi_amba_init(void) {}
-#endif
int acpi_sysfs_init(void);
void acpi_gpe_apply_masked_gpes(void);
void acpi_container_init(void);
@@ -128,7 +123,6 @@ int __acpi_device_uevent_modalias(const struct acpi_device *adev,
/* --------------------------------------------------------------------------
Power Resource
-------------------------------------------------------------------------- */
-int acpi_power_init(void);
void acpi_power_resources_list_free(struct list_head *list);
int acpi_extract_power_resources(union acpi_object *package, unsigned int start,
struct list_head *list);
@@ -152,15 +146,13 @@ int acpi_wakeup_device_init(void);
Processor
-------------------------------------------------------------------------- */
#ifdef CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC
+void acpi_early_processor_control_setup(void);
void acpi_early_processor_set_pdc(void);
-#else
-static inline void acpi_early_processor_set_pdc(void) {}
-#endif
-#ifdef CONFIG_X86
-void acpi_early_processor_osc(void);
+void acpi_proc_quirk_mwait_check(void);
+bool processor_physically_present(acpi_handle handle);
#else
-static inline void acpi_early_processor_osc(void) {}
+static inline void acpi_early_processor_control_setup(void) {}
#endif
/* --------------------------------------------------------------------------
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 07204d482968..f0e6738ae3c9 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -3282,6 +3282,23 @@ static void acpi_nfit_put_table(void *table)
acpi_put_table(table);
}
+static void acpi_nfit_notify(acpi_handle handle, u32 event, void *data)
+{
+ struct acpi_device *adev = data;
+
+ device_lock(&adev->dev);
+ __acpi_nfit_notify(&adev->dev, handle, event);
+ device_unlock(&adev->dev);
+}
+
+static void acpi_nfit_remove_notify_handler(void *data)
+{
+ struct acpi_device *adev = data;
+
+ acpi_dev_remove_notify_handler(adev, ACPI_DEVICE_NOTIFY,
+ acpi_nfit_notify);
+}
+
void acpi_nfit_shutdown(void *data)
{
struct acpi_nfit_desc *acpi_desc = data;
@@ -3368,12 +3385,18 @@ static int acpi_nfit_add(struct acpi_device *adev)
if (rc)
return rc;
- return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc);
-}
-static void acpi_nfit_remove(struct acpi_device *adev)
-{
- /* see acpi_nfit_unregister */
+ rc = devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc);
+ if (rc)
+ return rc;
+
+ rc = acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY,
+ acpi_nfit_notify);
+ if (rc)
+ return rc;
+
+ return devm_add_action_or_reset(dev, acpi_nfit_remove_notify_handler,
+ adev);
}
static void acpi_nfit_update_notify(struct device *dev, acpi_handle handle)
@@ -3446,13 +3469,6 @@ void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event)
}
EXPORT_SYMBOL_GPL(__acpi_nfit_notify);
-static void acpi_nfit_notify(struct acpi_device *adev, u32 event)
-{
- device_lock(&adev->dev);
- __acpi_nfit_notify(&adev->dev, adev->handle, event);
- device_unlock(&adev->dev);
-}
-
static const struct acpi_device_id acpi_nfit_ids[] = {
{ "ACPI0012", 0 },
{ "", 0 },
@@ -3464,8 +3480,6 @@ static struct acpi_driver acpi_nfit_driver = {
.ids = acpi_nfit_ids,
.ops = {
.add = acpi_nfit_add,
- .remove = acpi_nfit_remove,
- .notify = acpi_nfit_notify,
},
};
diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index 3d4c4620f9f9..7020584096bf 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -53,7 +53,7 @@ static LIST_HEAD(prm_module_list);
struct prm_handler_info {
guid_t guid;
- void *handler_addr;
+ efi_status_t (__efiapi *handler_addr)(u64, void *);
u64 static_data_buffer_addr;
u64 acpi_param_buffer_addr;
@@ -260,9 +260,9 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
context.static_data_buffer = handler->static_data_buffer_addr;
context.mmio_ranges = module->mmio_info;
- status = efi_call_virt_pointer(handler, handler_addr,
- handler->acpi_param_buffer_addr,
- &context);
+ status = efi_call_acpi_prm_handler(handler->handler_addr,
+ handler->acpi_param_buffer_addr,
+ &context);
if (status == EFI_SUCCESS) {
buffer->prm_status = PRM_HANDLER_SUCCESS;
} else {
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index d6606a9f2da6..7dd6dbaa98c3 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -132,6 +132,30 @@ static int map_rintc_hartid(struct acpi_subtable_header *entry,
return -EINVAL;
}
+/*
+ * Retrieve LoongArch CPU physical id
+ */
+static int map_core_pic_id(struct acpi_subtable_header *entry,
+ int device_declaration, u32 acpi_id, phys_cpuid_t *phys_id)
+{
+ struct acpi_madt_core_pic *core_pic =
+ container_of(entry, struct acpi_madt_core_pic, header);
+
+ if (!(core_pic->flags & ACPI_MADT_ENABLED))
+ return -ENODEV;
+
+ /* device_declaration means Device object in DSDT, in LoongArch
+ * system, logical processor acpi_id is required in _UID property
+ * of DSDT table, so we should check device_declaration here
+ */
+ if (device_declaration && (core_pic->processor_id == acpi_id)) {
+ *phys_id = core_pic->core_id;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
static phys_cpuid_t map_madt_entry(struct acpi_table_madt *madt,
int type, u32 acpi_id)
{
@@ -165,6 +189,9 @@ static phys_cpuid_t map_madt_entry(struct acpi_table_madt *madt,
} else if (header->type == ACPI_MADT_TYPE_RINTC) {
if (!map_rintc_hartid(header, type, acpi_id, &phys_id))
break;
+ } else if (header->type == ACPI_MADT_TYPE_CORE_PIC) {
+ if (!map_core_pic_id(header, type, acpi_id, &phys_id))
+ break;
}
entry += header->length;
}
@@ -216,6 +243,8 @@ static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
map_x2apic_id(header, type, acpi_id, &phys_id);
else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT)
map_gicc_mpidr(header, type, acpi_id, &phys_id);
+ else if (header->type == ACPI_MADT_TYPE_CORE_PIC)
+ map_core_pic_id(header, type, acpi_id, &phys_id);
exit:
kfree(buffer.pointer);
diff --git a/drivers/acpi/processor_pdc.c b/drivers/acpi/processor_pdc.c
index 18fb04523f93..1a8591e9a9bf 100644
--- a/drivers/acpi/processor_pdc.c
+++ b/drivers/acpi/processor_pdc.c
@@ -9,71 +9,19 @@
#define pr_fmt(fmt) "ACPI: " fmt
-#include <linux/dmi.h>
#include <linux/slab.h>
#include <linux/acpi.h>
#include <acpi/processor.h>
-#include <xen/xen.h>
-
#include "internal.h"
-static bool __init processor_physically_present(acpi_handle handle)
-{
- int cpuid, type;
- u32 acpi_id;
- acpi_status status;
- acpi_object_type acpi_type;
- unsigned long long tmp;
- union acpi_object object = { 0 };
- struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
-
- status = acpi_get_type(handle, &acpi_type);
- if (ACPI_FAILURE(status))
- return false;
-
- switch (acpi_type) {
- case ACPI_TYPE_PROCESSOR:
- status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
- if (ACPI_FAILURE(status))
- return false;
- acpi_id = object.processor.proc_id;
- break;
- case ACPI_TYPE_DEVICE:
- status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
- if (ACPI_FAILURE(status))
- return false;
- acpi_id = tmp;
- break;
- default:
- return false;
- }
-
- if (xen_initial_domain())
- /*
- * When running as a Xen dom0 the number of processors Linux
- * sees can be different from the real number of processors on
- * the system, and we still need to execute _PDC for all of
- * them.
- */
- return xen_processor_present(acpi_id);
-
- type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
- cpuid = acpi_get_cpuid(handle, type, acpi_id);
-
- return !invalid_logical_cpuid(cpuid);
-}
-
static void acpi_set_pdc_bits(u32 *buf)
{
buf[0] = ACPI_PDC_REVISION_ID;
buf[1] = 1;
- /* Enable coordination with firmware's _TSD info */
- buf[2] = ACPI_PDC_SMP_T_SWCOORD;
-
/* Twiddle arch-specific bits needed for _PDC */
- arch_acpi_set_pdc_bits(buf);
+ arch_acpi_set_proc_cap_bits(&buf[2]);
}
static struct acpi_object_list *acpi_processor_alloc_pdc(void)
@@ -123,20 +71,6 @@ acpi_processor_eval_pdc(acpi_handle handle, struct acpi_object_list *pdc_in)
{
acpi_status status = AE_OK;
- if (boot_option_idle_override == IDLE_NOMWAIT) {
- /*
- * If mwait is disabled for CPU C-states, the C2C3_FFH access
- * mode will be disabled in the parameter of _PDC object.
- * Of course C1_FFH access mode will also be disabled.
- */
- union acpi_object *obj;
- u32 *buffer = NULL;
-
- obj = pdc_in->pointer;
- buffer = (u32 *)(obj->buffer.pointer);
- buffer[2] &= ~(ACPI_PDC_C_C2C3_FFH | ACPI_PDC_C_C1_FFH);
-
- }
status = acpi_evaluate_object(handle, "_PDC", pdc_in, NULL);
if (ACPI_FAILURE(status))
@@ -174,36 +108,9 @@ early_init_pdc(acpi_handle handle, u32 lvl, void *context, void **rv)
return AE_OK;
}
-static int __init set_no_mwait(const struct dmi_system_id *id)
-{
- pr_notice("%s detected - disabling mwait for CPU C-states\n",
- id->ident);
- boot_option_idle_override = IDLE_NOMWAIT;
- return 0;
-}
-
-static const struct dmi_system_id processor_idle_dmi_table[] __initconst = {
- {
- set_no_mwait, "Extensa 5220", {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
- DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
- DMI_MATCH(DMI_PRODUCT_VERSION, "0100"),
- DMI_MATCH(DMI_BOARD_NAME, "Columbia") }, NULL},
- {},
-};
-
-static void __init processor_dmi_check(void)
-{
- /*
- * Check whether the system is DMI table. If yes, OSPM
- * should not use mwait for CPU-states.
- */
- dmi_check_system(processor_idle_dmi_table);
-}
-
void __init acpi_early_processor_set_pdc(void)
{
- processor_dmi_check();
+ acpi_proc_quirk_mwait_check();
acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
ACPI_UINT32_MAX,
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index a4d9f149b48d..32cfa3f4efd3 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -501,9 +501,13 @@ static const struct dmi_system_id maingear_laptop[] = {
static const struct dmi_system_id pcspecialist_laptop[] = {
{
.ident = "PCSpecialist Elimina Pro 16 M",
+ /*
+ * Some models have product-name "Elimina Pro 16 M",
+ * others "GM6BGEQ". Match on board-name to match both.
+ */
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "PCSpecialist"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Elimina Pro 16 M"),
+ DMI_MATCH(DMI_BOARD_NAME, "GM6BGEQ"),
},
},
{ }
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 87e385542576..531a9e3df717 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -795,6 +795,9 @@ static const char * const acpi_ignore_dep_ids[] = {
/* List of HIDs for which we honor deps of matching ACPI devs, when checking _DEP lists. */
static const char * const acpi_honor_dep_ids[] = {
"INT3472", /* Camera sensor PMIC / clk and regulator info */
+ "INTC1059", /* IVSC (TGL) driver must be loaded to allow i2c access to camera sensors */
+ "INTC1095", /* IVSC (ADL) driver must be loaded to allow i2c access to camera sensors */
+ "INTC100A", /* IVSC (RPL) driver must be loaded to allow i2c access to camera sensors */
NULL
};
@@ -2616,7 +2619,6 @@ void __init acpi_scan_init(void)
acpi_watchdog_init();
acpi_pnp_init();
acpi_int340x_thermal_init();
- acpi_amba_init();
acpi_init_lpit();
acpi_scan_add_handler(&generic_device_handler);
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index f9f6ebb08fdb..419590f41ed5 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -82,10 +82,6 @@ static int tzp;
module_param(tzp, int, 0444);
MODULE_PARM_DESC(tzp, "Thermal zone polling frequency, in 1/10 seconds.");
-static int nocrt;
-module_param(nocrt, int, 0);
-MODULE_PARM_DESC(nocrt, "Set to take no action upon ACPI thermal zone critical trips points.");
-
static int off;
module_param(off, int, 0);
MODULE_PARM_DESC(off, "Set to disable ACPI thermal support.");
@@ -96,35 +92,27 @@ MODULE_PARM_DESC(psv, "Disable or override all passive trip points.");
static struct workqueue_struct *acpi_thermal_pm_queue;
-struct acpi_thermal_critical {
- unsigned long temperature;
- bool valid;
-};
-
-struct acpi_thermal_hot {
+struct acpi_thermal_trip {
unsigned long temperature;
bool valid;
};
struct acpi_thermal_passive {
+ struct acpi_thermal_trip trip;
struct acpi_handle_list devices;
- unsigned long temperature;
unsigned long tc1;
unsigned long tc2;
unsigned long tsp;
- bool valid;
};
struct acpi_thermal_active {
+ struct acpi_thermal_trip trip;
struct acpi_handle_list devices;
- unsigned long temperature;
- bool valid;
- bool enabled;
};
struct acpi_thermal_trips {
- struct acpi_thermal_critical critical;
- struct acpi_thermal_hot hot;
+ struct acpi_thermal_trip critical;
+ struct acpi_thermal_trip hot;
struct acpi_thermal_passive passive;
struct acpi_thermal_active active[ACPI_THERMAL_MAX_ACTIVE];
};
@@ -137,6 +125,7 @@ struct acpi_thermal {
unsigned long polling_frequency;
volatile u8 zombie;
struct acpi_thermal_trips trips;
+ struct thermal_trip *trip_table;
struct acpi_handle_list devices;
struct thermal_zone_device *thermal_zone;
int kelvin_offset; /* in millidegrees */
@@ -190,7 +179,16 @@ static int acpi_thermal_get_polling_frequency(struct acpi_thermal *tz)
return 0;
}
-static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag)
+static int acpi_thermal_temp(struct acpi_thermal *tz, int temp_deci_k)
+{
+ if (temp_deci_k == THERMAL_TEMP_INVALID)
+ return THERMAL_TEMP_INVALID;
+
+ return deci_kelvin_to_millicelsius_with_offset(temp_deci_k,
+ tz->kelvin_offset);
+}
+
+static void __acpi_thermal_trips_update(struct acpi_thermal *tz, int flag)
{
acpi_status status;
unsigned long long tmp;
@@ -255,9 +253,9 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag)
}
/* Passive (optional) */
- if (((flag & ACPI_TRIPS_PASSIVE) && tz->trips.passive.valid) ||
+ if (((flag & ACPI_TRIPS_PASSIVE) && tz->trips.passive.trip.valid) ||
flag == ACPI_TRIPS_INIT) {
- valid = tz->trips.passive.valid;
+ valid = tz->trips.passive.trip.valid;
if (psv == -1) {
status = AE_SUPPORT;
} else if (psv > 0) {
@@ -269,44 +267,44 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag)
}
if (ACPI_FAILURE(status)) {
- tz->trips.passive.valid = false;
+ tz->trips.passive.trip.valid = false;
} else {
- tz->trips.passive.temperature = tmp;
- tz->trips.passive.valid = true;
+ tz->trips.passive.trip.temperature = tmp;
+ tz->trips.passive.trip.valid = true;
if (flag == ACPI_TRIPS_INIT) {
status = acpi_evaluate_integer(tz->device->handle,
"_TC1", NULL, &tmp);
if (ACPI_FAILURE(status))
- tz->trips.passive.valid = false;
+ tz->trips.passive.trip.valid = false;
else
tz->trips.passive.tc1 = tmp;
status = acpi_evaluate_integer(tz->device->handle,
"_TC2", NULL, &tmp);
if (ACPI_FAILURE(status))
- tz->trips.passive.valid = false;
+ tz->trips.passive.trip.valid = false;
else
tz->trips.passive.tc2 = tmp;
status = acpi_evaluate_integer(tz->device->handle,
"_TSP", NULL, &tmp);
if (ACPI_FAILURE(status))
- tz->trips.passive.valid = false;
+ tz->trips.passive.trip.valid = false;
else
tz->trips.passive.tsp = tmp;
}
}
}
- if ((flag & ACPI_TRIPS_DEVICES) && tz->trips.passive.valid) {
+ if ((flag & ACPI_TRIPS_DEVICES) && tz->trips.passive.trip.valid) {
memset(&devices, 0, sizeof(struct acpi_handle_list));
status = acpi_evaluate_reference(tz->device->handle, "_PSL",
NULL, &devices);
if (ACPI_FAILURE(status)) {
acpi_handle_info(tz->device->handle,
"Invalid passive threshold\n");
- tz->trips.passive.valid = false;
+ tz->trips.passive.trip.valid = false;
} else {
- tz->trips.passive.valid = true;
+ tz->trips.passive.trip.valid = true;
}
if (memcmp(&tz->trips.passive.devices, &devices,
@@ -317,24 +315,24 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag)
}
}
if ((flag & ACPI_TRIPS_PASSIVE) || (flag & ACPI_TRIPS_DEVICES)) {
- if (valid != tz->trips.passive.valid)
+ if (valid != tz->trips.passive.trip.valid)
ACPI_THERMAL_TRIPS_EXCEPTION(flag, tz, "state");
}
/* Active (optional) */
for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
char name[5] = { '_', 'A', 'C', ('0' + i), '\0' };
- valid = tz->trips.active[i].valid;
+ valid = tz->trips.active[i].trip.valid;
if (act == -1)
break; /* disable all active trip points */
if (flag == ACPI_TRIPS_INIT || ((flag & ACPI_TRIPS_ACTIVE) &&
- tz->trips.active[i].valid)) {
+ tz->trips.active[i].trip.valid)) {
status = acpi_evaluate_integer(tz->device->handle,
name, NULL, &tmp);
if (ACPI_FAILURE(status)) {
- tz->trips.active[i].valid = false;
+ tz->trips.active[i].trip.valid = false;
if (i == 0)
break;
@@ -342,35 +340,36 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag)
break;
if (i == 1)
- tz->trips.active[0].temperature = celsius_to_deci_kelvin(act);
+ tz->trips.active[0].trip.temperature =
+ celsius_to_deci_kelvin(act);
else
/*
* Don't allow override higher than
* the next higher trip point
*/
- tz->trips.active[i-1].temperature =
+ tz->trips.active[i-1].trip.temperature =
min_t(unsigned long,
- tz->trips.active[i-2].temperature,
+ tz->trips.active[i-2].trip.temperature,
celsius_to_deci_kelvin(act));
break;
} else {
- tz->trips.active[i].temperature = tmp;
- tz->trips.active[i].valid = true;
+ tz->trips.active[i].trip.temperature = tmp;
+ tz->trips.active[i].trip.valid = true;
}
}
name[2] = 'L';
- if ((flag & ACPI_TRIPS_DEVICES) && tz->trips.active[i].valid) {
+ if ((flag & ACPI_TRIPS_DEVICES) && tz->trips.active[i].trip.valid) {
memset(&devices, 0, sizeof(struct acpi_handle_list));
status = acpi_evaluate_reference(tz->device->handle,
name, NULL, &devices);
if (ACPI_FAILURE(status)) {
acpi_handle_info(tz->device->handle,
"Invalid active%d threshold\n", i);
- tz->trips.active[i].valid = false;
+ tz->trips.active[i].trip.valid = false;
} else {
- tz->trips.active[i].valid = true;
+ tz->trips.active[i].trip.valid = true;
}
if (memcmp(&tz->trips.active[i].devices, &devices,
@@ -381,10 +380,10 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag)
}
}
if ((flag & ACPI_TRIPS_ACTIVE) || (flag & ACPI_TRIPS_DEVICES))
- if (valid != tz->trips.active[i].valid)
+ if (valid != tz->trips.active[i].trip.valid)
ACPI_THERMAL_TRIPS_EXCEPTION(flag, tz, "state");
- if (!tz->trips.active[i].valid)
+ if (!tz->trips.active[i].trip.valid)
break;
}
@@ -398,24 +397,73 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag)
ACPI_THERMAL_TRIPS_EXCEPTION(flag, tz, "device");
}
}
+}
+
+static int acpi_thermal_adjust_trip(struct thermal_trip *trip, void *data)
+{
+ struct acpi_thermal_trip *acpi_trip = trip->priv;
+ struct acpi_thermal *tz = data;
+
+ if (!acpi_trip)
+ return 0;
+
+ if (acpi_trip->valid)
+ trip->temperature = acpi_thermal_temp(tz, acpi_trip->temperature);
+ else
+ trip->temperature = THERMAL_TEMP_INVALID;
return 0;
}
+static void acpi_thermal_adjust_thermal_zone(struct thermal_zone_device *thermal,
+ unsigned long data)
+{
+ struct acpi_thermal *tz = thermal_zone_device_priv(thermal);
+ int flag = data == ACPI_THERMAL_NOTIFY_THRESHOLDS ?
+ ACPI_TRIPS_THRESHOLDS : ACPI_TRIPS_DEVICES;
+
+ __acpi_thermal_trips_update(tz, flag);
+
+ for_each_thermal_trip(tz->thermal_zone, acpi_thermal_adjust_trip, tz);
+}
+
+static void acpi_queue_thermal_check(struct acpi_thermal *tz)
+{
+ if (!work_pending(&tz->thermal_check_work))
+ queue_work(acpi_thermal_pm_queue, &tz->thermal_check_work);
+}
+
+static void acpi_thermal_trips_update(struct acpi_thermal *tz, u32 event)
+{
+ struct acpi_device *adev = tz->device;
+
+ /*
+ * Use thermal_zone_device_exec() to carry out the trip points
+ * update, so as to protect thermal_get_trend() from getting stale
+ * trip point temperatures and to prevent thermal_zone_device_update()
+ * invoked from acpi_thermal_check_fn() from producing inconsistent
+ * results.
+ */
+ thermal_zone_device_exec(tz->thermal_zone,
+ acpi_thermal_adjust_thermal_zone, event);
+ acpi_queue_thermal_check(tz);
+ acpi_bus_generate_netlink_event(adev->pnp.device_class,
+ dev_name(&adev->dev), event, 0);
+}
+
static int acpi_thermal_get_trip_points(struct acpi_thermal *tz)
{
- int i, ret = acpi_thermal_trips_update(tz, ACPI_TRIPS_INIT);
bool valid;
+ int i;
- if (ret)
- return ret;
+ __acpi_thermal_trips_update(tz, ACPI_TRIPS_INIT);
valid = tz->trips.critical.valid |
tz->trips.hot.valid |
- tz->trips.passive.valid;
+ tz->trips.passive.trip.valid;
for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++)
- valid = valid || tz->trips.active[i].valid;
+ valid = valid || tz->trips.active[i].trip.valid;
if (!valid) {
pr_warn(FW_BUG "No valid trip found\n");
@@ -443,159 +491,55 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, int *temp)
return 0;
}
-static int thermal_get_trip_type(struct thermal_zone_device *thermal,
- int trip, enum thermal_trip_type *type)
+static int thermal_get_trend(struct thermal_zone_device *thermal,
+ int trip_index, enum thermal_trend *trend)
{
struct acpi_thermal *tz = thermal_zone_device_priv(thermal);
- int i;
+ struct acpi_thermal_trip *acpi_trip;
+ int t, i;
- if (!tz || trip < 0)
+ if (!tz || trip_index < 0)
return -EINVAL;
- if (tz->trips.critical.valid) {
- if (!trip) {
- *type = THERMAL_TRIP_CRITICAL;
- return 0;
- }
- trip--;
- }
-
- if (tz->trips.hot.valid) {
- if (!trip) {
- *type = THERMAL_TRIP_HOT;
- return 0;
- }
- trip--;
- }
-
- if (tz->trips.passive.valid) {
- if (!trip) {
- *type = THERMAL_TRIP_PASSIVE;
- return 0;
- }
- trip--;
- }
-
- for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].valid; i++) {
- if (!trip) {
- *type = THERMAL_TRIP_ACTIVE;
- return 0;
- }
- trip--;
- }
-
- return -EINVAL;
-}
+ if (tz->trips.critical.valid)
+ trip_index--;
-static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
- int trip, int *temp)
-{
- struct acpi_thermal *tz = thermal_zone_device_priv(thermal);
- int i;
+ if (tz->trips.hot.valid)
+ trip_index--;
- if (!tz || trip < 0)
+ if (trip_index < 0)
return -EINVAL;
- if (tz->trips.critical.valid) {
- if (!trip) {
- *temp = deci_kelvin_to_millicelsius_with_offset(
- tz->trips.critical.temperature,
- tz->kelvin_offset);
- return 0;
- }
- trip--;
- }
-
- if (tz->trips.hot.valid) {
- if (!trip) {
- *temp = deci_kelvin_to_millicelsius_with_offset(
- tz->trips.hot.temperature,
- tz->kelvin_offset);
- return 0;
- }
- trip--;
- }
-
- if (tz->trips.passive.valid) {
- if (!trip) {
- *temp = deci_kelvin_to_millicelsius_with_offset(
- tz->trips.passive.temperature,
- tz->kelvin_offset);
- return 0;
- }
- trip--;
- }
-
- for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE &&
- tz->trips.active[i].valid; i++) {
- if (!trip) {
- *temp = deci_kelvin_to_millicelsius_with_offset(
- tz->trips.active[i].temperature,
- tz->kelvin_offset);
- return 0;
- }
- trip--;
- }
-
- return -EINVAL;
-}
-
-static int thermal_get_crit_temp(struct thermal_zone_device *thermal,
- int *temperature)
-{
- struct acpi_thermal *tz = thermal_zone_device_priv(thermal);
+ acpi_trip = &tz->trips.passive.trip;
+ if (acpi_trip->valid && !trip_index--) {
+ t = tz->trips.passive.tc1 * (tz->temperature -
+ tz->last_temperature) +
+ tz->trips.passive.tc2 * (tz->temperature -
+ acpi_trip->temperature);
+ if (t > 0)
+ *trend = THERMAL_TREND_RAISING;
+ else if (t < 0)
+ *trend = THERMAL_TREND_DROPPING;
+ else
+ *trend = THERMAL_TREND_STABLE;
- if (tz->trips.critical.valid) {
- *temperature = deci_kelvin_to_millicelsius_with_offset(
- tz->trips.critical.temperature,
- tz->kelvin_offset);
return 0;
}
- return -EINVAL;
-}
-
-static int thermal_get_trend(struct thermal_zone_device *thermal,
- int trip, enum thermal_trend *trend)
-{
- struct acpi_thermal *tz = thermal_zone_device_priv(thermal);
- enum thermal_trip_type type;
- int i;
-
- if (thermal_get_trip_type(thermal, trip, &type))
- return -EINVAL;
-
- if (type == THERMAL_TRIP_ACTIVE) {
- int trip_temp;
- int temp = deci_kelvin_to_millicelsius_with_offset(
- tz->temperature, tz->kelvin_offset);
- if (thermal_get_trip_temp(thermal, trip, &trip_temp))
- return -EINVAL;
+ t = acpi_thermal_temp(tz, tz->temperature);
- if (temp > trip_temp) {
- *trend = THERMAL_TREND_RAISING;
- return 0;
- } else {
- /* Fall back on default trend */
- return -EINVAL;
+ for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
+ acpi_trip = &tz->trips.active[i].trip;
+ if (acpi_trip->valid && !trip_index--) {
+ if (t > acpi_thermal_temp(tz, acpi_trip->temperature)) {
+ *trend = THERMAL_TREND_RAISING;
+ return 0;
+ }
+ break;
}
}
- /*
- * tz->temperature has already been updated by generic thermal layer,
- * before this callback being invoked
- */
- i = tz->trips.passive.tc1 * (tz->temperature - tz->last_temperature) +
- tz->trips.passive.tc2 * (tz->temperature - tz->trips.passive.temperature);
-
- if (i > 0)
- *trend = THERMAL_TREND_RAISING;
- else if (i < 0)
- *trend = THERMAL_TREND_DROPPING;
- else
- *trend = THERMAL_TREND_STABLE;
-
- return 0;
+ return -EINVAL;
}
static void acpi_thermal_zone_device_hot(struct thermal_zone_device *thermal)
@@ -637,7 +581,7 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
if (tz->trips.hot.valid)
trip++;
- if (tz->trips.passive.valid) {
+ if (tz->trips.passive.trip.valid) {
trip++;
for (i = 0; i < tz->trips.passive.devices.count; i++) {
handle = tz->trips.passive.devices.handles[i];
@@ -662,7 +606,7 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
}
for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
- if (!tz->trips.active[i].valid)
+ if (!tz->trips.active[i].trip.valid)
break;
trip++;
@@ -709,9 +653,6 @@ static struct thermal_zone_device_ops acpi_thermal_zone_ops = {
.bind = acpi_thermal_bind_cooling_device,
.unbind = acpi_thermal_unbind_cooling_device,
.get_temp = thermal_get_temp,
- .get_trip_type = thermal_get_trip_type,
- .get_trip_temp = thermal_get_trip_temp,
- .get_crit_temp = thermal_get_crit_temp,
.get_trend = thermal_get_trend,
.hot = acpi_thermal_zone_device_hot,
.critical = acpi_thermal_zone_device_critical,
@@ -745,63 +686,97 @@ static void acpi_thermal_zone_sysfs_remove(struct acpi_thermal *tz)
static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz)
{
- int trips = 0;
+ struct acpi_thermal_trip *acpi_trip;
+ struct thermal_trip *trip;
+ int passive_delay = 0;
+ int trip_count = 0;
int result;
- acpi_status status;
int i;
if (tz->trips.critical.valid)
- trips++;
+ trip_count++;
if (tz->trips.hot.valid)
- trips++;
+ trip_count++;
+
+ if (tz->trips.passive.trip.valid) {
+ trip_count++;
+ passive_delay = tz->trips.passive.tsp * 100;
+ }
- if (tz->trips.passive.valid)
- trips++;
+ for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].trip.valid; i++)
+ trip_count++;
- for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].valid;
- i++, trips++);
+ trip = kcalloc(trip_count, sizeof(*trip), GFP_KERNEL);
+ if (!trip)
+ return -ENOMEM;
- if (tz->trips.passive.valid)
- tz->thermal_zone = thermal_zone_device_register("acpitz", trips, 0, tz,
- &acpi_thermal_zone_ops, NULL,
- tz->trips.passive.tsp * 100,
- tz->polling_frequency * 100);
- else
- tz->thermal_zone =
- thermal_zone_device_register("acpitz", trips, 0, tz,
- &acpi_thermal_zone_ops, NULL,
- 0, tz->polling_frequency * 100);
+ tz->trip_table = trip;
- if (IS_ERR(tz->thermal_zone))
- return -ENODEV;
+ if (tz->trips.critical.valid) {
+ trip->type = THERMAL_TRIP_CRITICAL;
+ trip->temperature = acpi_thermal_temp(tz, tz->trips.critical.temperature);
+ trip++;
+ }
+
+ if (tz->trips.hot.valid) {
+ trip->type = THERMAL_TRIP_HOT;
+ trip->temperature = acpi_thermal_temp(tz, tz->trips.hot.temperature);
+ trip++;
+ }
+
+ acpi_trip = &tz->trips.passive.trip;
+ if (acpi_trip->valid) {
+ trip->type = THERMAL_TRIP_PASSIVE;
+ trip->temperature = acpi_thermal_temp(tz, acpi_trip->temperature);
+ trip->priv = acpi_trip;
+ trip++;
+ }
+
+ for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
+ acpi_trip = &tz->trips.active[i].trip;
+
+ if (!acpi_trip->valid)
+ break;
+
+ trip->type = THERMAL_TRIP_ACTIVE;
+ trip->temperature = acpi_thermal_temp(tz, acpi_trip->temperature);
+ trip->priv = acpi_trip;
+ trip++;
+ }
+
+ tz->thermal_zone = thermal_zone_device_register_with_trips("acpitz",
+ tz->trip_table,
+ trip_count,
+ 0, tz,
+ &acpi_thermal_zone_ops,
+ NULL,
+ passive_delay,
+ tz->polling_frequency * 100);
+ if (IS_ERR(tz->thermal_zone)) {
+ result = PTR_ERR(tz->thermal_zone);
+ goto free_trip_table;
+ }
result = acpi_thermal_zone_sysfs_add(tz);
if (result)
goto unregister_tzd;
- status = acpi_bus_attach_private_data(tz->device->handle,
- tz->thermal_zone);
- if (ACPI_FAILURE(status)) {
- result = -ENODEV;
- goto remove_links;
- }
-
result = thermal_zone_device_enable(tz->thermal_zone);
if (result)
- goto acpi_bus_detach;
+ goto remove_links;
dev_info(&tz->device->dev, "registered as thermal_zone%d\n",
thermal_zone_device_id(tz->thermal_zone));
return 0;
-acpi_bus_detach:
- acpi_bus_detach_private_data(tz->device->handle);
remove_links:
acpi_thermal_zone_sysfs_remove(tz);
unregister_tzd:
thermal_zone_device_unregister(tz->thermal_zone);
+free_trip_table:
+ kfree(tz->trip_table);
return result;
}
@@ -810,8 +785,8 @@ static void acpi_thermal_unregister_thermal_zone(struct acpi_thermal *tz)
{
acpi_thermal_zone_sysfs_remove(tz);
thermal_zone_device_unregister(tz->thermal_zone);
+ kfree(tz->trip_table);
tz->thermal_zone = NULL;
- acpi_bus_detach_private_data(tz->device->handle);
}
@@ -819,14 +794,9 @@ static void acpi_thermal_unregister_thermal_zone(struct acpi_thermal *tz)
Driver Interface
-------------------------------------------------------------------------- */
-static void acpi_queue_thermal_check(struct acpi_thermal *tz)
-{
- if (!work_pending(&tz->thermal_check_work))
- queue_work(acpi_thermal_pm_queue, &tz->thermal_check_work);
-}
-
-static void acpi_thermal_notify(struct acpi_device *device, u32 event)
+static void acpi_thermal_notify(acpi_handle handle, u32 event, void *data)
{
+ struct acpi_device *device = data;
struct acpi_thermal *tz = acpi_driver_data(device);
if (!tz)
@@ -837,16 +807,8 @@ static void acpi_thermal_notify(struct acpi_device *device, u32 event)
acpi_queue_thermal_check(tz);
break;
case ACPI_THERMAL_NOTIFY_THRESHOLDS:
- acpi_thermal_trips_update(tz, ACPI_TRIPS_THRESHOLDS);
- acpi_queue_thermal_check(tz);
- acpi_bus_generate_netlink_event(device->pnp.device_class,
- dev_name(&device->dev), event, 0);
- break;
case ACPI_THERMAL_NOTIFY_DEVICES:
- acpi_thermal_trips_update(tz, ACPI_TRIPS_DEVICES);
- acpi_queue_thermal_check(tz);
- acpi_bus_generate_netlink_event(device->pnp.device_class,
- dev_name(&device->dev), event, 0);
+ acpi_thermal_trips_update(tz, event);
break;
default:
acpi_handle_debug(device->handle, "Unsupported event [0x%x]\n",
@@ -997,11 +959,20 @@ static int acpi_thermal_add(struct acpi_device *device)
pr_info("%s [%s] (%ld C)\n", acpi_device_name(device),
acpi_device_bid(device), deci_kelvin_to_celsius(tz->temperature));
- goto end;
+ result = acpi_dev_install_notify_handler(device, ACPI_DEVICE_NOTIFY,
+ acpi_thermal_notify);
+ if (result)
+ goto flush_wq;
+
+ return 0;
+
+flush_wq:
+ flush_workqueue(acpi_thermal_pm_queue);
+ acpi_thermal_unregister_thermal_zone(tz);
free_memory:
kfree(tz);
-end:
+
return result;
}
@@ -1012,10 +983,14 @@ static void acpi_thermal_remove(struct acpi_device *device)
if (!device || !acpi_driver_data(device))
return;
- flush_workqueue(acpi_thermal_pm_queue);
tz = acpi_driver_data(device);
+ acpi_dev_remove_notify_handler(device, ACPI_DEVICE_NOTIFY,
+ acpi_thermal_notify);
+
+ flush_workqueue(acpi_thermal_pm_queue);
acpi_thermal_unregister_thermal_zone(tz);
+
kfree(tz);
}
@@ -1030,7 +1005,7 @@ static int acpi_thermal_suspend(struct device *dev)
static int acpi_thermal_resume(struct device *dev)
{
struct acpi_thermal *tz;
- int i, j, power_state, result;
+ int i, j, power_state;
if (!dev)
return -EINVAL;
@@ -1040,18 +1015,12 @@ static int acpi_thermal_resume(struct device *dev)
return -EINVAL;
for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
- if (!tz->trips.active[i].valid)
+ if (!tz->trips.active[i].trip.valid)
break;
- tz->trips.active[i].enabled = true;
for (j = 0; j < tz->trips.active[i].devices.count; j++) {
- result = acpi_bus_update_power(
- tz->trips.active[i].devices.handles[j],
- &power_state);
- if (result || (power_state != ACPI_STATE_D0)) {
- tz->trips.active[i].enabled = false;
- break;
- }
+ acpi_bus_update_power(tz->trips.active[i].devices.handles[j],
+ &power_state);
}
}
@@ -1078,7 +1047,6 @@ static struct acpi_driver acpi_thermal_driver = {
.ops = {
.add = acpi_thermal_add,
.remove = acpi_thermal_remove,
- .notify = acpi_thermal_notify,
},
.drv.pm = &acpi_thermal_pm,
};
@@ -1094,7 +1062,7 @@ static int thermal_act(const struct dmi_system_id *d) {
static int thermal_nocrt(const struct dmi_system_id *d) {
pr_notice("%s detected: disabling all critical thermal trip point actions.\n",
d->ident);
- nocrt = 1;
+ crt = -1;
return 0;
}
static int thermal_tzp(const struct dmi_system_id *d) {
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index 18cc08c858cf..442396f6ed1f 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -446,6 +446,15 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
},
},
{
+ /* https://bugzilla.suse.com/show_bug.cgi?id=1208724 */
+ .callback = video_detect_force_native,
+ /* Lenovo Ideapad Z470 */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "IdeaPad Z470"),
+ },
+ },
+ {
/* https://bugzilla.redhat.com/show_bug.cgi?id=1187004 */
.callback = video_detect_force_native,
/* Lenovo Ideapad Z570 */
@@ -487,6 +496,24 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
},
},
{
+ /* https://gitlab.freedesktop.org/drm/amd/-/issues/1838 */
+ .callback = video_detect_force_native,
+ /* Apple iMac12,1 */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac12,1"),
+ },
+ },
+ {
+ /* https://gitlab.freedesktop.org/drm/amd/-/issues/2753 */
+ .callback = video_detect_force_native,
+ /* Apple iMac12,2 */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac12,2"),
+ },
+ },
+ {
/* https://bugzilla.redhat.com/show_bug.cgi?id=1217249 */
.callback = video_detect_force_native,
/* Apple MacBook Pro 12,1 */
diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index ce62e61a9605..08f7c6708206 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -94,6 +94,11 @@ static struct lpi_constraints *lpi_constraints_table;
static int lpi_constraints_table_size;
static int rev_id;
+#define for_each_lpi_constraint(entry) \
+ for (int i = 0; \
+ entry = &lpi_constraints_table[i], i < lpi_constraints_table_size; \
+ i++)
+
static void lpi_device_get_constraints_amd(void)
{
union acpi_object *out_obj;
@@ -113,6 +118,12 @@ static void lpi_device_get_constraints_amd(void)
union acpi_object *package = &out_obj->package.elements[i];
if (package->type == ACPI_TYPE_PACKAGE) {
+ if (lpi_constraints_table) {
+ acpi_handle_err(lps0_device_handle,
+ "Duplicate constraints list\n");
+ goto free_acpi_buffer;
+ }
+
lpi_constraints_table = kcalloc(package->package.count,
sizeof(*lpi_constraints_table),
GFP_KERNEL);
@@ -123,17 +134,16 @@ static void lpi_device_get_constraints_amd(void)
acpi_handle_debug(lps0_device_handle,
"LPI: constraints list begin:\n");
- for (j = 0; j < package->package.count; ++j) {
+ for (j = 0; j < package->package.count; j++) {
union acpi_object *info_obj = &package->package.elements[j];
struct lpi_device_constraint_amd dev_info = {};
struct lpi_constraints *list;
acpi_status status;
- for (k = 0; k < info_obj->package.count; ++k) {
- union acpi_object *obj = &info_obj->package.elements[k];
+ list = &lpi_constraints_table[lpi_constraints_table_size];
- list = &lpi_constraints_table[lpi_constraints_table_size];
- list->min_dstate = -1;
+ for (k = 0; k < info_obj->package.count; k++) {
+ union acpi_object *obj = &info_obj->package.elements[k];
switch (k) {
case 0:
@@ -149,27 +159,25 @@ static void lpi_device_get_constraints_amd(void)
dev_info.min_dstate = obj->integer.value;
break;
}
+ }
- if (!dev_info.enabled || !dev_info.name ||
- !dev_info.min_dstate)
- continue;
+ acpi_handle_debug(lps0_device_handle,
+ "Name:%s, Enabled: %d, States: %d, MinDstate: %d\n",
+ dev_info.name,
+ dev_info.enabled,
+ dev_info.function_states,
+ dev_info.min_dstate);
- status = acpi_get_handle(NULL, dev_info.name,
- &list->handle);
- if (ACPI_FAILURE(status))
- continue;
+ if (!dev_info.enabled || !dev_info.name ||
+ !dev_info.min_dstate)
+ continue;
- acpi_handle_debug(lps0_device_handle,
- "Name:%s\n", dev_info.name);
+ status = acpi_get_handle(NULL, dev_info.name, &list->handle);
+ if (ACPI_FAILURE(status))
+ continue;
- list->min_dstate = dev_info.min_dstate;
+ list->min_dstate = dev_info.min_dstate;
- if (list->min_dstate < 0) {
- acpi_handle_debug(lps0_device_handle,
- "Incomplete constraint defined\n");
- continue;
- }
- }
lpi_constraints_table_size++;
}
}
@@ -214,7 +222,7 @@ static void lpi_device_get_constraints(void)
if (!package)
continue;
- for (j = 0; j < package->package.count; ++j) {
+ for (j = 0; j < package->package.count; j++) {
union acpi_object *element =
&(package->package.elements[j]);
@@ -246,7 +254,7 @@ static void lpi_device_get_constraints(void)
constraint->min_dstate = -1;
- for (j = 0; j < package_count; ++j) {
+ for (j = 0; j < package_count; j++) {
union acpi_object *info_obj = &info.package[j];
union acpi_object *cnstr_pkg;
union acpi_object *obj;
@@ -291,32 +299,55 @@ free_acpi_buffer:
ACPI_FREE(out_obj);
}
+/**
+ * acpi_get_lps0_constraint - Get the LPS0 constraint for a device.
+ * @adev: Device to get the constraint for.
+ *
+ * The LPS0 constraint is the shallowest (minimum) power state in which the
+ * device can be so as to allow the platform as a whole to achieve additional
+ * energy conservation by utilizing a system-wide low-power state.
+ *
+ * Returns:
+ * - ACPI power state value of the constraint for @adev on success.
+ * - Otherwise, ACPI_STATE_UNKNOWN.
+ */
+int acpi_get_lps0_constraint(struct acpi_device *adev)
+{
+ struct lpi_constraints *entry;
+
+ for_each_lpi_constraint(entry) {
+ if (adev->handle == entry->handle)
+ return entry->min_dstate;
+ }
+
+ return ACPI_STATE_UNKNOWN;
+}
+
static void lpi_check_constraints(void)
{
- int i;
+ struct lpi_constraints *entry;
- for (i = 0; i < lpi_constraints_table_size; ++i) {
- acpi_handle handle = lpi_constraints_table[i].handle;
- struct acpi_device *adev = acpi_fetch_acpi_dev(handle);
+ for_each_lpi_constraint(entry) {
+ struct acpi_device *adev = acpi_fetch_acpi_dev(entry->handle);
if (!adev)
continue;
- acpi_handle_debug(handle,
+ acpi_handle_debug(entry->handle,
"LPI: required min power state:%s current power state:%s\n",
- acpi_power_state_string(lpi_constraints_table[i].min_dstate),
+ acpi_power_state_string(entry->min_dstate),
acpi_power_state_string(adev->power.state));
if (!adev->flags.power_manageable) {
- acpi_handle_info(handle, "LPI: Device not power manageable\n");
- lpi_constraints_table[i].handle = NULL;
+ acpi_handle_info(entry->handle, "LPI: Device not power manageable\n");
+ entry->handle = NULL;
continue;
}
- if (adev->power.state < lpi_constraints_table[i].min_dstate)
- acpi_handle_info(handle,
+ if (adev->power.state < entry->min_dstate)
+ acpi_handle_info(entry->handle,
"LPI: Constraint not met; min power state:%s current power state:%s\n",
- acpi_power_state_string(lpi_constraints_table[i].min_dstate),
+ acpi_power_state_string(entry->min_dstate),
acpi_power_state_string(adev->power.state));
}
}
diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c
index c2b925f8cd4e..63d834dd3811 100644
--- a/drivers/acpi/x86/utils.c
+++ b/drivers/acpi/x86/utils.c
@@ -518,3 +518,38 @@ bool acpi_quirk_skip_acpi_ac_and_battery(void)
return false;
}
EXPORT_SYMBOL_GPL(acpi_quirk_skip_acpi_ac_and_battery);
+
+/* This section provides a workaround for a specific x86 system
+ * which requires disabling of mwait to work correctly.
+ */
+static int __init acpi_proc_quirk_set_no_mwait(const struct dmi_system_id *id)
+{
+ pr_notice("%s detected - disabling mwait for CPU C-states\n",
+ id->ident);
+ boot_option_idle_override = IDLE_NOMWAIT;
+ return 0;
+}
+
+static const struct dmi_system_id acpi_proc_quirk_mwait_dmi_table[] __initconst = {
+ {
+ .callback = acpi_proc_quirk_set_no_mwait,
+ .ident = "Extensa 5220",
+ .matches = {
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "0100"),
+ DMI_MATCH(DMI_BOARD_NAME, "Columbia"),
+ },
+ .driver_data = NULL,
+ },
+ {}
+};
+
+void __init acpi_proc_quirk_mwait_check(void)
+{
+ /*
+ * Check whether the system is DMI table. If yes, OSPM
+ * should not use mwait for CPU-states.
+ */
+ dmi_check_system(acpi_proc_quirk_mwait_dmi_table);
+}
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
index 76e7d6676657..faebe9f5412a 100644
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -153,7 +153,7 @@ static int binderfs_binder_device_create(struct inode *ref_inode,
goto err;
inode->i_ino = minor + INODE_OFFSET;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
init_special_inode(inode, S_IFCHR | 0600,
MKDEV(MAJOR(binderfs_dev), minor));
inode->i_fop = &binder_fops;
@@ -432,7 +432,7 @@ static int binderfs_binder_ctl_create(struct super_block *sb)
}
inode->i_ino = SECOND_INODE;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
init_special_inode(inode, S_IFCHR | 0600,
MKDEV(MAJOR(binderfs_dev), minor));
inode->i_fop = &binder_ctl_fops;
@@ -474,7 +474,7 @@ static struct inode *binderfs_make_inode(struct super_block *sb, int mode)
if (ret) {
ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET);
ret->i_mode = mode;
- ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret);
+ ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret);
}
return ret;
}
@@ -703,7 +703,7 @@ static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc)
inode->i_ino = FIRST_INODE;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | 0755;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_op = &binderfs_dir_inode_operations;
set_nlink(inode, 2);
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index e460c9799d9f..2b98114a9fe0 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1547,7 +1547,6 @@ static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode,
rel_fdc();
return -EBUSY;
}
- fsync_bdev(bdev);
if (fd_motor_on(drive) == 0) {
rel_fdc();
return -ENODEV;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 2db9b186b977..ea4eb88a2e45 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3255,7 +3255,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g,
if (!disk || ITYPE(drive_state[cnt].fd_device) != type)
continue;
- __invalidate_device(disk->part0, true);
+ disk_force_media_change(disk);
}
mutex_unlock(&open_lock);
} else {
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 637c5bda2387..9f2d412fc560 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -603,7 +603,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
goto out_err;
/* and ... switch */
- disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
+ disk_force_media_change(lo->lo_disk);
blk_mq_freeze_queue(lo->lo_queue);
mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
lo->lo_backing_file = file;
@@ -1067,7 +1067,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
/* suppress uevents while reconfiguring the device */
dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1);
- disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
+ disk_force_media_change(lo->lo_disk);
set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);
lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
@@ -1171,7 +1171,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
if (!release)
blk_mq_unfreeze_queue(lo->lo_queue);
- disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
+ disk_force_media_change(lo->lo_disk);
if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
int err;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 8576d696c7a2..42e0159bb258 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1434,12 +1434,10 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd)
return ret;
}
-static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
- struct block_device *bdev)
+static void nbd_clear_sock_ioctl(struct nbd_device *nbd)
{
+ blk_mark_disk_dead(nbd->disk);
nbd_clear_sock(nbd);
- __invalidate_device(bdev, true);
- nbd_bdev_reset(nbd);
if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
nbd_config_put(nbd);
@@ -1465,7 +1463,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
case NBD_DISCONNECT:
return nbd_disconnect(nbd);
case NBD_CLEAR_SOCK:
- nbd_clear_sock_ioctl(nbd, bdev);
+ nbd_clear_sock_ioctl(nbd);
return 0;
case NBD_SET_SOCK:
return nbd_add_socket(nbd, arg, false);
diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index 4fb4fd4b06bd..737aa70e2cb3 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -205,18 +205,19 @@ EXPORT_SYMBOL(devm_clk_put);
struct clk *devm_get_clk_from_child(struct device *dev,
struct device_node *np, const char *con_id)
{
- struct clk **ptr, *clk;
+ struct devm_clk_state *state;
+ struct clk *clk;
- ptr = devres_alloc(devm_clk_release, sizeof(*ptr), GFP_KERNEL);
- if (!ptr)
+ state = devres_alloc(devm_clk_release, sizeof(*state), GFP_KERNEL);
+ if (!state)
return ERR_PTR(-ENOMEM);
clk = of_clk_get_by_name(np, con_id);
if (!IS_ERR(clk)) {
- *ptr = clk;
- devres_add(dev, ptr);
+ state->clk = clk;
+ devres_add(dev, state);
} else {
- devres_free(ptr);
+ devres_free(state);
}
return clk;
diff --git a/drivers/clk/keystone/syscon-clk.c b/drivers/clk/keystone/syscon-clk.c
index d33f74119488..935d9a2d8c2b 100644
--- a/drivers/clk/keystone/syscon-clk.c
+++ b/drivers/clk/keystone/syscon-clk.c
@@ -151,8 +151,10 @@ static int ti_syscon_gate_clk_probe(struct platform_device *pdev)
data[i].name);
}
- return devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get,
- hw_data);
+ if (num_clks == 1)
+ return devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get,
+ hw_data->hws[0]);
+ return devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get, hw_data);
}
#define TI_SYSCON_CLK_GATE(_name, _offset, _bit_idx) \
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 44e44b8d9ce6..c761952f0dc6 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -70,10 +70,9 @@ config ZCRYPT
select HW_RANDOM
help
Select this option if you want to enable support for
- s390 cryptographic adapters like:
- + Crypto Express 2 up to 7 Coprocessor (CEXxC)
- + Crypto Express 2 up to 7 Accelerator (CEXxA)
- + Crypto Express 4 up to 7 EP11 Coprocessor (CEXxP)
+ s390 cryptographic adapters like Crypto Express 4 up
+ to 8 in Coprocessor (CEXxC), EP11 Coprocessor (CEXxP)
+ or Accelerator (CEXxA) mode.
config ZCRYPT_DEBUG
bool "Enable debug features for s390 cryptographic adapters"
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index ff9ddbbca377..68e73775392d 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -382,8 +382,8 @@ static void kick_trng(struct device *dev, int ent_delay)
val = ent_delay;
/* min. freq. count, equal to 1/4 of the entropy sample length */
wr_reg32(&r4tst->rtfrqmin, val >> 2);
- /* max. freq. count, equal to 16 times the entropy sample length */
- wr_reg32(&r4tst->rtfrqmax, val << 4);
+ /* disable maximum frequency count */
+ wr_reg32(&r4tst->rtfrqmax, RTFRQMAX_DISABLE);
}
wr_reg32(&r4tst->rtsdctl, (val << RTSDCTL_ENT_DLY_SHIFT) |
diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
index 63f0aeb66db6..f0a35277fd84 100644
--- a/drivers/dma-buf/sw_sync.c
+++ b/drivers/dma-buf/sw_sync.c
@@ -191,6 +191,7 @@ static const struct dma_fence_ops timeline_fence_ops = {
*/
static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc)
{
+ LIST_HEAD(signalled);
struct sync_pt *pt, *next;
trace_sync_timeline(obj);
@@ -203,21 +204,20 @@ static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc)
if (!timeline_fence_signaled(&pt->base))
break;
- list_del_init(&pt->link);
+ dma_fence_get(&pt->base);
+
+ list_move_tail(&pt->link, &signalled);
rb_erase(&pt->node, &obj->pt_tree);
- /*
- * A signal callback may release the last reference to this
- * fence, causing it to be freed. That operation has to be
- * last to avoid a use after free inside this loop, and must
- * be after we remove the fence from the timeline in order to
- * prevent deadlocking on timeline->lock inside
- * timeline_fence_release().
- */
dma_fence_signal_locked(&pt->base);
}
spin_unlock_irq(&obj->lock);
+
+ list_for_each_entry_safe(pt, next, &signalled, link) {
+ list_del_init(&pt->link);
+ dma_fence_put(&pt->base);
+ }
}
/**
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 597dae7692b1..9b6642d00871 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -4150,6 +4150,20 @@ static int per_family_init(struct amd64_pvt *pvt)
}
break;
+ case 0x1A:
+ switch (pvt->model) {
+ case 0x00 ... 0x1f:
+ pvt->ctl_name = "F1Ah";
+ pvt->max_mcs = 12;
+ pvt->flags.zn_regs_v2 = 1;
+ break;
+ case 0x40 ... 0x4f:
+ pvt->ctl_name = "F1Ah_M40h";
+ pvt->flags.zn_regs_v2 = 1;
+ break;
+ }
+ break;
+
default:
amd64_err("Unsupported family!\n");
return -ENODEV;
@@ -4344,6 +4358,7 @@ static const struct x86_cpu_id amd64_cpuids[] = {
X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL),
X86_MATCH_VENDOR_FAM(HYGON, 0x18, NULL),
X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL),
+ X86_MATCH_VENDOR_FAM(AMD, 0x1A, NULL),
{ }
};
MODULE_DEVICE_TABLE(x86cpu, amd64_cpuids);
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index a897b6aff368..5abf997ca7c1 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -906,7 +906,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = {
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SIERRAFOREST_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg),
+ X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg),
{}
};
MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids);
diff --git a/drivers/eisa/eisa-bus.c b/drivers/eisa/eisa-bus.c
index 713582cc27d1..33f0ba11c6ad 100644
--- a/drivers/eisa/eisa-bus.c
+++ b/drivers/eisa/eisa-bus.c
@@ -60,7 +60,7 @@ static void __init eisa_name_device(struct eisa_device *edev)
int i;
for (i = 0; i < EISA_INFOS; i++) {
if (!strcmp(edev->id.sig, eisa_table[i].id.sig)) {
- strlcpy(edev->pretty_name,
+ strscpy(edev->pretty_name,
eisa_table[i].name,
sizeof(edev->pretty_name));
return;
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index f9040bd61081..285fe7ad490d 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -1095,3 +1095,22 @@ int sdei_event_handler(struct pt_regs *regs,
return err;
}
NOKPROBE_SYMBOL(sdei_event_handler);
+
+void sdei_handler_abort(void)
+{
+ /*
+ * If the crash happened in an SDEI event handler then we need to
+ * finish the handler with the firmware so that we can have working
+ * interrupts in the crash kernel.
+ */
+ if (__this_cpu_read(sdei_active_critical_event)) {
+ pr_warn("still in SDEI critical event context, attempting to finish handler.\n");
+ __sdei_handler_abort();
+ __this_cpu_write(sdei_active_critical_event, NULL);
+ }
+ if (__this_cpu_read(sdei_active_normal_event)) {
+ pr_warn("still in SDEI normal event context, attempting to finish handler.\n");
+ __sdei_handler_abort();
+ __this_cpu_write(sdei_active_normal_event, NULL);
+ }
+}
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 16d64a34d1e1..92389a5481ff 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -88,6 +88,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB) += efi-stub.o string.o intrinsics.o systable.o \
lib-$(CONFIG_ARM) += arm32-stub.o
lib-$(CONFIG_ARM64) += arm64.o arm64-stub.o smbios.o
lib-$(CONFIG_X86) += x86-stub.o
+lib-$(CONFIG_X86_64) += x86-5lvl.o
lib-$(CONFIG_RISCV) += riscv.o riscv-stub.o
lib-$(CONFIG_LOONGARCH) += loongarch.o loongarch-stub.o
@@ -146,7 +147,7 @@ STUBCOPY_RELOC-$(CONFIG_ARM64) := R_AARCH64_ABS
# For RISC-V, we don't need anything special other than arm64. Keep all the
# symbols in .init section and make sure that no absolute symbols references
-# doesn't exist.
+# exist.
STUBCOPY_FLAGS-$(CONFIG_RISCV) += --prefix-alloc-sections=.init \
--prefix-symbols=__efistub_
STUBCOPY_RELOC-$(CONFIG_RISCV) := R_RISCV_HI20
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index 770b8ecb7398..8c40fc89f5f9 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -106,7 +106,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr,
*/
status = efi_random_alloc(*reserve_size, min_kimg_align,
reserve_addr, phys_seed,
- EFI_LOADER_CODE);
+ EFI_LOADER_CODE, EFI_ALLOC_LIMIT);
if (status != EFI_SUCCESS)
efi_warn("efi_random_alloc() failed: 0x%lx\n", status);
} else {
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 732984295295..bfa30625f5d0 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -73,6 +73,8 @@ efi_status_t efi_parse_options(char const *cmdline)
efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
} else if (!strcmp(param, "noinitrd")) {
efi_noinitrd = true;
+ } else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
+ efi_no5lvl = true;
} else if (!strcmp(param, "efi") && val) {
efi_nochunk = parse_option_str(val, "nochunk");
efi_novamap |= parse_option_str(val, "novamap");
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 6aa38a1bf126..9823f6fb3e01 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -33,6 +33,7 @@
#define EFI_ALLOC_LIMIT ULONG_MAX
#endif
+extern bool efi_no5lvl;
extern bool efi_nochunk;
extern bool efi_nokaslr;
extern int efi_loglevel;
@@ -955,7 +956,7 @@ efi_status_t efi_get_random_bytes(unsigned long size, u8 *out);
efi_status_t efi_random_alloc(unsigned long size, unsigned long align,
unsigned long *addr, unsigned long random_seed,
- int memory_type);
+ int memory_type, unsigned long alloc_limit);
efi_status_t efi_random_get_seed(void);
diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c
index 32c7a54923b4..674a064b8f7a 100644
--- a/drivers/firmware/efi/libstub/randomalloc.c
+++ b/drivers/firmware/efi/libstub/randomalloc.c
@@ -16,7 +16,8 @@
*/
static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
unsigned long size,
- unsigned long align_shift)
+ unsigned long align_shift,
+ u64 alloc_limit)
{
unsigned long align = 1UL << align_shift;
u64 first_slot, last_slot, region_end;
@@ -29,7 +30,7 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
return 0;
region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1,
- (u64)EFI_ALLOC_LIMIT);
+ alloc_limit);
if (region_end < size)
return 0;
@@ -54,7 +55,8 @@ efi_status_t efi_random_alloc(unsigned long size,
unsigned long align,
unsigned long *addr,
unsigned long random_seed,
- int memory_type)
+ int memory_type,
+ unsigned long alloc_limit)
{
unsigned long total_slots = 0, target_slot;
unsigned long total_mirrored_slots = 0;
@@ -76,7 +78,7 @@ efi_status_t efi_random_alloc(unsigned long size,
efi_memory_desc_t *md = (void *)map->map + map_offset;
unsigned long slots;
- slots = get_entry_num_slots(md, size, ilog2(align));
+ slots = get_entry_num_slots(md, size, ilog2(align), alloc_limit);
MD_NUM_SLOTS(md) = slots;
total_slots += slots;
if (md->attribute & EFI_MEMORY_MORE_RELIABLE)
diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c
new file mode 100644
index 000000000000..479dd445acdc
--- /dev/null
+++ b/drivers/firmware/efi/libstub/x86-5lvl.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/efi.h>
+
+#include <asm/boot.h>
+#include <asm/desc.h>
+#include <asm/efi.h>
+
+#include "efistub.h"
+#include "x86-stub.h"
+
+bool efi_no5lvl;
+
+static void (*la57_toggle)(void *cr3);
+
+static const struct desc_struct gdt[] = {
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+};
+
+/*
+ * Enabling (or disabling) 5 level paging is tricky, because it can only be
+ * done from 32-bit mode with paging disabled. This means not only that the
+ * code itself must be running from 32-bit addressable physical memory, but
+ * also that the root page table must be 32-bit addressable, as programming
+ * a 64-bit value into CR3 when running in 32-bit mode is not supported.
+ */
+efi_status_t efi_setup_5level_paging(void)
+{
+ u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
+ efi_status_t status;
+ u8 *la57_code;
+
+ if (!efi_is_64bit())
+ return EFI_SUCCESS;
+
+ /* check for 5 level paging support */
+ if (native_cpuid_eax(0) < 7 ||
+ !(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+ return EFI_SUCCESS;
+
+ /* allocate some 32-bit addressable memory for code and a page table */
+ status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
+ U32_MAX);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ la57_toggle = memcpy(la57_code, trampoline_32bit_src, tmpl_size);
+ memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
+
+ /*
+ * To avoid the need to allocate a 32-bit addressable stack, the
+ * trampoline uses a LJMP instruction to switch back to long mode.
+ * LJMP takes an absolute destination address, which needs to be
+ * fixed up at runtime.
+ */
+ *(u32 *)&la57_code[trampoline_ljmp_imm_offset] += (unsigned long)la57_code;
+
+ efi_adjust_memory_range_protection((unsigned long)la57_toggle, PAGE_SIZE);
+
+ return EFI_SUCCESS;
+}
+
+void efi_5level_switch(void)
+{
+ bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
+ bool have_la57 = native_read_cr4() & X86_CR4_LA57;
+ bool need_toggle = want_la57 ^ have_la57;
+ u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
+ u64 *cr3 = (u64 *)__native_read_cr3();
+ u64 *new_cr3;
+
+ if (!la57_toggle || !need_toggle)
+ return;
+
+ if (!have_la57) {
+ /*
+ * 5 level paging will be enabled, so a root level page needs
+ * to be allocated from the 32-bit addressable physical region,
+ * with its first entry referring to the existing hierarchy.
+ */
+ new_cr3 = memset(pgt, 0, PAGE_SIZE);
+ new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC;
+ } else {
+ /* take the new root table pointer from the current entry #0 */
+ new_cr3 = (u64 *)(cr3[0] & PAGE_MASK);
+
+ /* copy the new root table if it is not 32-bit addressable */
+ if ((u64)new_cr3 > U32_MAX)
+ new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE);
+ }
+
+ native_load_gdt(&(struct desc_ptr){ sizeof(gdt) - 1, (u64)gdt });
+
+ la57_toggle(new_cr3);
+}
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 220be75a5cdc..2fee52ed335d 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -15,16 +15,16 @@
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/boot.h>
+#include <asm/kaslr.h>
+#include <asm/sev.h>
#include "efistub.h"
-
-/* Maximum physical address for 64-bit kernel with 4-level paging */
-#define MAXMEM_X86_64_4LEVEL (1ull << 46)
+#include "x86-stub.h"
const efi_system_table_t *efi_system_table;
const efi_dxe_services_table_t *efi_dxe_table;
-u32 image_offset __section(".data");
static efi_loaded_image_t *image = NULL;
+static efi_memory_attribute_protocol_t *memattr;
typedef union sev_memory_acceptance_protocol sev_memory_acceptance_protocol_t;
union sev_memory_acceptance_protocol {
@@ -72,7 +72,7 @@ preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
rom->data.type = SETUP_PCI;
rom->data.len = size - sizeof(struct setup_data);
rom->data.next = 0;
- rom->pcilen = pci->romsize;
+ rom->pcilen = romsize;
*__rom = rom;
status = efi_call_proto(pci, pci.read, EfiPciIoWidthUint16,
@@ -223,8 +223,8 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
}
}
-static void
-adjust_memory_range_protection(unsigned long start, unsigned long size)
+void efi_adjust_memory_range_protection(unsigned long start,
+ unsigned long size)
{
efi_status_t status;
efi_gcd_memory_space_desc_t desc;
@@ -232,12 +232,18 @@ adjust_memory_range_protection(unsigned long start, unsigned long size)
unsigned long rounded_start, rounded_end;
unsigned long unprotect_start, unprotect_size;
- if (efi_dxe_table == NULL)
- return;
-
rounded_start = rounddown(start, EFI_PAGE_SIZE);
rounded_end = roundup(start + size, EFI_PAGE_SIZE);
+ if (memattr != NULL) {
+ efi_call_proto(memattr, clear_memory_attributes, rounded_start,
+ rounded_end - rounded_start, EFI_MEMORY_XP);
+ return;
+ }
+
+ if (efi_dxe_table == NULL)
+ return;
+
/*
* Don't modify memory region attributes, they are
* already suitable, to lower the possibility to
@@ -278,49 +284,6 @@ adjust_memory_range_protection(unsigned long start, unsigned long size)
}
}
-/*
- * Trampoline takes 2 pages and can be loaded in first megabyte of memory
- * with its end placed between 128k and 640k where BIOS might start.
- * (see arch/x86/boot/compressed/pgtable_64.c)
- *
- * We cannot find exact trampoline placement since memory map
- * can be modified by UEFI, and it can alter the computed address.
- */
-
-#define TRAMPOLINE_PLACEMENT_BASE ((128 - 8)*1024)
-#define TRAMPOLINE_PLACEMENT_SIZE (640*1024 - (128 - 8)*1024)
-
-void startup_32(struct boot_params *boot_params);
-
-static void
-setup_memory_protection(unsigned long image_base, unsigned long image_size)
-{
- /*
- * Allow execution of possible trampoline used
- * for switching between 4- and 5-level page tables
- * and relocated kernel image.
- */
-
- adjust_memory_range_protection(TRAMPOLINE_PLACEMENT_BASE,
- TRAMPOLINE_PLACEMENT_SIZE);
-
-#ifdef CONFIG_64BIT
- if (image_base != (unsigned long)startup_32)
- adjust_memory_range_protection(image_base, image_size);
-#else
- /*
- * Clear protection flags on a whole range of possible
- * addresses used for KASLR. We don't need to do that
- * on x86_64, since KASLR/extraction is performed after
- * dedicated identity page tables are built and we only
- * need to remove possible protection on relocated image
- * itself disregarding further relocations.
- */
- adjust_memory_range_protection(LOAD_PHYSICAL_ADDR,
- KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR);
-#endif
-}
-
static void setup_unaccepted_memory(void)
{
efi_guid_t mem_acceptance_proto = OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID;
@@ -346,9 +309,7 @@ static void setup_unaccepted_memory(void)
static const efi_char16_t apple[] = L"Apple";
-static void setup_quirks(struct boot_params *boot_params,
- unsigned long image_base,
- unsigned long image_size)
+static void setup_quirks(struct boot_params *boot_params)
{
efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long)
efi_table_attr(efi_system_table, fw_vendor);
@@ -357,9 +318,6 @@ static void setup_quirks(struct boot_params *boot_params,
if (IS_ENABLED(CONFIG_APPLE_PROPERTIES))
retrieve_apple_device_properties(boot_params);
}
-
- if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES))
- setup_memory_protection(image_base, image_size);
}
/*
@@ -512,7 +470,6 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
}
image_base = efi_table_attr(image, image_base);
- image_offset = (void *)startup_32 - image_base;
status = efi_allocate_pages(sizeof(struct boot_params),
(unsigned long *)&boot_params, ULONG_MAX);
@@ -803,19 +760,96 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
return EFI_SUCCESS;
}
+static bool have_unsupported_snp_features(void)
+{
+ u64 unsupported;
+
+ unsupported = snp_get_unsupported_features(sev_get_status());
+ if (unsupported) {
+ efi_err("Unsupported SEV-SNP features detected: 0x%llx\n",
+ unsupported);
+ return true;
+ }
+ return false;
+}
+
+static void efi_get_seed(void *seed, int size)
+{
+ efi_get_random_bytes(size, seed);
+
+ /*
+ * This only updates seed[0] when running on 32-bit, but in that case,
+ * seed[1] is not used anyway, as there is no virtual KASLR on 32-bit.
+ */
+ *(unsigned long *)seed ^= kaslr_get_random_long("EFI");
+}
+
+static void error(char *str)
+{
+ efi_warn("Decompression failed: %s\n", str);
+}
+
+static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry)
+{
+ unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
+ unsigned long addr, alloc_size, entry;
+ efi_status_t status;
+ u32 seed[2] = {};
+
+ /* determine the required size of the allocation */
+ alloc_size = ALIGN(max_t(unsigned long, output_len, kernel_total_size),
+ MIN_KERNEL_ALIGN);
+
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && !efi_nokaslr) {
+ u64 range = KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR - kernel_total_size;
+
+ efi_get_seed(seed, sizeof(seed));
+
+ virt_addr += (range * seed[1]) >> 32;
+ virt_addr &= ~(CONFIG_PHYSICAL_ALIGN - 1);
+ }
+
+ status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr,
+ seed[0], EFI_LOADER_CODE,
+ EFI_X86_KERNEL_ALLOC_LIMIT);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ entry = decompress_kernel((void *)addr, virt_addr, error);
+ if (entry == ULONG_MAX) {
+ efi_free(alloc_size, addr);
+ return EFI_LOAD_ERROR;
+ }
+
+ *kernel_entry = addr + entry;
+
+ efi_adjust_memory_range_protection(addr, kernel_total_size);
+
+ return EFI_SUCCESS;
+}
+
+static void __noreturn enter_kernel(unsigned long kernel_addr,
+ struct boot_params *boot_params)
+{
+ /* enter decompressed kernel with boot_params pointer in RSI/ESI */
+ asm("jmp *%0"::"r"(kernel_addr), "S"(boot_params));
+
+ unreachable();
+}
+
/*
- * On success, we return the address of startup_32, which has potentially been
- * relocated by efi_relocate_kernel.
- * On failure, we exit to the firmware via efi_exit instead of returning.
+ * On success, this routine will jump to the relocated image directly and never
+ * return. On failure, it will exit to the firmware via efi_exit() instead of
+ * returning.
*/
-asmlinkage unsigned long efi_main(efi_handle_t handle,
- efi_system_table_t *sys_table_arg,
- struct boot_params *boot_params)
+void __noreturn efi_stub_entry(efi_handle_t handle,
+ efi_system_table_t *sys_table_arg,
+ struct boot_params *boot_params)
{
- unsigned long bzimage_addr = (unsigned long)startup_32;
- unsigned long buffer_start, buffer_end;
+ efi_guid_t guid = EFI_MEMORY_ATTRIBUTE_PROTOCOL_GUID;
struct setup_header *hdr = &boot_params->hdr;
const struct linux_efi_initrd *initrd = NULL;
+ unsigned long kernel_entry;
efi_status_t status;
efi_system_table = sys_table_arg;
@@ -823,65 +857,25 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
efi_exit(handle, EFI_INVALID_PARAMETER);
- efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID);
- if (efi_dxe_table &&
- efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) {
- efi_warn("Ignoring DXE services table: invalid signature\n");
- efi_dxe_table = NULL;
+ if (have_unsupported_snp_features())
+ efi_exit(handle, EFI_UNSUPPORTED);
+
+ if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) {
+ efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID);
+ if (efi_dxe_table &&
+ efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) {
+ efi_warn("Ignoring DXE services table: invalid signature\n");
+ efi_dxe_table = NULL;
+ }
}
- /*
- * If the kernel isn't already loaded at a suitable address,
- * relocate it.
- *
- * It must be loaded above LOAD_PHYSICAL_ADDR.
- *
- * The maximum address for 64-bit is 1 << 46 for 4-level paging. This
- * is defined as the macro MAXMEM, but unfortunately that is not a
- * compile-time constant if 5-level paging is configured, so we instead
- * define our own macro for use here.
- *
- * For 32-bit, the maximum address is complicated to figure out, for
- * now use KERNEL_IMAGE_SIZE, which will be 512MiB, the same as what
- * KASLR uses.
- *
- * Also relocate it if image_offset is zero, i.e. the kernel wasn't
- * loaded by LoadImage, but rather by a bootloader that called the
- * handover entry. The reason we must always relocate in this case is
- * to handle the case of systemd-boot booting a unified kernel image,
- * which is a PE executable that contains the bzImage and an initrd as
- * COFF sections. The initrd section is placed after the bzImage
- * without ensuring that there are at least init_size bytes available
- * for the bzImage, and thus the compressed kernel's startup code may
- * overwrite the initrd unless it is moved out of the way.
- */
+ /* grab the memory attributes protocol if it exists */
+ efi_bs_call(locate_protocol, &guid, NULL, (void **)&memattr);
- buffer_start = ALIGN(bzimage_addr - image_offset,
- hdr->kernel_alignment);
- buffer_end = buffer_start + hdr->init_size;
-
- if ((buffer_start < LOAD_PHYSICAL_ADDR) ||
- (IS_ENABLED(CONFIG_X86_32) && buffer_end > KERNEL_IMAGE_SIZE) ||
- (IS_ENABLED(CONFIG_X86_64) && buffer_end > MAXMEM_X86_64_4LEVEL) ||
- (image_offset == 0)) {
- extern char _bss[];
-
- status = efi_relocate_kernel(&bzimage_addr,
- (unsigned long)_bss - bzimage_addr,
- hdr->init_size,
- hdr->pref_address,
- hdr->kernel_alignment,
- LOAD_PHYSICAL_ADDR);
- if (status != EFI_SUCCESS) {
- efi_err("efi_relocate_kernel() failed!\n");
- goto fail;
- }
- /*
- * Now that we've copied the kernel elsewhere, we no longer
- * have a set up block before startup_32(), so reset image_offset
- * to zero in case it was set earlier.
- */
- image_offset = 0;
+ status = efi_setup_5level_paging();
+ if (status != EFI_SUCCESS) {
+ efi_err("efi_setup_5level_paging() failed!\n");
+ goto fail;
}
#ifdef CONFIG_CMDLINE_BOOL
@@ -901,6 +895,12 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
}
}
+ status = efi_decompress_kernel(&kernel_entry);
+ if (status != EFI_SUCCESS) {
+ efi_err("Failed to decompress kernel\n");
+ goto fail;
+ }
+
/*
* At this point, an initrd may already have been loaded by the
* bootloader and passed via bootparams. We permit an initrd loaded
@@ -940,7 +940,7 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
setup_efi_pci(boot_params);
- setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start);
+ setup_quirks(boot_params);
setup_unaccepted_memory();
@@ -950,9 +950,38 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
goto fail;
}
- return bzimage_addr;
+ /*
+ * Call the SEV init code while still running with the firmware's
+ * GDT/IDT, so #VC exceptions will be handled by EFI.
+ */
+ sev_enable(boot_params);
+
+ efi_5level_switch();
+
+ enter_kernel(kernel_entry, boot_params);
fail:
- efi_err("efi_main() failed!\n");
+ efi_err("efi_stub_entry() failed!\n");
efi_exit(handle, status);
}
+
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
+void efi_handover_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
+ struct boot_params *boot_params)
+{
+ extern char _bss[], _ebss[];
+
+ memset(_bss, 0, _ebss - _bss);
+ efi_stub_entry(handle, sys_table_arg, boot_params);
+}
+
+#ifndef CONFIG_EFI_MIXED
+extern __alias(efi_handover_entry)
+void efi32_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
+ struct boot_params *boot_params);
+
+extern __alias(efi_handover_entry)
+void efi64_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
+ struct boot_params *boot_params);
+#endif
+#endif
diff --git a/drivers/firmware/efi/libstub/x86-stub.h b/drivers/firmware/efi/libstub/x86-stub.h
new file mode 100644
index 000000000000..37c5a36b9d8c
--- /dev/null
+++ b/drivers/firmware/efi/libstub/x86-stub.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/efi.h>
+
+extern void trampoline_32bit_src(void *, bool);
+extern const u16 trampoline_ljmp_imm_offset;
+
+void efi_adjust_memory_range_protection(unsigned long start,
+ unsigned long size);
+
+#ifdef CONFIG_X86_64
+efi_status_t efi_setup_5level_paging(void);
+void efi_5level_switch(void);
+#else
+static inline efi_status_t efi_setup_5level_paging(void) { return EFI_SUCCESS; }
+static inline void efi_5level_switch(void) {}
+#endif
diff --git a/drivers/firmware/efi/libstub/zboot.c b/drivers/firmware/efi/libstub/zboot.c
index e5d7fa1f1d8f..bdb17eac0cb4 100644
--- a/drivers/firmware/efi/libstub/zboot.c
+++ b/drivers/firmware/efi/libstub/zboot.c
@@ -119,7 +119,7 @@ efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab)
}
status = efi_random_alloc(alloc_size, min_kimg_align, &image_base,
- seed, EFI_LOADER_CODE);
+ seed, EFI_LOADER_CODE, EFI_ALLOC_LIMIT);
if (status != EFI_SUCCESS) {
efi_err("Failed to allocate memory\n");
goto free_cmdline;
diff --git a/drivers/firmware/efi/riscv-runtime.c b/drivers/firmware/efi/riscv-runtime.c
index d0daacd2c903..09525fb5c240 100644
--- a/drivers/firmware/efi/riscv-runtime.c
+++ b/drivers/firmware/efi/riscv-runtime.c
@@ -130,14 +130,25 @@ static int __init riscv_enable_runtime_services(void)
}
early_initcall(riscv_enable_runtime_services);
-void efi_virtmap_load(void)
+static void efi_virtmap_load(void)
{
preempt_disable();
switch_mm(current->active_mm, &efi_mm, NULL);
}
-void efi_virtmap_unload(void)
+static void efi_virtmap_unload(void)
{
switch_mm(&efi_mm, current->active_mm, NULL);
preempt_enable();
}
+
+void arch_efi_call_virt_setup(void)
+{
+ sync_kernel_mappings(efi_mm.pgd);
+ efi_virtmap_load();
+}
+
+void arch_efi_call_virt_teardown(void)
+{
+ efi_virtmap_unload();
+}
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index a400c4312c82..5d56bc40a79d 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -40,55 +40,97 @@
* code doesn't get too cluttered:
*/
#define efi_call_virt(f, args...) \
- efi_call_virt_pointer(efi.runtime, f, args)
-#define __efi_call_virt(f, args...) \
- __efi_call_virt_pointer(efi.runtime, f, args)
+ arch_efi_call_virt(efi.runtime, f, args)
+
+union efi_rts_args {
+ struct {
+ efi_time_t *time;
+ efi_time_cap_t *capabilities;
+ } GET_TIME;
+
+ struct {
+ efi_time_t *time;
+ } SET_TIME;
+
+ struct {
+ efi_bool_t *enabled;
+ efi_bool_t *pending;
+ efi_time_t *time;
+ } GET_WAKEUP_TIME;
+
+ struct {
+ efi_bool_t enable;
+ efi_time_t *time;
+ } SET_WAKEUP_TIME;
+
+ struct {
+ efi_char16_t *name;
+ efi_guid_t *vendor;
+ u32 *attr;
+ unsigned long *data_size;
+ void *data;
+ } GET_VARIABLE;
+
+ struct {
+ unsigned long *name_size;
+ efi_char16_t *name;
+ efi_guid_t *vendor;
+ } GET_NEXT_VARIABLE;
+
+ struct {
+ efi_char16_t *name;
+ efi_guid_t *vendor;
+ u32 attr;
+ unsigned long data_size;
+ void *data;
+ } SET_VARIABLE;
+
+ struct {
+ u32 attr;
+ u64 *storage_space;
+ u64 *remaining_space;
+ u64 *max_variable_size;
+ } QUERY_VARIABLE_INFO;
+
+ struct {
+ u32 *high_count;
+ } GET_NEXT_HIGH_MONO_COUNT;
+
+ struct {
+ efi_capsule_header_t **capsules;
+ unsigned long count;
+ unsigned long sg_list;
+ } UPDATE_CAPSULE;
+
+ struct {
+ efi_capsule_header_t **capsules;
+ unsigned long count;
+ u64 *max_size;
+ int *reset_type;
+ } QUERY_CAPSULE_CAPS;
+
+ struct {
+ efi_status_t (__efiapi *acpi_prm_handler)(u64, void *);
+ u64 param_buffer_addr;
+ void *context;
+ } ACPI_PRM_HANDLER;
+};
struct efi_runtime_work efi_rts_work;
/*
- * efi_queue_work: Queue efi_runtime_service() and wait until it's done
- * @rts: efi_runtime_service() function identifier
- * @rts_arg<1-5>: efi_runtime_service() function arguments
+ * efi_queue_work: Queue EFI runtime service call and wait for completion
+ * @_rts: EFI runtime service function identifier
+ * @_args: Arguments to pass to the EFI runtime service
*
* Accesses to efi_runtime_services() are serialized by a binary
* semaphore (efi_runtime_lock) and caller waits until the work is
* finished, hence _only_ one work is queued at a time and the caller
* thread waits for completion.
*/
-#define efi_queue_work(_rts, _arg1, _arg2, _arg3, _arg4, _arg5) \
-({ \
- efi_rts_work.status = EFI_ABORTED; \
- \
- if (!efi_enabled(EFI_RUNTIME_SERVICES)) { \
- pr_warn_once("EFI Runtime Services are disabled!\n"); \
- efi_rts_work.status = EFI_DEVICE_ERROR; \
- goto exit; \
- } \
- \
- init_completion(&efi_rts_work.efi_rts_comp); \
- INIT_WORK(&efi_rts_work.work, efi_call_rts); \
- efi_rts_work.arg1 = _arg1; \
- efi_rts_work.arg2 = _arg2; \
- efi_rts_work.arg3 = _arg3; \
- efi_rts_work.arg4 = _arg4; \
- efi_rts_work.arg5 = _arg5; \
- efi_rts_work.efi_rts_id = _rts; \
- \
- /* \
- * queue_work() returns 0 if work was already on queue, \
- * _ideally_ this should never happen. \
- */ \
- if (queue_work(efi_rts_wq, &efi_rts_work.work)) \
- wait_for_completion(&efi_rts_work.efi_rts_comp); \
- else \
- pr_err("Failed to queue work to efi_rts_wq.\n"); \
- \
- WARN_ON_ONCE(efi_rts_work.status == EFI_ABORTED); \
-exit: \
- efi_rts_work.efi_rts_id = EFI_NONE; \
- efi_rts_work.status; \
-})
+#define efi_queue_work(_rts, _args...) \
+ __efi_queue_work(EFI_ ## _rts, \
+ &(union efi_rts_args){ ._rts = { _args }})
#ifndef arch_efi_save_flags
#define arch_efi_save_flags(state_flags) local_save_flags(state_flags)
@@ -103,7 +145,7 @@ unsigned long efi_call_virt_save_flags(void)
return flags;
}
-void efi_call_virt_check_flags(unsigned long flags, const char *call)
+void efi_call_virt_check_flags(unsigned long flags, const void *caller)
{
unsigned long cur_flags, mismatch;
@@ -114,8 +156,8 @@ void efi_call_virt_check_flags(unsigned long flags, const char *call)
return;
add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_NOW_UNRELIABLE);
- pr_err_ratelimited(FW_BUG "IRQ flags corrupted (0x%08lx=>0x%08lx) by EFI %s\n",
- flags, cur_flags, call);
+ pr_err_ratelimited(FW_BUG "IRQ flags corrupted (0x%08lx=>0x%08lx) by EFI call from %pS\n",
+ flags, cur_flags, caller ?: __builtin_return_address(0));
arch_efi_restore_flags(flags);
}
@@ -170,74 +212,90 @@ extern struct semaphore __efi_uv_runtime_lock __alias(efi_runtime_lock);
/*
* Calls the appropriate efi_runtime_service() with the appropriate
* arguments.
- *
- * Semantics followed by efi_call_rts() to understand efi_runtime_work:
- * 1. If argument was a pointer, recast it from void pointer to original
- * pointer type.
- * 2. If argument was a value, recast it from void pointer to original
- * pointer type and dereference it.
*/
static void efi_call_rts(struct work_struct *work)
{
- void *arg1, *arg2, *arg3, *arg4, *arg5;
+ const union efi_rts_args *args = efi_rts_work.args;
efi_status_t status = EFI_NOT_FOUND;
+ unsigned long flags;
- arg1 = efi_rts_work.arg1;
- arg2 = efi_rts_work.arg2;
- arg3 = efi_rts_work.arg3;
- arg4 = efi_rts_work.arg4;
- arg5 = efi_rts_work.arg5;
+ arch_efi_call_virt_setup();
+ flags = efi_call_virt_save_flags();
switch (efi_rts_work.efi_rts_id) {
case EFI_GET_TIME:
- status = efi_call_virt(get_time, (efi_time_t *)arg1,
- (efi_time_cap_t *)arg2);
+ status = efi_call_virt(get_time,
+ args->GET_TIME.time,
+ args->GET_TIME.capabilities);
break;
case EFI_SET_TIME:
- status = efi_call_virt(set_time, (efi_time_t *)arg1);
+ status = efi_call_virt(set_time,
+ args->SET_TIME.time);
break;
case EFI_GET_WAKEUP_TIME:
- status = efi_call_virt(get_wakeup_time, (efi_bool_t *)arg1,
- (efi_bool_t *)arg2, (efi_time_t *)arg3);
+ status = efi_call_virt(get_wakeup_time,
+ args->GET_WAKEUP_TIME.enabled,
+ args->GET_WAKEUP_TIME.pending,
+ args->GET_WAKEUP_TIME.time);
break;
case EFI_SET_WAKEUP_TIME:
- status = efi_call_virt(set_wakeup_time, *(efi_bool_t *)arg1,
- (efi_time_t *)arg2);
+ status = efi_call_virt(set_wakeup_time,
+ args->SET_WAKEUP_TIME.enable,
+ args->SET_WAKEUP_TIME.time);
break;
case EFI_GET_VARIABLE:
- status = efi_call_virt(get_variable, (efi_char16_t *)arg1,
- (efi_guid_t *)arg2, (u32 *)arg3,
- (unsigned long *)arg4, (void *)arg5);
+ status = efi_call_virt(get_variable,
+ args->GET_VARIABLE.name,
+ args->GET_VARIABLE.vendor,
+ args->GET_VARIABLE.attr,
+ args->GET_VARIABLE.data_size,
+ args->GET_VARIABLE.data);
break;
case EFI_GET_NEXT_VARIABLE:
- status = efi_call_virt(get_next_variable, (unsigned long *)arg1,
- (efi_char16_t *)arg2,
- (efi_guid_t *)arg3);
+ status = efi_call_virt(get_next_variable,
+ args->GET_NEXT_VARIABLE.name_size,
+ args->GET_NEXT_VARIABLE.name,
+ args->GET_NEXT_VARIABLE.vendor);
break;
case EFI_SET_VARIABLE:
- status = efi_call_virt(set_variable, (efi_char16_t *)arg1,
- (efi_guid_t *)arg2, *(u32 *)arg3,
- *(unsigned long *)arg4, (void *)arg5);
+ status = efi_call_virt(set_variable,
+ args->SET_VARIABLE.name,
+ args->SET_VARIABLE.vendor,
+ args->SET_VARIABLE.attr,
+ args->SET_VARIABLE.data_size,
+ args->SET_VARIABLE.data);
break;
case EFI_QUERY_VARIABLE_INFO:
- status = efi_call_virt(query_variable_info, *(u32 *)arg1,
- (u64 *)arg2, (u64 *)arg3, (u64 *)arg4);
+ status = efi_call_virt(query_variable_info,
+ args->QUERY_VARIABLE_INFO.attr,
+ args->QUERY_VARIABLE_INFO.storage_space,
+ args->QUERY_VARIABLE_INFO.remaining_space,
+ args->QUERY_VARIABLE_INFO.max_variable_size);
break;
case EFI_GET_NEXT_HIGH_MONO_COUNT:
- status = efi_call_virt(get_next_high_mono_count, (u32 *)arg1);
+ status = efi_call_virt(get_next_high_mono_count,
+ args->GET_NEXT_HIGH_MONO_COUNT.high_count);
break;
case EFI_UPDATE_CAPSULE:
status = efi_call_virt(update_capsule,
- (efi_capsule_header_t **)arg1,
- *(unsigned long *)arg2,
- *(unsigned long *)arg3);
+ args->UPDATE_CAPSULE.capsules,
+ args->UPDATE_CAPSULE.count,
+ args->UPDATE_CAPSULE.sg_list);
break;
case EFI_QUERY_CAPSULE_CAPS:
status = efi_call_virt(query_capsule_caps,
- (efi_capsule_header_t **)arg1,
- *(unsigned long *)arg2, (u64 *)arg3,
- (int *)arg4);
+ args->QUERY_CAPSULE_CAPS.capsules,
+ args->QUERY_CAPSULE_CAPS.count,
+ args->QUERY_CAPSULE_CAPS.max_size,
+ args->QUERY_CAPSULE_CAPS.reset_type);
break;
+ case EFI_ACPI_PRM_HANDLER:
+#ifdef CONFIG_ACPI_PRMT
+ status = arch_efi_call_virt(args, ACPI_PRM_HANDLER.acpi_prm_handler,
+ args->ACPI_PRM_HANDLER.param_buffer_addr,
+ args->ACPI_PRM_HANDLER.context);
+ break;
+#endif
default:
/*
* Ideally, we should never reach here because a caller of this
@@ -246,17 +304,53 @@ static void efi_call_rts(struct work_struct *work)
*/
pr_err("Requested executing invalid EFI Runtime Service.\n");
}
+
+ efi_call_virt_check_flags(flags, efi_rts_work.caller);
+ arch_efi_call_virt_teardown();
+
efi_rts_work.status = status;
complete(&efi_rts_work.efi_rts_comp);
}
+static efi_status_t __efi_queue_work(enum efi_rts_ids id,
+ union efi_rts_args *args)
+{
+ efi_rts_work.efi_rts_id = id;
+ efi_rts_work.args = args;
+ efi_rts_work.caller = __builtin_return_address(0);
+ efi_rts_work.status = EFI_ABORTED;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
+ pr_warn_once("EFI Runtime Services are disabled!\n");
+ efi_rts_work.status = EFI_DEVICE_ERROR;
+ goto exit;
+ }
+
+ init_completion(&efi_rts_work.efi_rts_comp);
+ INIT_WORK(&efi_rts_work.work, efi_call_rts);
+
+ /*
+ * queue_work() returns 0 if work was already on queue,
+ * _ideally_ this should never happen.
+ */
+ if (queue_work(efi_rts_wq, &efi_rts_work.work))
+ wait_for_completion(&efi_rts_work.efi_rts_comp);
+ else
+ pr_err("Failed to queue work to efi_rts_wq.\n");
+
+ WARN_ON_ONCE(efi_rts_work.status == EFI_ABORTED);
+exit:
+ efi_rts_work.efi_rts_id = EFI_NONE;
+ return efi_rts_work.status;
+}
+
static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
efi_status_t status;
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_GET_TIME, tm, tc, NULL, NULL, NULL);
+ status = efi_queue_work(GET_TIME, tm, tc);
up(&efi_runtime_lock);
return status;
}
@@ -267,7 +361,7 @@ static efi_status_t virt_efi_set_time(efi_time_t *tm)
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_SET_TIME, tm, NULL, NULL, NULL, NULL);
+ status = efi_queue_work(SET_TIME, tm);
up(&efi_runtime_lock);
return status;
}
@@ -280,8 +374,7 @@ static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_GET_WAKEUP_TIME, enabled, pending, tm, NULL,
- NULL);
+ status = efi_queue_work(GET_WAKEUP_TIME, enabled, pending, tm);
up(&efi_runtime_lock);
return status;
}
@@ -292,8 +385,7 @@ static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_SET_WAKEUP_TIME, &enabled, tm, NULL, NULL,
- NULL);
+ status = efi_queue_work(SET_WAKEUP_TIME, enabled, tm);
up(&efi_runtime_lock);
return status;
}
@@ -308,7 +400,7 @@ static efi_status_t virt_efi_get_variable(efi_char16_t *name,
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_GET_VARIABLE, name, vendor, attr, data_size,
+ status = efi_queue_work(GET_VARIABLE, name, vendor, attr, data_size,
data);
up(&efi_runtime_lock);
return status;
@@ -322,8 +414,7 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_GET_NEXT_VARIABLE, name_size, name, vendor,
- NULL, NULL);
+ status = efi_queue_work(GET_NEXT_VARIABLE, name_size, name, vendor);
up(&efi_runtime_lock);
return status;
}
@@ -338,24 +429,23 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name,
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_SET_VARIABLE, name, vendor, &attr, &data_size,
+ status = efi_queue_work(SET_VARIABLE, name, vendor, attr, data_size,
data);
up(&efi_runtime_lock);
return status;
}
static efi_status_t
-virt_efi_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor,
- u32 attr, unsigned long data_size,
- void *data)
+virt_efi_set_variable_nb(efi_char16_t *name, efi_guid_t *vendor, u32 attr,
+ unsigned long data_size, void *data)
{
efi_status_t status;
if (down_trylock(&efi_runtime_lock))
return EFI_NOT_READY;
- status = efi_call_virt(set_variable, name, vendor, attr, data_size,
- data);
+ status = efi_call_virt_pointer(efi.runtime, set_variable, name, vendor,
+ attr, data_size, data);
up(&efi_runtime_lock);
return status;
}
@@ -373,17 +463,15 @@ static efi_status_t virt_efi_query_variable_info(u32 attr,
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_QUERY_VARIABLE_INFO, &attr, storage_space,
- remaining_space, max_variable_size, NULL);
+ status = efi_queue_work(QUERY_VARIABLE_INFO, attr, storage_space,
+ remaining_space, max_variable_size);
up(&efi_runtime_lock);
return status;
}
static efi_status_t
-virt_efi_query_variable_info_nonblocking(u32 attr,
- u64 *storage_space,
- u64 *remaining_space,
- u64 *max_variable_size)
+virt_efi_query_variable_info_nb(u32 attr, u64 *storage_space,
+ u64 *remaining_space, u64 *max_variable_size)
{
efi_status_t status;
@@ -393,8 +481,9 @@ virt_efi_query_variable_info_nonblocking(u32 attr,
if (down_trylock(&efi_runtime_lock))
return EFI_NOT_READY;
- status = efi_call_virt(query_variable_info, attr, storage_space,
- remaining_space, max_variable_size);
+ status = efi_call_virt_pointer(efi.runtime, query_variable_info, attr,
+ storage_space, remaining_space,
+ max_variable_size);
up(&efi_runtime_lock);
return status;
}
@@ -405,8 +494,7 @@ static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_GET_NEXT_HIGH_MONO_COUNT, count, NULL, NULL,
- NULL, NULL);
+ status = efi_queue_work(GET_NEXT_HIGH_MONO_COUNT, count);
up(&efi_runtime_lock);
return status;
}
@@ -421,8 +509,13 @@ static void virt_efi_reset_system(int reset_type,
"could not get exclusive access to the firmware\n");
return;
}
+
+ arch_efi_call_virt_setup();
efi_rts_work.efi_rts_id = EFI_RESET_SYSTEM;
- __efi_call_virt(reset_system, reset_type, status, data_size, data);
+ arch_efi_call_virt(efi.runtime, reset_system, reset_type, status,
+ data_size, data);
+ arch_efi_call_virt_teardown();
+
up(&efi_runtime_lock);
}
@@ -437,8 +530,7 @@ static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_UPDATE_CAPSULE, capsules, &count, &sg_list,
- NULL, NULL);
+ status = efi_queue_work(UPDATE_CAPSULE, capsules, count, sg_list);
up(&efi_runtime_lock);
return status;
}
@@ -455,26 +547,44 @@ static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
if (down_interruptible(&efi_runtime_lock))
return EFI_ABORTED;
- status = efi_queue_work(EFI_QUERY_CAPSULE_CAPS, capsules, &count,
- max_size, reset_type, NULL);
+ status = efi_queue_work(QUERY_CAPSULE_CAPS, capsules, count,
+ max_size, reset_type);
up(&efi_runtime_lock);
return status;
}
-void efi_native_runtime_setup(void)
+void __init efi_native_runtime_setup(void)
{
- efi.get_time = virt_efi_get_time;
- efi.set_time = virt_efi_set_time;
- efi.get_wakeup_time = virt_efi_get_wakeup_time;
- efi.set_wakeup_time = virt_efi_set_wakeup_time;
- efi.get_variable = virt_efi_get_variable;
- efi.get_next_variable = virt_efi_get_next_variable;
- efi.set_variable = virt_efi_set_variable;
- efi.set_variable_nonblocking = virt_efi_set_variable_nonblocking;
- efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
- efi.reset_system = virt_efi_reset_system;
- efi.query_variable_info = virt_efi_query_variable_info;
- efi.query_variable_info_nonblocking = virt_efi_query_variable_info_nonblocking;
- efi.update_capsule = virt_efi_update_capsule;
- efi.query_capsule_caps = virt_efi_query_capsule_caps;
+ efi.get_time = virt_efi_get_time;
+ efi.set_time = virt_efi_set_time;
+ efi.get_wakeup_time = virt_efi_get_wakeup_time;
+ efi.set_wakeup_time = virt_efi_set_wakeup_time;
+ efi.get_variable = virt_efi_get_variable;
+ efi.get_next_variable = virt_efi_get_next_variable;
+ efi.set_variable = virt_efi_set_variable;
+ efi.set_variable_nonblocking = virt_efi_set_variable_nb;
+ efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
+ efi.reset_system = virt_efi_reset_system;
+ efi.query_variable_info = virt_efi_query_variable_info;
+ efi.query_variable_info_nonblocking = virt_efi_query_variable_info_nb;
+ efi.update_capsule = virt_efi_update_capsule;
+ efi.query_capsule_caps = virt_efi_query_capsule_caps;
}
+
+#ifdef CONFIG_ACPI_PRMT
+
+efi_status_t
+efi_call_acpi_prm_handler(efi_status_t (__efiapi *handler_addr)(u64, void *),
+ u64 param_buffer_addr, void *context)
+{
+ efi_status_t status;
+
+ if (down_interruptible(&efi_runtime_lock))
+ return EFI_ABORTED;
+ status = efi_queue_work(ACPI_PRM_HANDLER, handler_addr,
+ param_buffer_addr, context);
+ up(&efi_runtime_lock);
+ return status;
+}
+
+#endif
diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c
index f1f6f1c32987..533d81572579 100644
--- a/drivers/gpio/gpio-sim.c
+++ b/drivers/gpio/gpio-sim.c
@@ -291,6 +291,15 @@ static void gpio_sim_mutex_destroy(void *data)
mutex_destroy(lock);
}
+static void gpio_sim_dispose_mappings(void *data)
+{
+ struct gpio_sim_chip *chip = data;
+ unsigned int i;
+
+ for (i = 0; i < chip->gc.ngpio; i++)
+ irq_dispose_mapping(irq_find_mapping(chip->irq_sim, i));
+}
+
static void gpio_sim_sysfs_remove(void *data)
{
struct gpio_sim_chip *chip = data;
@@ -402,10 +411,14 @@ static int gpio_sim_add_bank(struct fwnode_handle *swnode, struct device *dev)
if (!chip->pull_map)
return -ENOMEM;
- chip->irq_sim = devm_irq_domain_create_sim(dev, NULL, num_lines);
+ chip->irq_sim = devm_irq_domain_create_sim(dev, swnode, num_lines);
if (IS_ERR(chip->irq_sim))
return PTR_ERR(chip->irq_sim);
+ ret = devm_add_action_or_reset(dev, gpio_sim_dispose_mappings, chip);
+ if (ret)
+ return ret;
+
mutex_init(&chip->lock);
ret = devm_add_action_or_reset(dev, gpio_sim_mutex_destroy,
&chip->lock);
diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c b/drivers/gpu/drm/bridge/samsung-dsim.c
index 043b8109e64a..73ec60757dbc 100644
--- a/drivers/gpu/drm/bridge/samsung-dsim.c
+++ b/drivers/gpu/drm/bridge/samsung-dsim.c
@@ -1386,6 +1386,18 @@ static void samsung_dsim_disable_irq(struct samsung_dsim *dsi)
disable_irq(dsi->irq);
}
+static void samsung_dsim_set_stop_state(struct samsung_dsim *dsi, bool enable)
+{
+ u32 reg = samsung_dsim_read(dsi, DSIM_ESCMODE_REG);
+
+ if (enable)
+ reg |= DSIM_FORCE_STOP_STATE;
+ else
+ reg &= ~DSIM_FORCE_STOP_STATE;
+
+ samsung_dsim_write(dsi, DSIM_ESCMODE_REG, reg);
+}
+
static int samsung_dsim_init(struct samsung_dsim *dsi)
{
const struct samsung_dsim_driver_data *driver_data = dsi->driver_data;
@@ -1445,15 +1457,12 @@ static void samsung_dsim_atomic_enable(struct drm_bridge *bridge,
struct drm_bridge_state *old_bridge_state)
{
struct samsung_dsim *dsi = bridge_to_dsi(bridge);
- u32 reg;
if (samsung_dsim_hw_is_exynos(dsi->plat_data->hw_type)) {
samsung_dsim_set_display_mode(dsi);
samsung_dsim_set_display_enable(dsi, true);
} else {
- reg = samsung_dsim_read(dsi, DSIM_ESCMODE_REG);
- reg &= ~DSIM_FORCE_STOP_STATE;
- samsung_dsim_write(dsi, DSIM_ESCMODE_REG, reg);
+ samsung_dsim_set_stop_state(dsi, false);
}
dsi->state |= DSIM_STATE_VIDOUT_AVAILABLE;
@@ -1463,16 +1472,12 @@ static void samsung_dsim_atomic_disable(struct drm_bridge *bridge,
struct drm_bridge_state *old_bridge_state)
{
struct samsung_dsim *dsi = bridge_to_dsi(bridge);
- u32 reg;
if (!(dsi->state & DSIM_STATE_ENABLED))
return;
- if (!samsung_dsim_hw_is_exynos(dsi->plat_data->hw_type)) {
- reg = samsung_dsim_read(dsi, DSIM_ESCMODE_REG);
- reg |= DSIM_FORCE_STOP_STATE;
- samsung_dsim_write(dsi, DSIM_ESCMODE_REG, reg);
- }
+ if (!samsung_dsim_hw_is_exynos(dsi->plat_data->hw_type))
+ samsung_dsim_set_stop_state(dsi, true);
dsi->state &= ~DSIM_STATE_VIDOUT_AVAILABLE;
}
@@ -1775,6 +1780,8 @@ static ssize_t samsung_dsim_host_transfer(struct mipi_dsi_host *host,
if (ret)
return ret;
+ samsung_dsim_set_stop_state(dsi, false);
+
ret = mipi_dsi_create_packet(&xfer.packet, msg);
if (ret < 0)
return ret;
diff --git a/drivers/gpu/drm/drm_probe_helper.c b/drivers/gpu/drm/drm_probe_helper.c
index 2fb9bf901a2c..3f479483d7d8 100644
--- a/drivers/gpu/drm/drm_probe_helper.c
+++ b/drivers/gpu/drm/drm_probe_helper.c
@@ -262,6 +262,26 @@ static bool drm_kms_helper_enable_hpd(struct drm_device *dev)
}
#define DRM_OUTPUT_POLL_PERIOD (10*HZ)
+static void reschedule_output_poll_work(struct drm_device *dev)
+{
+ unsigned long delay = DRM_OUTPUT_POLL_PERIOD;
+
+ if (dev->mode_config.delayed_event)
+ /*
+ * FIXME:
+ *
+ * Use short (1s) delay to handle the initial delayed event.
+ * This delay should not be needed, but Optimus/nouveau will
+ * fail in a mysterious way if the delayed event is handled as
+ * soon as possible like it is done in
+ * drm_helper_probe_single_connector_modes() in case the poll
+ * was enabled before.
+ */
+ delay = HZ;
+
+ schedule_delayed_work(&dev->mode_config.output_poll_work, delay);
+}
+
/**
* drm_kms_helper_poll_enable - re-enable output polling.
* @dev: drm_device
@@ -279,37 +299,41 @@ static bool drm_kms_helper_enable_hpd(struct drm_device *dev)
*/
void drm_kms_helper_poll_enable(struct drm_device *dev)
{
- bool poll = false;
- unsigned long delay = DRM_OUTPUT_POLL_PERIOD;
-
if (!dev->mode_config.poll_enabled || !drm_kms_helper_poll ||
dev->mode_config.poll_running)
return;
- poll = drm_kms_helper_enable_hpd(dev);
-
- if (dev->mode_config.delayed_event) {
- /*
- * FIXME:
- *
- * Use short (1s) delay to handle the initial delayed event.
- * This delay should not be needed, but Optimus/nouveau will
- * fail in a mysterious way if the delayed event is handled as
- * soon as possible like it is done in
- * drm_helper_probe_single_connector_modes() in case the poll
- * was enabled before.
- */
- poll = true;
- delay = HZ;
- }
-
- if (poll)
- schedule_delayed_work(&dev->mode_config.output_poll_work, delay);
+ if (drm_kms_helper_enable_hpd(dev) ||
+ dev->mode_config.delayed_event)
+ reschedule_output_poll_work(dev);
dev->mode_config.poll_running = true;
}
EXPORT_SYMBOL(drm_kms_helper_poll_enable);
+/**
+ * drm_kms_helper_poll_reschedule - reschedule the output polling work
+ * @dev: drm_device
+ *
+ * This function reschedules the output polling work, after polling for a
+ * connector has been enabled.
+ *
+ * Drivers must call this helper after enabling polling for a connector by
+ * setting %DRM_CONNECTOR_POLL_CONNECT / %DRM_CONNECTOR_POLL_DISCONNECT flags
+ * in drm_connector::polled. Note that after disabling polling by clearing these
+ * flags for a connector will stop the output polling work automatically if
+ * the polling is disabled for all other connectors as well.
+ *
+ * The function can be called only after polling has been enabled by calling
+ * drm_kms_helper_poll_init() / drm_kms_helper_poll_enable().
+ */
+void drm_kms_helper_poll_reschedule(struct drm_device *dev)
+{
+ if (dev->mode_config.poll_running)
+ reschedule_output_poll_work(dev);
+}
+EXPORT_SYMBOL(drm_kms_helper_poll_reschedule);
+
static enum drm_connector_status
drm_helper_probe_detect_ctx(struct drm_connector *connector, bool force)
{
diff --git a/drivers/gpu/drm/i915/display/intel_hotplug.c b/drivers/gpu/drm/i915/display/intel_hotplug.c
index 1160fa20433b..5eac7032bb5a 100644
--- a/drivers/gpu/drm/i915/display/intel_hotplug.c
+++ b/drivers/gpu/drm/i915/display/intel_hotplug.c
@@ -211,7 +211,7 @@ intel_hpd_irq_storm_switch_to_polling(struct drm_i915_private *dev_priv)
/* Enable polling and queue hotplug re-enabling. */
if (hpd_disabled) {
- drm_kms_helper_poll_enable(&dev_priv->drm);
+ drm_kms_helper_poll_reschedule(&dev_priv->drm);
mod_delayed_work(dev_priv->unordered_wq,
&dev_priv->display.hotplug.reenable_work,
msecs_to_jiffies(HPD_STORM_REENABLE_DELAY));
@@ -649,7 +649,7 @@ static void i915_hpd_poll_init_work(struct work_struct *work)
drm_connector_list_iter_end(&conn_iter);
if (enabled)
- drm_kms_helper_poll_enable(&dev_priv->drm);
+ drm_kms_helper_poll_reschedule(&dev_priv->drm);
mutex_unlock(&dev_priv->drm.mode_config.mutex);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
index ddd146265beb..fa70defcb5b2 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
@@ -26,6 +26,7 @@
* The kernel driver is only responsible for loading the HuC firmware and
* triggering its security authentication. This is done differently depending
* on the platform:
+ *
* - older platforms (from Gen9 to most Gen12s): the load is performed via DMA
* and the authentication via GuC
* - DG2: load and authentication are both performed via GSC.
@@ -33,6 +34,7 @@
* not-DG2 older platforms), while the authentication is done in 2-steps,
* a first auth for clear-media workloads via GuC and a second one for all
* workloads via GSC.
+ *
* On platforms where the GuC does the authentication, to correctly do so the
* HuC binary must be loaded before the GuC one.
* Loading the HuC is optional; however, not using the HuC might negatively
diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
index 0ad0c5885ec2..7d8671fdf447 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -443,7 +443,6 @@ static int i915_pcode_init(struct drm_i915_private *i915)
static int i915_driver_hw_probe(struct drm_i915_private *dev_priv)
{
struct pci_dev *pdev = to_pci_dev(dev_priv->drm.dev);
- struct pci_dev *root_pdev;
int ret;
if (i915_inject_probe_failure(dev_priv))
@@ -557,15 +556,6 @@ static int i915_driver_hw_probe(struct drm_i915_private *dev_priv)
intel_bw_init_hw(dev_priv);
- /*
- * FIXME: Temporary hammer to avoid freezing the machine on our DGFX
- * This should be totally removed when we handle the pci states properly
- * on runtime PM and on s2idle cases.
- */
- root_pdev = pcie_find_root_port(pdev);
- if (root_pdev)
- pci_d3cold_disable(root_pdev);
-
return 0;
err_opregion:
@@ -591,7 +581,6 @@ err_perf:
static void i915_driver_hw_remove(struct drm_i915_private *dev_priv)
{
struct pci_dev *pdev = to_pci_dev(dev_priv->drm.dev);
- struct pci_dev *root_pdev;
i915_perf_fini(dev_priv);
@@ -599,10 +588,6 @@ static void i915_driver_hw_remove(struct drm_i915_private *dev_priv)
if (pdev->msi_enabled)
pci_disable_msi(pdev);
-
- root_pdev = pcie_find_root_port(pdev);
- if (root_pdev)
- pci_d3cold_enable(root_pdev);
}
/**
@@ -1517,6 +1502,8 @@ static int intel_runtime_suspend(struct device *kdev)
{
struct drm_i915_private *dev_priv = kdev_to_i915(kdev);
struct intel_runtime_pm *rpm = &dev_priv->runtime_pm;
+ struct pci_dev *pdev = to_pci_dev(dev_priv->drm.dev);
+ struct pci_dev *root_pdev;
struct intel_gt *gt;
int ret, i;
@@ -1568,6 +1555,15 @@ static int intel_runtime_suspend(struct device *kdev)
drm_err(&dev_priv->drm,
"Unclaimed access detected prior to suspending\n");
+ /*
+ * FIXME: Temporary hammer to avoid freezing the machine on our DGFX
+ * This should be totally removed when we handle the pci states properly
+ * on runtime PM.
+ */
+ root_pdev = pcie_find_root_port(pdev);
+ if (root_pdev)
+ pci_d3cold_disable(root_pdev);
+
rpm->suspended = true;
/*
@@ -1606,6 +1602,8 @@ static int intel_runtime_resume(struct device *kdev)
{
struct drm_i915_private *dev_priv = kdev_to_i915(kdev);
struct intel_runtime_pm *rpm = &dev_priv->runtime_pm;
+ struct pci_dev *pdev = to_pci_dev(dev_priv->drm.dev);
+ struct pci_dev *root_pdev;
struct intel_gt *gt;
int ret, i;
@@ -1619,6 +1617,11 @@ static int intel_runtime_resume(struct device *kdev)
intel_opregion_notify_adapter(dev_priv, PCI_D0);
rpm->suspended = false;
+
+ root_pdev = pcie_find_root_port(pdev);
+ if (root_pdev)
+ pci_d3cold_enable(root_pdev);
+
if (intel_uncore_unclaimed_mmio(&dev_priv->uncore))
drm_dbg(&dev_priv->drm,
"Unclaimed access during suspend, bios?\n");
diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
index 58dfb15a8757..e78de99e9933 100644
--- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
+++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
@@ -96,7 +96,7 @@ static int panfrost_read_speedbin(struct device *dev)
* keep going without it; any other error means that we are
* supposed to read the bin value, but we failed doing so.
*/
- if (ret != -ENOENT) {
+ if (ret != -ENOENT && ret != -EOPNOTSUPP) {
DRM_DEV_ERROR(dev, "Cannot read speed-bin (%d).", ret);
return ret;
}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
index 82094c137855..c43853597776 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
@@ -497,10 +497,9 @@ static int vmw_user_bo_synccpu_release(struct drm_file *filp,
if (!(flags & drm_vmw_synccpu_allow_cs)) {
atomic_dec(&vmw_bo->cpu_writers);
}
- ttm_bo_put(&vmw_bo->tbo);
+ vmw_user_bo_unref(vmw_bo);
}
- drm_gem_object_put(&vmw_bo->tbo.base);
return ret;
}
@@ -540,8 +539,7 @@ int vmw_user_bo_synccpu_ioctl(struct drm_device *dev, void *data,
return ret;
ret = vmw_user_bo_synccpu_grab(vbo, arg->flags);
- vmw_bo_unreference(&vbo);
- drm_gem_object_put(&vbo->tbo.base);
+ vmw_user_bo_unref(vbo);
if (unlikely(ret != 0)) {
if (ret == -ERESTARTSYS || ret == -EBUSY)
return -EBUSY;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h
index 50a836e70994..1d433fceed3d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.h
@@ -195,6 +195,14 @@ static inline struct vmw_bo *vmw_bo_reference(struct vmw_bo *buf)
return buf;
}
+static inline void vmw_user_bo_unref(struct vmw_bo *vbo)
+{
+ if (vbo) {
+ ttm_bo_put(&vbo->tbo);
+ drm_gem_object_put(&vbo->tbo.base);
+ }
+}
+
static inline struct vmw_bo *to_vmw_bo(struct drm_gem_object *gobj)
{
return container_of((gobj), struct vmw_bo, tbo.base);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index 3810a9984a7f..58bfdf203eca 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -1513,4 +1513,16 @@ static inline bool vmw_has_fences(struct vmw_private *vmw)
return (vmw_fifo_caps(vmw) & SVGA_FIFO_CAP_FENCE) != 0;
}
+static inline bool vmw_shadertype_is_valid(enum vmw_sm_type shader_model,
+ u32 shader_type)
+{
+ SVGA3dShaderType max_allowed = SVGA3D_SHADERTYPE_PREDX_MAX;
+
+ if (shader_model >= VMW_SM_5)
+ max_allowed = SVGA3D_SHADERTYPE_MAX;
+ else if (shader_model >= VMW_SM_4)
+ max_allowed = SVGA3D_SHADERTYPE_DX10_MAX;
+ return shader_type >= SVGA3D_SHADERTYPE_MIN && shader_type < max_allowed;
+}
+
#endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 6b9aa2b4ef54..98e0723ca6f5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -1164,8 +1164,7 @@ static int vmw_translate_mob_ptr(struct vmw_private *dev_priv,
}
vmw_bo_placement_set(vmw_bo, VMW_BO_DOMAIN_MOB, VMW_BO_DOMAIN_MOB);
ret = vmw_validation_add_bo(sw_context->ctx, vmw_bo);
- ttm_bo_put(&vmw_bo->tbo);
- drm_gem_object_put(&vmw_bo->tbo.base);
+ vmw_user_bo_unref(vmw_bo);
if (unlikely(ret != 0))
return ret;
@@ -1221,8 +1220,7 @@ static int vmw_translate_guest_ptr(struct vmw_private *dev_priv,
vmw_bo_placement_set(vmw_bo, VMW_BO_DOMAIN_GMR | VMW_BO_DOMAIN_VRAM,
VMW_BO_DOMAIN_GMR | VMW_BO_DOMAIN_VRAM);
ret = vmw_validation_add_bo(sw_context->ctx, vmw_bo);
- ttm_bo_put(&vmw_bo->tbo);
- drm_gem_object_put(&vmw_bo->tbo.base);
+ vmw_user_bo_unref(vmw_bo);
if (unlikely(ret != 0))
return ret;
@@ -1992,7 +1990,7 @@ static int vmw_cmd_set_shader(struct vmw_private *dev_priv,
cmd = container_of(header, typeof(*cmd), header);
- if (cmd->body.type >= SVGA3D_SHADERTYPE_PREDX_MAX) {
+ if (!vmw_shadertype_is_valid(VMW_SM_LEGACY, cmd->body.type)) {
VMW_DEBUG_USER("Illegal shader type %u.\n",
(unsigned int) cmd->body.type);
return -EINVAL;
@@ -2115,8 +2113,6 @@ vmw_cmd_dx_set_single_constant_buffer(struct vmw_private *dev_priv,
SVGA3dCmdHeader *header)
{
VMW_DECLARE_CMD_VAR(*cmd, SVGA3dCmdDXSetSingleConstantBuffer);
- SVGA3dShaderType max_shader_num = has_sm5_context(dev_priv) ?
- SVGA3D_NUM_SHADERTYPE : SVGA3D_NUM_SHADERTYPE_DX10;
struct vmw_resource *res = NULL;
struct vmw_ctx_validation_info *ctx_node = VMW_GET_CTX_NODE(sw_context);
@@ -2133,6 +2129,14 @@ vmw_cmd_dx_set_single_constant_buffer(struct vmw_private *dev_priv,
if (unlikely(ret != 0))
return ret;
+ if (!vmw_shadertype_is_valid(dev_priv->sm_type, cmd->body.type) ||
+ cmd->body.slot >= SVGA3D_DX_MAX_CONSTBUFFERS) {
+ VMW_DEBUG_USER("Illegal const buffer shader %u slot %u.\n",
+ (unsigned int) cmd->body.type,
+ (unsigned int) cmd->body.slot);
+ return -EINVAL;
+ }
+
binding.bi.ctx = ctx_node->ctx;
binding.bi.res = res;
binding.bi.bt = vmw_ctx_binding_cb;
@@ -2141,14 +2145,6 @@ vmw_cmd_dx_set_single_constant_buffer(struct vmw_private *dev_priv,
binding.size = cmd->body.sizeInBytes;
binding.slot = cmd->body.slot;
- if (binding.shader_slot >= max_shader_num ||
- binding.slot >= SVGA3D_DX_MAX_CONSTBUFFERS) {
- VMW_DEBUG_USER("Illegal const buffer shader %u slot %u.\n",
- (unsigned int) cmd->body.type,
- (unsigned int) binding.slot);
- return -EINVAL;
- }
-
vmw_binding_add(ctx_node->staged, &binding.bi, binding.shader_slot,
binding.slot);
@@ -2207,15 +2203,13 @@ static int vmw_cmd_dx_set_shader_res(struct vmw_private *dev_priv,
{
VMW_DECLARE_CMD_VAR(*cmd, SVGA3dCmdDXSetShaderResources) =
container_of(header, typeof(*cmd), header);
- SVGA3dShaderType max_allowed = has_sm5_context(dev_priv) ?
- SVGA3D_SHADERTYPE_MAX : SVGA3D_SHADERTYPE_DX10_MAX;
u32 num_sr_view = (cmd->header.size - sizeof(cmd->body)) /
sizeof(SVGA3dShaderResourceViewId);
if ((u64) cmd->body.startView + (u64) num_sr_view >
(u64) SVGA3D_DX_MAX_SRVIEWS ||
- cmd->body.type >= max_allowed) {
+ !vmw_shadertype_is_valid(dev_priv->sm_type, cmd->body.type)) {
VMW_DEBUG_USER("Invalid shader binding.\n");
return -EINVAL;
}
@@ -2239,8 +2233,6 @@ static int vmw_cmd_dx_set_shader(struct vmw_private *dev_priv,
SVGA3dCmdHeader *header)
{
VMW_DECLARE_CMD_VAR(*cmd, SVGA3dCmdDXSetShader);
- SVGA3dShaderType max_allowed = has_sm5_context(dev_priv) ?
- SVGA3D_SHADERTYPE_MAX : SVGA3D_SHADERTYPE_DX10_MAX;
struct vmw_resource *res = NULL;
struct vmw_ctx_validation_info *ctx_node = VMW_GET_CTX_NODE(sw_context);
struct vmw_ctx_bindinfo_shader binding;
@@ -2251,8 +2243,7 @@ static int vmw_cmd_dx_set_shader(struct vmw_private *dev_priv,
cmd = container_of(header, typeof(*cmd), header);
- if (cmd->body.type >= max_allowed ||
- cmd->body.type < SVGA3D_SHADERTYPE_MIN) {
+ if (!vmw_shadertype_is_valid(dev_priv->sm_type, cmd->body.type)) {
VMW_DEBUG_USER("Illegal shader type %u.\n",
(unsigned int) cmd->body.type);
return -EINVAL;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index b62207be3363..1489ad73c103 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -1665,10 +1665,8 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev,
err_out:
/* vmw_user_lookup_handle takes one ref so does new_fb */
- if (bo) {
- vmw_bo_unreference(&bo);
- drm_gem_object_put(&bo->tbo.base);
- }
+ if (bo)
+ vmw_user_bo_unref(bo);
if (surface)
vmw_surface_unreference(&surface);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c
index 7e112319a23c..fb85f244c3d0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_overlay.c
@@ -451,8 +451,7 @@ int vmw_overlay_ioctl(struct drm_device *dev, void *data,
ret = vmw_overlay_update_stream(dev_priv, buf, arg, true);
- vmw_bo_unreference(&buf);
- drm_gem_object_put(&buf->tbo.base);
+ vmw_user_bo_unref(buf);
out_unlock:
mutex_unlock(&overlay->mutex);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
index e7226db8b242..1e81ff2422cf 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
@@ -809,8 +809,7 @@ static int vmw_shader_define(struct drm_device *dev, struct drm_file *file_priv,
shader_type, num_input_sig,
num_output_sig, tfile, shader_handle);
out_bad_arg:
- vmw_bo_unreference(&buffer);
- drm_gem_object_put(&buffer->tbo.base);
+ vmw_user_bo_unref(buffer);
return ret;
}
diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index a267b11731a8..bae0becfa24b 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -65,7 +65,7 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
#define F15H_M60H_HARDWARE_TEMP_CTRL_OFFSET 0xd8200c64
#define F15H_M60H_REPORTED_TEMP_CTRL_OFFSET 0xd8200ca4
-/* Common for Zen CPU families (Family 17h and 18h and 19h) */
+/* Common for Zen CPU families (Family 17h and 18h and 19h and 1Ah) */
#define ZEN_REPORTED_TEMP_CTRL_BASE 0x00059800
#define ZEN_CCD_TEMP(offset, x) (ZEN_REPORTED_TEMP_CTRL_BASE + \
@@ -475,6 +475,10 @@ static int k10temp_probe(struct pci_dev *pdev, const struct pci_device_id *id)
k10temp_get_ccd_support(pdev, data, 12);
break;
}
+ } else if (boot_cpu_data.x86 == 0x1a) {
+ data->temp_adjust_mask = ZEN_CUR_TEMP_RANGE_SEL_MASK;
+ data->read_tempreg = read_tempreg_nb_zen;
+ data->is_zen = true;
} else {
data->read_htcreg = read_htcreg_pci;
data->read_tempreg = read_tempreg_pci;
@@ -521,6 +525,8 @@ static const struct pci_device_id k10temp_id_table[] = {
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
{ PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{}
};
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 256c2d42e350..ea5a6a14c553 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -923,7 +923,7 @@ static struct cpuidle_state adl_l_cstates[] __initdata = {
.enter = NULL }
};
-static struct cpuidle_state adl_n_cstates[] __initdata = {
+static struct cpuidle_state gmt_cstates[] __initdata = {
{
.name = "C1",
.desc = "MWAIT 0x00",
@@ -1349,8 +1349,8 @@ static const struct idle_cpu idle_cpu_adl_l __initconst = {
.state_table = adl_l_cstates,
};
-static const struct idle_cpu idle_cpu_adl_n __initconst = {
- .state_table = adl_n_cstates,
+static const struct idle_cpu idle_cpu_gmt __initconst = {
+ .state_table = gmt_cstates,
};
static const struct idle_cpu idle_cpu_spr __initconst = {
@@ -1423,7 +1423,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &idle_cpu_adl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &idle_cpu_adl_n),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &idle_cpu_gmt),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr),
X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &idle_cpu_spr),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl),
@@ -1898,7 +1898,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
break;
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
- case INTEL_FAM6_ALDERLAKE_N:
+ case INTEL_FAM6_ATOM_GRACEMONT:
adl_idle_state_table_update();
break;
}
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index a973905afd13..ed7d4b02f45a 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -64,9 +64,8 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_blocks = 0;
- inode->i_atime = current_time(inode);
+ inode->i_atime = inode_set_ctime_current(inode);
inode->i_mtime = inode->i_atime;
- inode->i_ctime = inode->i_atime;
inode->i_private = data;
if (S_ISDIR(mode)) {
inode->i_op = &simple_dir_inode_operations;
diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c
index 6341c0167c4a..9745a119d0e6 100644
--- a/drivers/irqchip/irq-bcm6345-l1.c
+++ b/drivers/irqchip/irq-bcm6345-l1.c
@@ -60,7 +60,6 @@
#include <linux/of.h>
#include <linux/of_irq.h>
#include <linux/of_address.h>
-#include <linux/of_platform.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/smp.h>
diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c
index a62b96237b82..24ca1d656adc 100644
--- a/drivers/irqchip/irq-bcm7038-l1.c
+++ b/drivers/irqchip/irq-bcm7038-l1.c
@@ -20,7 +20,6 @@
#include <linux/of.h>
#include <linux/of_irq.h>
#include <linux/of_address.h>
-#include <linux/of_platform.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/smp.h>
diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c
index 091b0fe7e324..5559c943f03f 100644
--- a/drivers/irqchip/irq-brcmstb-l2.c
+++ b/drivers/irqchip/irq-brcmstb-l2.c
@@ -15,7 +15,6 @@
#include <linux/of.h>
#include <linux/of_irq.h>
#include <linux/of_address.h>
-#include <linux/of_platform.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/io.h>
diff --git a/drivers/irqchip/irq-gic-pm.c b/drivers/irqchip/irq-gic-pm.c
index 3989d16f997b..a275a8071a25 100644
--- a/drivers/irqchip/irq-gic-pm.c
+++ b/drivers/irqchip/irq-gic-pm.c
@@ -4,7 +4,7 @@
*/
#include <linux/module.h>
#include <linux/clk.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
#include <linux/of_irq.h>
#include <linux/irqchip/arm-gic.h>
#include <linux/platform_device.h>
diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
index 634263dfd7b5..8e87fc35f8aa 100644
--- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
@@ -9,8 +9,6 @@
#include <linux/acpi.h>
#include <linux/acpi_iort.h>
-#include <linux/of_device.h>
-#include <linux/of_address.h>
#include <linux/irq.h>
#include <linux/msi.h>
#include <linux/of.h>
diff --git a/drivers/irqchip/irq-i8259.c b/drivers/irqchip/irq-i8259.c
index b70ce0d3c092..115bdcffab24 100644
--- a/drivers/irqchip/irq-i8259.c
+++ b/drivers/irqchip/irq-i8259.c
@@ -340,7 +340,7 @@ static void i8259_irq_dispatch(struct irq_desc *desc)
generic_handle_domain_irq(domain, hwirq);
}
-int __init i8259_of_init(struct device_node *node, struct device_node *parent)
+static int __init i8259_of_init(struct device_node *node, struct device_node *parent)
{
struct irq_domain *domain;
unsigned int parent_irq;
diff --git a/drivers/irqchip/irq-imx-intmux.c b/drivers/irqchip/irq-imx-intmux.c
index 80aaea82468a..6d9a08238c9d 100644
--- a/drivers/irqchip/irq-imx-intmux.c
+++ b/drivers/irqchip/irq-imx-intmux.c
@@ -50,8 +50,9 @@
#include <linux/irqchip/chained_irq.h>
#include <linux/irqdomain.h>
#include <linux/kernel.h>
+#include <linux/mod_devicetable.h>
#include <linux/of_irq.h>
-#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <linux/spinlock.h>
#include <linux/pm_runtime.h>
diff --git a/drivers/irqchip/irq-imx-irqsteer.c b/drivers/irqchip/irq-imx-irqsteer.c
index 96230a04ec23..bd9543314539 100644
--- a/drivers/irqchip/irq-imx-irqsteer.c
+++ b/drivers/irqchip/irq-imx-irqsteer.c
@@ -10,8 +10,9 @@
#include <linux/irqchip/chained_irq.h>
#include <linux/irqdomain.h>
#include <linux/kernel.h>
+#include <linux/of.h>
#include <linux/of_irq.h>
-#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/spinlock.h>
diff --git a/drivers/irqchip/irq-imx-mu-msi.c b/drivers/irqchip/irq-imx-mu-msi.c
index 229039eda1b1..90d41c1407ac 100644
--- a/drivers/irqchip/irq-imx-mu-msi.c
+++ b/drivers/irqchip/irq-imx-mu-msi.c
@@ -339,8 +339,8 @@ static int __init imx_mu_of_init(struct device_node *dn,
msi_data->msiir_addr = res->start + msi_data->cfg->xTR;
irq = platform_get_irq(pdev, 0);
- if (irq <= 0)
- return -ENODEV;
+ if (irq < 0)
+ return irq;
platform_set_drvdata(pdev, msi_data);
diff --git a/drivers/irqchip/irq-keystone.c b/drivers/irqchip/irq-keystone.c
index ba9792e60329..a36396db4b08 100644
--- a/drivers/irqchip/irq-keystone.c
+++ b/drivers/irqchip/irq-keystone.c
@@ -15,7 +15,7 @@
#include <linux/irqdomain.h>
#include <linux/irqchip.h>
#include <linux/of.h>
-#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <linux/mfd/syscon.h>
#include <linux/regmap.h>
diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c
index 92d8aa28bdf5..1623cd779175 100644
--- a/drivers/irqchip/irq-loongson-eiointc.c
+++ b/drivers/irqchip/irq-loongson-eiointc.c
@@ -144,7 +144,7 @@ static int eiointc_router_init(unsigned int cpu)
int i, bit;
uint32_t data;
uint32_t node = cpu_to_eio_node(cpu);
- uint32_t index = eiointc_index(node);
+ int index = eiointc_index(node);
if (index < 0) {
pr_err("Error: invalid nodemap!\n");
diff --git a/drivers/irqchip/irq-loongson-htvec.c b/drivers/irqchip/irq-loongson-htvec.c
index fc8bf1f5d41b..0bff728b25e3 100644
--- a/drivers/irqchip/irq-loongson-htvec.c
+++ b/drivers/irqchip/irq-loongson-htvec.c
@@ -15,7 +15,6 @@
#include <linux/platform_device.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
-#include <linux/of_platform.h>
#include <linux/syscore_ops.h>
/* Registers */
diff --git a/drivers/irqchip/irq-loongson-pch-pic.c b/drivers/irqchip/irq-loongson-pch-pic.c
index 93a71f66efeb..63db8e2172e0 100644
--- a/drivers/irqchip/irq-loongson-pch-pic.c
+++ b/drivers/irqchip/irq-loongson-pch-pic.c
@@ -12,9 +12,9 @@
#include <linux/irqdomain.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
+#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
-#include <linux/of_platform.h>
#include <linux/syscore_ops.h>
/* Registers */
diff --git a/drivers/irqchip/irq-ls-scfg-msi.c b/drivers/irqchip/irq-ls-scfg-msi.c
index f5ba3f9f8415..f31a262fe438 100644
--- a/drivers/irqchip/irq-ls-scfg-msi.c
+++ b/drivers/irqchip/irq-ls-scfg-msi.c
@@ -349,8 +349,7 @@ static int ls_scfg_msi_probe(struct platform_device *pdev)
msi_data->cfg = (struct ls_scfg_msi_cfg *) match->data;
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- msi_data->regs = devm_ioremap_resource(&pdev->dev, res);
+ msi_data->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
if (IS_ERR(msi_data->regs)) {
dev_err(&pdev->dev, "failed to initialize 'regs'\n");
return PTR_ERR(msi_data->regs);
diff --git a/drivers/irqchip/irq-madera.c b/drivers/irqchip/irq-madera.c
index 8b81271c823c..3eb1f8cdf674 100644
--- a/drivers/irqchip/irq-madera.c
+++ b/drivers/irqchip/irq-madera.c
@@ -10,12 +10,10 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
+#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/regmap.h>
#include <linux/slab.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
-#include <linux/of_irq.h>
#include <linux/irqchip/irq-madera.h>
#include <linux/mfd/madera/core.h>
#include <linux/mfd/madera/pdata.h>
diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c
index 7da18ef95211..f88df39f4129 100644
--- a/drivers/irqchip/irq-meson-gpio.c
+++ b/drivers/irqchip/irq-meson-gpio.c
@@ -150,6 +150,10 @@ static const struct meson_gpio_irq_params s4_params = {
INIT_MESON_S4_COMMON_DATA(82)
};
+static const struct meson_gpio_irq_params c3_params = {
+ INIT_MESON_S4_COMMON_DATA(55)
+};
+
static const struct of_device_id meson_irq_gpio_matches[] __maybe_unused = {
{ .compatible = "amlogic,meson8-gpio-intc", .data = &meson8_params },
{ .compatible = "amlogic,meson8b-gpio-intc", .data = &meson8b_params },
@@ -160,6 +164,7 @@ static const struct of_device_id meson_irq_gpio_matches[] __maybe_unused = {
{ .compatible = "amlogic,meson-sm1-gpio-intc", .data = &sm1_params },
{ .compatible = "amlogic,meson-a1-gpio-intc", .data = &a1_params },
{ .compatible = "amlogic,meson-s4-gpio-intc", .data = &s4_params },
+ { .compatible = "amlogic,c3-gpio-intc", .data = &c3_params },
{ }
};
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 6d5ecc10a870..76253e864f23 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -557,7 +557,7 @@ static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq,
return gic_irq_domain_map(d, virq, hwirq);
}
-void gic_irq_domain_free(struct irq_domain *d, unsigned int virq,
+static void gic_irq_domain_free(struct irq_domain *d, unsigned int virq,
unsigned int nr_irqs)
{
}
diff --git a/drivers/irqchip/irq-mvebu-sei.c b/drivers/irqchip/irq-mvebu-sei.c
index 4ecef6d83777..a48dbe91b036 100644
--- a/drivers/irqchip/irq-mvebu-sei.c
+++ b/drivers/irqchip/irq-mvebu-sei.c
@@ -377,8 +377,7 @@ static int mvebu_sei_probe(struct platform_device *pdev)
mutex_init(&sei->cp_msi_lock);
raw_spin_lock_init(&sei->mask_lock);
- sei->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- sei->base = devm_ioremap_resource(sei->dev, sei->res);
+ sei->base = devm_platform_get_and_ioremap_resource(pdev, 0, &sei->res);
if (IS_ERR(sei->base))
return PTR_ERR(sei->base);
diff --git a/drivers/irqchip/irq-orion.c b/drivers/irqchip/irq-orion.c
index 17c2c7a07f10..4e4e874e09a8 100644
--- a/drivers/irqchip/irq-orion.c
+++ b/drivers/irqchip/irq-orion.c
@@ -57,8 +57,7 @@ static int __init orion_irq_init(struct device_node *np,
struct resource r;
/* count number of irq chips by valid reg addresses */
- while (of_address_to_resource(np, num_chips, &r) == 0)
- num_chips++;
+ num_chips = of_address_count(np);
orion_irq_domain = irq_domain_add_linear(np,
num_chips * ORION_IRQS_PER_CHIP,
diff --git a/drivers/irqchip/irq-pruss-intc.c b/drivers/irqchip/irq-pruss-intc.c
index fa8d89b02ec0..0f64ecb9b1f4 100644
--- a/drivers/irqchip/irq-pruss-intc.c
+++ b/drivers/irqchip/irq-pruss-intc.c
@@ -17,7 +17,7 @@
#include <linux/irqchip/chained_irq.h>
#include <linux/irqdomain.h>
#include <linux/module.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
#include <linux/platform_device.h>
/*
@@ -565,8 +565,8 @@ static int pruss_intc_probe(struct platform_device *pdev)
continue;
irq = platform_get_irq_byname(pdev, irq_names[i]);
- if (irq <= 0) {
- ret = (irq == 0) ? -EINVAL : irq;
+ if (irq < 0) {
+ ret = irq;
goto fail_irq;
}
diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c
index d30614661eea..7124565234a5 100644
--- a/drivers/irqchip/irq-qcom-mpm.c
+++ b/drivers/irqchip/irq-qcom-mpm.c
@@ -14,7 +14,7 @@
#include <linux/mailbox_client.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_device.h>
+#include <linux/of_platform.h>
#include <linux/platform_device.h>
#include <linux/pm_domain.h>
#include <linux/slab.h>
diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c
index 26e4c17a7bf2..fa19585f3dee 100644
--- a/drivers/irqchip/irq-renesas-intc-irqpin.c
+++ b/drivers/irqchip/irq-renesas-intc-irqpin.c
@@ -17,7 +17,6 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/of_device.h>
#include <linux/pm_runtime.h>
#define INTC_IRQPIN_MAX 8 /* maximum 8 interrupts per driver instance */
diff --git a/drivers/irqchip/irq-st.c b/drivers/irqchip/irq-st.c
index 819a12297b58..de71bb350d57 100644
--- a/drivers/irqchip/irq-st.c
+++ b/drivers/irqchip/irq-st.c
@@ -10,7 +10,7 @@
#include <dt-bindings/interrupt-controller/irq-st.h>
#include <linux/err.h>
#include <linux/mfd/syscon.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/regmap.h>
#include <linux/slab.h>
diff --git a/drivers/irqchip/irq-stm32-exti.c b/drivers/irqchip/irq-stm32-exti.c
index b5fa76ce5046..d8ba5fba7450 100644
--- a/drivers/irqchip/irq-stm32-exti.c
+++ b/drivers/irqchip/irq-stm32-exti.c
@@ -14,10 +14,11 @@
#include <linux/irqchip.h>
#include <linux/irqchip/chained_irq.h>
#include <linux/irqdomain.h>
+#include <linux/mod_devicetable.h>
#include <linux/module.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
-#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <linux/syscore_ops.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c
index 21d49791f855..e760b1278143 100644
--- a/drivers/irqchip/irq-sunxi-nmi.c
+++ b/drivers/irqchip/irq-sunxi-nmi.c
@@ -19,7 +19,6 @@
#include <linux/irqdomain.h>
#include <linux/of_irq.h>
#include <linux/of_address.h>
-#include <linux/of_platform.h>
#include <linux/irqchip.h>
#include <linux/irqchip/chained_irq.h>
diff --git a/drivers/irqchip/irq-tb10x.c b/drivers/irqchip/irq-tb10x.c
index 8a0e69298e83..680586354d12 100644
--- a/drivers/irqchip/irq-tb10x.c
+++ b/drivers/irqchip/irq-tb10x.c
@@ -13,7 +13,6 @@
#include <linux/irqchip.h>
#include <linux/of_irq.h>
#include <linux/of_address.h>
-#include <linux/of_platform.h>
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/bitops.h>
diff --git a/drivers/irqchip/irq-ti-sci-inta.c b/drivers/irqchip/irq-ti-sci-inta.c
index 7133f9fa6fd9..b83f5cbab123 100644
--- a/drivers/irqchip/irq-ti-sci-inta.c
+++ b/drivers/irqchip/irq-ti-sci-inta.c
@@ -15,9 +15,9 @@
#include <linux/msi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/of_address.h>
+#include <linux/of.h>
#include <linux/of_irq.h>
-#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <linux/irqchip/chained_irq.h>
#include <linux/soc/ti/ti_sci_inta_msi.h>
#include <linux/soc/ti/ti_sci_protocol.h>
diff --git a/drivers/irqchip/irq-ti-sci-intr.c b/drivers/irqchip/irq-ti-sci-intr.c
index 1186f1e431a3..c027cd9e4a69 100644
--- a/drivers/irqchip/irq-ti-sci-intr.c
+++ b/drivers/irqchip/irq-ti-sci-intr.c
@@ -12,9 +12,9 @@
#include <linux/io.h>
#include <linux/irqchip.h>
#include <linux/irqdomain.h>
-#include <linux/of_platform.h>
-#include <linux/of_address.h>
+#include <linux/of.h>
#include <linux/of_irq.h>
+#include <linux/platform_device.h>
#include <linux/soc/ti/ti_sci_protocol.h>
/**
diff --git a/drivers/irqchip/irq-uniphier-aidet.c b/drivers/irqchip/irq-uniphier-aidet.c
index 716b1bb88bf2..601f9343d5b3 100644
--- a/drivers/irqchip/irq-uniphier-aidet.c
+++ b/drivers/irqchip/irq-uniphier-aidet.c
@@ -12,7 +12,6 @@
#include <linux/irqdomain.h>
#include <linux/kernel.h>
#include <linux/of.h>
-#include <linux/of_device.h>
#include <linux/of_irq.h>
#include <linux/platform_device.h>
#include <linux/spinlock.h>
diff --git a/drivers/irqchip/irq-xtensa-pic.c b/drivers/irqchip/irq-xtensa-pic.c
index ab12328be5ee..0c18d1f1e264 100644
--- a/drivers/irqchip/irq-xtensa-pic.c
+++ b/drivers/irqchip/irq-xtensa-pic.c
@@ -16,6 +16,7 @@
#include <linux/irqdomain.h>
#include <linux/irq.h>
#include <linux/irqchip.h>
+#include <linux/irqchip/xtensa-pic.h>
#include <linux/of.h>
unsigned int cached_irq_mask;
diff --git a/drivers/irqchip/irqchip.c b/drivers/irqchip/irqchip.c
index 7899607fbee8..1eeb0d0156ce 100644
--- a/drivers/irqchip/irqchip.c
+++ b/drivers/irqchip/irqchip.c
@@ -10,7 +10,7 @@
#include <linux/acpi.h>
#include <linux/init.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
#include <linux/of_irq.h>
#include <linux/irqchip.h>
#include <linux/platform_device.h>
diff --git a/drivers/irqchip/qcom-pdc.c b/drivers/irqchip/qcom-pdc.c
index d96916cf6a41..a32c0d28d038 100644
--- a/drivers/irqchip/qcom-pdc.c
+++ b/drivers/irqchip/qcom-pdc.c
@@ -14,7 +14,6 @@
#include <linux/module.h>
#include <linux/of.h>
#include <linux/of_address.h>
-#include <linux/of_device.h>
#include <linux/of_irq.h>
#include <linux/soc/qcom/irq.h>
#include <linux/spinlock.h>
diff --git a/drivers/leds/trigger/ledtrig-netdev.c b/drivers/leds/trigger/ledtrig-netdev.c
index c9bc5a91ec83..03c58e50cc44 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -406,15 +406,15 @@ static ssize_t interval_store(struct device *dev,
static DEVICE_ATTR_RW(interval);
-static ssize_t hw_control_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t offloaded_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct led_netdev_data *trigger_data = led_trigger_get_drvdata(dev);
return sprintf(buf, "%d\n", trigger_data->hw_control);
}
-static DEVICE_ATTR_RO(hw_control);
+static DEVICE_ATTR_RO(offloaded);
static struct attribute *netdev_trig_attrs[] = {
&dev_attr_device_name.attr,
@@ -427,7 +427,7 @@ static struct attribute *netdev_trig_attrs[] = {
&dev_attr_rx.attr,
&dev_attr_tx.attr,
&dev_attr_interval.attr,
- &dev_attr_hw_control.attr,
+ &dev_attr_offloaded.attr,
NULL
};
ATTRIBUTE_GROUPS(netdev_trig);
diff --git a/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc.c b/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc.c
index 9ff439a50f53..315e97a2450e 100644
--- a/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc.c
+++ b/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc.c
@@ -821,6 +821,8 @@ static int vb2ops_venc_queue_setup(struct vb2_queue *vq,
return -EINVAL;
if (*nplanes) {
+ if (*nplanes != q_data->fmt->num_planes)
+ return -EINVAL;
for (i = 0; i < *nplanes; i++)
if (sizes[i] < q_data->sizeimage[i])
return -EINVAL;
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 35fec1bf1b3d..5867af9f592c 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -139,7 +139,7 @@ static struct inode *ibmasmfs_make_inode(struct super_block *sb, int mode)
if (ret) {
ret->i_ino = get_next_ino();
ret->i_mode = mode;
- ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret);
+ ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret);
}
return ret;
}
diff --git a/drivers/misc/ibmvmc.c b/drivers/misc/ibmvmc.c
index cbaf6d35e854..2101eb12bcba 100644
--- a/drivers/misc/ibmvmc.c
+++ b/drivers/misc/ibmvmc.c
@@ -1124,7 +1124,7 @@ static ssize_t ibmvmc_write(struct file *file, const char *buffer,
goto out;
inode = file_inode(file);
- inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
dev_dbg(adapter->dev, "write: file = 0x%lx, count = 0x%lx\n",
diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 3c95600ab2f7..c66cc05a68c4 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -273,8 +273,8 @@ static void lkdtm_HUNG_TASK(void)
schedule();
}
-volatile unsigned int huge = INT_MAX - 2;
-volatile unsigned int ignored;
+static volatile unsigned int huge = INT_MAX - 2;
+static volatile unsigned int ignored;
static void lkdtm_OVERFLOW_SIGNED(void)
{
@@ -305,7 +305,7 @@ static void lkdtm_OVERFLOW_UNSIGNED(void)
ignored = value;
}
-/* Intentionally using old-style flex array definition of 1 byte. */
+/* Intentionally using unannotated flex array definition. */
struct array_bounds_flex_array {
int one;
int two;
@@ -357,6 +357,46 @@ static void lkdtm_ARRAY_BOUNDS(void)
pr_expected_config(CONFIG_UBSAN_BOUNDS);
}
+struct lkdtm_annotated {
+ unsigned long flags;
+ int count;
+ int array[] __counted_by(count);
+};
+
+static volatile int fam_count = 4;
+
+static void lkdtm_FAM_BOUNDS(void)
+{
+ struct lkdtm_annotated *inst;
+
+ inst = kzalloc(struct_size(inst, array, fam_count + 1), GFP_KERNEL);
+ if (!inst) {
+ pr_err("FAIL: could not allocate test struct!\n");
+ return;
+ }
+
+ inst->count = fam_count;
+ pr_info("Array access within bounds ...\n");
+ inst->array[1] = fam_count;
+ ignored = inst->array[1];
+
+ pr_info("Array access beyond bounds ...\n");
+ inst->array[fam_count] = fam_count;
+ ignored = inst->array[fam_count];
+
+ kfree(inst);
+
+ pr_err("FAIL: survived access of invalid flexible array member index!\n");
+
+ if (!__has_attribute(__counted_by__))
+ pr_warn("This is expected since this %s was built a compiler supporting __counted_by\n",
+ lkdtm_kernel_info);
+ else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS))
+ pr_expected_config(CONFIG_UBSAN_TRAP);
+ else
+ pr_expected_config(CONFIG_UBSAN_BOUNDS);
+}
+
static void lkdtm_CORRUPT_LIST_ADD(void)
{
/*
@@ -393,7 +433,7 @@ static void lkdtm_CORRUPT_LIST_ADD(void)
pr_err("Overwrite did not happen, but no BUG?!\n");
else {
pr_err("list_add() corruption not detected!\n");
- pr_expected_config(CONFIG_DEBUG_LIST);
+ pr_expected_config(CONFIG_LIST_HARDENED);
}
}
@@ -420,7 +460,7 @@ static void lkdtm_CORRUPT_LIST_DEL(void)
pr_err("Overwrite did not happen, but no BUG?!\n");
else {
pr_err("list_del() corruption not detected!\n");
- pr_expected_config(CONFIG_DEBUG_LIST);
+ pr_expected_config(CONFIG_LIST_HARDENED);
}
}
@@ -616,6 +656,7 @@ static struct crashtype crashtypes[] = {
CRASHTYPE(OVERFLOW_SIGNED),
CRASHTYPE(OVERFLOW_UNSIGNED),
CRASHTYPE(ARRAY_BOUNDS),
+ CRASHTYPE(FAM_BOUNDS),
CRASHTYPE(CORRUPT_LIST_ADD),
CRASHTYPE(CORRUPT_LIST_DEL),
CRASHTYPE(STACK_GUARD_PAGE_LEADING),
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index b9dbad3a8af8..fc5da5d7744d 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -660,10 +660,10 @@ static struct slave *rlb_arp_xmit(struct sk_buff *skb, struct bonding *bond)
return NULL;
arp = (struct arp_pkt *)skb_network_header(skb);
- /* Don't modify or load balance ARPs that do not originate locally
- * (e.g.,arrive via a bridge).
+ /* Don't modify or load balance ARPs that do not originate
+ * from the bond itself or a VLAN directly above the bond.
*/
- if (!bond_slave_has_mac_rx(bond, arp->mac_src))
+ if (!bond_slave_has_mac_rcu(bond, arp->mac_src))
return NULL;
dev = ip_dev_find(dev_net(bond->dev), arp->ip_src);
diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index 4068d962203d..98c669ad5141 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -192,12 +192,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
nla_peer = data[VXCAN_INFO_PEER];
ifmp = nla_data(nla_peer);
- err = rtnl_nla_parse_ifla(peer_tb,
- nla_data(nla_peer) +
- sizeof(struct ifinfomsg),
- nla_len(nla_peer) -
- sizeof(struct ifinfomsg),
- NULL);
+ err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
if (err < 0)
return err;
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 38b3c6dda386..b8bb9f3b3609 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -1006,6 +1006,10 @@ mt753x_trap_frames(struct mt7530_priv *priv)
mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK,
MT753X_BPDU_CPU_ONLY);
+ /* Trap 802.1X PAE frames to the CPU port(s) */
+ mt7530_rmw(priv, MT753X_BPC, MT753X_PAE_PORT_FW_MASK,
+ MT753X_PAE_PORT_FW(MT753X_BPDU_CPU_ONLY));
+
/* Trap LLDP frames with :0E MAC DA to the CPU port(s) */
mt7530_rmw(priv, MT753X_RGAC2, MT753X_R0E_PORT_FW_MASK,
MT753X_R0E_PORT_FW(MT753X_BPDU_CPU_ONLY));
diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h
index 08045b035e6a..17e42d30fff4 100644
--- a/drivers/net/dsa/mt7530.h
+++ b/drivers/net/dsa/mt7530.h
@@ -66,6 +66,8 @@ enum mt753x_id {
/* Registers for BPDU and PAE frame control*/
#define MT753X_BPC 0x24
#define MT753X_BPDU_PORT_FW_MASK GENMASK(2, 0)
+#define MT753X_PAE_PORT_FW_MASK GENMASK(18, 16)
+#define MT753X_PAE_PORT_FW(x) FIELD_PREP(MT753X_PAE_PORT_FW_MASK, x)
/* Register for :03 and :0E MAC DA frame control */
#define MT753X_RGAC2 0x2c
diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 1c113957fcf4..f16daa9b1765 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1069,6 +1069,9 @@ static u64 vsc9959_tas_remaining_gate_len_ps(u64 gate_len_ns)
if (gate_len_ns == U64_MAX)
return U64_MAX;
+ if (gate_len_ns < VSC9959_TAS_MIN_GATE_LEN_NS)
+ return 0;
+
return (gate_len_ns - VSC9959_TAS_MIN_GATE_LEN_NS) * PSEC_PER_NSEC;
}
diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c
index 10c7c232cc4e..52ee3751187a 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1448,7 +1448,7 @@ int bgmac_phy_connect_direct(struct bgmac *bgmac)
int err;
phy_dev = fixed_phy_register(PHY_POLL, &fphy_status, NULL);
- if (!phy_dev || IS_ERR(phy_dev)) {
+ if (IS_ERR(phy_dev)) {
dev_err(bgmac->dev, "Failed to register fixed PHY device\n");
return -ENODEV;
}
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 8bcde0a6e011..e2a4e1088b7f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1508,6 +1508,8 @@ struct bnx2x {
bool cnic_loaded;
struct cnic_eth_dev *(*cnic_probe)(struct net_device *);
+ bool nic_stopped;
+
/* Flag that indicates that we can start looking for FCoE L2 queue
* completions in the default status block.
*/
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 6ea5521074d3..e9c1e1bb5580 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2715,6 +2715,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
bnx2x_add_all_napi(bp);
DP(NETIF_MSG_IFUP, "napi added\n");
bnx2x_napi_enable(bp);
+ bp->nic_stopped = false;
if (IS_PF(bp)) {
/* set pf load just before approaching the MCP */
@@ -2960,6 +2961,7 @@ load_error2:
load_error1:
bnx2x_napi_disable(bp);
bnx2x_del_all_napi(bp);
+ bp->nic_stopped = true;
/* clear pf_load status, as it was already set */
if (IS_PF(bp))
@@ -3095,14 +3097,17 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
if (!CHIP_IS_E1x(bp))
bnx2x_pf_disable(bp);
- /* Disable HW interrupts, NAPI */
- bnx2x_netif_stop(bp, 1);
- /* Delete all NAPI objects */
- bnx2x_del_all_napi(bp);
- if (CNIC_LOADED(bp))
- bnx2x_del_all_napi_cnic(bp);
- /* Release IRQs */
- bnx2x_free_irq(bp);
+ if (!bp->nic_stopped) {
+ /* Disable HW interrupts, NAPI */
+ bnx2x_netif_stop(bp, 1);
+ /* Delete all NAPI objects */
+ bnx2x_del_all_napi(bp);
+ if (CNIC_LOADED(bp))
+ bnx2x_del_all_napi_cnic(bp);
+ /* Release IRQs */
+ bnx2x_free_irq(bp);
+ bp->nic_stopped = true;
+ }
/* Report UNLOAD_DONE to MCP */
bnx2x_send_unload_done(bp, false);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 1e7a6f1d4223..0d8e61c63c7c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9474,15 +9474,18 @@ unload_error:
}
}
- /* Disable HW interrupts, NAPI */
- bnx2x_netif_stop(bp, 1);
- /* Delete all NAPI objects */
- bnx2x_del_all_napi(bp);
- if (CNIC_LOADED(bp))
- bnx2x_del_all_napi_cnic(bp);
+ if (!bp->nic_stopped) {
+ /* Disable HW interrupts, NAPI */
+ bnx2x_netif_stop(bp, 1);
+ /* Delete all NAPI objects */
+ bnx2x_del_all_napi(bp);
+ if (CNIC_LOADED(bp))
+ bnx2x_del_all_napi_cnic(bp);
- /* Release IRQs */
- bnx2x_free_irq(bp);
+ /* Release IRQs */
+ bnx2x_free_irq(bp);
+ bp->nic_stopped = true;
+ }
/* Reset the chip, unless PCI function is offline. If we reach this
* point following a PCI error handling, it means device is really
@@ -14238,13 +14241,16 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
}
bnx2x_drain_tx_queues(bp);
bnx2x_send_unload_req(bp, UNLOAD_RECOVERY);
- bnx2x_netif_stop(bp, 1);
- bnx2x_del_all_napi(bp);
+ if (!bp->nic_stopped) {
+ bnx2x_netif_stop(bp, 1);
+ bnx2x_del_all_napi(bp);
- if (CNIC_LOADED(bp))
- bnx2x_del_all_napi_cnic(bp);
+ if (CNIC_LOADED(bp))
+ bnx2x_del_all_napi_cnic(bp);
- bnx2x_free_irq(bp);
+ bnx2x_free_irq(bp);
+ bp->nic_stopped = true;
+ }
/* Report UNLOAD_DONE to MCP */
bnx2x_send_unload_done(bp, true);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index 0657a0f5170f..8946a931e87e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -529,13 +529,16 @@ void bnx2x_vfpf_close_vf(struct bnx2x *bp)
bnx2x_vfpf_finalize(bp, &req->first_tlv);
free_irq:
- /* Disable HW interrupts, NAPI */
- bnx2x_netif_stop(bp, 0);
- /* Delete all NAPI objects */
- bnx2x_del_all_napi(bp);
-
- /* Release IRQs */
- bnx2x_free_irq(bp);
+ if (!bp->nic_stopped) {
+ /* Disable HW interrupts, NAPI */
+ bnx2x_netif_stop(bp, 0);
+ /* Delete all NAPI objects */
+ bnx2x_del_all_napi(bp);
+
+ /* Release IRQs */
+ bnx2x_free_irq(bp);
+ bp->nic_stopped = true;
+ }
}
static void bnx2x_leading_vfq_init(struct bnx2x *bp, struct bnx2x_virtf *vf,
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 0092e46c46f8..cc3afb605b1e 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -617,7 +617,7 @@ static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv)
};
phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL);
- if (!phydev || IS_ERR(phydev)) {
+ if (IS_ERR(phydev)) {
dev_err(kdev, "failed to register fixed PHY device\n");
return -ENODEV;
}
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 5ef073a79ce9..cb2810f175cc 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -6881,7 +6881,10 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
ri->data = NULL;
- skb = build_skb(data, frag_size);
+ if (frag_size)
+ skb = build_skb(data, frag_size);
+ else
+ skb = slab_build_skb(data);
if (!skb) {
tg3_frag_free(frag_size != 0, data);
goto drop_it_no_recycle;
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
index c2e7037c7ba1..7750702900fa 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
@@ -1466,7 +1466,7 @@ static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt)
tp->write_seq = snd_isn;
tp->snd_nxt = snd_isn;
tp->snd_una = snd_isn;
- inet_sk(sk)->inet_id = get_random_u16();
+ atomic_set(&inet_sk(sk)->inet_id, get_random_u16());
assign_rxopt(sk, opt);
if (tp->rcv_wnd > (RCV_BUFSIZ_M << 10))
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index 113fcb3e353e..832a2ae01950 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -203,7 +203,7 @@ static inline void ibmveth_flush_buffer(void *addr, unsigned long length)
unsigned long offset;
for (offset = 0; offset < length; offset += SMP_CACHE_BYTES)
- asm("dcbfl %0,%1" :: "b" (addr), "r" (offset));
+ asm("dcbf %0,%1,1" :: "b" (addr), "r" (offset));
}
/* replenish the buffers for a pool. note that we don't need to
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 29ad1797adce..a86bfa3bba74 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2609,7 +2609,7 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi)
retval = i40e_correct_mac_vlan_filters
(vsi, &tmp_add_list, &tmp_del_list,
vlan_filters);
- else
+ else if (pf->vf)
retval = i40e_correct_vf_mac_vlan_filters
(vsi, &tmp_add_list, &tmp_del_list,
vlan_filters, pf->vf[vsi->vf_id].trusted);
@@ -2782,7 +2782,8 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi)
}
/* if the VF is not trusted do not do promisc */
- if ((vsi->type == I40E_VSI_SRIOV) && !pf->vf[vsi->vf_id].trusted) {
+ if (vsi->type == I40E_VSI_SRIOV && pf->vf &&
+ !pf->vf[vsi->vf_id].trusted) {
clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
goto out;
}
diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
index b678bdf96f3a..074bf9403cd1 100644
--- a/drivers/net/ethernet/intel/ice/ice_base.c
+++ b/drivers/net/ethernet/intel/ice/ice_base.c
@@ -435,7 +435,8 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
/* Receive Packet Data Buffer Size.
* The Packet Data Buffer Size is defined in 128 byte units.
*/
- rlan_ctx.dbuf = ring->rx_buf_len >> ICE_RLAN_CTX_DBUF_S;
+ rlan_ctx.dbuf = DIV_ROUND_UP(ring->rx_buf_len,
+ BIT_ULL(ICE_RLAN_CTX_DBUF_S));
/* use 32 byte descriptors */
rlan_ctx.dsize = 1;
diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
index 1f66914c7a20..31314e7540f8 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.c
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
@@ -1131,7 +1131,7 @@ int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena)
if (!vf)
return -EINVAL;
- ret = ice_check_vf_ready_for_reset(vf);
+ ret = ice_check_vf_ready_for_cfg(vf);
if (ret)
goto out_put_vf;
@@ -1246,7 +1246,7 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
goto out_put_vf;
}
- ret = ice_check_vf_ready_for_reset(vf);
+ ret = ice_check_vf_ready_for_cfg(vf);
if (ret)
goto out_put_vf;
@@ -1300,7 +1300,7 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted)
return -EOPNOTSUPP;
}
- ret = ice_check_vf_ready_for_reset(vf);
+ ret = ice_check_vf_ready_for_cfg(vf);
if (ret)
goto out_put_vf;
@@ -1613,7 +1613,7 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
if (!vf)
return -EINVAL;
- ret = ice_check_vf_ready_for_reset(vf);
+ ret = ice_check_vf_ready_for_cfg(vf);
if (ret)
goto out_put_vf;
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.c b/drivers/net/ethernet/intel/ice/ice_vf_lib.c
index b26ce4425f45..ea3310be8354 100644
--- a/drivers/net/ethernet/intel/ice/ice_vf_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.c
@@ -186,25 +186,6 @@ int ice_check_vf_ready_for_cfg(struct ice_vf *vf)
}
/**
- * ice_check_vf_ready_for_reset - check if VF is ready to be reset
- * @vf: VF to check if it's ready to be reset
- *
- * The purpose of this function is to ensure that the VF is not in reset,
- * disabled, and is both initialized and active, thus enabling us to safely
- * initialize another reset.
- */
-int ice_check_vf_ready_for_reset(struct ice_vf *vf)
-{
- int ret;
-
- ret = ice_check_vf_ready_for_cfg(vf);
- if (!ret && !test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
- ret = -EAGAIN;
-
- return ret;
-}
-
-/**
* ice_trigger_vf_reset - Reset a VF on HW
* @vf: pointer to the VF structure
* @is_vflr: true if VFLR was issued, false if not
@@ -631,11 +612,17 @@ int ice_reset_vf(struct ice_vf *vf, u32 flags)
return 0;
}
+ if (flags & ICE_VF_RESET_LOCK)
+ mutex_lock(&vf->cfg_lock);
+ else
+ lockdep_assert_held(&vf->cfg_lock);
+
if (ice_is_vf_disabled(vf)) {
vsi = ice_get_vf_vsi(vf);
if (!vsi) {
dev_dbg(dev, "VF is already removed\n");
- return -EINVAL;
+ err = -EINVAL;
+ goto out_unlock;
}
ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, vf->vf_id);
@@ -644,14 +631,9 @@ int ice_reset_vf(struct ice_vf *vf, u32 flags)
dev_dbg(dev, "VF is already disabled, there is no need for resetting it, telling VM, all is fine %d\n",
vf->vf_id);
- return 0;
+ goto out_unlock;
}
- if (flags & ICE_VF_RESET_LOCK)
- mutex_lock(&vf->cfg_lock);
- else
- lockdep_assert_held(&vf->cfg_lock);
-
/* Set VF disable bit state here, before triggering reset */
set_bit(ICE_VF_STATE_DIS, vf->vf_states);
ice_trigger_vf_reset(vf, flags & ICE_VF_RESET_VFLR, false);
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.h b/drivers/net/ethernet/intel/ice/ice_vf_lib.h
index 67172fdd9bc2..48fea6fa0362 100644
--- a/drivers/net/ethernet/intel/ice/ice_vf_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.h
@@ -215,7 +215,6 @@ u16 ice_get_num_vfs(struct ice_pf *pf);
struct ice_vsi *ice_get_vf_vsi(struct ice_vf *vf);
bool ice_is_vf_disabled(struct ice_vf *vf);
int ice_check_vf_ready_for_cfg(struct ice_vf *vf);
-int ice_check_vf_ready_for_reset(struct ice_vf *vf);
void ice_set_vf_state_dis(struct ice_vf *vf);
bool ice_is_any_vf_in_unicast_promisc(struct ice_pf *pf);
void
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
index efbc2968a7bf..dcf628b1fccd 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
@@ -3947,7 +3947,6 @@ error_handler:
ice_vc_notify_vf_link_state(vf);
break;
case VIRTCHNL_OP_RESET_VF:
- clear_bit(ICE_VF_STATE_ACTIVE, vf->vf_states);
ops->reset_vf(vf);
break;
case VIRTCHNL_OP_ADD_ETH_ADDR:
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index 405886ee5261..319c544b9f04 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -1385,18 +1385,6 @@ void igb_ptp_init(struct igb_adapter *adapter)
return;
}
- spin_lock_init(&adapter->tmreg_lock);
- INIT_WORK(&adapter->ptp_tx_work, igb_ptp_tx_work);
-
- if (adapter->ptp_flags & IGB_PTP_OVERFLOW_CHECK)
- INIT_DELAYED_WORK(&adapter->ptp_overflow_work,
- igb_ptp_overflow_check);
-
- adapter->tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
- adapter->tstamp_config.tx_type = HWTSTAMP_TX_OFF;
-
- igb_ptp_reset(adapter);
-
adapter->ptp_clock = ptp_clock_register(&adapter->ptp_caps,
&adapter->pdev->dev);
if (IS_ERR(adapter->ptp_clock)) {
@@ -1406,6 +1394,18 @@ void igb_ptp_init(struct igb_adapter *adapter)
dev_info(&adapter->pdev->dev, "added PHC on %s\n",
adapter->netdev->name);
adapter->ptp_flags |= IGB_PTP_ENABLED;
+
+ spin_lock_init(&adapter->tmreg_lock);
+ INIT_WORK(&adapter->ptp_tx_work, igb_ptp_tx_work);
+
+ if (adapter->ptp_flags & IGB_PTP_OVERFLOW_CHECK)
+ INIT_DELAYED_WORK(&adapter->ptp_overflow_work,
+ igb_ptp_overflow_check);
+
+ adapter->tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
+ adapter->tstamp_config.tx_type = HWTSTAMP_TX_OFF;
+
+ igb_ptp_reset(adapter);
}
}
diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h
index 44a507029946..2f780cc90883 100644
--- a/drivers/net/ethernet/intel/igc/igc_defines.h
+++ b/drivers/net/ethernet/intel/igc/igc_defines.h
@@ -546,7 +546,7 @@
#define IGC_PTM_CTRL_START_NOW BIT(29) /* Start PTM Now */
#define IGC_PTM_CTRL_EN BIT(30) /* Enable PTM */
#define IGC_PTM_CTRL_TRIG BIT(31) /* PTM Cycle trigger */
-#define IGC_PTM_CTRL_SHRT_CYC(usec) (((usec) & 0x2f) << 2)
+#define IGC_PTM_CTRL_SHRT_CYC(usec) (((usec) & 0x3f) << 2)
#define IGC_PTM_CTRL_PTM_TO(usec) (((usec) & 0xff) << 8)
#define IGC_PTM_SHORT_CYC_DEFAULT 10 /* Default Short/interrupted cycle interval */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 04b0e885f9d2..c2f68678e947 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -4270,9 +4270,10 @@ rx_frscfg:
if (link < 0)
return NIX_AF_ERR_RX_LINK_INVALID;
- nix_find_link_frs(rvu, req, pcifunc);
linkcfg:
+ nix_find_link_frs(rvu, req, pcifunc);
+
cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_LINKX_CFG(link));
cfg = (cfg & ~(0xFFFFULL << 16)) | ((u64)req->maxlen << 16);
if (req->update_minlen)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 985cff910f30..3b651efcc25e 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -221,9 +221,13 @@ void mtk_wed_fe_reset(void)
for (i = 0; i < ARRAY_SIZE(hw_list); i++) {
struct mtk_wed_hw *hw = hw_list[i];
- struct mtk_wed_device *dev = hw->wed_dev;
+ struct mtk_wed_device *dev;
int err;
+ if (!hw)
+ break;
+
+ dev = hw->wed_dev;
if (!dev || !dev->wlan.reset)
continue;
@@ -244,8 +248,12 @@ void mtk_wed_fe_reset_complete(void)
for (i = 0; i < ARRAY_SIZE(hw_list); i++) {
struct mtk_wed_hw *hw = hw_list[i];
- struct mtk_wed_device *dev = hw->wed_dev;
+ struct mtk_wed_device *dev;
+
+ if (!hw)
+ break;
+ dev = hw->wed_dev;
if (!dev || !dev->wlan.reset_complete)
continue;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
index f0b2963ebac3..973de2adc943 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
@@ -32,8 +32,8 @@ static const struct mlxsw_afk_element_info mlxsw_afk_element_infos[] = {
MLXSW_AFK_ELEMENT_INFO_U32(IP_TTL_, 0x18, 0, 8),
MLXSW_AFK_ELEMENT_INFO_U32(IP_ECN, 0x18, 9, 2),
MLXSW_AFK_ELEMENT_INFO_U32(IP_DSCP, 0x18, 11, 6),
- MLXSW_AFK_ELEMENT_INFO_U32(VIRT_ROUTER_MSB, 0x18, 17, 3),
- MLXSW_AFK_ELEMENT_INFO_U32(VIRT_ROUTER_LSB, 0x18, 20, 8),
+ MLXSW_AFK_ELEMENT_INFO_U32(VIRT_ROUTER_MSB, 0x18, 17, 4),
+ MLXSW_AFK_ELEMENT_INFO_U32(VIRT_ROUTER_LSB, 0x18, 21, 8),
MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP_96_127, 0x20, 4),
MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP_64_95, 0x24, 4),
MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP_32_63, 0x28, 4),
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index c968309657dd..51eea1f0529c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -517,11 +517,15 @@ static void mlxsw_pci_skb_cb_ts_set(struct mlxsw_pci *mlxsw_pci,
struct sk_buff *skb,
enum mlxsw_pci_cqe_v cqe_v, char *cqe)
{
+ u8 ts_type;
+
if (cqe_v != MLXSW_PCI_CQE_V2)
return;
- if (mlxsw_pci_cqe2_time_stamp_type_get(cqe) !=
- MLXSW_PCI_CQE_TIME_STAMP_TYPE_UTC)
+ ts_type = mlxsw_pci_cqe2_time_stamp_type_get(cqe);
+
+ if (ts_type != MLXSW_PCI_CQE_TIME_STAMP_TYPE_UTC &&
+ ts_type != MLXSW_PCI_CQE_TIME_STAMP_TYPE_MIRROR_UTC)
return;
mlxsw_skb_cb(skb)->cqe_ts.sec = mlxsw_pci_cqe2_time_stamp_sec_get(cqe);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 8165bf31a99a..17160e867bef 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -97,14 +97,6 @@ MLXSW_ITEM32(reg, sspr, m, 0x00, 31, 1);
*/
MLXSW_ITEM32_LP(reg, sspr, 0x00, 16, 0x00, 12);
-/* reg_sspr_sub_port
- * Virtual port within the physical port.
- * Should be set to 0 when virtual ports are not enabled on the port.
- *
- * Access: RW
- */
-MLXSW_ITEM32(reg, sspr, sub_port, 0x00, 8, 8);
-
/* reg_sspr_system_port
* Unique identifier within the stacking domain that represents all the ports
* that are available in the system (external ports).
@@ -120,7 +112,6 @@ static inline void mlxsw_reg_sspr_pack(char *payload, u16 local_port)
MLXSW_REG_ZERO(sspr, payload);
mlxsw_reg_sspr_m_set(payload, 1);
mlxsw_reg_sspr_local_port_set(payload, local_port);
- mlxsw_reg_sspr_sub_port_set(payload, 0);
mlxsw_reg_sspr_system_port_set(payload, local_port);
}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c
index e4f4cded2b6f..b1178b7a7f51 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c
@@ -193,7 +193,7 @@ mlxsw_sp2_mr_tcam_rule_parse(struct mlxsw_sp_acl_rule *rule,
key->vrid, GENMASK(7, 0));
mlxsw_sp_acl_rulei_keymask_u32(rulei,
MLXSW_AFK_ELEMENT_VIRT_ROUTER_MSB,
- key->vrid >> 8, GENMASK(2, 0));
+ key->vrid >> 8, GENMASK(3, 0));
switch (key->proto) {
case MLXSW_SP_L3_PROTO_IPV4:
return mlxsw_sp2_mr_tcam_rule_parse4(rulei, key);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.c
index 4dea39f2b304..ae2d6f12b799 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.c
@@ -171,7 +171,7 @@ static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_2[] = {
static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_4[] = {
MLXSW_AFK_ELEMENT_INST_U32(VIRT_ROUTER_LSB, 0x04, 24, 8),
- MLXSW_AFK_ELEMENT_INST_U32(VIRT_ROUTER_MSB, 0x00, 0, 3),
+ MLXSW_AFK_ELEMENT_INST_EXT_U32(VIRT_ROUTER_MSB, 0x00, 0, 3, 0, true),
};
static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_0[] = {
@@ -321,7 +321,7 @@ static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_mac_5b[] = {
static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_4b[] = {
MLXSW_AFK_ELEMENT_INST_U32(VIRT_ROUTER_LSB, 0x04, 13, 8),
- MLXSW_AFK_ELEMENT_INST_EXT_U32(VIRT_ROUTER_MSB, 0x04, 21, 4, 0, true),
+ MLXSW_AFK_ELEMENT_INST_U32(VIRT_ROUTER_MSB, 0x04, 21, 4),
};
static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_2b[] = {
diff --git a/drivers/net/ethernet/sfc/falcon/selftest.c b/drivers/net/ethernet/sfc/falcon/selftest.c
index cf1d67b6d86d..c3dc88e6c26c 100644
--- a/drivers/net/ethernet/sfc/falcon/selftest.c
+++ b/drivers/net/ethernet/sfc/falcon/selftest.c
@@ -428,7 +428,7 @@ static int ef4_begin_loopback(struct ef4_tx_queue *tx_queue)
for (i = 0; i < state->packet_count; i++) {
/* Allocate an skb, holding an extra reference for
* transmit completion counting */
- skb = alloc_skb(EF4_LOOPBACK_PAYLOAD_LEN, GFP_KERNEL);
+ skb = alloc_skb(sizeof(state->payload), GFP_KERNEL);
if (!skb)
return -ENOMEM;
state->skbs[i] = skb;
diff --git a/drivers/net/ethernet/sfc/selftest.c b/drivers/net/ethernet/sfc/selftest.c
index 19a0b8584afb..563c1e317ce9 100644
--- a/drivers/net/ethernet/sfc/selftest.c
+++ b/drivers/net/ethernet/sfc/selftest.c
@@ -426,7 +426,7 @@ static int efx_begin_loopback(struct efx_tx_queue *tx_queue)
for (i = 0; i < state->packet_count; i++) {
/* Allocate an skb, holding an extra reference for
* transmit completion counting */
- skb = alloc_skb(EFX_LOOPBACK_PAYLOAD_LEN, GFP_KERNEL);
+ skb = alloc_skb(sizeof(state->payload), GFP_KERNEL);
if (!skb)
return -ENOMEM;
state->skbs[i] = skb;
diff --git a/drivers/net/ethernet/sfc/siena/selftest.c b/drivers/net/ethernet/sfc/siena/selftest.c
index b55fd3346972..526da43d4b61 100644
--- a/drivers/net/ethernet/sfc/siena/selftest.c
+++ b/drivers/net/ethernet/sfc/siena/selftest.c
@@ -426,7 +426,7 @@ static int efx_begin_loopback(struct efx_tx_queue *tx_queue)
for (i = 0; i < state->packet_count; i++) {
/* Allocate an skb, holding an extra reference for
* transmit completion counting */
- skb = alloc_skb(EFX_LOOPBACK_PAYLOAD_LEN, GFP_KERNEL);
+ skb = alloc_skb(sizeof(state->payload), GFP_KERNEL);
if (!skb)
return -ENOMEM;
state->skbs[i] = skb;
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index b15dd9a3ad54..1b55928e89b8 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -748,7 +748,8 @@ static int ipvlan_device_event(struct notifier_block *unused,
write_pnet(&port->pnet, newnet);
- ipvlan_migrate_l3s_hook(oldnet, newnet);
+ if (port->mode == IPVLAN_MODE_L3S)
+ ipvlan_migrate_l3s_hook(oldnet, newnet);
break;
}
case NETDEV_UNREGISTER:
diff --git a/drivers/net/mdio/mdio-bitbang.c b/drivers/net/mdio/mdio-bitbang.c
index b83932562be2..81b7748c10ce 100644
--- a/drivers/net/mdio/mdio-bitbang.c
+++ b/drivers/net/mdio/mdio-bitbang.c
@@ -186,7 +186,7 @@ int mdiobb_read_c45(struct mii_bus *bus, int phy, int devad, int reg)
struct mdiobb_ctrl *ctrl = bus->priv;
mdiobb_cmd_addr(ctrl, phy, devad, reg);
- mdiobb_cmd(ctrl, MDIO_C45_READ, phy, reg);
+ mdiobb_cmd(ctrl, MDIO_C45_READ, phy, devad);
return mdiobb_read_common(bus, phy);
}
@@ -222,7 +222,7 @@ int mdiobb_write_c45(struct mii_bus *bus, int phy, int devad, int reg, u16 val)
struct mdiobb_ctrl *ctrl = bus->priv;
mdiobb_cmd_addr(ctrl, phy, devad, reg);
- mdiobb_cmd(ctrl, MDIO_C45_WRITE, phy, reg);
+ mdiobb_cmd(ctrl, MDIO_C45_WRITE, phy, devad);
return mdiobb_write_common(bus, val);
}
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index bdf00b2b2c1d..a9ecfdd19624 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1184,9 +1184,11 @@ void phy_stop_machine(struct phy_device *phydev)
static void phy_process_error(struct phy_device *phydev)
{
- mutex_lock(&phydev->lock);
+ /* phydev->lock must be held for the state change to be safe */
+ if (!mutex_is_locked(&phydev->lock))
+ phydev_err(phydev, "PHY-device data unsafe context\n");
+
phydev->state = PHY_ERROR;
- mutex_unlock(&phydev->lock);
phy_trigger_machine(phydev);
}
@@ -1195,7 +1197,9 @@ static void phy_error_precise(struct phy_device *phydev,
const void *func, int err)
{
WARN(1, "%pS: returned: %d\n", func, err);
+ mutex_lock(&phydev->lock);
phy_process_error(phydev);
+ mutex_unlock(&phydev->lock);
}
/**
@@ -1204,8 +1208,7 @@ static void phy_error_precise(struct phy_device *phydev,
*
* Moves the PHY to the ERROR state in response to a read
* or write error, and tells the controller the link is down.
- * Must not be called from interrupt context, or while the
- * phydev->lock is held.
+ * Must be called with phydev->lock held.
*/
void phy_error(struct phy_device *phydev)
{
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index e8dd47bffe43..208a9393c2df 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -258,6 +258,16 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
switch (id->base.extended_cc) {
case SFF8024_ECC_UNSPEC:
break;
+ case SFF8024_ECC_100G_25GAUI_C2M_AOC:
+ if (br_min <= 28000 && br_max >= 25000) {
+ /* 25GBASE-R, possibly with FEC */
+ __set_bit(PHY_INTERFACE_MODE_25GBASER, interfaces);
+ /* There is currently no link mode for 25000base
+ * with unspecified range, reuse SR.
+ */
+ phylink_set(modes, 25000baseSR_Full);
+ }
+ break;
case SFF8024_ECC_100GBASE_SR4_25GBASE_SR:
phylink_set(modes, 100000baseSR4_Full);
phylink_set(modes, 25000baseSR_Full);
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 509e901da41d..ef8eacb596f7 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1861,10 +1861,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
nla_peer = data[VETH_INFO_PEER];
ifmp = nla_data(nla_peer);
- err = rtnl_nla_parse_ifla(peer_tb,
- nla_data(nla_peer) + sizeof(struct ifinfomsg),
- nla_len(nla_peer) - sizeof(struct ifinfomsg),
- NULL);
+ err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
if (err < 0)
return err;
diff --git a/drivers/net/wireless/intel/iwlwifi/Kconfig b/drivers/net/wireless/intel/iwlwifi/Kconfig
index b20409f8c13a..20971304fdef 100644
--- a/drivers/net/wireless/intel/iwlwifi/Kconfig
+++ b/drivers/net/wireless/intel/iwlwifi/Kconfig
@@ -66,6 +66,7 @@ config IWLMVM
tristate "Intel Wireless WiFi MVM Firmware support"
select WANT_DEV_COREDUMP
depends on MAC80211
+ depends on PTP_1588_CLOCK_OPTIONAL
help
This is the driver that supports the MVM firmware. The list
of the devices that use this firmware is available here:
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index e311d406b170..4999636eaa92 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -63,15 +63,14 @@ int of_reconfig_notifier_unregister(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister);
-#ifdef DEBUG
-const char *action_names[] = {
+static const char *action_names[] = {
+ [0] = "INVALID",
[OF_RECONFIG_ATTACH_NODE] = "ATTACH_NODE",
[OF_RECONFIG_DETACH_NODE] = "DETACH_NODE",
[OF_RECONFIG_ADD_PROPERTY] = "ADD_PROPERTY",
[OF_RECONFIG_REMOVE_PROPERTY] = "REMOVE_PROPERTY",
[OF_RECONFIG_UPDATE_PROPERTY] = "UPDATE_PROPERTY",
};
-#endif
int of_reconfig_notify(unsigned long action, struct of_reconfig_data *p)
{
@@ -620,21 +619,9 @@ static int __of_changeset_entry_apply(struct of_changeset_entry *ce)
}
ret = __of_add_property(ce->np, ce->prop);
- if (ret) {
- pr_err("changeset: add_property failed @%pOF/%s\n",
- ce->np,
- ce->prop->name);
- break;
- }
break;
case OF_RECONFIG_REMOVE_PROPERTY:
ret = __of_remove_property(ce->np, ce->prop);
- if (ret) {
- pr_err("changeset: remove_property failed @%pOF/%s\n",
- ce->np,
- ce->prop->name);
- break;
- }
break;
case OF_RECONFIG_UPDATE_PROPERTY:
@@ -648,20 +635,17 @@ static int __of_changeset_entry_apply(struct of_changeset_entry *ce)
}
ret = __of_update_property(ce->np, ce->prop, &old_prop);
- if (ret) {
- pr_err("changeset: update_property failed @%pOF/%s\n",
- ce->np,
- ce->prop->name);
- break;
- }
break;
default:
ret = -EINVAL;
}
raw_spin_unlock_irqrestore(&devtree_lock, flags);
- if (ret)
+ if (ret) {
+ pr_err("changeset: apply failed: %-15s %pOF:%s\n",
+ action_names[ce->action], ce->np, ce->prop->name);
return ret;
+ }
switch (ce->action) {
case OF_RECONFIG_ATTACH_NODE:
@@ -947,6 +931,9 @@ int of_changeset_action(struct of_changeset *ocs, unsigned long action,
if (!ce)
return -ENOMEM;
+ if (WARN_ON(action >= ARRAY_SIZE(action_names)))
+ return -EINVAL;
+
/* get a reference to the node */
ce->action = action;
ce->np = of_node_get(np);
diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index f26d2ba8a371..68278340cecf 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -184,7 +184,8 @@ int __init ima_free_kexec_buffer(void)
if (ret)
return ret;
- return memblock_phys_free(addr, size);
+ memblock_free_late(addr, size);
+ return 0;
}
#endif
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 0c3475e7d2ff..6a557eb866d0 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -141,7 +141,7 @@ struct platform_device *of_device_alloc(struct device_node *np,
}
/* setup generic device info */
- device_set_node(&dev->dev, of_fwnode_handle(np));
+ device_set_node(&dev->dev, of_fwnode_handle(of_node_get(np)));
dev->dev.parent = parent ? : &platform_bus;
if (bus_id)
@@ -239,7 +239,7 @@ static struct amba_device *of_amba_device_create(struct device_node *node,
dev->dev.dma_mask = &dev->dev.coherent_dma_mask;
/* setup generic device info */
- device_set_node(&dev->dev, of_fwnode_handle(node));
+ device_set_node(&dev->dev, of_fwnode_handle(of_node_get(node)));
dev->dev.parent = parent ? : &platform_bus;
dev->dev.platform_data = platform_data;
if (bus_id)
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index a406a12eb208..b545fcb22536 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -664,12 +664,12 @@ static void __init of_unittest_parse_phandle_with_args_map(void)
memset(&args, 0, sizeof(args));
EXPECT_BEGIN(KERN_INFO,
- "OF: /testcase-data/phandle-tests/consumer-b: could not find phandle");
+ "OF: /testcase-data/phandle-tests/consumer-b: could not find phandle 12345678");
rc = of_parse_phandle_with_args_map(np, "phandle-list-bad-phandle",
"phandle", 0, &args);
EXPECT_END(KERN_INFO,
- "OF: /testcase-data/phandle-tests/consumer-b: could not find phandle");
+ "OF: /testcase-data/phandle-tests/consumer-b: could not find phandle 12345678");
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index f4572a5cca72..273d67ecf6d2 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -92,7 +92,7 @@ config ARM_PMU_ACPI
config ARM_SMMU_V3_PMU
tristate "ARM SMMUv3 Performance Monitors Extension"
- depends on (ARM64 && ACPI) || (COMPILE_TEST && 64BIT)
+ depends on ARM64 || (COMPILE_TEST && 64BIT)
depends on GENERIC_MSI_IRQ
help
Provides support for the ARM SMMUv3 Performance Monitor Counter
diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
index 5c5be9fc1b15..19d459a36be5 100644
--- a/drivers/perf/alibaba_uncore_drw_pmu.c
+++ b/drivers/perf/alibaba_uncore_drw_pmu.c
@@ -236,10 +236,37 @@ static const struct attribute_group ali_drw_pmu_cpumask_attr_group = {
.attrs = ali_drw_pmu_cpumask_attrs,
};
+static ssize_t ali_drw_pmu_identifier_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%s\n", "ali_drw_pmu");
+}
+
+static umode_t ali_drw_pmu_identifier_attr_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ return attr->mode;
+}
+
+static struct device_attribute ali_drw_pmu_identifier_attr =
+ __ATTR(identifier, 0444, ali_drw_pmu_identifier_show, NULL);
+
+static struct attribute *ali_drw_pmu_identifier_attrs[] = {
+ &ali_drw_pmu_identifier_attr.attr,
+ NULL
+};
+
+static const struct attribute_group ali_drw_pmu_identifier_attr_group = {
+ .attrs = ali_drw_pmu_identifier_attrs,
+ .is_visible = ali_drw_pmu_identifier_attr_visible
+};
+
static const struct attribute_group *ali_drw_pmu_attr_groups[] = {
&ali_drw_pmu_events_attr_group,
&ali_drw_pmu_cpumask_attr_group,
&ali_drw_pmu_format_group,
+ &ali_drw_pmu_identifier_attr_group,
NULL,
};
diff --git a/drivers/perf/amlogic/meson_ddr_pmu_core.c b/drivers/perf/amlogic/meson_ddr_pmu_core.c
index 0b24dee1ed3c..bbc7285fd934 100644
--- a/drivers/perf/amlogic/meson_ddr_pmu_core.c
+++ b/drivers/perf/amlogic/meson_ddr_pmu_core.c
@@ -9,8 +9,6 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_device.h>
-#include <linux/of_irq.h>
#include <linux/perf_event.h>
#include <linux/platform_device.h>
#include <linux/printk.h>
diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 998259f1d973..61de861eaf91 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -7,10 +7,7 @@
#include <linux/io.h>
#include <linux/interrupt.h>
#include <linux/module.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
-#include <linux/of_irq.h>
-#include <linux/of_platform.h>
+#include <linux/of.h>
#include <linux/perf_event.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index b8c15878bc86..913dc04b3a40 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -72,6 +72,8 @@
/* For most nodes, this is all there is */
#define CMN_PMU_EVENT_SEL 0x000
#define CMN__PMU_CBUSY_SNTHROTTLE_SEL GENMASK_ULL(44, 42)
+#define CMN__PMU_SN_HOME_SEL GENMASK_ULL(40, 39)
+#define CMN__PMU_HBT_LBT_SEL GENMASK_ULL(38, 37)
#define CMN__PMU_CLASS_OCCUP_ID GENMASK_ULL(36, 35)
/* Technically this is 4 bits wide on DNs, but we only use 2 there anyway */
#define CMN__PMU_OCCUP1_ID GENMASK_ULL(34, 32)
@@ -226,6 +228,7 @@ enum cmn_revision {
REV_CMN700_R0P0 = 0,
REV_CMN700_R1P0,
REV_CMN700_R2P0,
+ REV_CMN700_R3P0,
REV_CI700_R0P0 = 0,
REV_CI700_R1P0,
REV_CI700_R2P0,
@@ -254,6 +257,9 @@ enum cmn_node_type {
CMN_TYPE_CCHA,
CMN_TYPE_CCLA,
CMN_TYPE_CCLA_RNI,
+ CMN_TYPE_HNS = 0x200,
+ CMN_TYPE_HNS_MPAM_S,
+ CMN_TYPE_HNS_MPAM_NS,
/* Not a real node type */
CMN_TYPE_WP = 0x7770
};
@@ -263,6 +269,8 @@ enum cmn_filter_select {
SEL_OCCUP1ID,
SEL_CLASS_OCCUP_ID,
SEL_CBUSY_SNTHROTTLE_SEL,
+ SEL_HBT_LBT_SEL,
+ SEL_SN_HOME_SEL,
SEL_MAX
};
@@ -742,8 +750,8 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
_CMN_EVENT_ATTR(_model, dn_##_name, CMN_TYPE_DVM, _event, _occup, _fsel)
#define CMN_EVENT_DTC(_name) \
CMN_EVENT_ATTR(CMN_ANY, dtc_##_name, CMN_TYPE_DTC, 0)
-#define _CMN_EVENT_HNF(_model, _name, _event, _occup, _fsel) \
- _CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup, _fsel)
+#define CMN_EVENT_HNF(_model, _name, _event) \
+ CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event)
#define CMN_EVENT_HNI(_name, _event) \
CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event)
#define CMN_EVENT_HNP(_name, _event) \
@@ -768,6 +776,8 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
CMN_EVENT_ATTR(CMN_ANY, ccla_##_name, CMN_TYPE_CCLA, _event)
#define CMN_EVENT_CCLA_RNI(_name, _event) \
CMN_EVENT_ATTR(CMN_ANY, ccla_rni_##_name, CMN_TYPE_CCLA_RNI, _event)
+#define CMN_EVENT_HNS(_name, _event) \
+ CMN_EVENT_ATTR(CMN_ANY, hns_##_name, CMN_TYPE_HNS, _event)
#define CMN_EVENT_DVM(_model, _name, _event) \
_CMN_EVENT_DVM(_model, _name, _event, 0, SEL_NONE)
@@ -775,32 +785,68 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
_CMN_EVENT_DVM(_model, _name##_all, _event, 0, SEL_OCCUP1ID), \
_CMN_EVENT_DVM(_model, _name##_dvmop, _event, 1, SEL_OCCUP1ID), \
_CMN_EVENT_DVM(_model, _name##_dvmsync, _event, 2, SEL_OCCUP1ID)
-#define CMN_EVENT_HNF(_model, _name, _event) \
- _CMN_EVENT_HNF(_model, _name, _event, 0, SEL_NONE)
+
+#define CMN_EVENT_HN_OCC(_model, _name, _type, _event) \
+ _CMN_EVENT_ATTR(_model, _name##_all, _type, _event, 0, SEL_OCCUP1ID), \
+ _CMN_EVENT_ATTR(_model, _name##_read, _type, _event, 1, SEL_OCCUP1ID), \
+ _CMN_EVENT_ATTR(_model, _name##_write, _type, _event, 2, SEL_OCCUP1ID), \
+ _CMN_EVENT_ATTR(_model, _name##_atomic, _type, _event, 3, SEL_OCCUP1ID), \
+ _CMN_EVENT_ATTR(_model, _name##_stash, _type, _event, 4, SEL_OCCUP1ID)
+#define CMN_EVENT_HN_CLS(_model, _name, _type, _event) \
+ _CMN_EVENT_ATTR(_model, _name##_class0, _type, _event, 0, SEL_CLASS_OCCUP_ID), \
+ _CMN_EVENT_ATTR(_model, _name##_class1, _type, _event, 1, SEL_CLASS_OCCUP_ID), \
+ _CMN_EVENT_ATTR(_model, _name##_class2, _type, _event, 2, SEL_CLASS_OCCUP_ID), \
+ _CMN_EVENT_ATTR(_model, _name##_class3, _type, _event, 3, SEL_CLASS_OCCUP_ID)
+#define CMN_EVENT_HN_SNT(_model, _name, _type, _event) \
+ _CMN_EVENT_ATTR(_model, _name##_all, _type, _event, 0, SEL_CBUSY_SNTHROTTLE_SEL), \
+ _CMN_EVENT_ATTR(_model, _name##_group0_read, _type, _event, 1, SEL_CBUSY_SNTHROTTLE_SEL), \
+ _CMN_EVENT_ATTR(_model, _name##_group0_write, _type, _event, 2, SEL_CBUSY_SNTHROTTLE_SEL), \
+ _CMN_EVENT_ATTR(_model, _name##_group1_read, _type, _event, 3, SEL_CBUSY_SNTHROTTLE_SEL), \
+ _CMN_EVENT_ATTR(_model, _name##_group1_write, _type, _event, 4, SEL_CBUSY_SNTHROTTLE_SEL), \
+ _CMN_EVENT_ATTR(_model, _name##_read, _type, _event, 5, SEL_CBUSY_SNTHROTTLE_SEL), \
+ _CMN_EVENT_ATTR(_model, _name##_write, _type, _event, 6, SEL_CBUSY_SNTHROTTLE_SEL)
+
+#define CMN_EVENT_HNF_OCC(_model, _name, _event) \
+ CMN_EVENT_HN_OCC(_model, hnf_##_name, CMN_TYPE_HNF, _event)
#define CMN_EVENT_HNF_CLS(_model, _name, _event) \
- _CMN_EVENT_HNF(_model, _name##_class0, _event, 0, SEL_CLASS_OCCUP_ID), \
- _CMN_EVENT_HNF(_model, _name##_class1, _event, 1, SEL_CLASS_OCCUP_ID), \
- _CMN_EVENT_HNF(_model, _name##_class2, _event, 2, SEL_CLASS_OCCUP_ID), \
- _CMN_EVENT_HNF(_model, _name##_class3, _event, 3, SEL_CLASS_OCCUP_ID)
+ CMN_EVENT_HN_CLS(_model, hnf_##_name, CMN_TYPE_HNS, _event)
#define CMN_EVENT_HNF_SNT(_model, _name, _event) \
- _CMN_EVENT_HNF(_model, _name##_all, _event, 0, SEL_CBUSY_SNTHROTTLE_SEL), \
- _CMN_EVENT_HNF(_model, _name##_group0_read, _event, 1, SEL_CBUSY_SNTHROTTLE_SEL), \
- _CMN_EVENT_HNF(_model, _name##_group0_write, _event, 2, SEL_CBUSY_SNTHROTTLE_SEL), \
- _CMN_EVENT_HNF(_model, _name##_group1_read, _event, 3, SEL_CBUSY_SNTHROTTLE_SEL), \
- _CMN_EVENT_HNF(_model, _name##_group1_write, _event, 4, SEL_CBUSY_SNTHROTTLE_SEL), \
- _CMN_EVENT_HNF(_model, _name##_read, _event, 5, SEL_CBUSY_SNTHROTTLE_SEL), \
- _CMN_EVENT_HNF(_model, _name##_write, _event, 6, SEL_CBUSY_SNTHROTTLE_SEL)
-
-#define _CMN_EVENT_XP(_name, _event) \
+ CMN_EVENT_HN_SNT(_model, hnf_##_name, CMN_TYPE_HNF, _event)
+
+#define CMN_EVENT_HNS_OCC(_name, _event) \
+ CMN_EVENT_HN_OCC(CMN_ANY, hns_##_name, CMN_TYPE_HNS, _event), \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_rxsnp, CMN_TYPE_HNS, _event, 5, SEL_OCCUP1ID), \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_lbt, CMN_TYPE_HNS, _event, 6, SEL_OCCUP1ID), \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_hbt, CMN_TYPE_HNS, _event, 7, SEL_OCCUP1ID)
+#define CMN_EVENT_HNS_CLS( _name, _event) \
+ CMN_EVENT_HN_CLS(CMN_ANY, hns_##_name, CMN_TYPE_HNS, _event)
+#define CMN_EVENT_HNS_SNT(_name, _event) \
+ CMN_EVENT_HN_SNT(CMN_ANY, hns_##_name, CMN_TYPE_HNS, _event)
+#define CMN_EVENT_HNS_HBT(_name, _event) \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_all, CMN_TYPE_HNS, _event, 0, SEL_HBT_LBT_SEL), \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_hbt, CMN_TYPE_HNS, _event, 1, SEL_HBT_LBT_SEL), \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_lbt, CMN_TYPE_HNS, _event, 2, SEL_HBT_LBT_SEL)
+#define CMN_EVENT_HNS_SNH(_name, _event) \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_all, CMN_TYPE_HNS, _event, 0, SEL_SN_HOME_SEL), \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_sn, CMN_TYPE_HNS, _event, 1, SEL_SN_HOME_SEL), \
+ _CMN_EVENT_ATTR(CMN_ANY, hns_##_name##_home, CMN_TYPE_HNS, _event, 2, SEL_SN_HOME_SEL)
+
+#define _CMN_EVENT_XP_MESH(_name, _event) \
__CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)), \
__CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)), \
__CMN_EVENT_XP(n_##_name, (_event) | (2 << 2)), \
- __CMN_EVENT_XP(s_##_name, (_event) | (3 << 2)), \
+ __CMN_EVENT_XP(s_##_name, (_event) | (3 << 2))
+
+#define _CMN_EVENT_XP_PORT(_name, _event) \
__CMN_EVENT_XP(p0_##_name, (_event) | (4 << 2)), \
__CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)), \
__CMN_EVENT_XP(p2_##_name, (_event) | (6 << 2)), \
__CMN_EVENT_XP(p3_##_name, (_event) | (7 << 2))
+#define _CMN_EVENT_XP(_name, _event) \
+ _CMN_EVENT_XP_MESH(_name, _event), \
+ _CMN_EVENT_XP_PORT(_name, _event)
+
/* Good thing there are only 3 fundamental XP events... */
#define CMN_EVENT_XP(_name, _event) \
_CMN_EVENT_XP(req_##_name, (_event) | (0 << 5)), \
@@ -813,6 +859,10 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
_CMN_EVENT_XP(snp2_##_name, (_event) | (7 << 5)), \
_CMN_EVENT_XP(req2_##_name, (_event) | (8 << 5))
+#define CMN_EVENT_XP_DAT(_name, _event) \
+ _CMN_EVENT_XP_PORT(dat_##_name, (_event) | (3 << 5)), \
+ _CMN_EVENT_XP_PORT(dat2_##_name, (_event) | (6 << 5))
+
static struct attribute *arm_cmn_event_attrs[] = {
CMN_EVENT_DTC(cycles),
@@ -862,11 +912,7 @@ static struct attribute *arm_cmn_event_attrs[] = {
CMN_EVENT_HNF(CMN_ANY, mc_retries, 0x0c),
CMN_EVENT_HNF(CMN_ANY, mc_reqs, 0x0d),
CMN_EVENT_HNF(CMN_ANY, qos_hh_retry, 0x0e),
- _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_all, 0x0f, 0, SEL_OCCUP1ID),
- _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_read, 0x0f, 1, SEL_OCCUP1ID),
- _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_write, 0x0f, 2, SEL_OCCUP1ID),
- _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_atomic, 0x0f, 3, SEL_OCCUP1ID),
- _CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_stash, 0x0f, 4, SEL_OCCUP1ID),
+ CMN_EVENT_HNF_OCC(CMN_ANY, qos_pocq_occupancy, 0x0f),
CMN_EVENT_HNF(CMN_ANY, pocq_addrhaz, 0x10),
CMN_EVENT_HNF(CMN_ANY, pocq_atomic_addrhaz, 0x11),
CMN_EVENT_HNF(CMN_ANY, ld_st_swp_adq_full, 0x12),
@@ -943,7 +989,7 @@ static struct attribute *arm_cmn_event_attrs[] = {
CMN_EVENT_XP(txflit_valid, 0x01),
CMN_EVENT_XP(txflit_stall, 0x02),
- CMN_EVENT_XP(partial_dat_flit, 0x03),
+ CMN_EVENT_XP_DAT(partial_dat_flit, 0x03),
/* We treat watchpoints as a special made-up class of XP events */
CMN_EVENT_ATTR(CMN_ANY, watchpoint_up, CMN_TYPE_WP, CMN_WP_UP),
CMN_EVENT_ATTR(CMN_ANY, watchpoint_down, CMN_TYPE_WP, CMN_WP_DOWN),
@@ -1132,6 +1178,66 @@ static struct attribute *arm_cmn_event_attrs[] = {
CMN_EVENT_CCLA(pfwd_sndr_stalls_static_crd, 0x2a),
CMN_EVENT_CCLA(pfwd_sndr_stalls_dynmaic_crd, 0x2b),
+ CMN_EVENT_HNS_HBT(cache_miss, 0x01),
+ CMN_EVENT_HNS_HBT(slc_sf_cache_access, 0x02),
+ CMN_EVENT_HNS_HBT(cache_fill, 0x03),
+ CMN_EVENT_HNS_HBT(pocq_retry, 0x04),
+ CMN_EVENT_HNS_HBT(pocq_reqs_recvd, 0x05),
+ CMN_EVENT_HNS_HBT(sf_hit, 0x06),
+ CMN_EVENT_HNS_HBT(sf_evictions, 0x07),
+ CMN_EVENT_HNS(dir_snoops_sent, 0x08),
+ CMN_EVENT_HNS(brd_snoops_sent, 0x09),
+ CMN_EVENT_HNS_HBT(slc_eviction, 0x0a),
+ CMN_EVENT_HNS_HBT(slc_fill_invalid_way, 0x0b),
+ CMN_EVENT_HNS(mc_retries_local, 0x0c),
+ CMN_EVENT_HNS_SNH(mc_reqs_local, 0x0d),
+ CMN_EVENT_HNS(qos_hh_retry, 0x0e),
+ CMN_EVENT_HNS_OCC(qos_pocq_occupancy, 0x0f),
+ CMN_EVENT_HNS(pocq_addrhaz, 0x10),
+ CMN_EVENT_HNS(pocq_atomic_addrhaz, 0x11),
+ CMN_EVENT_HNS(ld_st_swp_adq_full, 0x12),
+ CMN_EVENT_HNS(cmp_adq_full, 0x13),
+ CMN_EVENT_HNS(txdat_stall, 0x14),
+ CMN_EVENT_HNS(txrsp_stall, 0x15),
+ CMN_EVENT_HNS(seq_full, 0x16),
+ CMN_EVENT_HNS(seq_hit, 0x17),
+ CMN_EVENT_HNS(snp_sent, 0x18),
+ CMN_EVENT_HNS(sfbi_dir_snp_sent, 0x19),
+ CMN_EVENT_HNS(sfbi_brd_snp_sent, 0x1a),
+ CMN_EVENT_HNS(intv_dirty, 0x1c),
+ CMN_EVENT_HNS(stash_snp_sent, 0x1d),
+ CMN_EVENT_HNS(stash_data_pull, 0x1e),
+ CMN_EVENT_HNS(snp_fwded, 0x1f),
+ CMN_EVENT_HNS(atomic_fwd, 0x20),
+ CMN_EVENT_HNS(mpam_hardlim, 0x21),
+ CMN_EVENT_HNS(mpam_softlim, 0x22),
+ CMN_EVENT_HNS(snp_sent_cluster, 0x23),
+ CMN_EVENT_HNS(sf_imprecise_evict, 0x24),
+ CMN_EVENT_HNS(sf_evict_shared_line, 0x25),
+ CMN_EVENT_HNS_CLS(pocq_class_occup, 0x26),
+ CMN_EVENT_HNS_CLS(pocq_class_retry, 0x27),
+ CMN_EVENT_HNS_CLS(class_mc_reqs_local, 0x28),
+ CMN_EVENT_HNS_CLS(class_cgnt_cmin, 0x29),
+ CMN_EVENT_HNS_SNT(sn_throttle, 0x2a),
+ CMN_EVENT_HNS_SNT(sn_throttle_min, 0x2b),
+ CMN_EVENT_HNS(sf_precise_to_imprecise, 0x2c),
+ CMN_EVENT_HNS(snp_intv_cln, 0x2d),
+ CMN_EVENT_HNS(nc_excl, 0x2e),
+ CMN_EVENT_HNS(excl_mon_ovfl, 0x2f),
+ CMN_EVENT_HNS(snp_req_recvd, 0x30),
+ CMN_EVENT_HNS(snp_req_byp_pocq, 0x31),
+ CMN_EVENT_HNS(dir_ccgha_snp_sent, 0x32),
+ CMN_EVENT_HNS(brd_ccgha_snp_sent, 0x33),
+ CMN_EVENT_HNS(ccgha_snp_stall, 0x34),
+ CMN_EVENT_HNS(lbt_req_hardlim, 0x35),
+ CMN_EVENT_HNS(hbt_req_hardlim, 0x36),
+ CMN_EVENT_HNS(sf_reupdate, 0x37),
+ CMN_EVENT_HNS(excl_sf_imprecise, 0x38),
+ CMN_EVENT_HNS(snp_pocq_addrhaz, 0x39),
+ CMN_EVENT_HNS(mc_retries_remote, 0x3a),
+ CMN_EVENT_HNS_SNH(mc_reqs_remote, 0x3b),
+ CMN_EVENT_HNS_CLS(class_mc_reqs_remote, 0x3c),
+
NULL
};
@@ -1373,6 +1479,10 @@ static int arm_cmn_set_event_sel_hi(struct arm_cmn_node *dn,
dn->occupid[fsel].val = occupid;
reg = FIELD_PREP(CMN__PMU_CBUSY_SNTHROTTLE_SEL,
dn->occupid[SEL_CBUSY_SNTHROTTLE_SEL].val) |
+ FIELD_PREP(CMN__PMU_SN_HOME_SEL,
+ dn->occupid[SEL_SN_HOME_SEL].val) |
+ FIELD_PREP(CMN__PMU_HBT_LBT_SEL,
+ dn->occupid[SEL_HBT_LBT_SEL].val) |
FIELD_PREP(CMN__PMU_CLASS_OCCUP_ID,
dn->occupid[SEL_CLASS_OCCUP_ID].val) |
FIELD_PREP(CMN__PMU_OCCUP1_ID,
@@ -2200,6 +2310,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
case CMN_TYPE_CCRA:
case CMN_TYPE_CCHA:
case CMN_TYPE_CCLA:
+ case CMN_TYPE_HNS:
dn++;
break;
/* Nothing to see here */
@@ -2207,6 +2318,8 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
case CMN_TYPE_MPAM_NS:
case CMN_TYPE_RNSAM:
case CMN_TYPE_CXLA:
+ case CMN_TYPE_HNS_MPAM_S:
+ case CMN_TYPE_HNS_MPAM_NS:
break;
/*
* Split "optimised" combination nodes into separate
diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 9d0f01c4455a..30cea6859574 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -66,8 +66,13 @@
#define DMC620_PMU_COUNTERn_OFFSET(n) \
(DMC620_PMU_COUNTERS_BASE + 0x28 * (n))
-static LIST_HEAD(dmc620_pmu_irqs);
+/*
+ * dmc620_pmu_irqs_lock: protects dmc620_pmu_irqs list
+ * dmc620_pmu_node_lock: protects pmus_node lists in all dmc620_pmu instances
+ */
static DEFINE_MUTEX(dmc620_pmu_irqs_lock);
+static DEFINE_MUTEX(dmc620_pmu_node_lock);
+static LIST_HEAD(dmc620_pmu_irqs);
struct dmc620_pmu_irq {
struct hlist_node node;
@@ -475,9 +480,9 @@ static int dmc620_pmu_get_irq(struct dmc620_pmu *dmc620_pmu, int irq_num)
return PTR_ERR(irq);
dmc620_pmu->irq = irq;
- mutex_lock(&dmc620_pmu_irqs_lock);
+ mutex_lock(&dmc620_pmu_node_lock);
list_add_rcu(&dmc620_pmu->pmus_node, &irq->pmus_node);
- mutex_unlock(&dmc620_pmu_irqs_lock);
+ mutex_unlock(&dmc620_pmu_node_lock);
return 0;
}
@@ -486,9 +491,11 @@ static void dmc620_pmu_put_irq(struct dmc620_pmu *dmc620_pmu)
{
struct dmc620_pmu_irq *irq = dmc620_pmu->irq;
- mutex_lock(&dmc620_pmu_irqs_lock);
+ mutex_lock(&dmc620_pmu_node_lock);
list_del_rcu(&dmc620_pmu->pmus_node);
+ mutex_unlock(&dmc620_pmu_node_lock);
+ mutex_lock(&dmc620_pmu_irqs_lock);
if (!refcount_dec_and_test(&irq->refcount)) {
mutex_unlock(&dmc620_pmu_irqs_lock);
return;
@@ -638,10 +645,10 @@ static int dmc620_pmu_cpu_teardown(unsigned int cpu,
return 0;
/* We're only reading, but this isn't the place to be involving RCU */
- mutex_lock(&dmc620_pmu_irqs_lock);
+ mutex_lock(&dmc620_pmu_node_lock);
list_for_each_entry(dmc620_pmu, &irq->pmus_node, pmus_node)
perf_pmu_migrate_context(&dmc620_pmu->pmu, irq->cpu, target);
- mutex_unlock(&dmc620_pmu_irqs_lock);
+ mutex_unlock(&dmc620_pmu_node_lock);
WARN_ON(irq_set_affinity(irq->irq_num, cpumask_of(target)));
irq->cpu = target;
diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index fe2abb412c00..8223c49bd082 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -20,7 +20,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
#include <linux/perf_event.h>
#include <linux/platform_device.h>
#include <linux/spinlock.h>
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index f6ccb2cd4dfc..d712a19e47ac 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -877,11 +877,13 @@ struct arm_pmu *armpmu_alloc(void)
.attr_groups = pmu->attr_groups,
/*
* This is a CPU PMU potentially in a heterogeneous
- * configuration (e.g. big.LITTLE). This is not an uncore PMU,
- * and we have taken ctx sharing into account (e.g. with our
- * pmu::filter callback and pmu::event_init group validation).
+ * configuration (e.g. big.LITTLE) so
+ * PERF_PMU_CAP_EXTENDED_HW_TYPE is required to open
+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events on a
+ * specific PMU.
*/
- .capabilities = PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS,
+ .capabilities = PERF_PMU_CAP_EXTENDED_REGS |
+ PERF_PMU_CAP_EXTENDED_HW_TYPE,
};
pmu->attr_groups[ARMPMU_ATTR_GROUP_COMMON] =
diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
index 90815ad762eb..05dda19c5359 100644
--- a/drivers/perf/arm_pmu_acpi.c
+++ b/drivers/perf/arm_pmu_acpi.c
@@ -69,6 +69,62 @@ static void arm_pmu_acpi_unregister_irq(int cpu)
acpi_unregister_gsi(gsi);
}
+static int __maybe_unused
+arm_acpi_register_pmu_device(struct platform_device *pdev, u8 len,
+ u16 (*parse_gsi)(struct acpi_madt_generic_interrupt *))
+{
+ int cpu, this_hetid, hetid, irq, ret;
+ u16 this_gsi = 0, gsi = 0;
+
+ /*
+ * Ensure that platform device must have IORESOURCE_IRQ
+ * resource to hold gsi interrupt.
+ */
+ if (pdev->num_resources != 1)
+ return -ENXIO;
+
+ if (pdev->resource[0].flags != IORESOURCE_IRQ)
+ return -ENXIO;
+
+ /*
+ * Sanity check all the GICC tables for the same interrupt
+ * number. For now, only support homogeneous ACPI machines.
+ */
+ for_each_possible_cpu(cpu) {
+ struct acpi_madt_generic_interrupt *gicc;
+
+ gicc = acpi_cpu_get_madt_gicc(cpu);
+ if (gicc->header.length < len)
+ return gsi ? -ENXIO : 0;
+
+ this_gsi = parse_gsi(gicc);
+ this_hetid = find_acpi_cpu_topology_hetero_id(cpu);
+ if (!gsi) {
+ hetid = this_hetid;
+ gsi = this_gsi;
+ } else if (hetid != this_hetid || gsi != this_gsi) {
+ pr_warn("ACPI: %s: must be homogeneous\n", pdev->name);
+ return -ENXIO;
+ }
+ }
+
+ if (!this_gsi)
+ return 0;
+
+ irq = acpi_register_gsi(NULL, gsi, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_HIGH);
+ if (irq < 0) {
+ pr_warn("ACPI: %s Unable to register interrupt: %d\n", pdev->name, gsi);
+ return -ENXIO;
+ }
+
+ pdev->resource[0].start = irq;
+ ret = platform_device_register(pdev);
+ if (ret)
+ acpi_unregister_gsi(gsi);
+
+ return ret;
+}
+
#if IS_ENABLED(CONFIG_ARM_SPE_PMU)
static struct resource spe_resources[] = {
{
@@ -84,6 +140,11 @@ static struct platform_device spe_dev = {
.num_resources = ARRAY_SIZE(spe_resources)
};
+static u16 arm_spe_parse_gsi(struct acpi_madt_generic_interrupt *gicc)
+{
+ return gicc->spe_interrupt;
+}
+
/*
* For lack of a better place, hook the normal PMU MADT walk
* and create a SPE device if we detect a recent MADT with
@@ -91,53 +152,50 @@ static struct platform_device spe_dev = {
*/
static void arm_spe_acpi_register_device(void)
{
- int cpu, hetid, irq, ret;
- bool first = true;
- u16 gsi = 0;
-
- /*
- * Sanity check all the GICC tables for the same interrupt number.
- * For now, we only support homogeneous ACPI/SPE machines.
- */
- for_each_possible_cpu(cpu) {
- struct acpi_madt_generic_interrupt *gicc;
+ int ret = arm_acpi_register_pmu_device(&spe_dev, ACPI_MADT_GICC_SPE,
+ arm_spe_parse_gsi);
+ if (ret)
+ pr_warn("ACPI: SPE: Unable to register device\n");
+}
+#else
+static inline void arm_spe_acpi_register_device(void)
+{
+}
+#endif /* CONFIG_ARM_SPE_PMU */
- gicc = acpi_cpu_get_madt_gicc(cpu);
- if (gicc->header.length < ACPI_MADT_GICC_SPE)
- return;
-
- if (first) {
- gsi = gicc->spe_interrupt;
- if (!gsi)
- return;
- hetid = find_acpi_cpu_topology_hetero_id(cpu);
- first = false;
- } else if ((gsi != gicc->spe_interrupt) ||
- (hetid != find_acpi_cpu_topology_hetero_id(cpu))) {
- pr_warn("ACPI: SPE must be homogeneous\n");
- return;
- }
+#if IS_ENABLED(CONFIG_CORESIGHT_TRBE)
+static struct resource trbe_resources[] = {
+ {
+ /* irq */
+ .flags = IORESOURCE_IRQ,
}
+};
- irq = acpi_register_gsi(NULL, gsi, ACPI_LEVEL_SENSITIVE,
- ACPI_ACTIVE_HIGH);
- if (irq < 0) {
- pr_warn("ACPI: SPE Unable to register interrupt: %d\n", gsi);
- return;
- }
+static struct platform_device trbe_dev = {
+ .name = ARMV8_TRBE_PDEV_NAME,
+ .id = -1,
+ .resource = trbe_resources,
+ .num_resources = ARRAY_SIZE(trbe_resources)
+};
- spe_resources[0].start = irq;
- ret = platform_device_register(&spe_dev);
- if (ret < 0) {
- pr_warn("ACPI: SPE: Unable to register device\n");
- acpi_unregister_gsi(gsi);
- }
+static u16 arm_trbe_parse_gsi(struct acpi_madt_generic_interrupt *gicc)
+{
+ return gicc->trbe_interrupt;
+}
+
+static void arm_trbe_acpi_register_device(void)
+{
+ int ret = arm_acpi_register_pmu_device(&trbe_dev, ACPI_MADT_GICC_TRBE,
+ arm_trbe_parse_gsi);
+ if (ret)
+ pr_warn("ACPI: TRBE: Unable to register device\n");
}
#else
-static inline void arm_spe_acpi_register_device(void)
+static inline void arm_trbe_acpi_register_device(void)
{
+
}
-#endif /* CONFIG_ARM_SPE_PMU */
+#endif /* CONFIG_CORESIGHT_TRBE */
static int arm_pmu_acpi_parse_irqs(void)
{
@@ -374,6 +432,7 @@ static int arm_pmu_acpi_init(void)
return 0;
arm_spe_acpi_register_device();
+ arm_trbe_acpi_register_device();
return 0;
}
diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c
index 933b96e243b8..3596db36cbff 100644
--- a/drivers/perf/arm_pmu_platform.c
+++ b/drivers/perf/arm_pmu_platform.c
@@ -16,7 +16,6 @@
#include <linux/irqdesc.h>
#include <linux/kconfig.h>
#include <linux/of.h>
-#include <linux/of_device.h>
#include <linux/percpu.h>
#include <linux/perf/arm_pmu.h>
#include <linux/platform_device.h>
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 08b3a1bf0ef6..e5a2ac4155f6 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -721,38 +721,15 @@ static void armv8pmu_enable_event(struct perf_event *event)
* Enable counter and interrupt, and set the counter to count
* the event that we're interested in.
*/
-
- /*
- * Disable counter
- */
armv8pmu_disable_event_counter(event);
-
- /*
- * Set event.
- */
armv8pmu_write_event_type(event);
-
- /*
- * Enable interrupt for this counter
- */
armv8pmu_enable_event_irq(event);
-
- /*
- * Enable counter
- */
armv8pmu_enable_event_counter(event);
}
static void armv8pmu_disable_event(struct perf_event *event)
{
- /*
- * Disable counter
- */
armv8pmu_disable_event_counter(event);
-
- /*
- * Disable interrupt for this counter
- */
armv8pmu_disable_event_irq(event);
}
@@ -1266,9 +1243,14 @@ PMUV3_INIT_SIMPLE(armv8_cortex_a76)
PMUV3_INIT_SIMPLE(armv8_cortex_a77)
PMUV3_INIT_SIMPLE(armv8_cortex_a78)
PMUV3_INIT_SIMPLE(armv9_cortex_a510)
+PMUV3_INIT_SIMPLE(armv9_cortex_a520)
PMUV3_INIT_SIMPLE(armv9_cortex_a710)
+PMUV3_INIT_SIMPLE(armv9_cortex_a715)
+PMUV3_INIT_SIMPLE(armv9_cortex_a720)
PMUV3_INIT_SIMPLE(armv8_cortex_x1)
PMUV3_INIT_SIMPLE(armv9_cortex_x2)
+PMUV3_INIT_SIMPLE(armv9_cortex_x3)
+PMUV3_INIT_SIMPLE(armv9_cortex_x4)
PMUV3_INIT_SIMPLE(armv8_neoverse_e1)
PMUV3_INIT_SIMPLE(armv8_neoverse_n1)
PMUV3_INIT_SIMPLE(armv9_neoverse_n2)
@@ -1334,9 +1316,14 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = {
{.compatible = "arm,cortex-a77-pmu", .data = armv8_cortex_a77_pmu_init},
{.compatible = "arm,cortex-a78-pmu", .data = armv8_cortex_a78_pmu_init},
{.compatible = "arm,cortex-a510-pmu", .data = armv9_cortex_a510_pmu_init},
+ {.compatible = "arm,cortex-a520-pmu", .data = armv9_cortex_a520_pmu_init},
{.compatible = "arm,cortex-a710-pmu", .data = armv9_cortex_a710_pmu_init},
+ {.compatible = "arm,cortex-a715-pmu", .data = armv9_cortex_a715_pmu_init},
+ {.compatible = "arm,cortex-a720-pmu", .data = armv9_cortex_a720_pmu_init},
{.compatible = "arm,cortex-x1-pmu", .data = armv8_cortex_x1_pmu_init},
{.compatible = "arm,cortex-x2-pmu", .data = armv9_cortex_x2_pmu_init},
+ {.compatible = "arm,cortex-x3-pmu", .data = armv9_cortex_x3_pmu_init},
+ {.compatible = "arm,cortex-x4-pmu", .data = armv9_cortex_x4_pmu_init},
{.compatible = "arm,neoverse-e1-pmu", .data = armv8_neoverse_e1_pmu_init},
{.compatible = "arm,neoverse-n1-pmu", .data = armv8_neoverse_n1_pmu_init},
{.compatible = "arm,neoverse-n2-pmu", .data = armv9_neoverse_n2_pmu_init},
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 25a269d431e4..6303b82566f9 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -115,6 +115,7 @@
#define SMMU_PMCG_PA_SHIFT 12
#define SMMU_PMCG_EVCNTR_RDONLY BIT(0)
+#define SMMU_PMCG_HARDEN_DISABLE BIT(1)
static int cpuhp_state_num;
@@ -159,6 +160,20 @@ static inline void smmu_pmu_enable(struct pmu *pmu)
writel(SMMU_PMCG_CR_ENABLE, smmu_pmu->reg_base + SMMU_PMCG_CR);
}
+static int smmu_pmu_apply_event_filter(struct smmu_pmu *smmu_pmu,
+ struct perf_event *event, int idx);
+
+static inline void smmu_pmu_enable_quirk_hip08_09(struct pmu *pmu)
+{
+ struct smmu_pmu *smmu_pmu = to_smmu_pmu(pmu);
+ unsigned int idx;
+
+ for_each_set_bit(idx, smmu_pmu->used_counters, smmu_pmu->num_counters)
+ smmu_pmu_apply_event_filter(smmu_pmu, smmu_pmu->events[idx], idx);
+
+ smmu_pmu_enable(pmu);
+}
+
static inline void smmu_pmu_disable(struct pmu *pmu)
{
struct smmu_pmu *smmu_pmu = to_smmu_pmu(pmu);
@@ -167,6 +182,22 @@ static inline void smmu_pmu_disable(struct pmu *pmu)
writel(0, smmu_pmu->reg_base + SMMU_PMCG_IRQ_CTRL);
}
+static inline void smmu_pmu_disable_quirk_hip08_09(struct pmu *pmu)
+{
+ struct smmu_pmu *smmu_pmu = to_smmu_pmu(pmu);
+ unsigned int idx;
+
+ /*
+ * The global disable of PMU sometimes fail to stop the counting.
+ * Harden this by writing an invalid event type to each used counter
+ * to forcibly stop counting.
+ */
+ for_each_set_bit(idx, smmu_pmu->used_counters, smmu_pmu->num_counters)
+ writel(0xffff, smmu_pmu->reg_base + SMMU_PMCG_EVTYPER(idx));
+
+ smmu_pmu_disable(pmu);
+}
+
static inline void smmu_pmu_counter_set_value(struct smmu_pmu *smmu_pmu,
u32 idx, u64 value)
{
@@ -765,7 +796,10 @@ static void smmu_pmu_get_acpi_options(struct smmu_pmu *smmu_pmu)
switch (model) {
case IORT_SMMU_V3_PMCG_HISI_HIP08:
/* HiSilicon Erratum 162001800 */
- smmu_pmu->options |= SMMU_PMCG_EVCNTR_RDONLY;
+ smmu_pmu->options |= SMMU_PMCG_EVCNTR_RDONLY | SMMU_PMCG_HARDEN_DISABLE;
+ break;
+ case IORT_SMMU_V3_PMCG_HISI_HIP09:
+ smmu_pmu->options |= SMMU_PMCG_HARDEN_DISABLE;
break;
}
@@ -890,6 +924,16 @@ static int smmu_pmu_probe(struct platform_device *pdev)
if (!dev->of_node)
smmu_pmu_get_acpi_options(smmu_pmu);
+ /*
+ * For platforms suffer this quirk, the PMU disable sometimes fails to
+ * stop the counters. This will leads to inaccurate or error counting.
+ * Forcibly disable the counters with these quirk handler.
+ */
+ if (smmu_pmu->options & SMMU_PMCG_HARDEN_DISABLE) {
+ smmu_pmu->pmu.pmu_enable = smmu_pmu_enable_quirk_hip08_09;
+ smmu_pmu->pmu.pmu_disable = smmu_pmu_disable_quirk_hip08_09;
+ }
+
/* Pick one CPU to be the preferred one to use */
smmu_pmu->on_cpu = raw_smp_processor_id();
WARN_ON(irq_set_affinity(smmu_pmu->irq, cpumask_of(smmu_pmu->on_cpu)));
@@ -984,6 +1028,7 @@ static void __exit arm_smmu_pmu_exit(void)
module_exit(arm_smmu_pmu_exit);
+MODULE_ALIAS("platform:arm-smmu-v3-pmcg");
MODULE_DESCRIPTION("PMU driver for ARM SMMUv3 Performance Monitors Extension");
MODULE_AUTHOR("Neil Leeder <nleeder@codeaurora.org>");
MODULE_AUTHOR("Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>");
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index b9ba4c4fe5a2..d2b0cbf0e0c4 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -25,8 +25,7 @@
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
#include <linux/perf_event.h>
#include <linux/perf/arm_pmu.h>
#include <linux/platform_device.h>
diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index 5222ba1e79d0..92611c98120f 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -10,10 +10,9 @@
#include <linux/io.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
#include <linux/of_irq.h>
#include <linux/perf_event.h>
+#include <linux/platform_device.h>
#include <linux/slab.h>
#define COUNTER_CNTL 0x0
@@ -28,6 +27,8 @@
#define CNTL_CLEAR_MASK 0xFFFFFFFD
#define CNTL_OVER_MASK 0xFFFFFFFE
+#define CNTL_CP_SHIFT 16
+#define CNTL_CP_MASK (0xFF << CNTL_CP_SHIFT)
#define CNTL_CSV_SHIFT 24
#define CNTL_CSV_MASK (0xFFU << CNTL_CSV_SHIFT)
@@ -35,6 +36,8 @@
#define EVENT_CYCLES_COUNTER 0
#define NUM_COUNTERS 4
+/* For removing bias if cycle counter CNTL.CP is set to 0xf0 */
+#define CYCLES_COUNTER_MASK 0x0FFFFFFF
#define AXI_MASKING_REVERT 0xffff0000 /* AXI_MASKING(MSB 16bits) + AXI_ID(LSB 16bits) */
#define to_ddr_pmu(p) container_of(p, struct ddr_pmu, pmu)
@@ -101,6 +104,7 @@ struct ddr_pmu {
const struct fsl_ddr_devtype_data *devtype_data;
int irq;
int id;
+ int active_counter;
};
static ssize_t ddr_perf_identifier_show(struct device *dev,
@@ -427,6 +431,17 @@ static void ddr_perf_counter_enable(struct ddr_pmu *pmu, int config,
writel(0, pmu->base + reg);
val = CNTL_EN | CNTL_CLEAR;
val |= FIELD_PREP(CNTL_CSV_MASK, config);
+
+ /*
+ * On i.MX8MP we need to bias the cycle counter to overflow more often.
+ * We do this by initializing bits [23:16] of the counter value via the
+ * COUNTER_CTRL Counter Parameter (CP) field.
+ */
+ if (pmu->devtype_data->quirks & DDR_CAP_AXI_ID_FILTER_ENHANCED) {
+ if (counter == EVENT_CYCLES_COUNTER)
+ val |= FIELD_PREP(CNTL_CP_MASK, 0xf0);
+ }
+
writel(val, pmu->base + reg);
} else {
/* Disable counter */
@@ -466,6 +481,12 @@ static void ddr_perf_event_update(struct perf_event *event)
int ret;
new_raw_count = ddr_perf_read_counter(pmu, counter);
+ /* Remove the bias applied in ddr_perf_counter_enable(). */
+ if (pmu->devtype_data->quirks & DDR_CAP_AXI_ID_FILTER_ENHANCED) {
+ if (counter == EVENT_CYCLES_COUNTER)
+ new_raw_count &= CYCLES_COUNTER_MASK;
+ }
+
local64_add(new_raw_count, &event->count);
/*
@@ -495,6 +516,10 @@ static void ddr_perf_event_start(struct perf_event *event, int flags)
ddr_perf_counter_enable(pmu, event->attr.config, counter, true);
+ if (!pmu->active_counter++)
+ ddr_perf_counter_enable(pmu, EVENT_CYCLES_ID,
+ EVENT_CYCLES_COUNTER, true);
+
hwc->state = 0;
}
@@ -548,6 +573,10 @@ static void ddr_perf_event_stop(struct perf_event *event, int flags)
ddr_perf_counter_enable(pmu, event->attr.config, counter, false);
ddr_perf_event_update(event);
+ if (!--pmu->active_counter)
+ ddr_perf_counter_enable(pmu, EVENT_CYCLES_ID,
+ EVENT_CYCLES_COUNTER, false);
+
hwc->state |= PERF_HES_STOPPED;
}
@@ -565,25 +594,10 @@ static void ddr_perf_event_del(struct perf_event *event, int flags)
static void ddr_perf_pmu_enable(struct pmu *pmu)
{
- struct ddr_pmu *ddr_pmu = to_ddr_pmu(pmu);
-
- /* enable cycle counter if cycle is not active event list */
- if (ddr_pmu->events[EVENT_CYCLES_COUNTER] == NULL)
- ddr_perf_counter_enable(ddr_pmu,
- EVENT_CYCLES_ID,
- EVENT_CYCLES_COUNTER,
- true);
}
static void ddr_perf_pmu_disable(struct pmu *pmu)
{
- struct ddr_pmu *ddr_pmu = to_ddr_pmu(pmu);
-
- if (ddr_pmu->events[EVENT_CYCLES_COUNTER] == NULL)
- ddr_perf_counter_enable(ddr_pmu,
- EVENT_CYCLES_ID,
- EVENT_CYCLES_COUNTER,
- false);
}
static int ddr_perf_init(struct ddr_pmu *pmu, void __iomem *base,
diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c
index 71d5b07e3aff..5cf770a1bc31 100644
--- a/drivers/perf/fsl_imx9_ddr_perf.c
+++ b/drivers/perf/fsl_imx9_ddr_perf.c
@@ -7,9 +7,7 @@
#include <linux/io.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
-#include <linux/of_irq.h>
+#include <linux/platform_device.h>
#include <linux/perf_event.h>
/* Performance monitor configuration */
diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index e10fc7cb9493..5a00adb2de8c 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -665,8 +665,8 @@ static int hisi_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node);
if (pcie_pmu->on_cpu == -1) {
- pcie_pmu->on_cpu = cpu;
- WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(cpu)));
+ pcie_pmu->on_cpu = cpumask_local_spread(0, dev_to_node(&pcie_pmu->pdev->dev));
+ WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(pcie_pmu->on_cpu)));
}
return 0;
@@ -676,14 +676,23 @@ static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
{
struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node);
unsigned int target;
+ cpumask_t mask;
+ int numa_node;
/* Nothing to do if this CPU doesn't own the PMU */
if (pcie_pmu->on_cpu != cpu)
return 0;
pcie_pmu->on_cpu = -1;
- /* Choose a new CPU from all online cpus. */
- target = cpumask_any_but(cpu_online_mask, cpu);
+
+ /* Choose a local CPU from all online cpus. */
+ numa_node = dev_to_node(&pcie_pmu->pdev->dev);
+ if (cpumask_and(&mask, cpumask_of_node(numa_node), cpu_online_mask) &&
+ cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
+ target = cpumask_any(&mask);
+ else
+ target = cpumask_any_but(cpu_online_mask, cpu);
+
if (target >= nr_cpu_ids) {
pci_err(pcie_pmu->pdev, "There is no CPU to set\n");
return 0;
diff --git a/drivers/perf/marvell_cn10k_ddr_pmu.c b/drivers/perf/marvell_cn10k_ddr_pmu.c
index b94a5f6cc22b..524ba82bfce2 100644
--- a/drivers/perf/marvell_cn10k_ddr_pmu.c
+++ b/drivers/perf/marvell_cn10k_ddr_pmu.c
@@ -8,11 +8,10 @@
#include <linux/io.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
#include <linux/perf_event.h>
#include <linux/hrtimer.h>
#include <linux/acpi.h>
+#include <linux/platform_device.h>
/* Performance Counters Operating Mode Control Registers */
#define DDRC_PERF_CNT_OP_MODE_CTRL 0x8020
diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
index 3972197e2210..fec8e82edb95 100644
--- a/drivers/perf/marvell_cn10k_tad_pmu.c
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -6,10 +6,9 @@
#define pr_fmt(fmt) "tad_pmu: " fmt
+#include <linux/io.h>
#include <linux/module.h>
#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
#include <linux/cpuhotplug.h>
#include <linux/perf_event.h>
#include <linux/platform_device.h>
diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index 0c32dffc7ede..9972bfc11a5c 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -1833,7 +1833,6 @@ static int xgene_pmu_probe(struct platform_device *pdev)
const struct xgene_pmu_data *dev_data;
const struct of_device_id *of_id;
struct xgene_pmu *xgene_pmu;
- struct resource *res;
int irq, rc;
int version;
@@ -1883,8 +1882,7 @@ static int xgene_pmu_probe(struct platform_device *pdev)
xgene_pmu->version = version;
dev_info(&pdev->dev, "X-Gene PMU version %d\n", xgene_pmu->version);
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- xgene_pmu->pcppmu_csr = devm_ioremap_resource(&pdev->dev, res);
+ xgene_pmu->pcppmu_csr = devm_platform_ioremap_resource(pdev, 0);
if (IS_ERR(xgene_pmu->pcppmu_csr)) {
dev_err(&pdev->dev, "ioremap failed for PCP PMU resource\n");
return PTR_ERR(xgene_pmu->pcppmu_csr);
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index 4a8c1b57a90d..4dff656af3ad 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -862,6 +862,33 @@ static const struct pinconf_ops amd_pinconf_ops = {
.pin_config_group_set = amd_pinconf_group_set,
};
+static void amd_gpio_irq_init(struct amd_gpio *gpio_dev)
+{
+ struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
+ u32 pin_reg, mask;
+ int i;
+
+ mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3) |
+ BIT(WAKE_CNTRL_OFF_S4);
+
+ for (i = 0; i < desc->npins; i++) {
+ int pin = desc->pins[i].number;
+ const struct pin_desc *pd = pin_desc_get(gpio_dev->pctrl, pin);
+
+ if (!pd)
+ continue;
+
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+
+ pin_reg = readl(gpio_dev->base + pin * 4);
+ pin_reg &= ~mask;
+ writel(pin_reg, gpio_dev->base + pin * 4);
+
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
+ }
+}
+
#ifdef CONFIG_PM_SLEEP
static bool amd_gpio_should_save(struct amd_gpio *gpio_dev, unsigned int pin)
{
@@ -1099,6 +1126,9 @@ static int amd_gpio_probe(struct platform_device *pdev)
return PTR_ERR(gpio_dev->pctrl);
}
+ /* Disable and mask interrupts */
+ amd_gpio_irq_init(gpio_dev);
+
girq = &gpio_dev->gc.irq;
gpio_irq_chip_set_chip(girq, &amd_gpio_irqchip);
/* This will let us handle the parent IRQ in the driver */
diff --git a/drivers/pinctrl/renesas/pinctrl-rza2.c b/drivers/pinctrl/renesas/pinctrl-rza2.c
index 40b1326a1077..5591ddf16fdf 100644
--- a/drivers/pinctrl/renesas/pinctrl-rza2.c
+++ b/drivers/pinctrl/renesas/pinctrl-rza2.c
@@ -14,6 +14,7 @@
#include <linux/gpio/driver.h>
#include <linux/io.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/of_device.h>
#include <linux/pinctrl/pinmux.h>
@@ -46,6 +47,7 @@ struct rza2_pinctrl_priv {
struct pinctrl_dev *pctl;
struct pinctrl_gpio_range gpio_range;
int npins;
+ struct mutex mutex; /* serialize adding groups and functions */
};
#define RZA2_PDR(port) (0x0000 + (port) * 2) /* Direction 16-bit */
@@ -358,10 +360,14 @@ static int rza2_dt_node_to_map(struct pinctrl_dev *pctldev,
psel_val[i] = MUX_FUNC(value);
}
+ mutex_lock(&priv->mutex);
+
/* Register a single pin group listing all the pins we read from DT */
gsel = pinctrl_generic_add_group(pctldev, np->name, pins, npins, NULL);
- if (gsel < 0)
- return gsel;
+ if (gsel < 0) {
+ ret = gsel;
+ goto unlock;
+ }
/*
* Register a single group function where the 'data' is an array PSEL
@@ -390,6 +396,8 @@ static int rza2_dt_node_to_map(struct pinctrl_dev *pctldev,
(*map)->data.mux.function = np->name;
*num_maps = 1;
+ mutex_unlock(&priv->mutex);
+
return 0;
remove_function:
@@ -398,6 +406,9 @@ remove_function:
remove_group:
pinctrl_generic_remove_group(pctldev, gsel);
+unlock:
+ mutex_unlock(&priv->mutex);
+
dev_err(priv->dev, "Unable to parse DT node %s\n", np->name);
return ret;
@@ -473,6 +484,8 @@ static int rza2_pinctrl_probe(struct platform_device *pdev)
if (IS_ERR(priv->base))
return PTR_ERR(priv->base);
+ mutex_init(&priv->mutex);
+
platform_set_drvdata(pdev, priv);
priv->npins = (int)(uintptr_t)of_device_get_match_data(&pdev->dev) *
diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
index b53d26167da5..6e8a76556e23 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
@@ -11,6 +11,7 @@
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/of_device.h>
#include <linux/of_irq.h>
#include <linux/seq_file.h>
@@ -149,10 +150,11 @@ struct rzg2l_pinctrl {
struct gpio_chip gpio_chip;
struct pinctrl_gpio_range gpio_range;
DECLARE_BITMAP(tint_slot, RZG2L_TINT_MAX_INTERRUPT);
- spinlock_t bitmap_lock;
+ spinlock_t bitmap_lock; /* protect tint_slot bitmap */
unsigned int hwirq[RZG2L_TINT_MAX_INTERRUPT];
- spinlock_t lock;
+ spinlock_t lock; /* lock read/write registers */
+ struct mutex mutex; /* serialize adding groups and functions */
};
static const unsigned int iolh_groupa_mA[] = { 2, 4, 8, 12 };
@@ -362,11 +364,13 @@ static int rzg2l_dt_subnode_to_map(struct pinctrl_dev *pctldev,
name = np->name;
}
+ mutex_lock(&pctrl->mutex);
+
/* Register a single pin group listing all the pins we read from DT */
gsel = pinctrl_generic_add_group(pctldev, name, pins, num_pinmux, NULL);
if (gsel < 0) {
ret = gsel;
- goto done;
+ goto unlock;
}
/*
@@ -380,6 +384,8 @@ static int rzg2l_dt_subnode_to_map(struct pinctrl_dev *pctldev,
goto remove_group;
}
+ mutex_unlock(&pctrl->mutex);
+
maps[idx].type = PIN_MAP_TYPE_MUX_GROUP;
maps[idx].data.mux.group = name;
maps[idx].data.mux.function = name;
@@ -391,6 +397,8 @@ static int rzg2l_dt_subnode_to_map(struct pinctrl_dev *pctldev,
remove_group:
pinctrl_generic_remove_group(pctldev, gsel);
+unlock:
+ mutex_unlock(&pctrl->mutex);
done:
*index = idx;
kfree(configs);
@@ -1509,6 +1517,7 @@ static int rzg2l_pinctrl_probe(struct platform_device *pdev)
spin_lock_init(&pctrl->lock);
spin_lock_init(&pctrl->bitmap_lock);
+ mutex_init(&pctrl->mutex);
platform_set_drvdata(pdev, pctrl);
diff --git a/drivers/pinctrl/renesas/pinctrl-rzv2m.c b/drivers/pinctrl/renesas/pinctrl-rzv2m.c
index 35b23c1a5684..9146101ea9e2 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzv2m.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzv2m.c
@@ -14,6 +14,7 @@
#include <linux/gpio/driver.h>
#include <linux/io.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/of_device.h>
#include <linux/spinlock.h>
@@ -123,7 +124,8 @@ struct rzv2m_pinctrl {
struct gpio_chip gpio_chip;
struct pinctrl_gpio_range gpio_range;
- spinlock_t lock;
+ spinlock_t lock; /* lock read/write registers */
+ struct mutex mutex; /* serialize adding groups and functions */
};
static const unsigned int drv_1_8V_group2_uA[] = { 1800, 3800, 7800, 11000 };
@@ -322,11 +324,13 @@ static int rzv2m_dt_subnode_to_map(struct pinctrl_dev *pctldev,
name = np->name;
}
+ mutex_lock(&pctrl->mutex);
+
/* Register a single pin group listing all the pins we read from DT */
gsel = pinctrl_generic_add_group(pctldev, name, pins, num_pinmux, NULL);
if (gsel < 0) {
ret = gsel;
- goto done;
+ goto unlock;
}
/*
@@ -340,6 +344,8 @@ static int rzv2m_dt_subnode_to_map(struct pinctrl_dev *pctldev,
goto remove_group;
}
+ mutex_unlock(&pctrl->mutex);
+
maps[idx].type = PIN_MAP_TYPE_MUX_GROUP;
maps[idx].data.mux.group = name;
maps[idx].data.mux.function = name;
@@ -351,6 +357,8 @@ static int rzv2m_dt_subnode_to_map(struct pinctrl_dev *pctldev,
remove_group:
pinctrl_generic_remove_group(pctldev, gsel);
+unlock:
+ mutex_unlock(&pctrl->mutex);
done:
*index = idx;
kfree(configs);
@@ -1071,6 +1079,7 @@ static int rzv2m_pinctrl_probe(struct platform_device *pdev)
}
spin_lock_init(&pctrl->lock);
+ mutex_init(&pctrl->mutex);
platform_set_drvdata(pdev, pctrl);
diff --git a/drivers/platform/chrome/chromeos_acpi.c b/drivers/platform/chrome/chromeos_acpi.c
index 50d8a4d4352d..e6e6dcfc74d1 100644
--- a/drivers/platform/chrome/chromeos_acpi.c
+++ b/drivers/platform/chrome/chromeos_acpi.c
@@ -90,7 +90,36 @@ static int chromeos_acpi_handle_package(struct device *dev, union acpi_object *o
case ACPI_TYPE_STRING:
return sysfs_emit(buf, "%s\n", element->string.pointer);
case ACPI_TYPE_BUFFER:
- return sysfs_emit(buf, "%s\n", element->buffer.pointer);
+ {
+ int i, r, at, room_left;
+ const int byte_per_line = 16;
+
+ at = 0;
+ room_left = PAGE_SIZE - 1;
+ for (i = 0; i < element->buffer.length && room_left; i += byte_per_line) {
+ r = hex_dump_to_buffer(element->buffer.pointer + i,
+ element->buffer.length - i,
+ byte_per_line, 1, buf + at, room_left,
+ false);
+ if (r > room_left)
+ goto truncating;
+ at += r;
+ room_left -= r;
+
+ r = sysfs_emit_at(buf, at, "\n");
+ if (!r)
+ goto truncating;
+ at += r;
+ room_left -= r;
+ }
+
+ buf[at] = 0;
+ return at;
+truncating:
+ dev_info_once(dev, "truncating sysfs content for %s\n", name);
+ sysfs_emit_at(buf, PAGE_SIZE - 4, "..\n");
+ return PAGE_SIZE - 1;
+ }
default:
dev_err(dev, "element type %d not supported\n", element->type);
return -EINVAL;
@@ -235,9 +264,9 @@ static int chromeos_acpi_device_probe(struct platform_device *pdev)
return 0;
}
-/* GGL is valid PNP ID of Google. PNP ID can be used with the ACPI devices. */
static const struct acpi_device_id chromeos_device_ids[] = {
{ "GGL0001", 0 },
+ { "GOOG0016", 0 },
{}
};
MODULE_DEVICE_TABLE(acpi, chromeos_device_ids);
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 500a61b093e4..356572452898 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -327,8 +327,8 @@ static void cros_ec_lpc_acpi_notify(acpi_handle device, u32 value, void *data)
dev_emerg(ec_dev->dev, "CrOS EC Panic Reported. Shutdown is imminent!");
blocking_notifier_call_chain(&ec_dev->panic_notifier, 0, ec_dev);
kobject_uevent_env(&ec_dev->dev->kobj, KOBJ_CHANGE, (char **)env);
- /* Begin orderly shutdown. Force shutdown after 1 second. */
- hw_protection_shutdown("CrOS EC Panic", 1000);
+ /* Begin orderly shutdown. EC will force reset after a short period. */
+ hw_protection_shutdown("CrOS EC Panic", -1);
/* Do not query for other events after a panic is reported */
return;
}
diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c
index a79318e90a13..b600b77d91ef 100644
--- a/drivers/platform/mellanox/mlxbf-tmfifo.c
+++ b/drivers/platform/mellanox/mlxbf-tmfifo.c
@@ -887,6 +887,7 @@ static bool mlxbf_tmfifo_virtio_notify(struct virtqueue *vq)
tm_vdev = fifo->vdev[VIRTIO_ID_CONSOLE];
mlxbf_tmfifo_console_output(tm_vdev, vring);
spin_unlock_irqrestore(&fifo->spin_lock[0], flags);
+ set_bit(MLXBF_TM_TX_LWM_IRQ, &fifo->pend_events);
} else if (test_and_set_bit(MLXBF_TM_TX_LWM_IRQ,
&fifo->pend_events)) {
return true;
diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c
index d2fee9a3e239..6d9297c1d96c 100644
--- a/drivers/platform/x86/ideapad-laptop.c
+++ b/drivers/platform/x86/ideapad-laptop.c
@@ -1049,6 +1049,11 @@ static const struct key_entry ideapad_keymap[] = {
{ KE_IGNORE, 0x03 | IDEAPAD_WMI_KEY },
/* Customizable Lenovo Hotkey ("star" with 'S' inside) */
{ KE_KEY, 0x01 | IDEAPAD_WMI_KEY, { KEY_FAVORITES } },
+ { KE_KEY, 0x04 | IDEAPAD_WMI_KEY, { KEY_SELECTIVE_SCREENSHOT } },
+ /* Lenovo Support */
+ { KE_KEY, 0x07 | IDEAPAD_WMI_KEY, { KEY_HELP } },
+ { KE_KEY, 0x0e | IDEAPAD_WMI_KEY, { KEY_PICKUP_PHONE } },
+ { KE_KEY, 0x0f | IDEAPAD_WMI_KEY, { KEY_HANGUP_PHONE } },
/* Dark mode toggle */
{ KE_KEY, 0x13 | IDEAPAD_WMI_KEY, { KEY_PROG1 } },
/* Sound profile switch */
diff --git a/drivers/platform/x86/intel/ifs/load.c b/drivers/platform/x86/intel/ifs/load.c
index e6ae8265f3a3..cefd0d886cfd 100644
--- a/drivers/platform/x86/intel/ifs/load.c
+++ b/drivers/platform/x86/intel/ifs/load.c
@@ -3,7 +3,7 @@
#include <linux/firmware.h>
#include <asm/cpu.h>
-#include <asm/microcode_intel.h>
+#include <asm/microcode.h>
#include "ifs.h"
@@ -56,12 +56,13 @@ struct metadata_header {
static struct metadata_header *find_meta_data(void *ucode, unsigned int meta_type)
{
+ struct microcode_header_intel *hdr = &((struct microcode_intel *)ucode)->hdr;
struct metadata_header *meta_header;
unsigned long data_size, total_meta;
unsigned long meta_size = 0;
- data_size = get_datasize(ucode);
- total_meta = ((struct microcode_intel *)ucode)->hdr.metasize;
+ data_size = intel_microcode_get_datasize(hdr);
+ total_meta = hdr->metasize;
if (!total_meta)
return NULL;
diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c
index 5a36b3f77bc5..84c175b9721a 100644
--- a/drivers/platform/x86/intel/pmc/core.c
+++ b/drivers/platform/x86/intel/pmc/core.c
@@ -1123,7 +1123,7 @@ static const struct x86_cpu_id intel_pmc_core_ids[] = {
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, icl_core_init),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, tgl_core_init),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, tgl_core_init),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, tgl_core_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, tgl_core_init),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, adl_core_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, tgl_core_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, adl_core_init),
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
index a95004e3d80b..08df9494603c 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
@@ -720,7 +720,7 @@ static struct miscdevice isst_if_char_driver = {
static const struct x86_cpu_id hpm_cpu_ids[] = {
X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(SIERRAFOREST_X, NULL),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, NULL),
{}
};
diff --git a/drivers/platform/x86/lenovo-ymc.c b/drivers/platform/x86/lenovo-ymc.c
index f360370d5002..e1fbc35504d4 100644
--- a/drivers/platform/x86/lenovo-ymc.c
+++ b/drivers/platform/x86/lenovo-ymc.c
@@ -36,6 +36,13 @@ static const struct dmi_system_id ec_trigger_quirk_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "82QF"),
},
},
+ {
+ /* Lenovo Yoga 7 14ACN6 */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "82N7"),
+ },
+ },
{ }
};
diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c
index 38928ff7472b..6ab272c84b7b 100644
--- a/drivers/pnp/pnpacpi/core.c
+++ b/drivers/pnp/pnpacpi/core.c
@@ -254,6 +254,9 @@ static int __init pnpacpi_add_device(struct acpi_device *device)
else
strncpy(dev->name, acpi_device_bid(device), sizeof(dev->name));
+ /* Handle possible string truncation */
+ dev->name[sizeof(dev->name) - 1] = '\0';
+
if (dev->active)
pnpacpi_parse_allocated_resource(dev);
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index f173fba828ef..5c2e6d5eea2a 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -1250,7 +1250,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &rapl_defaults_core),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &rapl_defaults_core),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &rapl_defaults_core),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &rapl_defaults_core),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &rapl_defaults_core),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &rapl_defaults_core),
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index dd471021f237..250bd41a588c 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -142,7 +142,7 @@ static const struct x86_cpu_id pl4_support_ids[] = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, NULL),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, NULL),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, NULL),
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 50a5ff70814a..215597f73be4 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -3636,11 +3636,8 @@ int dasd_generic_set_offline(struct ccw_device *cdev)
* so sync bdev first and then wait for our queues to become
* empty
*/
- if (device->block) {
- rc = fsync_bdev(device->block->bdev);
- if (rc != 0)
- goto interrupted;
- }
+ if (device->block)
+ bdev_mark_dead(device->block->bdev, false);
dasd_schedule_device_bh(device);
rc = wait_event_interruptible(shutdown_waitq,
_wait_for_empty_queues(device));
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 09acf3853a77..06bcb6c78909 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -412,6 +412,7 @@ removeseg:
}
list_del(&dev_info->lh);
+ dax_remove_host(dev_info->gd);
kill_dax(dev_info->dax_dev);
put_dax(dev_info->dax_dev);
del_gendisk(dev_info->gd);
@@ -707,9 +708,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
goto out;
out_dax_host:
+ put_device(&dev_info->dev);
dax_remove_host(dev_info->gd);
out_dax:
- put_device(&dev_info->dev);
kill_dax(dev_info->dax_dev);
put_dax(dev_info->dax_dev);
put_dev:
@@ -789,6 +790,7 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch
}
list_del(&dev_info->lh);
+ dax_remove_host(dev_info->gd);
kill_dax(dev_info->dax_dev);
put_dax(dev_info->dax_dev);
del_gendisk(dev_info->gd);
@@ -860,7 +862,7 @@ dcssblk_submit_bio(struct bio *bio)
struct bio_vec bvec;
struct bvec_iter iter;
unsigned long index;
- unsigned long page_addr;
+ void *page_addr;
unsigned long source_addr;
unsigned long bytes_done;
@@ -868,8 +870,8 @@ dcssblk_submit_bio(struct bio *bio)
dev_info = bio->bi_bdev->bd_disk->private_data;
if (dev_info == NULL)
goto fail;
- if ((bio->bi_iter.bi_sector & 7) != 0 ||
- (bio->bi_iter.bi_size & 4095) != 0)
+ if (!IS_ALIGNED(bio->bi_iter.bi_sector, 8) ||
+ !IS_ALIGNED(bio->bi_iter.bi_size, PAGE_SIZE))
/* Request is not page-aligned. */
goto fail;
/* verify data transfer direction */
@@ -889,18 +891,16 @@ dcssblk_submit_bio(struct bio *bio)
index = (bio->bi_iter.bi_sector >> 3);
bio_for_each_segment(bvec, bio, iter) {
- page_addr = (unsigned long)bvec_virt(&bvec);
+ page_addr = bvec_virt(&bvec);
source_addr = dev_info->start + (index<<12) + bytes_done;
- if (unlikely((page_addr & 4095) != 0) || (bvec.bv_len & 4095) != 0)
+ if (unlikely(!IS_ALIGNED((unsigned long)page_addr, PAGE_SIZE) ||
+ !IS_ALIGNED(bvec.bv_len, PAGE_SIZE)))
// More paranoia.
goto fail;
- if (bio_data_dir(bio) == READ) {
- memcpy((void*)page_addr, (void*)source_addr,
- bvec.bv_len);
- } else {
- memcpy((void*)source_addr, (void*)page_addr,
- bvec.bv_len);
- }
+ if (bio_data_dir(bio) == READ)
+ memcpy(page_addr, __va(source_addr), bvec.bv_len);
+ else
+ memcpy(__va(source_addr), page_addr, bvec.bv_len);
bytes_done += bvec.bv_len;
}
bio_endio(bio);
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index 0c1df1d5f1ac..3a9cc8a4a230 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -134,7 +134,7 @@ static void scm_request_done(struct scm_request *scmrq)
if ((msb->flags & MSB_FLAG_IDA) && aidaw &&
IS_ALIGNED(aidaw, PAGE_SIZE))
- mempool_free(virt_to_page(aidaw), aidaw_pool);
+ mempool_free(virt_to_page((void *)aidaw), aidaw_pool);
}
spin_lock_irqsave(&list_lock, flags);
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 3c87057436d5..8b4575a0db9f 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -392,10 +392,6 @@ static void __init add_memory_merged(u16 rn)
goto skip_add;
start = rn2addr(first_rn);
size = (unsigned long long) num * sclp.rzm;
- if (start >= VMEM_MAX_PHYS)
- goto skip_add;
- if (start + size > VMEM_MAX_PHYS)
- size = VMEM_MAX_PHYS - start;
if (start >= ident_map_size)
goto skip_add;
if (start + size > ident_map_size)
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index f480d6c7fd39..fdc8668f3fba 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -55,6 +55,7 @@ static void __init sclp_early_facilities_detect(void)
S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_GUEST;
if (sccb->cpuoff > 134) {
sclp.has_diag318 = !!(sccb->byte_134 & 0x80);
+ sclp.has_diag320 = !!(sccb->byte_134 & 0x04);
sclp.has_iplcc = !!(sccb->byte_134 & 0x02);
}
if (sccb->cpuoff > 137) {
diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c
index 4cebfaaa22b4..eb0520a9d4af 100644
--- a/drivers/s390/char/vmcp.c
+++ b/drivers/s390/char/vmcp.c
@@ -89,7 +89,7 @@ static void vmcp_response_free(struct vmcp_session *session)
order = get_order(session->bufsize);
nr_pages = ALIGN(session->bufsize, PAGE_SIZE) >> PAGE_SHIFT;
if (session->cma_alloc) {
- page = virt_to_page((unsigned long)session->response);
+ page = virt_to_page(session->response);
cma_release(vmcp_cma, page, nr_pages);
session->cma_alloc = 0;
} else {
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 942c73a11ca3..bc3be0330f1d 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -3,7 +3,7 @@
* zcore module to export memory content and register sets for creating system
* dumps on SCSI/NVMe disks (zfcp/nvme dump).
*
- * For more information please refer to Documentation/s390/zfcpdump.rst
+ * For more information please refer to Documentation/arch/s390/zfcpdump.rst
*
* Copyright IBM Corp. 2003, 2008
* Author(s): Michael Holzheu
diff --git a/drivers/s390/crypto/Makefile b/drivers/s390/crypto/Makefile
index 22d2db690cd3..0edacd101c12 100644
--- a/drivers/s390/crypto/Makefile
+++ b/drivers/s390/crypto/Makefile
@@ -11,7 +11,7 @@ zcrypt-objs += zcrypt_msgtype6.o zcrypt_msgtype50.o
zcrypt-objs += zcrypt_ccamisc.o zcrypt_ep11misc.o
obj-$(CONFIG_ZCRYPT) += zcrypt.o
# adapter drivers depend on ap.o and zcrypt.o
-obj-$(CONFIG_ZCRYPT) += zcrypt_cex2c.o zcrypt_cex2a.o zcrypt_cex4.o
+obj-$(CONFIG_ZCRYPT) += zcrypt_cex4.o
# pkey kernel module
pkey-objs := pkey_api.o
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 420120be300f..339812efe822 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * Copyright IBM Corp. 2006, 2021
+ * Copyright IBM Corp. 2006, 2023
* Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
* Martin Schwidefsky <schwidefsky@de.ibm.com>
* Ralph Wuerthner <rwuerthn@de.ibm.com>
@@ -219,6 +219,15 @@ int ap_sb_available(void)
}
/*
+ * ap_is_se_guest(): Check for SE guest with AP pass-through support.
+ */
+bool ap_is_se_guest(void)
+{
+ return is_prot_virt_guest() && ap_sb_available();
+}
+EXPORT_SYMBOL(ap_is_se_guest);
+
+/*
* ap_fetch_qci_info(): Fetch cryptographic config info
*
* Returns the ap configuration info fetched via PQAP(QCI).
@@ -387,23 +396,6 @@ static int ap_queue_info(ap_qid_t qid, int *q_type, unsigned int *q_fac,
*q_ml = tapq_info.ml;
*q_decfg = status.response_code == AP_RESPONSE_DECONFIGURED;
*q_cstop = status.response_code == AP_RESPONSE_CHECKSTOPPED;
- switch (*q_type) {
- /* For CEX2 and CEX3 the available functions
- * are not reflected by the facilities bits.
- * Instead it is coded into the type. So here
- * modify the function bits based on the type.
- */
- case AP_DEVICE_TYPE_CEX2A:
- case AP_DEVICE_TYPE_CEX3A:
- *q_fac |= 0x08000000;
- break;
- case AP_DEVICE_TYPE_CEX2C:
- case AP_DEVICE_TYPE_CEX3C:
- *q_fac |= 0x10000000;
- break;
- default:
- break;
- }
return 1;
default:
/*
@@ -1678,8 +1670,8 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func)
{
int comp_type = 0;
- /* < CEX2A is not supported */
- if (rawtype < AP_DEVICE_TYPE_CEX2A) {
+ /* < CEX4 is not supported */
+ if (rawtype < AP_DEVICE_TYPE_CEX4) {
AP_DBF_WARN("%s queue=%02x.%04x unsupported type %d\n",
__func__, AP_QID_CARD(qid),
AP_QID_QUEUE(qid), rawtype);
@@ -1701,7 +1693,7 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func)
apinfo.cat = AP_DEVICE_TYPE_CEX8;
status = ap_qact(qid, 0, &apinfo);
if (status.response_code == AP_RESPONSE_NORMAL &&
- apinfo.cat >= AP_DEVICE_TYPE_CEX2A &&
+ apinfo.cat >= AP_DEVICE_TYPE_CEX4 &&
apinfo.cat <= AP_DEVICE_TYPE_CEX8)
comp_type = apinfo.cat;
}
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 0d7b7eb374ad..be54b070c031 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * Copyright IBM Corp. 2006, 2019
+ * Copyright IBM Corp. 2006, 2023
* Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
* Martin Schwidefsky <schwidefsky@de.ibm.com>
* Ralph Wuerthner <rwuerthn@de.ibm.com>
@@ -67,15 +67,8 @@ static inline int ap_test_bit(unsigned int *ptr, unsigned int nr)
#define AP_RESPONSE_INVALID_DOMAIN 0x42
/*
- * Known device types
+ * Supported AP device types
*/
-#define AP_DEVICE_TYPE_PCICC 3
-#define AP_DEVICE_TYPE_PCICA 4
-#define AP_DEVICE_TYPE_PCIXCC 5
-#define AP_DEVICE_TYPE_CEX2A 6
-#define AP_DEVICE_TYPE_CEX2C 7
-#define AP_DEVICE_TYPE_CEX3A 8
-#define AP_DEVICE_TYPE_CEX3C 9
#define AP_DEVICE_TYPE_CEX4 10
#define AP_DEVICE_TYPE_CEX5 11
#define AP_DEVICE_TYPE_CEX6 12
@@ -272,14 +265,6 @@ static inline void ap_release_message(struct ap_message *ap_msg)
kfree_sensitive(ap_msg->private);
}
-/*
- * Note: don't use ap_send/ap_recv after using ap_queue_message
- * for the first time. Otherwise the ap message queue will get
- * confused.
- */
-int ap_send(ap_qid_t qid, unsigned long psmid, void *msg, size_t msglen);
-int ap_recv(ap_qid_t qid, unsigned long *psmid, void *msg, size_t msglen);
-
enum ap_sm_wait ap_sm_event(struct ap_queue *aq, enum ap_sm_event event);
enum ap_sm_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_sm_event event);
@@ -289,6 +274,7 @@ void ap_flush_queue(struct ap_queue *aq);
void *ap_airq_ptr(void);
int ap_sb_available(void);
+bool ap_is_se_guest(void);
void ap_wait(enum ap_sm_wait wait);
void ap_request_timeout(struct timer_list *t);
void ap_bus_force_rescan(void);
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 30df83735adf..1336e632adc4 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright IBM Corp. 2016
+ * Copyright IBM Corp. 2016, 2023
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*
* Adjunct processor bus, queue related code.
@@ -93,51 +93,6 @@ __ap_send(ap_qid_t qid, unsigned long psmid, void *msg, size_t msglen,
return ap_nqap(qid, psmid, msg, msglen);
}
-int ap_send(ap_qid_t qid, unsigned long psmid, void *msg, size_t msglen)
-{
- struct ap_queue_status status;
-
- status = __ap_send(qid, psmid, msg, msglen, 0);
- if (status.async)
- return -EPERM;
- switch (status.response_code) {
- case AP_RESPONSE_NORMAL:
- return 0;
- case AP_RESPONSE_Q_FULL:
- case AP_RESPONSE_RESET_IN_PROGRESS:
- return -EBUSY;
- case AP_RESPONSE_REQ_FAC_NOT_INST:
- return -EINVAL;
- default: /* Device is gone. */
- return -ENODEV;
- }
-}
-EXPORT_SYMBOL(ap_send);
-
-int ap_recv(ap_qid_t qid, unsigned long *psmid, void *msg, size_t msglen)
-{
- struct ap_queue_status status;
-
- if (!msg)
- return -EINVAL;
- status = ap_dqap(qid, psmid, msg, msglen, NULL, NULL, NULL);
- if (status.async)
- return -EPERM;
- switch (status.response_code) {
- case AP_RESPONSE_NORMAL:
- return 0;
- case AP_RESPONSE_NO_PENDING_REPLY:
- if (status.queue_empty)
- return -ENOENT;
- return -EBUSY;
- case AP_RESPONSE_RESET_IN_PROGRESS:
- return -EBUSY;
- default:
- return -ENODEV;
- }
-}
-EXPORT_SYMBOL(ap_recv);
-
/* State machine definitions and helpers */
static enum ap_sm_wait ap_sm_nop(struct ap_queue *aq)
diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
index e58bfd225323..6cfb6b2340c9 100644
--- a/drivers/s390/crypto/pkey_api.c
+++ b/drivers/s390/crypto/pkey_api.c
@@ -263,7 +263,9 @@ static int pkey_clr2ep11key(const u8 *clrkey, size_t clrkeylen,
/* build a list of apqns suitable for ep11 keys with cpacf support */
rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF,
- ZCRYPT_CEX7, EP11_API_V, NULL);
+ ZCRYPT_CEX7,
+ ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4,
+ NULL);
if (rc)
goto out;
@@ -272,7 +274,8 @@ static int pkey_clr2ep11key(const u8 *clrkey, size_t clrkeylen,
card = apqns[i] >> 16;
dom = apqns[i] & 0xFFFF;
rc = ep11_clr2keyblob(card, dom, clrkeylen * 8,
- 0, clrkey, keybuf, keybuflen);
+ 0, clrkey, keybuf, keybuflen,
+ PKEY_TYPE_EP11);
if (rc == 0)
break;
}
@@ -287,10 +290,9 @@ out:
/*
* Find card and transform EP11 secure key into protected key.
*/
-static int pkey_ep11key2pkey(const u8 *key, u8 *protkey,
- u32 *protkeylen, u32 *protkeytype)
+static int pkey_ep11key2pkey(const u8 *key, size_t keylen,
+ u8 *protkey, u32 *protkeylen, u32 *protkeytype)
{
- struct ep11keyblob *kb = (struct ep11keyblob *)key;
u32 nr_apqns, *apqns = NULL;
u16 card, dom;
int i, rc;
@@ -299,7 +301,9 @@ static int pkey_ep11key2pkey(const u8 *key, u8 *protkey,
/* build a list of apqns suitable for this key */
rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF,
- ZCRYPT_CEX7, EP11_API_V, kb->wkvp);
+ ZCRYPT_CEX7,
+ ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4,
+ ep11_kb_wkvp(key, keylen));
if (rc)
goto out;
@@ -307,7 +311,7 @@ static int pkey_ep11key2pkey(const u8 *key, u8 *protkey,
for (rc = -ENODEV, i = 0; i < nr_apqns; i++) {
card = apqns[i] >> 16;
dom = apqns[i] & 0xFFFF;
- rc = ep11_kblob2protkey(card, dom, key, kb->head.len,
+ rc = ep11_kblob2protkey(card, dom, key, keylen,
protkey, protkeylen, protkeytype);
if (rc == 0)
break;
@@ -495,7 +499,7 @@ try_via_ep11:
tmpbuf, &tmpbuflen);
if (rc)
goto failure;
- rc = pkey_ep11key2pkey(tmpbuf,
+ rc = pkey_ep11key2pkey(tmpbuf, tmpbuflen,
protkey, protkeylen, protkeytype);
if (!rc)
goto out;
@@ -611,7 +615,7 @@ static int pkey_nonccatok2pkey(const u8 *key, u32 keylen,
rc = ep11_check_aes_key(debug_info, 3, key, keylen, 1);
if (rc)
goto out;
- rc = pkey_ep11key2pkey(key,
+ rc = pkey_ep11key2pkey(key, keylen,
protkey, protkeylen, protkeytype);
break;
}
@@ -620,7 +624,7 @@ static int pkey_nonccatok2pkey(const u8 *key, u32 keylen,
rc = ep11_check_aes_key_with_hdr(debug_info, 3, key, keylen, 1);
if (rc)
goto out;
- rc = pkey_ep11key2pkey(key + sizeof(struct ep11kblob_header),
+ rc = pkey_ep11key2pkey(key, keylen,
protkey, protkeylen, protkeytype);
break;
default:
@@ -713,6 +717,11 @@ static int pkey_genseckey2(const struct pkey_apqn *apqns, size_t nr_apqns,
if (*keybufsize < MINEP11AESKEYBLOBSIZE)
return -EINVAL;
break;
+ case PKEY_TYPE_EP11_AES:
+ if (*keybufsize < (sizeof(struct ep11kblob_header) +
+ MINEP11AESKEYBLOBSIZE))
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -729,9 +738,10 @@ static int pkey_genseckey2(const struct pkey_apqn *apqns, size_t nr_apqns,
for (i = 0, rc = -ENODEV; i < nr_apqns; i++) {
card = apqns[i].card;
dom = apqns[i].domain;
- if (ktype == PKEY_TYPE_EP11) {
+ if (ktype == PKEY_TYPE_EP11 ||
+ ktype == PKEY_TYPE_EP11_AES) {
rc = ep11_genaeskey(card, dom, ksize, kflags,
- keybuf, keybufsize);
+ keybuf, keybufsize, ktype);
} else if (ktype == PKEY_TYPE_CCA_DATA) {
rc = cca_genseckey(card, dom, ksize, keybuf);
*keybufsize = (rc ? 0 : SECKEYBLOBSIZE);
@@ -769,6 +779,11 @@ static int pkey_clr2seckey2(const struct pkey_apqn *apqns, size_t nr_apqns,
if (*keybufsize < MINEP11AESKEYBLOBSIZE)
return -EINVAL;
break;
+ case PKEY_TYPE_EP11_AES:
+ if (*keybufsize < (sizeof(struct ep11kblob_header) +
+ MINEP11AESKEYBLOBSIZE))
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -787,9 +802,11 @@ static int pkey_clr2seckey2(const struct pkey_apqn *apqns, size_t nr_apqns,
for (i = 0, rc = -ENODEV; i < nr_apqns; i++) {
card = apqns[i].card;
dom = apqns[i].domain;
- if (ktype == PKEY_TYPE_EP11) {
+ if (ktype == PKEY_TYPE_EP11 ||
+ ktype == PKEY_TYPE_EP11_AES) {
rc = ep11_clr2keyblob(card, dom, ksize, kflags,
- clrkey, keybuf, keybufsize);
+ clrkey, keybuf, keybufsize,
+ ktype);
} else if (ktype == PKEY_TYPE_CCA_DATA) {
rc = cca_clr2seckey(card, dom, ksize,
clrkey, keybuf);
@@ -888,6 +905,7 @@ static int pkey_verifykey2(const u8 *key, size_t keylen,
} else if (hdr->type == TOKTYPE_NON_CCA &&
hdr->version == TOKVER_EP11_AES) {
struct ep11keyblob *kb = (struct ep11keyblob *)key;
+ int api;
rc = ep11_check_aes_key(debug_info, 3, key, keylen, 1);
if (rc)
@@ -895,10 +913,12 @@ static int pkey_verifykey2(const u8 *key, size_t keylen,
if (ktype)
*ktype = PKEY_TYPE_EP11;
if (ksize)
- *ksize = kb->head.keybitlen;
+ *ksize = kb->head.bitlen;
+ api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4;
rc = ep11_findcard2(&_apqns, &_nr_apqns, *cardnr, *domain,
- ZCRYPT_CEX7, EP11_API_V, kb->wkvp);
+ ZCRYPT_CEX7, api,
+ ep11_kb_wkvp(key, keylen));
if (rc)
goto out;
@@ -908,6 +928,32 @@ static int pkey_verifykey2(const u8 *key, size_t keylen,
*cardnr = ((struct pkey_apqn *)_apqns)->card;
*domain = ((struct pkey_apqn *)_apqns)->domain;
+ } else if (hdr->type == TOKTYPE_NON_CCA &&
+ hdr->version == TOKVER_EP11_AES_WITH_HEADER) {
+ struct ep11kblob_header *kh = (struct ep11kblob_header *)key;
+ int api;
+
+ rc = ep11_check_aes_key_with_hdr(debug_info, 3,
+ key, keylen, 1);
+ if (rc)
+ goto out;
+ if (ktype)
+ *ktype = PKEY_TYPE_EP11_AES;
+ if (ksize)
+ *ksize = kh->bitlen;
+
+ api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4;
+ rc = ep11_findcard2(&_apqns, &_nr_apqns, *cardnr, *domain,
+ ZCRYPT_CEX7, api,
+ ep11_kb_wkvp(key, keylen));
+ if (rc)
+ goto out;
+
+ if (flags)
+ *flags = PKEY_FLAGS_MATCH_CUR_MKVP;
+
+ *cardnr = ((struct pkey_apqn *)_apqns)->card;
+ *domain = ((struct pkey_apqn *)_apqns)->domain;
} else {
rc = -EINVAL;
}
@@ -949,10 +995,12 @@ static int pkey_keyblob2pkey2(const struct pkey_apqn *apqns, size_t nr_apqns,
}
} else if (hdr->type == TOKTYPE_NON_CCA) {
if (hdr->version == TOKVER_EP11_AES) {
- if (keylen < sizeof(struct ep11keyblob))
- return -EINVAL;
if (ep11_check_aes_key(debug_info, 3, key, keylen, 1))
return -EINVAL;
+ } else if (hdr->version == TOKVER_EP11_AES_WITH_HEADER) {
+ if (ep11_check_aes_key_with_hdr(debug_info, 3,
+ key, keylen, 1))
+ return -EINVAL;
} else {
return pkey_nonccatok2pkey(key, keylen,
protkey, protkeylen,
@@ -980,10 +1028,7 @@ static int pkey_keyblob2pkey2(const struct pkey_apqn *apqns, size_t nr_apqns,
protkey, protkeylen,
protkeytype);
} else {
- /* EP11 AES secure key blob */
- struct ep11keyblob *kb = (struct ep11keyblob *)key;
-
- rc = ep11_kblob2protkey(card, dom, key, kb->head.len,
+ rc = ep11_kblob2protkey(card, dom, key, keylen,
protkey, protkeylen,
protkeytype);
}
@@ -1018,7 +1063,7 @@ static int pkey_apqns4key(const u8 *key, size_t keylen, u32 flags,
return -EINVAL;
if (kb->attr & EP11_BLOB_PKEY_EXTRACTABLE) {
minhwtype = ZCRYPT_CEX7;
- api = EP11_API_V;
+ api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4;
}
rc = ep11_findcard2(&_apqns, &_nr_apqns, 0xFFFF, 0xFFFF,
minhwtype, api, kb->wkvp);
@@ -1034,7 +1079,7 @@ static int pkey_apqns4key(const u8 *key, size_t keylen, u32 flags,
return -EINVAL;
if (kb->attr & EP11_BLOB_PKEY_EXTRACTABLE) {
minhwtype = ZCRYPT_CEX7;
- api = EP11_API_V;
+ api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4;
}
rc = ep11_findcard2(&_apqns, &_nr_apqns, 0xFFFF, 0xFFFF,
minhwtype, api, kb->wkvp);
@@ -1144,11 +1189,13 @@ static int pkey_apqns4keytype(enum pkey_key_type ktype,
ktype == PKEY_TYPE_EP11_AES ||
ktype == PKEY_TYPE_EP11_ECC) {
u8 *wkvp = NULL;
+ int api;
if (flags & PKEY_FLAGS_MATCH_CUR_MKVP)
wkvp = cur_mkvp;
+ api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4;
rc = ep11_findcard2(&_apqns, &_nr_apqns, 0xFFFF, 0xFFFF,
- ZCRYPT_CEX7, EP11_API_V, wkvp);
+ ZCRYPT_CEX7, api, wkvp);
if (rc)
goto out;
@@ -1243,12 +1290,14 @@ static int pkey_keyblob2pkey3(const struct pkey_apqn *apqns, size_t nr_apqns,
hdr->version == TOKVER_EP11_ECC_WITH_HEADER) &&
is_ep11_keyblob(key + sizeof(struct ep11kblob_header)))
rc = ep11_kblob2protkey(card, dom, key, hdr->len,
- protkey, protkeylen, protkeytype);
+ protkey, protkeylen,
+ protkeytype);
else if (hdr->type == TOKTYPE_NON_CCA &&
hdr->version == TOKVER_EP11_AES &&
is_ep11_keyblob(key))
rc = ep11_kblob2protkey(card, dom, key, hdr->len,
- protkey, protkeylen, protkeytype);
+ protkey, protkeylen,
+ protkeytype);
else if (hdr->type == TOKTYPE_CCA_INTERNAL &&
hdr->version == TOKVER_CCA_AES)
rc = cca_sec2protkey(card, dom, key, protkey,
@@ -1466,7 +1515,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
apqns = _copy_apqns_from_user(kgs.apqns, kgs.apqn_entries);
if (IS_ERR(apqns))
return PTR_ERR(apqns);
- kkey = kmalloc(klen, GFP_KERNEL);
+ kkey = kzalloc(klen, GFP_KERNEL);
if (!kkey) {
kfree(apqns);
return -ENOMEM;
@@ -1508,7 +1557,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
apqns = _copy_apqns_from_user(kcs.apqns, kcs.apqn_entries);
if (IS_ERR(apqns))
return PTR_ERR(apqns);
- kkey = kmalloc(klen, GFP_KERNEL);
+ kkey = kzalloc(klen, GFP_KERNEL);
if (!kkey) {
kfree(apqns);
return -ENOMEM;
@@ -2102,7 +2151,7 @@ static struct attribute_group ccacipher_attr_group = {
* (i.e. off != 0 or count < key blob size) -EINVAL is returned.
* This function and the sysfs attributes using it provide EP11 key blobs
* padded to the upper limit of MAXEP11AESKEYBLOBSIZE which is currently
- * 320 bytes.
+ * 336 bytes.
*/
static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits,
bool is_xts, char *buf, loff_t off,
@@ -2120,7 +2169,9 @@ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits,
/* build a list of apqns able to generate an cipher key */
rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF,
- ZCRYPT_CEX7, EP11_API_V, NULL);
+ ZCRYPT_CEX7,
+ ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4,
+ NULL);
if (rc)
return rc;
@@ -2130,7 +2181,8 @@ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits,
for (i = 0, rc = -ENODEV; i < nr_apqns; i++) {
card = apqns[i] >> 16;
dom = apqns[i] & 0xFFFF;
- rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize);
+ rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize,
+ PKEY_TYPE_EP11_AES);
if (rc == 0)
break;
}
@@ -2140,7 +2192,8 @@ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits,
if (is_xts) {
keysize = MAXEP11AESKEYBLOBSIZE;
buf += MAXEP11AESKEYBLOBSIZE;
- rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize);
+ rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize,
+ PKEY_TYPE_EP11_AES);
if (rc == 0)
return 2 * MAXEP11AESKEYBLOBSIZE;
}
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index b441745b0418..0509f80622cd 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -30,13 +30,12 @@
#define AP_QUEUE_UNASSIGNED "unassigned"
#define AP_QUEUE_IN_USE "in use"
-#define MAX_RESET_CHECK_WAIT 200 /* Sleep max 200ms for reset check */
#define AP_RESET_INTERVAL 20 /* Reset sleep interval (20ms) */
static int vfio_ap_mdev_reset_queues(struct ap_queue_table *qtable);
static struct vfio_ap_queue *vfio_ap_find_queue(int apqn);
static const struct vfio_device_ops vfio_ap_matrix_dev_ops;
-static int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q);
+static void vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q);
/**
* get_update_locks_for_kvm: Acquire the locks required to dynamically update a
@@ -360,6 +359,28 @@ static int vfio_ap_validate_nib(struct kvm_vcpu *vcpu, dma_addr_t *nib)
return 0;
}
+static int ensure_nib_shared(unsigned long addr, struct gmap *gmap)
+{
+ int ret;
+
+ /*
+ * The nib has to be located in shared storage since guest and
+ * host access it. vfio_pin_pages() will do a pin shared and
+ * if that fails (possibly because it's not a shared page) it
+ * calls export. We try to do a second pin shared here so that
+ * the UV gives us an error code if we try to pin a non-shared
+ * page.
+ *
+ * If the page is already pinned shared the UV will return a success.
+ */
+ ret = uv_pin_shared(addr);
+ if (ret) {
+ /* vfio_pin_pages() likely exported the page so let's re-import */
+ gmap_convert_to_secure(gmap, addr);
+ }
+ return ret;
+}
+
/**
* vfio_ap_irq_enable - Enable Interruption for a APQN
*
@@ -423,6 +444,14 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
h_nib = page_to_phys(h_page) | (nib & ~PAGE_MASK);
aqic_gisa.gisc = isc;
+ /* NIB in non-shared storage is a rc 6 for PV guests */
+ if (kvm_s390_pv_cpu_is_protected(vcpu) &&
+ ensure_nib_shared(h_nib & PAGE_MASK, kvm->arch.gmap)) {
+ vfio_unpin_pages(&q->matrix_mdev->vdev, nib, 1);
+ status.response_code = AP_RESPONSE_INVALID_ADDRESS;
+ return status;
+ }
+
nisc = kvm_s390_gisc_register(kvm, isc);
if (nisc < 0) {
VFIO_AP_DBF_WARN("%s: gisc registration failed: nisc=%d, isc=%d, apqn=%#04x\n",
@@ -675,7 +704,7 @@ static bool vfio_ap_mdev_filter_matrix(unsigned long *apm, unsigned long *aqm,
*/
apqn = AP_MKQID(apid, apqi);
q = vfio_ap_mdev_get_queue(matrix_mdev, apqn);
- if (!q || q->reset_rc) {
+ if (!q || q->reset_status.response_code) {
clear_bit_inv(apid,
matrix_mdev->shadow_apcb.apm);
break;
@@ -1608,19 +1637,21 @@ static int apq_status_check(int apqn, struct ap_queue_status *status)
{
switch (status->response_code) {
case AP_RESPONSE_NORMAL:
+ case AP_RESPONSE_DECONFIGURED:
+ return 0;
case AP_RESPONSE_RESET_IN_PROGRESS:
- if (status->queue_empty && !status->irq_enabled)
- return 0;
+ case AP_RESPONSE_BUSY:
return -EBUSY;
- case AP_RESPONSE_DECONFIGURED:
+ case AP_RESPONSE_ASSOC_SECRET_NOT_UNIQUE:
+ case AP_RESPONSE_ASSOC_FAILED:
/*
- * If the AP queue is deconfigured, any subsequent AP command
- * targeting the queue will fail with the same response code. On the
- * other hand, when an AP adapter is deconfigured, the associated
- * queues are reset, so let's return a value indicating the reset
- * for which we're waiting completed successfully.
+ * These asynchronous response codes indicate a PQAP(AAPQ)
+ * instruction to associate a secret with the guest failed. All
+ * subsequent AP instructions will end with the asynchronous
+ * response code until the AP queue is reset; so, let's return
+ * a value indicating a reset needs to be performed again.
*/
- return 0;
+ return -EAGAIN;
default:
WARN(true,
"failed to verify reset of queue %02x.%04x: TAPQ rc=%u\n",
@@ -1630,91 +1661,105 @@ static int apq_status_check(int apqn, struct ap_queue_status *status)
}
}
-static int apq_reset_check(struct vfio_ap_queue *q)
+#define WAIT_MSG "Waited %dms for reset of queue %02x.%04x (%u, %u, %u)"
+
+static void apq_reset_check(struct work_struct *reset_work)
{
- int ret;
- int iters = MAX_RESET_CHECK_WAIT / AP_RESET_INTERVAL;
+ int ret = -EBUSY, elapsed = 0;
struct ap_queue_status status;
+ struct vfio_ap_queue *q;
- for (; iters > 0; iters--) {
+ q = container_of(reset_work, struct vfio_ap_queue, reset_work);
+ memcpy(&status, &q->reset_status, sizeof(status));
+ while (true) {
msleep(AP_RESET_INTERVAL);
+ elapsed += AP_RESET_INTERVAL;
status = ap_tapq(q->apqn, NULL);
ret = apq_status_check(q->apqn, &status);
- if (ret != -EBUSY)
- return ret;
+ if (ret == -EIO)
+ return;
+ if (ret == -EBUSY) {
+ pr_notice_ratelimited(WAIT_MSG, elapsed,
+ AP_QID_CARD(q->apqn),
+ AP_QID_QUEUE(q->apqn),
+ status.response_code,
+ status.queue_empty,
+ status.irq_enabled);
+ } else {
+ if (q->reset_status.response_code == AP_RESPONSE_RESET_IN_PROGRESS ||
+ q->reset_status.response_code == AP_RESPONSE_BUSY ||
+ q->reset_status.response_code == AP_RESPONSE_STATE_CHANGE_IN_PROGRESS ||
+ ret == -EAGAIN) {
+ status = ap_zapq(q->apqn, 0);
+ memcpy(&q->reset_status, &status, sizeof(status));
+ continue;
+ }
+ /*
+ * When an AP adapter is deconfigured, the
+ * associated queues are reset, so let's set the
+ * status response code to 0 so the queue may be
+ * passed through (i.e., not filtered)
+ */
+ if (status.response_code == AP_RESPONSE_DECONFIGURED)
+ q->reset_status.response_code = 0;
+ if (q->saved_isc != VFIO_AP_ISC_INVALID)
+ vfio_ap_free_aqic_resources(q);
+ break;
+ }
}
- WARN_ONCE(iters <= 0,
- "timeout verifying reset of queue %02x.%04x (%u, %u, %u)",
- AP_QID_CARD(q->apqn), AP_QID_QUEUE(q->apqn),
- status.queue_empty, status.irq_enabled, status.response_code);
- return ret;
}
-static int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q)
+static void vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q)
{
struct ap_queue_status status;
- int ret;
if (!q)
- return 0;
-retry_zapq:
+ return;
status = ap_zapq(q->apqn, 0);
- q->reset_rc = status.response_code;
+ memcpy(&q->reset_status, &status, sizeof(status));
switch (status.response_code) {
case AP_RESPONSE_NORMAL:
- ret = 0;
- /* if the reset has not completed, wait for it to take effect */
- if (!status.queue_empty || status.irq_enabled)
- ret = apq_reset_check(q);
- break;
case AP_RESPONSE_RESET_IN_PROGRESS:
+ case AP_RESPONSE_BUSY:
+ case AP_RESPONSE_STATE_CHANGE_IN_PROGRESS:
/*
- * There is a reset issued by another process in progress. Let's wait
- * for that to complete. Since we have no idea whether it was a RAPQ or
- * ZAPQ, then if it completes successfully, let's issue the ZAPQ.
+ * Let's verify whether the ZAPQ completed successfully on a work queue.
*/
- ret = apq_reset_check(q);
- if (ret)
- break;
- goto retry_zapq;
+ queue_work(system_long_wq, &q->reset_work);
+ break;
case AP_RESPONSE_DECONFIGURED:
/*
* When an AP adapter is deconfigured, the associated
- * queues are reset, so let's return a value indicating the reset
- * completed successfully.
+ * queues are reset, so let's set the status response code to 0
+ * so the queue may be passed through (i.e., not filtered).
*/
- ret = 0;
+ q->reset_status.response_code = 0;
+ vfio_ap_free_aqic_resources(q);
break;
default:
WARN(true,
"PQAP/ZAPQ for %02x.%04x failed with invalid rc=%u\n",
AP_QID_CARD(q->apqn), AP_QID_QUEUE(q->apqn),
status.response_code);
- return -EIO;
}
-
- vfio_ap_free_aqic_resources(q);
-
- return ret;
}
static int vfio_ap_mdev_reset_queues(struct ap_queue_table *qtable)
{
- int ret, loop_cursor, rc = 0;
+ int ret = 0, loop_cursor;
struct vfio_ap_queue *q;
+ hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode)
+ vfio_ap_mdev_reset_queue(q);
+
hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) {
- ret = vfio_ap_mdev_reset_queue(q);
- /*
- * Regardless whether a queue turns out to be busy, or
- * is not operational, we need to continue resetting
- * the remaining queues.
- */
- if (ret)
- rc = ret;
+ flush_work(&q->reset_work);
+
+ if (q->reset_status.response_code)
+ ret = -EIO;
}
- return rc;
+ return ret;
}
static int vfio_ap_mdev_open_device(struct vfio_device *vdev)
@@ -2038,6 +2083,8 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev)
q->apqn = to_ap_queue(&apdev->device)->qid;
q->saved_isc = VFIO_AP_ISC_INVALID;
+ memset(&q->reset_status, 0, sizeof(q->reset_status));
+ INIT_WORK(&q->reset_work, apq_reset_check);
matrix_mdev = get_update_locks_by_apqn(q->apqn);
if (matrix_mdev) {
@@ -2087,6 +2134,7 @@ void vfio_ap_mdev_remove_queue(struct ap_device *apdev)
}
vfio_ap_mdev_reset_queue(q);
+ flush_work(&q->reset_work);
dev_set_drvdata(&apdev->device, NULL);
kfree(q);
release_update_locks_for_mdev(matrix_mdev);
diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
index 4642bbdbd1b2..88aff8b81f2f 100644
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -133,7 +133,8 @@ struct ap_matrix_mdev {
* @apqn: the APQN of the AP queue device
* @saved_isc: the guest ISC registered with the GIB interface
* @mdev_qnode: allows the vfio_ap_queue struct to be added to a hashtable
- * @reset_rc: the status response code from the last reset of the queue
+ * @reset_status: the status from the last reset of the queue
+ * @reset_work: work to wait for queue reset to complete
*/
struct vfio_ap_queue {
struct ap_matrix_mdev *matrix_mdev;
@@ -142,7 +143,8 @@ struct vfio_ap_queue {
#define VFIO_AP_ISC_INVALID 0xff
unsigned char saved_isc;
struct hlist_node mdev_qnode;
- unsigned int reset_rc;
+ struct ap_queue_status reset_status;
+ struct work_struct reset_work;
};
int vfio_ap_mdev_register(void);
diff --git a/drivers/s390/crypto/zcrypt_cex2a.c b/drivers/s390/crypto/zcrypt_cex2a.c
index 83f692c9c197..e69de29bb2d1 100644
--- a/drivers/s390/crypto/zcrypt_cex2a.c
+++ b/drivers/s390/crypto/zcrypt_cex2a.c
@@ -1,227 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright IBM Corp. 2001, 2012
- * Author(s): Robert Burroughs
- * Eric Rossman (edrossma@us.ibm.com)
- *
- * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com)
- * Major cleanup & driver split: Martin Schwidefsky <schwidefsky@de.ibm.com>
- * Ralph Wuerthner <rwuerthn@de.ibm.com>
- * MSGTYPE restruct: Holger Dengler <hd@linux.vnet.ibm.com>
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/err.h>
-#include <linux/atomic.h>
-#include <linux/uaccess.h>
-#include <linux/mod_devicetable.h>
-
-#include "ap_bus.h"
-#include "zcrypt_api.h"
-#include "zcrypt_error.h"
-#include "zcrypt_cex2a.h"
-#include "zcrypt_msgtype50.h"
-
-#define CEX2A_MIN_MOD_SIZE 1 /* 8 bits */
-#define CEX2A_MAX_MOD_SIZE 256 /* 2048 bits */
-#define CEX3A_MIN_MOD_SIZE CEX2A_MIN_MOD_SIZE
-#define CEX3A_MAX_MOD_SIZE 512 /* 4096 bits */
-
-#define CEX2A_MAX_MESSAGE_SIZE 0x390 /* sizeof(struct type50_crb2_msg) */
-#define CEX2A_MAX_RESPONSE_SIZE 0x110 /* max outputdatalength + type80_hdr */
-
-#define CEX3A_MAX_RESPONSE_SIZE 0x210 /* 512 bit modulus
- * (max outputdatalength) +
- * type80_hdr
- */
-#define CEX3A_MAX_MESSAGE_SIZE sizeof(struct type50_crb3_msg)
-
-#define CEX2A_CLEANUP_TIME (15 * HZ)
-#define CEX3A_CLEANUP_TIME CEX2A_CLEANUP_TIME
-
-MODULE_AUTHOR("IBM Corporation");
-MODULE_DESCRIPTION("CEX2A/CEX3A Cryptographic Coprocessor device driver, " \
- "Copyright IBM Corp. 2001, 2018");
-MODULE_LICENSE("GPL");
-
-static struct ap_device_id zcrypt_cex2a_card_ids[] = {
- { .dev_type = AP_DEVICE_TYPE_CEX2A,
- .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE },
- { .dev_type = AP_DEVICE_TYPE_CEX3A,
- .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE },
- { /* end of list */ },
-};
-
-MODULE_DEVICE_TABLE(ap, zcrypt_cex2a_card_ids);
-
-static struct ap_device_id zcrypt_cex2a_queue_ids[] = {
- { .dev_type = AP_DEVICE_TYPE_CEX2A,
- .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
- { .dev_type = AP_DEVICE_TYPE_CEX3A,
- .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
- { /* end of list */ },
-};
-
-MODULE_DEVICE_TABLE(ap, zcrypt_cex2a_queue_ids);
-
-/*
- * Probe function for CEX2A card devices. It always accepts the AP device
- * since the bus_match already checked the card type.
- * @ap_dev: pointer to the AP device.
- */
-static int zcrypt_cex2a_card_probe(struct ap_device *ap_dev)
-{
- /*
- * Normalized speed ratings per crypto adapter
- * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY
- */
- static const int CEX2A_SPEED_IDX[] = {
- 800, 1000, 2000, 900, 1200, 2400, 0, 0};
- static const int CEX3A_SPEED_IDX[] = {
- 400, 500, 1000, 450, 550, 1200, 0, 0};
-
- struct ap_card *ac = to_ap_card(&ap_dev->device);
- struct zcrypt_card *zc;
- int rc = 0;
-
- zc = zcrypt_card_alloc();
- if (!zc)
- return -ENOMEM;
- zc->card = ac;
- dev_set_drvdata(&ap_dev->device, zc);
-
- if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX2A) {
- zc->min_mod_size = CEX2A_MIN_MOD_SIZE;
- zc->max_mod_size = CEX2A_MAX_MOD_SIZE;
- zc->speed_rating = CEX2A_SPEED_IDX;
- zc->max_exp_bit_length = CEX2A_MAX_MOD_SIZE;
- zc->type_string = "CEX2A";
- zc->user_space_type = ZCRYPT_CEX2A;
- } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX3A) {
- zc->min_mod_size = CEX2A_MIN_MOD_SIZE;
- zc->max_mod_size = CEX2A_MAX_MOD_SIZE;
- zc->max_exp_bit_length = CEX2A_MAX_MOD_SIZE;
- if (ap_test_bit(&ac->functions, AP_FUNC_MEX4K) &&
- ap_test_bit(&ac->functions, AP_FUNC_CRT4K)) {
- zc->max_mod_size = CEX3A_MAX_MOD_SIZE;
- zc->max_exp_bit_length = CEX3A_MAX_MOD_SIZE;
- }
- zc->speed_rating = CEX3A_SPEED_IDX;
- zc->type_string = "CEX3A";
- zc->user_space_type = ZCRYPT_CEX3A;
- } else {
- zcrypt_card_free(zc);
- return -ENODEV;
- }
- zc->online = 1;
-
- rc = zcrypt_card_register(zc);
- if (rc)
- zcrypt_card_free(zc);
-
- return rc;
-}
-
-/*
- * This is called to remove the CEX2A card driver information
- * if an AP card device is removed.
- */
-static void zcrypt_cex2a_card_remove(struct ap_device *ap_dev)
-{
- struct zcrypt_card *zc = dev_get_drvdata(&ap_dev->device);
-
- zcrypt_card_unregister(zc);
-}
-
-static struct ap_driver zcrypt_cex2a_card_driver = {
- .probe = zcrypt_cex2a_card_probe,
- .remove = zcrypt_cex2a_card_remove,
- .ids = zcrypt_cex2a_card_ids,
- .flags = AP_DRIVER_FLAG_DEFAULT,
-};
-
-/*
- * Probe function for CEX2A queue devices. It always accepts the AP device
- * since the bus_match already checked the queue type.
- * @ap_dev: pointer to the AP device.
- */
-static int zcrypt_cex2a_queue_probe(struct ap_device *ap_dev)
-{
- struct ap_queue *aq = to_ap_queue(&ap_dev->device);
- struct zcrypt_queue *zq = NULL;
- int rc;
-
- switch (ap_dev->device_type) {
- case AP_DEVICE_TYPE_CEX2A:
- zq = zcrypt_queue_alloc(CEX2A_MAX_RESPONSE_SIZE);
- if (!zq)
- return -ENOMEM;
- break;
- case AP_DEVICE_TYPE_CEX3A:
- zq = zcrypt_queue_alloc(CEX3A_MAX_RESPONSE_SIZE);
- if (!zq)
- return -ENOMEM;
- break;
- }
- if (!zq)
- return -ENODEV;
- zq->ops = zcrypt_msgtype(MSGTYPE50_NAME, MSGTYPE50_VARIANT_DEFAULT);
- zq->queue = aq;
- zq->online = 1;
- atomic_set(&zq->load, 0);
- ap_queue_init_state(aq);
- ap_queue_init_reply(aq, &zq->reply);
- aq->request_timeout = CEX2A_CLEANUP_TIME;
- dev_set_drvdata(&ap_dev->device, zq);
- rc = zcrypt_queue_register(zq);
- if (rc)
- zcrypt_queue_free(zq);
-
- return rc;
-}
-
-/*
- * This is called to remove the CEX2A queue driver information
- * if an AP queue device is removed.
- */
-static void zcrypt_cex2a_queue_remove(struct ap_device *ap_dev)
-{
- struct zcrypt_queue *zq = dev_get_drvdata(&ap_dev->device);
-
- zcrypt_queue_unregister(zq);
-}
-
-static struct ap_driver zcrypt_cex2a_queue_driver = {
- .probe = zcrypt_cex2a_queue_probe,
- .remove = zcrypt_cex2a_queue_remove,
- .ids = zcrypt_cex2a_queue_ids,
- .flags = AP_DRIVER_FLAG_DEFAULT,
-};
-
-int __init zcrypt_cex2a_init(void)
-{
- int rc;
-
- rc = ap_driver_register(&zcrypt_cex2a_card_driver,
- THIS_MODULE, "cex2acard");
- if (rc)
- return rc;
-
- rc = ap_driver_register(&zcrypt_cex2a_queue_driver,
- THIS_MODULE, "cex2aqueue");
- if (rc)
- ap_driver_unregister(&zcrypt_cex2a_card_driver);
-
- return rc;
-}
-
-void __exit zcrypt_cex2a_exit(void)
-{
- ap_driver_unregister(&zcrypt_cex2a_queue_driver);
- ap_driver_unregister(&zcrypt_cex2a_card_driver);
-}
-
-module_init(zcrypt_cex2a_init);
-module_exit(zcrypt_cex2a_exit);
diff --git a/drivers/s390/crypto/zcrypt_cex2a.h b/drivers/s390/crypto/zcrypt_cex2a.h
index 7842214d9d09..e69de29bb2d1 100644
--- a/drivers/s390/crypto/zcrypt_cex2a.h
+++ b/drivers/s390/crypto/zcrypt_cex2a.h
@@ -1,134 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Copyright IBM Corp. 2001, 2006
- * Author(s): Robert Burroughs
- * Eric Rossman (edrossma@us.ibm.com)
- *
- * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com)
- * Major cleanup & driver split: Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-
-#ifndef _ZCRYPT_CEX2A_H_
-#define _ZCRYPT_CEX2A_H_
-
-/**
- * The type 50 message family is associated with CEXxA cards.
- *
- * The four members of the family are described below.
- *
- * Note that all unsigned char arrays are right-justified and left-padded
- * with zeroes.
- *
- * Note that all reserved fields must be zeroes.
- */
-struct type50_hdr {
- unsigned char reserved1;
- unsigned char msg_type_code; /* 0x50 */
- unsigned short msg_len;
- unsigned char reserved2;
- unsigned char ignored;
- unsigned short reserved3;
-} __packed;
-
-#define TYPE50_TYPE_CODE 0x50
-
-#define TYPE50_MEB1_FMT 0x0001
-#define TYPE50_MEB2_FMT 0x0002
-#define TYPE50_MEB3_FMT 0x0003
-#define TYPE50_CRB1_FMT 0x0011
-#define TYPE50_CRB2_FMT 0x0012
-#define TYPE50_CRB3_FMT 0x0013
-
-/* Mod-Exp, with a small modulus */
-struct type50_meb1_msg {
- struct type50_hdr header;
- unsigned short keyblock_type; /* 0x0001 */
- unsigned char reserved[6];
- unsigned char exponent[128];
- unsigned char modulus[128];
- unsigned char message[128];
-} __packed;
-
-/* Mod-Exp, with a large modulus */
-struct type50_meb2_msg {
- struct type50_hdr header;
- unsigned short keyblock_type; /* 0x0002 */
- unsigned char reserved[6];
- unsigned char exponent[256];
- unsigned char modulus[256];
- unsigned char message[256];
-} __packed;
-
-/* Mod-Exp, with a larger modulus */
-struct type50_meb3_msg {
- struct type50_hdr header;
- unsigned short keyblock_type; /* 0x0003 */
- unsigned char reserved[6];
- unsigned char exponent[512];
- unsigned char modulus[512];
- unsigned char message[512];
-} __packed;
-
-/* CRT, with a small modulus */
-struct type50_crb1_msg {
- struct type50_hdr header;
- unsigned short keyblock_type; /* 0x0011 */
- unsigned char reserved[6];
- unsigned char p[64];
- unsigned char q[64];
- unsigned char dp[64];
- unsigned char dq[64];
- unsigned char u[64];
- unsigned char message[128];
-} __packed;
-
-/* CRT, with a large modulus */
-struct type50_crb2_msg {
- struct type50_hdr header;
- unsigned short keyblock_type; /* 0x0012 */
- unsigned char reserved[6];
- unsigned char p[128];
- unsigned char q[128];
- unsigned char dp[128];
- unsigned char dq[128];
- unsigned char u[128];
- unsigned char message[256];
-} __packed;
-
-/* CRT, with a larger modulus */
-struct type50_crb3_msg {
- struct type50_hdr header;
- unsigned short keyblock_type; /* 0x0013 */
- unsigned char reserved[6];
- unsigned char p[256];
- unsigned char q[256];
- unsigned char dp[256];
- unsigned char dq[256];
- unsigned char u[256];
- unsigned char message[512];
-} __packed;
-
-/**
- * The type 80 response family is associated with a CEXxA cards.
- *
- * Note that all unsigned char arrays are right-justified and left-padded
- * with zeroes.
- *
- * Note that all reserved fields must be zeroes.
- */
-
-#define TYPE80_RSP_CODE 0x80
-
-struct type80_hdr {
- unsigned char reserved1;
- unsigned char type; /* 0x80 */
- unsigned short len;
- unsigned char code; /* 0x00 */
- unsigned char reserved2[3];
- unsigned char reserved3[8];
-} __packed;
-
-int zcrypt_cex2a_init(void);
-void zcrypt_cex2a_exit(void);
-
-#endif /* _ZCRYPT_CEX2A_H_ */
diff --git a/drivers/s390/crypto/zcrypt_cex2c.c b/drivers/s390/crypto/zcrypt_cex2c.c
index 251b5bd3d19c..e69de29bb2d1 100644
--- a/drivers/s390/crypto/zcrypt_cex2c.c
+++ b/drivers/s390/crypto/zcrypt_cex2c.c
@@ -1,421 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright IBM Corp. 2001, 2018
- * Author(s): Robert Burroughs
- * Eric Rossman (edrossma@us.ibm.com)
- *
- * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com)
- * Major cleanup & driver split: Martin Schwidefsky <schwidefsky@de.ibm.com>
- * Ralph Wuerthner <rwuerthn@de.ibm.com>
- * MSGTYPE restruct: Holger Dengler <hd@linux.vnet.ibm.com>
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/err.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/atomic.h>
-#include <linux/uaccess.h>
-#include <linux/mod_devicetable.h>
-
-#include "ap_bus.h"
-#include "zcrypt_api.h"
-#include "zcrypt_error.h"
-#include "zcrypt_msgtype6.h"
-#include "zcrypt_cex2c.h"
-#include "zcrypt_cca_key.h"
-#include "zcrypt_ccamisc.h"
-
-#define CEX2C_MIN_MOD_SIZE 16 /* 128 bits */
-#define CEX2C_MAX_MOD_SIZE 256 /* 2048 bits */
-#define CEX3C_MIN_MOD_SIZE 16 /* 128 bits */
-#define CEX3C_MAX_MOD_SIZE 512 /* 4096 bits */
-#define CEX2C_MAX_XCRB_MESSAGE_SIZE (12 * 1024)
-#define CEX2C_CLEANUP_TIME (15 * HZ)
-
-MODULE_AUTHOR("IBM Corporation");
-MODULE_DESCRIPTION("CEX2C/CEX3C Cryptographic Coprocessor device driver, " \
- "Copyright IBM Corp. 2001, 2018");
-MODULE_LICENSE("GPL");
-
-static struct ap_device_id zcrypt_cex2c_card_ids[] = {
- { .dev_type = AP_DEVICE_TYPE_CEX2C,
- .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE },
- { .dev_type = AP_DEVICE_TYPE_CEX3C,
- .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE },
- { /* end of list */ },
-};
-
-MODULE_DEVICE_TABLE(ap, zcrypt_cex2c_card_ids);
-
-static struct ap_device_id zcrypt_cex2c_queue_ids[] = {
- { .dev_type = AP_DEVICE_TYPE_CEX2C,
- .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
- { .dev_type = AP_DEVICE_TYPE_CEX3C,
- .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
- { /* end of list */ },
-};
-
-MODULE_DEVICE_TABLE(ap, zcrypt_cex2c_queue_ids);
-
-/*
- * CCA card additional device attributes
- */
-static ssize_t cca_serialnr_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- struct zcrypt_card *zc = dev_get_drvdata(dev);
- struct cca_info ci;
- struct ap_card *ac = to_ap_card(dev);
-
- memset(&ci, 0, sizeof(ci));
-
- if (ap_domain_index >= 0)
- cca_get_info(ac->id, ap_domain_index, &ci, zc->online);
-
- return sysfs_emit(buf, "%s\n", ci.serial);
-}
-
-static struct device_attribute dev_attr_cca_serialnr =
- __ATTR(serialnr, 0444, cca_serialnr_show, NULL);
-
-static struct attribute *cca_card_attrs[] = {
- &dev_attr_cca_serialnr.attr,
- NULL,
-};
-
-static const struct attribute_group cca_card_attr_grp = {
- .attrs = cca_card_attrs,
-};
-
- /*
- * CCA queue additional device attributes
- */
-static ssize_t cca_mkvps_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- struct zcrypt_queue *zq = dev_get_drvdata(dev);
- int n = 0;
- struct cca_info ci;
- static const char * const cao_state[] = { "invalid", "valid" };
- static const char * const new_state[] = { "empty", "partial", "full" };
-
- memset(&ci, 0, sizeof(ci));
-
- cca_get_info(AP_QID_CARD(zq->queue->qid),
- AP_QID_QUEUE(zq->queue->qid),
- &ci, zq->online);
-
- if (ci.new_aes_mk_state >= '1' && ci.new_aes_mk_state <= '3')
- n = sysfs_emit(buf, "AES NEW: %s 0x%016llx\n",
- new_state[ci.new_aes_mk_state - '1'],
- ci.new_aes_mkvp);
- else
- n = sysfs_emit(buf, "AES NEW: - -\n");
-
- if (ci.cur_aes_mk_state >= '1' && ci.cur_aes_mk_state <= '2')
- n += sysfs_emit_at(buf, n, "AES CUR: %s 0x%016llx\n",
- cao_state[ci.cur_aes_mk_state - '1'],
- ci.cur_aes_mkvp);
- else
- n += sysfs_emit_at(buf, n, "AES CUR: - -\n");
-
- if (ci.old_aes_mk_state >= '1' && ci.old_aes_mk_state <= '2')
- n += sysfs_emit_at(buf, n, "AES OLD: %s 0x%016llx\n",
- cao_state[ci.old_aes_mk_state - '1'],
- ci.old_aes_mkvp);
- else
- n += sysfs_emit_at(buf, n, "AES OLD: - -\n");
-
- if (ci.new_apka_mk_state >= '1' && ci.new_apka_mk_state <= '3')
- n += sysfs_emit_at(buf, n, "APKA NEW: %s 0x%016llx\n",
- new_state[ci.new_apka_mk_state - '1'],
- ci.new_apka_mkvp);
- else
- n += sysfs_emit_at(buf, n, "APKA NEW: - -\n");
-
- if (ci.cur_apka_mk_state >= '1' && ci.cur_apka_mk_state <= '2')
- n += sysfs_emit_at(buf, n, "APKA CUR: %s 0x%016llx\n",
- cao_state[ci.cur_apka_mk_state - '1'],
- ci.cur_apka_mkvp);
- else
- n += sysfs_emit_at(buf, n, "APKA CUR: - -\n");
-
- if (ci.old_apka_mk_state >= '1' && ci.old_apka_mk_state <= '2')
- n += sysfs_emit_at(buf, n, "APKA OLD: %s 0x%016llx\n",
- cao_state[ci.old_apka_mk_state - '1'],
- ci.old_apka_mkvp);
- else
- n += sysfs_emit_at(buf, n, "APKA OLD: - -\n");
-
- return n;
-}
-
-static struct device_attribute dev_attr_cca_mkvps =
- __ATTR(mkvps, 0444, cca_mkvps_show, NULL);
-
-static struct attribute *cca_queue_attrs[] = {
- &dev_attr_cca_mkvps.attr,
- NULL,
-};
-
-static const struct attribute_group cca_queue_attr_grp = {
- .attrs = cca_queue_attrs,
-};
-
-/*
- * Large random number detection function. Its sends a message to a CEX2C/CEX3C
- * card to find out if large random numbers are supported.
- * @ap_dev: pointer to the AP device.
- *
- * Returns 1 if large random numbers are supported, 0 if not and < 0 on error.
- */
-static int zcrypt_cex2c_rng_supported(struct ap_queue *aq)
-{
- struct ap_message ap_msg;
- unsigned long psmid;
- unsigned int domain;
- struct {
- struct type86_hdr hdr;
- struct type86_fmt2_ext fmt2;
- struct CPRBX cprbx;
- } __packed *reply;
- struct {
- struct type6_hdr hdr;
- struct CPRBX cprbx;
- char function_code[2];
- short int rule_length;
- char rule[8];
- short int verb_length;
- short int key_length;
- } __packed *msg;
- int rc, i;
-
- ap_init_message(&ap_msg);
- ap_msg.msg = (void *)get_zeroed_page(GFP_KERNEL);
- if (!ap_msg.msg)
- return -ENOMEM;
- ap_msg.bufsize = PAGE_SIZE;
-
- rng_type6cprb_msgx(&ap_msg, 4, &domain);
-
- msg = ap_msg.msg;
- msg->cprbx.domain = AP_QID_QUEUE(aq->qid);
-
- rc = ap_send(aq->qid, 0x0102030405060708UL, ap_msg.msg, ap_msg.len);
- if (rc)
- goto out_free;
-
- /* Wait for the test message to complete. */
- for (i = 0; i < 2 * HZ; i++) {
- msleep(1000 / HZ);
- rc = ap_recv(aq->qid, &psmid, ap_msg.msg, ap_msg.bufsize);
- if (rc == 0 && psmid == 0x0102030405060708UL)
- break;
- }
-
- if (i >= 2 * HZ) {
- /* Got no answer. */
- rc = -ENODEV;
- goto out_free;
- }
-
- reply = ap_msg.msg;
- if (reply->cprbx.ccp_rtcode == 0 && reply->cprbx.ccp_rscode == 0)
- rc = 1;
- else
- rc = 0;
-out_free:
- free_page((unsigned long)ap_msg.msg);
- return rc;
-}
-
-/*
- * Probe function for CEX2C/CEX3C card devices. It always accepts the
- * AP device since the bus_match already checked the hardware type.
- * @ap_dev: pointer to the AP card device.
- */
-static int zcrypt_cex2c_card_probe(struct ap_device *ap_dev)
-{
- /*
- * Normalized speed ratings per crypto adapter
- * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY
- */
- static const int CEX2C_SPEED_IDX[] = {
- 1000, 1400, 2400, 1100, 1500, 2600, 100, 12};
- static const int CEX3C_SPEED_IDX[] = {
- 500, 700, 1400, 550, 800, 1500, 80, 10};
-
- struct ap_card *ac = to_ap_card(&ap_dev->device);
- struct zcrypt_card *zc;
- int rc = 0;
-
- zc = zcrypt_card_alloc();
- if (!zc)
- return -ENOMEM;
- zc->card = ac;
- dev_set_drvdata(&ap_dev->device, zc);
- switch (ac->ap_dev.device_type) {
- case AP_DEVICE_TYPE_CEX2C:
- zc->user_space_type = ZCRYPT_CEX2C;
- zc->type_string = "CEX2C";
- zc->speed_rating = CEX2C_SPEED_IDX;
- zc->min_mod_size = CEX2C_MIN_MOD_SIZE;
- zc->max_mod_size = CEX2C_MAX_MOD_SIZE;
- zc->max_exp_bit_length = CEX2C_MAX_MOD_SIZE;
- break;
- case AP_DEVICE_TYPE_CEX3C:
- zc->user_space_type = ZCRYPT_CEX3C;
- zc->type_string = "CEX3C";
- zc->speed_rating = CEX3C_SPEED_IDX;
- zc->min_mod_size = CEX3C_MIN_MOD_SIZE;
- zc->max_mod_size = CEX3C_MAX_MOD_SIZE;
- zc->max_exp_bit_length = CEX3C_MAX_MOD_SIZE;
- break;
- default:
- zcrypt_card_free(zc);
- return -ENODEV;
- }
- zc->online = 1;
-
- rc = zcrypt_card_register(zc);
- if (rc) {
- zcrypt_card_free(zc);
- return rc;
- }
-
- if (ap_test_bit(&ac->functions, AP_FUNC_COPRO)) {
- rc = sysfs_create_group(&ap_dev->device.kobj,
- &cca_card_attr_grp);
- if (rc) {
- zcrypt_card_unregister(zc);
- zcrypt_card_free(zc);
- }
- }
-
- return rc;
-}
-
-/*
- * This is called to remove the CEX2C/CEX3C card driver information
- * if an AP card device is removed.
- */
-static void zcrypt_cex2c_card_remove(struct ap_device *ap_dev)
-{
- struct zcrypt_card *zc = dev_get_drvdata(&ap_dev->device);
- struct ap_card *ac = to_ap_card(&ap_dev->device);
-
- if (ap_test_bit(&ac->functions, AP_FUNC_COPRO))
- sysfs_remove_group(&ap_dev->device.kobj, &cca_card_attr_grp);
-
- zcrypt_card_unregister(zc);
-}
-
-static struct ap_driver zcrypt_cex2c_card_driver = {
- .probe = zcrypt_cex2c_card_probe,
- .remove = zcrypt_cex2c_card_remove,
- .ids = zcrypt_cex2c_card_ids,
- .flags = AP_DRIVER_FLAG_DEFAULT,
-};
-
-/*
- * Probe function for CEX2C/CEX3C queue devices. It always accepts the
- * AP device since the bus_match already checked the hardware type.
- * @ap_dev: pointer to the AP card device.
- */
-static int zcrypt_cex2c_queue_probe(struct ap_device *ap_dev)
-{
- struct ap_queue *aq = to_ap_queue(&ap_dev->device);
- struct zcrypt_queue *zq;
- int rc;
-
- zq = zcrypt_queue_alloc(CEX2C_MAX_XCRB_MESSAGE_SIZE);
- if (!zq)
- return -ENOMEM;
- zq->queue = aq;
- zq->online = 1;
- atomic_set(&zq->load, 0);
- ap_rapq(aq->qid, 0);
- rc = zcrypt_cex2c_rng_supported(aq);
- if (rc < 0) {
- zcrypt_queue_free(zq);
- return rc;
- }
- if (rc)
- zq->ops = zcrypt_msgtype(MSGTYPE06_NAME,
- MSGTYPE06_VARIANT_DEFAULT);
- else
- zq->ops = zcrypt_msgtype(MSGTYPE06_NAME,
- MSGTYPE06_VARIANT_NORNG);
- ap_queue_init_state(aq);
- ap_queue_init_reply(aq, &zq->reply);
- aq->request_timeout = CEX2C_CLEANUP_TIME;
- dev_set_drvdata(&ap_dev->device, zq);
- rc = zcrypt_queue_register(zq);
- if (rc) {
- zcrypt_queue_free(zq);
- return rc;
- }
-
- if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) {
- rc = sysfs_create_group(&ap_dev->device.kobj,
- &cca_queue_attr_grp);
- if (rc) {
- zcrypt_queue_unregister(zq);
- zcrypt_queue_free(zq);
- }
- }
-
- return rc;
-}
-
-/*
- * This is called to remove the CEX2C/CEX3C queue driver information
- * if an AP queue device is removed.
- */
-static void zcrypt_cex2c_queue_remove(struct ap_device *ap_dev)
-{
- struct zcrypt_queue *zq = dev_get_drvdata(&ap_dev->device);
- struct ap_queue *aq = to_ap_queue(&ap_dev->device);
-
- if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO))
- sysfs_remove_group(&ap_dev->device.kobj, &cca_queue_attr_grp);
-
- zcrypt_queue_unregister(zq);
-}
-
-static struct ap_driver zcrypt_cex2c_queue_driver = {
- .probe = zcrypt_cex2c_queue_probe,
- .remove = zcrypt_cex2c_queue_remove,
- .ids = zcrypt_cex2c_queue_ids,
- .flags = AP_DRIVER_FLAG_DEFAULT,
-};
-
-int __init zcrypt_cex2c_init(void)
-{
- int rc;
-
- rc = ap_driver_register(&zcrypt_cex2c_card_driver,
- THIS_MODULE, "cex2card");
- if (rc)
- return rc;
-
- rc = ap_driver_register(&zcrypt_cex2c_queue_driver,
- THIS_MODULE, "cex2cqueue");
- if (rc)
- ap_driver_unregister(&zcrypt_cex2c_card_driver);
-
- return rc;
-}
-
-void zcrypt_cex2c_exit(void)
-{
- ap_driver_unregister(&zcrypt_cex2c_queue_driver);
- ap_driver_unregister(&zcrypt_cex2c_card_driver);
-}
-
-module_init(zcrypt_cex2c_init);
-module_exit(zcrypt_cex2c_exit);
diff --git a/drivers/s390/crypto/zcrypt_cex2c.h b/drivers/s390/crypto/zcrypt_cex2c.h
index 6ec405c2bec2..e69de29bb2d1 100644
--- a/drivers/s390/crypto/zcrypt_cex2c.h
+++ b/drivers/s390/crypto/zcrypt_cex2c.h
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Copyright IBM Corp. 2001, 2018
- * Author(s): Robert Burroughs
- * Eric Rossman (edrossma@us.ibm.com)
- *
- * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com)
- * Major cleanup & driver split: Martin Schwidefsky <schwidefsky@de.ibm.com>
- * MSGTYPE restruct: Holger Dengler <hd@linux.vnet.ibm.com>
- */
-
-#ifndef _ZCRYPT_CEX2C_H_
-#define _ZCRYPT_CEX2C_H_
-
-int zcrypt_cex2c_init(void);
-void zcrypt_cex2c_exit(void);
-
-#endif /* _ZCRYPT_CEX2C_H_ */
diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c
index 958f5ee47f1b..0a877f9792c2 100644
--- a/drivers/s390/crypto/zcrypt_ep11misc.c
+++ b/drivers/s390/crypto/zcrypt_ep11misc.c
@@ -29,6 +29,8 @@
#define DEBUG_WARN(...) ZCRYPT_DBF(DBF_WARN, ##__VA_ARGS__)
#define DEBUG_ERR(...) ZCRYPT_DBF(DBF_ERR, ##__VA_ARGS__)
+#define EP11_PINBLOB_V1_BYTES 56
+
/* default iv used here */
static const u8 def_iv[16] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff };
@@ -113,6 +115,109 @@ static void __exit card_cache_free(void)
spin_unlock_bh(&card_list_lock);
}
+static int ep11_kb_split(const u8 *kb, size_t kblen, u32 kbver,
+ struct ep11kblob_header **kbhdr, size_t *kbhdrsize,
+ u8 **kbpl, size_t *kbplsize)
+{
+ struct ep11kblob_header *hdr = NULL;
+ size_t hdrsize, plsize = 0;
+ int rc = -EINVAL;
+ u8 *pl = NULL;
+
+ if (kblen < sizeof(struct ep11kblob_header))
+ goto out;
+ hdr = (struct ep11kblob_header *)kb;
+
+ switch (kbver) {
+ case TOKVER_EP11_AES:
+ /* header overlays the payload */
+ hdrsize = 0;
+ break;
+ case TOKVER_EP11_ECC_WITH_HEADER:
+ case TOKVER_EP11_AES_WITH_HEADER:
+ /* payload starts after the header */
+ hdrsize = sizeof(struct ep11kblob_header);
+ break;
+ default:
+ goto out;
+ }
+
+ plsize = kblen - hdrsize;
+ pl = (u8 *)kb + hdrsize;
+
+ if (kbhdr)
+ *kbhdr = hdr;
+ if (kbhdrsize)
+ *kbhdrsize = hdrsize;
+ if (kbpl)
+ *kbpl = pl;
+ if (kbplsize)
+ *kbplsize = plsize;
+
+ rc = 0;
+out:
+ return rc;
+}
+
+static int ep11_kb_decode(const u8 *kb, size_t kblen,
+ struct ep11kblob_header **kbhdr, size_t *kbhdrsize,
+ struct ep11keyblob **kbpl, size_t *kbplsize)
+{
+ struct ep11kblob_header *tmph, *hdr = NULL;
+ size_t hdrsize = 0, plsize = 0;
+ struct ep11keyblob *pl = NULL;
+ int rc = -EINVAL;
+ u8 *tmpp;
+
+ if (kblen < sizeof(struct ep11kblob_header))
+ goto out;
+ tmph = (struct ep11kblob_header *)kb;
+
+ if (tmph->type != TOKTYPE_NON_CCA &&
+ tmph->len > kblen)
+ goto out;
+
+ if (ep11_kb_split(kb, kblen, tmph->version,
+ &hdr, &hdrsize, &tmpp, &plsize))
+ goto out;
+
+ if (plsize < sizeof(struct ep11keyblob))
+ goto out;
+
+ if (!is_ep11_keyblob(tmpp))
+ goto out;
+
+ pl = (struct ep11keyblob *)tmpp;
+ plsize = hdr->len - hdrsize;
+
+ if (kbhdr)
+ *kbhdr = hdr;
+ if (kbhdrsize)
+ *kbhdrsize = hdrsize;
+ if (kbpl)
+ *kbpl = pl;
+ if (kbplsize)
+ *kbplsize = plsize;
+
+ rc = 0;
+out:
+ return rc;
+}
+
+/*
+ * For valid ep11 keyblobs, returns a reference to the wrappingkey verification
+ * pattern. Otherwise NULL.
+ */
+const u8 *ep11_kb_wkvp(const u8 *keyblob, size_t keybloblen)
+{
+ struct ep11keyblob *kb;
+
+ if (ep11_kb_decode(keyblob, keybloblen, NULL, NULL, &kb, NULL))
+ return NULL;
+ return kb->wkvp;
+}
+EXPORT_SYMBOL(ep11_kb_wkvp);
+
/*
* Simple check if the key blob is a valid EP11 AES key blob with header.
*/
@@ -489,7 +594,7 @@ static int ep11_query_info(u16 cardnr, u16 domain, u32 query_type,
struct ep11_cprb *req = NULL, *rep = NULL;
struct ep11_target_dev target;
struct ep11_urb *urb = NULL;
- int api = 1, rc = -ENOMEM;
+ int api = EP11_API_V1, rc = -ENOMEM;
/* request cprb and payload */
req = alloc_cprb(sizeof(struct ep11_info_req_pl));
@@ -664,8 +769,9 @@ EXPORT_SYMBOL(ep11_get_domain_info);
*/
#define KEY_ATTR_DEFAULTS 0x00200c00
-int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
- u8 *keybuf, size_t *keybufsize)
+static int _ep11_genaeskey(u16 card, u16 domain,
+ u32 keybitsize, u32 keygenflags,
+ u8 *keybuf, size_t *keybufsize)
{
struct keygen_req_pl {
struct pl_head head;
@@ -685,8 +791,7 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
u32 attr_bool_bits;
u32 attr_val_len_type;
u32 attr_val_len_value;
- u8 pin_tag;
- u8 pin_len;
+ /* followed by empty pin tag or empty pinblob tag */
} __packed * req_pl;
struct keygen_rep_pl {
struct pl_head head;
@@ -699,10 +804,11 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
u8 data[512];
} __packed * rep_pl;
struct ep11_cprb *req = NULL, *rep = NULL;
+ size_t req_pl_size, pinblob_size = 0;
struct ep11_target_dev target;
struct ep11_urb *urb = NULL;
- struct ep11keyblob *kb;
int api, rc = -ENOMEM;
+ u8 *p;
switch (keybitsize) {
case 128:
@@ -718,12 +824,22 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
}
/* request cprb and payload */
- req = alloc_cprb(sizeof(struct keygen_req_pl));
+ api = (!keygenflags || keygenflags & 0x00200000) ?
+ EP11_API_V4 : EP11_API_V1;
+ if (ap_is_se_guest()) {
+ /*
+ * genkey within SE environment requires API ordinal 6
+ * with empty pinblob
+ */
+ api = EP11_API_V6;
+ pinblob_size = EP11_PINBLOB_V1_BYTES;
+ }
+ req_pl_size = sizeof(struct keygen_req_pl) + ASN1TAGLEN(pinblob_size);
+ req = alloc_cprb(req_pl_size);
if (!req)
goto out;
req_pl = (struct keygen_req_pl *)(((u8 *)req) + sizeof(*req));
- api = (!keygenflags || keygenflags & 0x00200000) ? 4 : 1;
- prep_head(&req_pl->head, sizeof(*req_pl), api, 21); /* GenerateKey */
+ prep_head(&req_pl->head, req_pl_size, api, 21); /* GenerateKey */
req_pl->var_tag = 0x04;
req_pl->var_len = sizeof(u32);
req_pl->keybytes_tag = 0x04;
@@ -739,7 +855,10 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
req_pl->attr_bool_bits = keygenflags ? keygenflags : KEY_ATTR_DEFAULTS;
req_pl->attr_val_len_type = 0x00000161; /* CKA_VALUE_LEN */
req_pl->attr_val_len_value = keybitsize / 8;
- req_pl->pin_tag = 0x04;
+ p = ((u8 *)req_pl) + sizeof(*req_pl);
+ /* pin tag */
+ *p++ = 0x04;
+ *p++ = pinblob_size;
/* reply cprb and payload */
rep = alloc_cprb(sizeof(struct keygen_rep_pl));
@@ -754,7 +873,7 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
target.ap_id = card;
target.dom_id = domain;
prep_urb(urb, &target, 1,
- req, sizeof(*req) + sizeof(*req_pl),
+ req, sizeof(*req) + req_pl_size,
rep, sizeof(*rep) + sizeof(*rep_pl));
rc = zcrypt_send_ep11_cprb(urb);
@@ -780,14 +899,9 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
goto out;
}
- /* copy key blob and set header values */
+ /* copy key blob */
memcpy(keybuf, rep_pl->data, rep_pl->data_len);
*keybufsize = rep_pl->data_len;
- kb = (struct ep11keyblob *)keybuf;
- kb->head.type = TOKTYPE_NON_CCA;
- kb->head.len = rep_pl->data_len;
- kb->head.version = TOKVER_EP11_AES;
- kb->head.keybitlen = keybitsize;
out:
kfree(req);
@@ -795,6 +909,43 @@ out:
kfree(urb);
return rc;
}
+
+int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
+ u8 *keybuf, size_t *keybufsize, u32 keybufver)
+{
+ struct ep11kblob_header *hdr;
+ size_t hdr_size, pl_size;
+ u8 *pl;
+ int rc;
+
+ switch (keybufver) {
+ case TOKVER_EP11_AES:
+ case TOKVER_EP11_AES_WITH_HEADER:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ rc = ep11_kb_split(keybuf, *keybufsize, keybufver,
+ &hdr, &hdr_size, &pl, &pl_size);
+ if (rc)
+ return rc;
+
+ rc = _ep11_genaeskey(card, domain, keybitsize, keygenflags,
+ pl, &pl_size);
+ if (rc)
+ return rc;
+
+ *keybufsize = hdr_size + pl_size;
+
+ /* update header information */
+ hdr->type = TOKTYPE_NON_CCA;
+ hdr->len = *keybufsize;
+ hdr->version = keybufver;
+ hdr->bitlen = keybitsize;
+
+ return 0;
+}
EXPORT_SYMBOL(ep11_genaeskey);
static int ep11_cryptsingle(u16 card, u16 domain,
@@ -830,7 +981,7 @@ static int ep11_cryptsingle(u16 card, u16 domain,
struct ep11_target_dev target;
struct ep11_urb *urb = NULL;
size_t req_pl_size, rep_pl_size;
- int n, api = 1, rc = -ENOMEM;
+ int n, api = EP11_API_V1, rc = -ENOMEM;
u8 *p;
/* the simple asn1 coding used has length limits */
@@ -924,12 +1075,12 @@ out:
return rc;
}
-static int ep11_unwrapkey(u16 card, u16 domain,
- const u8 *kek, size_t keksize,
- const u8 *enckey, size_t enckeysize,
- u32 mech, const u8 *iv,
- u32 keybitsize, u32 keygenflags,
- u8 *keybuf, size_t *keybufsize)
+static int _ep11_unwrapkey(u16 card, u16 domain,
+ const u8 *kek, size_t keksize,
+ const u8 *enckey, size_t enckeysize,
+ u32 mech, const u8 *iv,
+ u32 keybitsize, u32 keygenflags,
+ u8 *keybuf, size_t *keybufsize)
{
struct uw_req_pl {
struct pl_head head;
@@ -949,7 +1100,7 @@ static int ep11_unwrapkey(u16 card, u16 domain,
* maybe followed by iv data
* followed by kek tag + kek blob
* followed by empty mac tag
- * followed by empty pin tag
+ * followed by empty pin tag or empty pinblob tag
* followed by encryted key tag + bytes
*/
} __packed * req_pl;
@@ -964,21 +1115,30 @@ static int ep11_unwrapkey(u16 card, u16 domain,
u8 data[512];
} __packed * rep_pl;
struct ep11_cprb *req = NULL, *rep = NULL;
+ size_t req_pl_size, pinblob_size = 0;
struct ep11_target_dev target;
struct ep11_urb *urb = NULL;
- struct ep11keyblob *kb;
- size_t req_pl_size;
int api, rc = -ENOMEM;
u8 *p;
/* request cprb and payload */
+ api = (!keygenflags || keygenflags & 0x00200000) ?
+ EP11_API_V4 : EP11_API_V1;
+ if (ap_is_se_guest()) {
+ /*
+ * unwrap within SE environment requires API ordinal 6
+ * with empty pinblob
+ */
+ api = EP11_API_V6;
+ pinblob_size = EP11_PINBLOB_V1_BYTES;
+ }
req_pl_size = sizeof(struct uw_req_pl) + (iv ? 16 : 0)
- + ASN1TAGLEN(keksize) + 4 + ASN1TAGLEN(enckeysize);
+ + ASN1TAGLEN(keksize) + ASN1TAGLEN(0)
+ + ASN1TAGLEN(pinblob_size) + ASN1TAGLEN(enckeysize);
req = alloc_cprb(req_pl_size);
if (!req)
goto out;
req_pl = (struct uw_req_pl *)(((u8 *)req) + sizeof(*req));
- api = (!keygenflags || keygenflags & 0x00200000) ? 4 : 1;
prep_head(&req_pl->head, req_pl_size, api, 34); /* UnwrapKey */
req_pl->attr_tag = 0x04;
req_pl->attr_len = 7 * sizeof(u32);
@@ -1003,9 +1163,10 @@ static int ep11_unwrapkey(u16 card, u16 domain,
/* empty mac key tag */
*p++ = 0x04;
*p++ = 0;
- /* empty pin tag */
+ /* pin tag */
*p++ = 0x04;
- *p++ = 0;
+ *p++ = pinblob_size;
+ p += pinblob_size;
/* encrypted key value tag and bytes */
p += asn1tag_write(p, 0x04, enckey, enckeysize);
@@ -1048,14 +1209,9 @@ static int ep11_unwrapkey(u16 card, u16 domain,
goto out;
}
- /* copy key blob and set header values */
+ /* copy key blob */
memcpy(keybuf, rep_pl->data, rep_pl->data_len);
*keybufsize = rep_pl->data_len;
- kb = (struct ep11keyblob *)keybuf;
- kb->head.type = TOKTYPE_NON_CCA;
- kb->head.len = rep_pl->data_len;
- kb->head.version = TOKVER_EP11_AES;
- kb->head.keybitlen = keybitsize;
out:
kfree(req);
@@ -1064,10 +1220,46 @@ out:
return rc;
}
-static int ep11_wrapkey(u16 card, u16 domain,
- const u8 *key, size_t keysize,
- u32 mech, const u8 *iv,
- u8 *databuf, size_t *datasize)
+static int ep11_unwrapkey(u16 card, u16 domain,
+ const u8 *kek, size_t keksize,
+ const u8 *enckey, size_t enckeysize,
+ u32 mech, const u8 *iv,
+ u32 keybitsize, u32 keygenflags,
+ u8 *keybuf, size_t *keybufsize,
+ u8 keybufver)
+{
+ struct ep11kblob_header *hdr;
+ size_t hdr_size, pl_size;
+ u8 *pl;
+ int rc;
+
+ rc = ep11_kb_split(keybuf, *keybufsize, keybufver,
+ &hdr, &hdr_size, &pl, &pl_size);
+ if (rc)
+ return rc;
+
+ rc = _ep11_unwrapkey(card, domain, kek, keksize, enckey, enckeysize,
+ mech, iv, keybitsize, keygenflags,
+ pl, &pl_size);
+ if (rc)
+ return rc;
+
+ *keybufsize = hdr_size + pl_size;
+
+ /* update header information */
+ hdr = (struct ep11kblob_header *)keybuf;
+ hdr->type = TOKTYPE_NON_CCA;
+ hdr->len = *keybufsize;
+ hdr->version = keybufver;
+ hdr->bitlen = keybitsize;
+
+ return 0;
+}
+
+static int _ep11_wrapkey(u16 card, u16 domain,
+ const u8 *key, size_t keysize,
+ u32 mech, const u8 *iv,
+ u8 *databuf, size_t *datasize)
{
struct wk_req_pl {
struct pl_head head;
@@ -1097,20 +1289,10 @@ static int ep11_wrapkey(u16 card, u16 domain,
struct ep11_cprb *req = NULL, *rep = NULL;
struct ep11_target_dev target;
struct ep11_urb *urb = NULL;
- struct ep11keyblob *kb;
size_t req_pl_size;
int api, rc = -ENOMEM;
- bool has_header = false;
u8 *p;
- /* maybe the session field holds a header with key info */
- kb = (struct ep11keyblob *)key;
- if (kb->head.type == TOKTYPE_NON_CCA &&
- kb->head.version == TOKVER_EP11_AES) {
- has_header = true;
- keysize = min_t(size_t, kb->head.len, keysize);
- }
-
/* request cprb and payload */
req_pl_size = sizeof(struct wk_req_pl) + (iv ? 16 : 0)
+ ASN1TAGLEN(keysize) + 4;
@@ -1120,7 +1302,8 @@ static int ep11_wrapkey(u16 card, u16 domain,
if (!mech || mech == 0x80060001)
req->flags |= 0x20; /* CPACF_WRAP needs special bit */
req_pl = (struct wk_req_pl *)(((u8 *)req) + sizeof(*req));
- api = (!mech || mech == 0x80060001) ? 4 : 1; /* CKM_IBM_CPACF_WRAP */
+ api = (!mech || mech == 0x80060001) ? /* CKM_IBM_CPACF_WRAP */
+ EP11_API_V4 : EP11_API_V1;
prep_head(&req_pl->head, req_pl_size, api, 33); /* WrapKey */
req_pl->var_tag = 0x04;
req_pl->var_len = sizeof(u32);
@@ -1135,11 +1318,6 @@ static int ep11_wrapkey(u16 card, u16 domain,
}
/* key blob */
p += asn1tag_write(p, 0x04, key, keysize);
- /* maybe the key argument needs the head data cleaned out */
- if (has_header) {
- kb = (struct ep11keyblob *)(p - keysize);
- memset(&kb->head, 0, sizeof(kb->head));
- }
/* empty kek tag */
*p++ = 0x04;
*p++ = 0;
@@ -1198,10 +1376,10 @@ out:
}
int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
- const u8 *clrkey, u8 *keybuf, size_t *keybufsize)
+ const u8 *clrkey, u8 *keybuf, size_t *keybufsize,
+ u32 keytype)
{
int rc;
- struct ep11keyblob *kb;
u8 encbuf[64], *kek = NULL;
size_t clrkeylen, keklen, encbuflen = sizeof(encbuf);
@@ -1223,17 +1401,15 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
}
/* Step 1: generate AES 256 bit random kek key */
- rc = ep11_genaeskey(card, domain, 256,
- 0x00006c00, /* EN/DECRYPT, WRAP/UNWRAP */
- kek, &keklen);
+ rc = _ep11_genaeskey(card, domain, 256,
+ 0x00006c00, /* EN/DECRYPT, WRAP/UNWRAP */
+ kek, &keklen);
if (rc) {
DEBUG_ERR(
"%s generate kek key failed, rc=%d\n",
__func__, rc);
goto out;
}
- kb = (struct ep11keyblob *)kek;
- memset(&kb->head, 0, sizeof(kb->head));
/* Step 2: encrypt clear key value with the kek key */
rc = ep11_cryptsingle(card, domain, 0, 0, def_iv, kek, keklen,
@@ -1248,7 +1424,7 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
/* Step 3: import the encrypted key value as a new key */
rc = ep11_unwrapkey(card, domain, kek, keklen,
encbuf, encbuflen, 0, def_iv,
- keybitsize, 0, keybuf, keybufsize);
+ keybitsize, 0, keybuf, keybufsize, keytype);
if (rc) {
DEBUG_ERR(
"%s importing key value as new key failed,, rc=%d\n",
@@ -1262,11 +1438,12 @@ out:
}
EXPORT_SYMBOL(ep11_clr2keyblob);
-int ep11_kblob2protkey(u16 card, u16 dom, const u8 *keyblob, size_t keybloblen,
+int ep11_kblob2protkey(u16 card, u16 dom,
+ const u8 *keyblob, size_t keybloblen,
u8 *protkey, u32 *protkeylen, u32 *protkeytype)
{
- int rc = -EIO;
- u8 *wkbuf = NULL;
+ struct ep11kblob_header *hdr;
+ struct ep11keyblob *key;
size_t wkbuflen, keylen;
struct wk_info {
u16 version;
@@ -1277,31 +1454,17 @@ int ep11_kblob2protkey(u16 card, u16 dom, const u8 *keyblob, size_t keybloblen,
u8 res2[8];
u8 pkey[];
} __packed * wki;
- const u8 *key;
- struct ep11kblob_header *hdr;
+ u8 *wkbuf = NULL;
+ int rc = -EIO;
- /* key with or without header ? */
- hdr = (struct ep11kblob_header *)keyblob;
- if (hdr->type == TOKTYPE_NON_CCA &&
- (hdr->version == TOKVER_EP11_AES_WITH_HEADER ||
- hdr->version == TOKVER_EP11_ECC_WITH_HEADER) &&
- is_ep11_keyblob(keyblob + sizeof(struct ep11kblob_header))) {
- /* EP11 AES or ECC key with header */
- key = keyblob + sizeof(struct ep11kblob_header);
- keylen = hdr->len - sizeof(struct ep11kblob_header);
- } else if (hdr->type == TOKTYPE_NON_CCA &&
- hdr->version == TOKVER_EP11_AES &&
- is_ep11_keyblob(keyblob)) {
- /* EP11 AES key (old style) */
- key = keyblob;
- keylen = hdr->len;
- } else if (is_ep11_keyblob(keyblob)) {
- /* raw EP11 key blob */
- key = keyblob;
- keylen = keybloblen;
- } else {
+ if (ep11_kb_decode((u8 *)keyblob, keybloblen, &hdr, NULL, &key, &keylen))
return -EINVAL;
+
+ if (hdr->version == TOKVER_EP11_AES) {
+ /* wipe overlayed header */
+ memset(hdr, 0, sizeof(*hdr));
}
+ /* !!! hdr is no longer a valid header !!! */
/* alloc temp working buffer */
wkbuflen = (keylen + AES_BLOCK_SIZE) & (~(AES_BLOCK_SIZE - 1));
@@ -1310,8 +1473,8 @@ int ep11_kblob2protkey(u16 card, u16 dom, const u8 *keyblob, size_t keybloblen,
return -ENOMEM;
/* ep11 secure key -> protected key + info */
- rc = ep11_wrapkey(card, dom, key, keylen,
- 0, def_iv, wkbuf, &wkbuflen);
+ rc = _ep11_wrapkey(card, dom, (u8 *)key, keylen,
+ 0, def_iv, wkbuf, &wkbuflen);
if (rc) {
DEBUG_ERR(
"%s rewrapping ep11 key to pkey failed, rc=%d\n",
diff --git a/drivers/s390/crypto/zcrypt_ep11misc.h b/drivers/s390/crypto/zcrypt_ep11misc.h
index a3eddf51242d..9d17fd5228a7 100644
--- a/drivers/s390/crypto/zcrypt_ep11misc.h
+++ b/drivers/s390/crypto/zcrypt_ep11misc.h
@@ -12,7 +12,9 @@
#include <asm/zcrypt.h>
#include <asm/pkey.h>
-#define EP11_API_V 4 /* highest known and supported EP11 API version */
+#define EP11_API_V1 1 /* min EP11 API, default if no higher api required */
+#define EP11_API_V4 4 /* supported EP11 API for the ep11misc cprbs */
+#define EP11_API_V6 6 /* min EP11 API for some cprbs in SE environment */
#define EP11_STRUCT_MAGIC 0x1234
#define EP11_BLOB_PKEY_EXTRACTABLE 0x00200000
@@ -29,14 +31,7 @@ struct ep11keyblob {
union {
u8 session[32];
/* only used for PKEY_TYPE_EP11: */
- struct {
- u8 type; /* 0x00 (TOKTYPE_NON_CCA) */
- u8 res0; /* unused */
- u16 len; /* total length in bytes of this blob */
- u8 version; /* 0x03 (TOKVER_EP11_AES) */
- u8 res1; /* unused */
- u16 keybitlen; /* clear key bit len, 0 for unknown */
- } head;
+ struct ep11kblob_header head;
};
u8 wkvp[16]; /* wrapping key verification pattern */
u64 attr; /* boolean key attributes */
@@ -56,6 +51,12 @@ static inline bool is_ep11_keyblob(const u8 *key)
}
/*
+ * For valid ep11 keyblobs, returns a reference to the wrappingkey verification
+ * pattern. Otherwise NULL.
+ */
+const u8 *ep11_kb_wkvp(const u8 *kblob, size_t kbloblen);
+
+/*
* Simple check if the key blob is a valid EP11 AES key blob with header.
* If checkcpacfexport is enabled, the key is also checked for the
* attributes needed to export this key for CPACF use.
@@ -114,13 +115,14 @@ int ep11_get_domain_info(u16 card, u16 domain, struct ep11_domain_info *info);
* Generate (random) EP11 AES secure key.
*/
int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
- u8 *keybuf, size_t *keybufsize);
+ u8 *keybuf, size_t *keybufsize, u32 keybufver);
/*
* Generate EP11 AES secure key with given clear key value.
*/
int ep11_clr2keyblob(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags,
- const u8 *clrkey, u8 *keybuf, size_t *keybufsize);
+ const u8 *clrkey, u8 *keybuf, size_t *keybufsize,
+ u32 keytype);
/*
* Build a list of ep11 apqns meeting the following constrains:
diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c
index 51f8f7a463f7..2e155de8abe5 100644
--- a/drivers/s390/crypto/zcrypt_msgtype50.c
+++ b/drivers/s390/crypto/zcrypt_msgtype50.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * Copyright IBM Corp. 2001, 2012
+ * Copyright IBM Corp. 2001, 2023
* Author(s): Robert Burroughs
* Eric Rossman (edrossma@us.ibm.com)
*
@@ -28,15 +28,12 @@
/* >= CEX3A: 4096 bits */
#define CEX3A_MAX_MOD_SIZE 512
-/* CEX2A: max outputdatalength + type80_hdr */
-#define CEX2A_MAX_RESPONSE_SIZE 0x110
-
/* >= CEX3A: 512 bit modulus, (max outputdatalength) + type80_hdr */
#define CEX3A_MAX_RESPONSE_SIZE 0x210
MODULE_AUTHOR("IBM Corporation");
MODULE_DESCRIPTION("Cryptographic Accelerator (message type 50), " \
- "Copyright IBM Corp. 2001, 2012");
+ "Copyright IBM Corp. 2001, 2023");
MODULE_LICENSE("GPL");
/*
@@ -366,20 +363,17 @@ static int convert_type80(struct zcrypt_queue *zq,
ap_send_online_uevent(&zq->queue->ap_dev, zq->online);
return -EAGAIN;
}
- if (zq->zcard->user_space_type == ZCRYPT_CEX2A)
- BUG_ON(t80h->len > CEX2A_MAX_RESPONSE_SIZE);
- else
- BUG_ON(t80h->len > CEX3A_MAX_RESPONSE_SIZE);
+ BUG_ON(t80h->len > CEX3A_MAX_RESPONSE_SIZE);
data = reply->msg + t80h->len - outputdatalength;
if (copy_to_user(outputdata, data, outputdatalength))
return -EFAULT;
return 0;
}
-static int convert_response_cex2a(struct zcrypt_queue *zq,
- struct ap_message *reply,
- char __user *outputdata,
- unsigned int outputdatalength)
+static int convert_response(struct zcrypt_queue *zq,
+ struct ap_message *reply,
+ char __user *outputdata,
+ unsigned int outputdatalength)
{
/* Response type byte is the second byte in the response. */
unsigned char rtype = ((unsigned char *)reply->msg)[1];
@@ -414,9 +408,9 @@ static int convert_response_cex2a(struct zcrypt_queue *zq,
* @msg: pointer to the AP message
* @reply: pointer to the AP reply message
*/
-static void zcrypt_cex2a_receive(struct ap_queue *aq,
- struct ap_message *msg,
- struct ap_message *reply)
+static void zcrypt_msgtype50_receive(struct ap_queue *aq,
+ struct ap_message *msg,
+ struct ap_message *reply)
{
static struct error_hdr error_reply = {
.type = TYPE82_RSP_CODE,
@@ -456,19 +450,18 @@ static atomic_t zcrypt_step = ATOMIC_INIT(0);
* CEXxA device to the request distributor
* @mex: pointer to the modexpo request buffer
*/
-static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq,
- struct ica_rsa_modexpo *mex,
- struct ap_message *ap_msg)
+static long zcrypt_msgtype50_modexpo(struct zcrypt_queue *zq,
+ struct ica_rsa_modexpo *mex,
+ struct ap_message *ap_msg)
{
struct completion work;
int rc;
- ap_msg->bufsize = (zq->zcard->user_space_type == ZCRYPT_CEX2A) ?
- MSGTYPE50_CRB2_MAX_MSG_SIZE : MSGTYPE50_CRB3_MAX_MSG_SIZE;
+ ap_msg->bufsize = MSGTYPE50_CRB3_MAX_MSG_SIZE;
ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
- ap_msg->receive = zcrypt_cex2a_receive;
+ ap_msg->receive = zcrypt_msgtype50_receive;
ap_msg->psmid = (((unsigned long)current->pid) << 32) +
atomic_inc_return(&zcrypt_step);
ap_msg->private = &work;
@@ -483,9 +476,9 @@ static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq,
if (rc == 0) {
rc = ap_msg->rc;
if (rc == 0)
- rc = convert_response_cex2a(zq, ap_msg,
- mex->outputdata,
- mex->outputdatalength);
+ rc = convert_response(zq, ap_msg,
+ mex->outputdata,
+ mex->outputdatalength);
} else {
/* Signal pending. */
ap_cancel_message(zq->queue, ap_msg);
@@ -507,19 +500,18 @@ out:
* CEXxA device to the request distributor
* @crt: pointer to the modexpoc_crt request buffer
*/
-static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq,
- struct ica_rsa_modexpo_crt *crt,
- struct ap_message *ap_msg)
+static long zcrypt_msgtype50_modexpo_crt(struct zcrypt_queue *zq,
+ struct ica_rsa_modexpo_crt *crt,
+ struct ap_message *ap_msg)
{
struct completion work;
int rc;
- ap_msg->bufsize = (zq->zcard->user_space_type == ZCRYPT_CEX2A) ?
- MSGTYPE50_CRB2_MAX_MSG_SIZE : MSGTYPE50_CRB3_MAX_MSG_SIZE;
+ ap_msg->bufsize = MSGTYPE50_CRB3_MAX_MSG_SIZE;
ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL);
if (!ap_msg->msg)
return -ENOMEM;
- ap_msg->receive = zcrypt_cex2a_receive;
+ ap_msg->receive = zcrypt_msgtype50_receive;
ap_msg->psmid = (((unsigned long)current->pid) << 32) +
atomic_inc_return(&zcrypt_step);
ap_msg->private = &work;
@@ -534,9 +526,9 @@ static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq,
if (rc == 0) {
rc = ap_msg->rc;
if (rc == 0)
- rc = convert_response_cex2a(zq, ap_msg,
- crt->outputdata,
- crt->outputdatalength);
+ rc = convert_response(zq, ap_msg,
+ crt->outputdata,
+ crt->outputdatalength);
} else {
/* Signal pending. */
ap_cancel_message(zq->queue, ap_msg);
@@ -555,8 +547,8 @@ out:
* The crypto operations for message type 50.
*/
static struct zcrypt_ops zcrypt_msgtype50_ops = {
- .rsa_modexpo = zcrypt_cex2a_modexpo,
- .rsa_modexpo_crt = zcrypt_cex2a_modexpo_crt,
+ .rsa_modexpo = zcrypt_msgtype50_modexpo,
+ .rsa_modexpo_crt = zcrypt_msgtype50_modexpo_crt,
.owner = THIS_MODULE,
.name = MSGTYPE50_NAME,
.variant = MSGTYPE50_VARIANT_DEFAULT,
diff --git a/drivers/s390/crypto/zcrypt_msgtype50.h b/drivers/s390/crypto/zcrypt_msgtype50.h
index eb49f06bed29..323e93b90b12 100644
--- a/drivers/s390/crypto/zcrypt_msgtype50.h
+++ b/drivers/s390/crypto/zcrypt_msgtype50.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
- * Copyright IBM Corp. 2001, 2012
+ * Copyright IBM Corp. 2001, 2023
* Author(s): Robert Burroughs
* Eric Rossman (edrossma@us.ibm.com)
*
@@ -15,7 +15,6 @@
#define MSGTYPE50_NAME "zcrypt_msgtype50"
#define MSGTYPE50_VARIANT_DEFAULT 0
-#define MSGTYPE50_CRB2_MAX_MSG_SIZE 0x390 /* sizeof(struct type50_crb2_msg) */
#define MSGTYPE50_CRB3_MAX_MSG_SIZE 0x710 /* sizeof(struct type50_crb3_msg) */
#define MSGTYPE_ADJUSTMENT 0x08 /* type04 extension (not needed in type50) */
diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c
index e668ff5eb384..3c53abbdc342 100644
--- a/drivers/s390/crypto/zcrypt_msgtype6.c
+++ b/drivers/s390/crypto/zcrypt_msgtype6.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * Copyright IBM Corp. 2001, 2022
+ * Copyright IBM Corp. 2001, 2023
* Author(s): Robert Burroughs
* Eric Rossman (edrossma@us.ibm.com)
*
@@ -42,7 +42,7 @@ struct response_type {
MODULE_AUTHOR("IBM Corporation");
MODULE_DESCRIPTION("Cryptographic Coprocessor (message type 6), " \
- "Copyright IBM Corp. 2001, 2012");
+ "Copyright IBM Corp. 2001, 2023");
MODULE_LICENSE("GPL");
struct function_and_rules_block {
@@ -1348,14 +1348,6 @@ out:
/*
* The crypto operations for a CEXxC card.
*/
-static struct zcrypt_ops zcrypt_msgtype6_norng_ops = {
- .owner = THIS_MODULE,
- .name = MSGTYPE06_NAME,
- .variant = MSGTYPE06_VARIANT_NORNG,
- .rsa_modexpo = zcrypt_msgtype6_modexpo,
- .rsa_modexpo_crt = zcrypt_msgtype6_modexpo_crt,
- .send_cprb = zcrypt_msgtype6_send_cprb,
-};
static struct zcrypt_ops zcrypt_msgtype6_ops = {
.owner = THIS_MODULE,
@@ -1378,14 +1370,12 @@ static struct zcrypt_ops zcrypt_msgtype6_ep11_ops = {
void __init zcrypt_msgtype6_init(void)
{
- zcrypt_msgtype_register(&zcrypt_msgtype6_norng_ops);
zcrypt_msgtype_register(&zcrypt_msgtype6_ops);
zcrypt_msgtype_register(&zcrypt_msgtype6_ep11_ops);
}
void __exit zcrypt_msgtype6_exit(void)
{
- zcrypt_msgtype_unregister(&zcrypt_msgtype6_norng_ops);
zcrypt_msgtype_unregister(&zcrypt_msgtype6_ops);
zcrypt_msgtype_unregister(&zcrypt_msgtype6_ep11_ops);
}
diff --git a/drivers/scsi/raid_class.c b/drivers/scsi/raid_class.c
index 711252e52d8e..95a86e0dfd77 100644
--- a/drivers/scsi/raid_class.c
+++ b/drivers/scsi/raid_class.c
@@ -209,54 +209,6 @@ raid_attr_ro_state(level);
raid_attr_ro_fn(resync);
raid_attr_ro_state_fn(state);
-static void raid_component_release(struct device *dev)
-{
- struct raid_component *rc =
- container_of(dev, struct raid_component, dev);
- dev_printk(KERN_ERR, rc->dev.parent, "COMPONENT RELEASE\n");
- put_device(rc->dev.parent);
- kfree(rc);
-}
-
-int raid_component_add(struct raid_template *r,struct device *raid_dev,
- struct device *component_dev)
-{
- struct device *cdev =
- attribute_container_find_class_device(&r->raid_attrs.ac,
- raid_dev);
- struct raid_component *rc;
- struct raid_data *rd = dev_get_drvdata(cdev);
- int err;
-
- rc = kzalloc(sizeof(*rc), GFP_KERNEL);
- if (!rc)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&rc->node);
- device_initialize(&rc->dev);
- rc->dev.release = raid_component_release;
- rc->dev.parent = get_device(component_dev);
- rc->num = rd->component_count++;
-
- dev_set_name(&rc->dev, "component-%d", rc->num);
- list_add_tail(&rc->node, &rd->component_list);
- rc->dev.class = &raid_class.class;
- err = device_add(&rc->dev);
- if (err)
- goto err_out;
-
- return 0;
-
-err_out:
- put_device(&rc->dev);
- list_del(&rc->node);
- rd->component_count--;
- put_device(component_dev);
- kfree(rc);
- return err;
-}
-EXPORT_SYMBOL(raid_component_add);
-
struct raid_template *
raid_class_attach(struct raid_function_template *ft)
{
diff --git a/drivers/scsi/snic/snic_disc.c b/drivers/scsi/snic/snic_disc.c
index e429ad23c396..4db3ba62fcd3 100644
--- a/drivers/scsi/snic/snic_disc.c
+++ b/drivers/scsi/snic/snic_disc.c
@@ -303,12 +303,11 @@ snic_tgt_create(struct snic *snic, struct snic_tgt_id *tgtid)
"Snic Tgt: device_add, with err = %d\n",
ret);
- put_device(&tgt->dev);
put_device(&snic->shost->shost_gendev);
spin_lock_irqsave(snic->shost->host_lock, flags);
list_del(&tgt->list);
spin_unlock_irqrestore(snic->shost->host_lock, flags);
- kfree(tgt);
+ put_device(&tgt->dev);
tgt = NULL;
return tgt;
diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
index b3c226eb5292..58746e570d14 100644
--- a/drivers/soc/fsl/qe/qe.c
+++ b/drivers/soc/fsl/qe/qe.c
@@ -524,7 +524,7 @@ int qe_upload_firmware(const struct qe_firmware *firmware)
* saved microcode information and put in the new.
*/
memset(&qe_firmware_info, 0, sizeof(qe_firmware_info));
- strlcpy(qe_firmware_info.id, firmware->id, sizeof(qe_firmware_info.id));
+ strscpy(qe_firmware_info.id, firmware->id, sizeof(qe_firmware_info.id));
qe_firmware_info.extended_modes = be64_to_cpu(firmware->extended_modes);
memcpy(qe_firmware_info.vtraps, firmware->vtraps,
sizeof(firmware->vtraps));
@@ -599,7 +599,7 @@ struct qe_firmware_info *qe_get_firmware_info(void)
/* Copy the data into qe_firmware_info*/
sprop = of_get_property(fw, "id", NULL);
if (sprop)
- strlcpy(qe_firmware_info.id, sprop,
+ strscpy(qe_firmware_info.id, sprop,
sizeof(qe_firmware_info.id));
of_property_read_u64(fw, "extended-modes",
diff --git a/drivers/spi/spi-cadence.c b/drivers/spi/spi-cadence.c
index de8fe3c5becb..9b021390263e 100644
--- a/drivers/spi/spi-cadence.c
+++ b/drivers/spi/spi-cadence.c
@@ -317,12 +317,6 @@ static void cdns_spi_process_fifo(struct cdns_spi *xspi, int ntx, int nrx)
xspi->rx_bytes -= nrx;
while (ntx || nrx) {
- /* When xspi in busy condition, bytes may send failed,
- * then spi control did't work thoroughly, add one byte delay
- */
- if (cdns_spi_read(xspi, CDNS_SPI_ISR) & CDNS_SPI_IXR_TXFULL)
- udelay(10);
-
if (ntx) {
if (xspi->txbuf)
cdns_spi_write(xspi, CDNS_SPI_TXD, *xspi->txbuf++);
@@ -392,6 +386,11 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
if (xspi->tx_bytes) {
cdns_spi_process_fifo(xspi, trans_cnt, trans_cnt);
} else {
+ /* Fixed delay due to controller limitation with
+ * RX_NEMPTY incorrect status
+ * Xilinx AR:65885 contains more details
+ */
+ udelay(10);
cdns_spi_process_fifo(xspi, 0, trans_cnt);
cdns_spi_write(xspi, CDNS_SPI_IDR,
CDNS_SPI_IXR_DEFAULT);
@@ -439,12 +438,18 @@ static int cdns_transfer_one(struct spi_controller *ctlr,
cdns_spi_setup_transfer(spi, transfer);
} else {
/* Set TX empty threshold to half of FIFO depth
- * only if TX bytes are more than half FIFO depth.
+ * only if TX bytes are more than FIFO depth.
*/
if (xspi->tx_bytes > xspi->tx_fifo_depth)
cdns_spi_write(xspi, CDNS_SPI_THLD, xspi->tx_fifo_depth >> 1);
}
+ /* When xspi in busy condition, bytes may send failed,
+ * then spi control didn't work thoroughly, add one byte delay
+ */
+ if (cdns_spi_read(xspi, CDNS_SPI_ISR) & CDNS_SPI_IXR_TXFULL)
+ udelay(10);
+
cdns_spi_process_fifo(xspi, xspi->tx_fifo_depth, 0);
spi_transfer_delay_exec(transfer);
diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c
index 6d10fa4ab783..7ddf9db776b0 100644
--- a/drivers/spi/spi-stm32.c
+++ b/drivers/spi/spi-stm32.c
@@ -1001,9 +1001,9 @@ static int stm32_spi_prepare_msg(struct spi_controller *ctrl,
if (spi->cfg->set_number_of_data) {
int ret;
- ret = spi_split_transfers_maxsize(ctrl, msg,
- STM32H7_SPI_TSIZE_MAX,
- GFP_KERNEL | GFP_DMA);
+ ret = spi_split_transfers_maxwords(ctrl, msg,
+ STM32H7_SPI_TSIZE_MAX,
+ GFP_KERNEL | GFP_DMA);
if (ret)
return ret;
}
diff --git a/drivers/thermal/intel/intel_tcc_cooling.c b/drivers/thermal/intel/intel_tcc_cooling.c
index e95f799454fe..6c392147e6d1 100644
--- a/drivers/thermal/intel/intel_tcc_cooling.c
+++ b/drivers/thermal/intel/intel_tcc_cooling.c
@@ -60,7 +60,7 @@ static const struct x86_cpu_id tcc_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, NULL),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, NULL),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, NULL),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, NULL),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, NULL),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, NULL),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, NULL),
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index cc2b5e81c620..a59700593d32 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -348,7 +348,8 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip_id)
struct thermal_trip trip;
/* Ignore disabled trip points */
- if (test_bit(trip_id, &tz->trips_disabled))
+ if (test_bit(trip_id, &tz->trips_disabled) ||
+ trip.temperature == THERMAL_TEMP_INVALID)
return;
__thermal_zone_get_trip(tz, trip_id, &trip);
@@ -496,6 +497,25 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
}
EXPORT_SYMBOL_GPL(thermal_zone_device_update);
+/**
+ * thermal_zone_device_exec - Run a callback under the zone lock.
+ * @tz: Thermal zone.
+ * @cb: Callback to run.
+ * @data: Data to pass to the callback.
+ */
+void thermal_zone_device_exec(struct thermal_zone_device *tz,
+ void (*cb)(struct thermal_zone_device *,
+ unsigned long),
+ unsigned long data)
+{
+ mutex_lock(&tz->lock);
+
+ cb(tz, data);
+
+ mutex_unlock(&tz->lock);
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device_exec);
+
static void thermal_zone_device_check(struct work_struct *work)
{
struct thermal_zone_device *tz = container_of(work, struct
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 17c1bbed734d..04513f9fbfa1 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -54,10 +54,6 @@ int for_each_thermal_cooling_device(int (*cb)(struct thermal_cooling_device *,
int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *),
void *thermal_governor);
-int __for_each_thermal_trip(struct thermal_zone_device *,
- int (*cb)(struct thermal_trip *, void *),
- void *);
-
struct thermal_zone_device *thermal_zone_get_by_id(int id);
struct thermal_attr {
diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
index 907f3a4d7bc8..53115cfdfd42 100644
--- a/drivers/thermal/thermal_trip.c
+++ b/drivers/thermal/thermal_trip.c
@@ -9,28 +9,26 @@
*/
#include "thermal_core.h"
-int __for_each_thermal_trip(struct thermal_zone_device *tz,
- int (*cb)(struct thermal_trip *, void *),
- void *data)
+int for_each_thermal_trip(struct thermal_zone_device *tz,
+ int (*cb)(struct thermal_trip *, void *),
+ void *data)
{
int i, ret;
- struct thermal_trip trip;
lockdep_assert_held(&tz->lock);
- for (i = 0; i < tz->num_trips; i++) {
-
- ret = __thermal_zone_get_trip(tz, i, &trip);
- if (ret)
- return ret;
+ if (!tz->trips)
+ return -ENODATA;
- ret = cb(&trip, data);
+ for (i = 0; i < tz->num_trips; i++) {
+ ret = cb(&tz->trips[i], data);
if (ret)
return ret;
}
return 0;
}
+EXPORT_SYMBOL_GPL(for_each_thermal_trip);
int thermal_zone_get_num_trips(struct thermal_zone_device *tz)
{
diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index 6fb0e007af63..386674ead7f0 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -580,7 +580,6 @@ static bool ufshcd_mcq_sqe_search(struct ufs_hba *hba,
{
struct ufshcd_lrb *lrbp = &hba->lrb[task_tag];
struct utp_transfer_req_desc *utrd;
- u32 mask = hwq->max_entries - 1;
__le64 cmd_desc_base_addr;
bool ret = false;
u64 addr, match;
@@ -608,7 +607,10 @@ static bool ufshcd_mcq_sqe_search(struct ufs_hba *hba,
ret = true;
goto out;
}
- sq_head_slot = (sq_head_slot + 1) & mask;
+
+ sq_head_slot++;
+ if (sq_head_slot == hwq->max_entries)
+ sq_head_slot = 0;
}
out:
diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index 8d6fd4c3324f..c1557d21b027 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -321,7 +321,7 @@ static void ufs_qcom_select_unipro_mode(struct ufs_qcom_host *host)
ufs_qcom_cap_qunipro(host) ? QUNIPRO_SEL : 0,
REG_UFS_CFG1);
- if (host->hw_ver.major == 0x05)
+ if (host->hw_ver.major >= 0x05)
ufshcd_rmwl(host->hba, QUNIPRO_G4_SEL, 0, REG_UFS_CFG0);
/* make sure above configuration is applied before we return */
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 1a16a8bdea60..4f68f6ef3cc1 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -2642,21 +2642,21 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
snoop(&dev->dev, "%s: CONTROL\n", __func__);
ret = proc_control(ps, p);
if (ret >= 0)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
break;
case USBDEVFS_BULK:
snoop(&dev->dev, "%s: BULK\n", __func__);
ret = proc_bulk(ps, p);
if (ret >= 0)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
break;
case USBDEVFS_RESETEP:
snoop(&dev->dev, "%s: RESETEP\n", __func__);
ret = proc_resetep(ps, p);
if (ret >= 0)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
break;
case USBDEVFS_RESET:
@@ -2668,7 +2668,7 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
snoop(&dev->dev, "%s: CLEAR_HALT\n", __func__);
ret = proc_clearhalt(ps, p);
if (ret >= 0)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
break;
case USBDEVFS_GETDRIVER:
@@ -2695,7 +2695,7 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
snoop(&dev->dev, "%s: SUBMITURB\n", __func__);
ret = proc_submiturb(ps, p);
if (ret >= 0)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
break;
#ifdef CONFIG_COMPAT
@@ -2703,14 +2703,14 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
snoop(&dev->dev, "%s: CONTROL32\n", __func__);
ret = proc_control_compat(ps, p);
if (ret >= 0)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
break;
case USBDEVFS_BULK32:
snoop(&dev->dev, "%s: BULK32\n", __func__);
ret = proc_bulk_compat(ps, p);
if (ret >= 0)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
break;
case USBDEVFS_DISCSIGNAL32:
@@ -2722,7 +2722,7 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
snoop(&dev->dev, "%s: SUBMITURB32\n", __func__);
ret = proc_submiturb_compat(ps, p);
if (ret >= 0)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
break;
case USBDEVFS_IOCTL32:
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index f41a385a5c42..6e9ef35a43a7 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1377,7 +1377,7 @@ ffs_sb_make_inode(struct super_block *sb, void *data,
inode = new_inode(sb);
if (inode) {
- struct timespec64 ts = current_time(inode);
+ struct timespec64 ts = inode_set_ctime_current(inode);
inode->i_ino = get_next_ino();
inode->i_mode = perms->mode;
@@ -1385,7 +1385,6 @@ ffs_sb_make_inode(struct super_block *sb, void *data,
inode->i_gid = perms->gid;
inode->i_atime = ts;
inode->i_mtime = ts;
- inode->i_ctime = ts;
inode->i_private = data;
if (fops)
inode->i_fop = fops;
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 28249d0bf062..ce9e31f3d26b 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -1969,8 +1969,7 @@ gadgetfs_make_inode (struct super_block *sb,
inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, default_uid);
inode->i_gid = make_kgid(&init_user_ns, default_gid);
- inode->i_atime = inode->i_mtime = inode->i_ctime
- = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_private = data;
inode->i_fop = fops;
}
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index d5d7c402b651..d43153fec18e 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -269,6 +269,13 @@ config XEN_PRIVCMD
disaggregated Xen setups this driver might be needed for other
domains, too.
+config XEN_PRIVCMD_IRQFD
+ bool "Xen irqfd support"
+ depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD
+ help
+ Using the irqfd mechanism a virtio backend running in a daemon can
+ speed up interrupt injection into a guest.
+
config XEN_ACPI_PROCESSOR
tristate "Xen ACPI processor"
depends on XEN && XEN_PV_DOM0 && X86 && ACPI_PROCESSOR && CPU_FREQ
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index f13c3b76ad1e..35659bf70746 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -1044,7 +1044,7 @@ EXPORT_SYMBOL_GPL(gnttab_pages_clear_private);
/**
* gnttab_free_pages - free pages allocated by gnttab_alloc_pages()
- * @nr_pages; number of pages to free
+ * @nr_pages: number of pages to free
* @pages: the pages
*/
void gnttab_free_pages(int nr_pages, struct page **pages)
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index f447cd37cc4c..f00ad5f5f1d4 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -9,11 +9,16 @@
#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
+#include <linux/eventfd.h>
+#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/workqueue.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/mman.h>
@@ -833,6 +838,263 @@ out:
return rc;
}
+#ifdef CONFIG_XEN_PRIVCMD_IRQFD
+/* Irqfd support */
+static struct workqueue_struct *irqfd_cleanup_wq;
+static DEFINE_MUTEX(irqfds_lock);
+static LIST_HEAD(irqfds_list);
+
+struct privcmd_kernel_irqfd {
+ struct xen_dm_op_buf xbufs;
+ domid_t dom;
+ bool error;
+ struct eventfd_ctx *eventfd;
+ struct work_struct shutdown;
+ wait_queue_entry_t wait;
+ struct list_head list;
+ poll_table pt;
+};
+
+static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
+{
+ lockdep_assert_held(&irqfds_lock);
+
+ list_del_init(&kirqfd->list);
+ queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
+}
+
+static void irqfd_shutdown(struct work_struct *work)
+{
+ struct privcmd_kernel_irqfd *kirqfd =
+ container_of(work, struct privcmd_kernel_irqfd, shutdown);
+ u64 cnt;
+
+ eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
+ eventfd_ctx_put(kirqfd->eventfd);
+ kfree(kirqfd);
+}
+
+static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
+{
+ u64 cnt;
+ long rc;
+
+ eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
+
+ xen_preemptible_hcall_begin();
+ rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs);
+ xen_preemptible_hcall_end();
+
+ /* Don't repeat the error message for consecutive failures */
+ if (rc && !kirqfd->error) {
+ pr_err("Failed to configure irq for guest domain: %d\n",
+ kirqfd->dom);
+ }
+
+ kirqfd->error = rc;
+}
+
+static int
+irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+ struct privcmd_kernel_irqfd *kirqfd =
+ container_of(wait, struct privcmd_kernel_irqfd, wait);
+ __poll_t flags = key_to_poll(key);
+
+ if (flags & EPOLLIN)
+ irqfd_inject(kirqfd);
+
+ if (flags & EPOLLHUP) {
+ mutex_lock(&irqfds_lock);
+ irqfd_deactivate(kirqfd);
+ mutex_unlock(&irqfds_lock);
+ }
+
+ return 0;
+}
+
+static void
+irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct privcmd_kernel_irqfd *kirqfd =
+ container_of(pt, struct privcmd_kernel_irqfd, pt);
+
+ add_wait_queue_priority(wqh, &kirqfd->wait);
+}
+
+static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
+{
+ struct privcmd_kernel_irqfd *kirqfd, *tmp;
+ __poll_t events;
+ struct fd f;
+ void *dm_op;
+ int ret;
+
+ kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL);
+ if (!kirqfd)
+ return -ENOMEM;
+ dm_op = kirqfd + 1;
+
+ if (copy_from_user(dm_op, irqfd->dm_op, irqfd->size)) {
+ ret = -EFAULT;
+ goto error_kfree;
+ }
+
+ kirqfd->xbufs.size = irqfd->size;
+ set_xen_guest_handle(kirqfd->xbufs.h, dm_op);
+ kirqfd->dom = irqfd->dom;
+ INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
+
+ f = fdget(irqfd->fd);
+ if (!f.file) {
+ ret = -EBADF;
+ goto error_kfree;
+ }
+
+ kirqfd->eventfd = eventfd_ctx_fileget(f.file);
+ if (IS_ERR(kirqfd->eventfd)) {
+ ret = PTR_ERR(kirqfd->eventfd);
+ goto error_fd_put;
+ }
+
+ /*
+ * Install our own custom wake-up handling so we are notified via a
+ * callback whenever someone signals the underlying eventfd.
+ */
+ init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
+ init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
+
+ mutex_lock(&irqfds_lock);
+
+ list_for_each_entry(tmp, &irqfds_list, list) {
+ if (kirqfd->eventfd == tmp->eventfd) {
+ ret = -EBUSY;
+ mutex_unlock(&irqfds_lock);
+ goto error_eventfd;
+ }
+ }
+
+ list_add_tail(&kirqfd->list, &irqfds_list);
+ mutex_unlock(&irqfds_lock);
+
+ /*
+ * Check if there was an event already pending on the eventfd before we
+ * registered, and trigger it as if we didn't miss it.
+ */
+ events = vfs_poll(f.file, &kirqfd->pt);
+ if (events & EPOLLIN)
+ irqfd_inject(kirqfd);
+
+ /*
+ * Do not drop the file until the kirqfd is fully initialized, otherwise
+ * we might race against the EPOLLHUP.
+ */
+ fdput(f);
+ return 0;
+
+error_eventfd:
+ eventfd_ctx_put(kirqfd->eventfd);
+
+error_fd_put:
+ fdput(f);
+
+error_kfree:
+ kfree(kirqfd);
+ return ret;
+}
+
+static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
+{
+ struct privcmd_kernel_irqfd *kirqfd;
+ struct eventfd_ctx *eventfd;
+
+ eventfd = eventfd_ctx_fdget(irqfd->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&irqfds_lock);
+
+ list_for_each_entry(kirqfd, &irqfds_list, list) {
+ if (kirqfd->eventfd == eventfd) {
+ irqfd_deactivate(kirqfd);
+ break;
+ }
+ }
+
+ mutex_unlock(&irqfds_lock);
+
+ eventfd_ctx_put(eventfd);
+
+ /*
+ * Block until we know all outstanding shutdown jobs have completed so
+ * that we guarantee there will not be any more interrupts once this
+ * deassign function returns.
+ */
+ flush_workqueue(irqfd_cleanup_wq);
+
+ return 0;
+}
+
+static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
+{
+ struct privcmd_data *data = file->private_data;
+ struct privcmd_irqfd irqfd;
+
+ if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
+ return -EFAULT;
+
+ /* No other flags should be set */
+ if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
+ return -EINVAL;
+
+ /* If restriction is in place, check the domid matches */
+ if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
+ return -EPERM;
+
+ if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
+ return privcmd_irqfd_deassign(&irqfd);
+
+ return privcmd_irqfd_assign(&irqfd);
+}
+
+static int privcmd_irqfd_init(void)
+{
+ irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
+ if (!irqfd_cleanup_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void privcmd_irqfd_exit(void)
+{
+ struct privcmd_kernel_irqfd *kirqfd, *tmp;
+
+ mutex_lock(&irqfds_lock);
+
+ list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
+ irqfd_deactivate(kirqfd);
+
+ mutex_unlock(&irqfds_lock);
+
+ destroy_workqueue(irqfd_cleanup_wq);
+}
+#else
+static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int privcmd_irqfd_init(void)
+{
+ return 0;
+}
+
+static inline void privcmd_irqfd_exit(void)
+{
+}
+#endif /* CONFIG_XEN_PRIVCMD_IRQFD */
+
static long privcmd_ioctl(struct file *file,
unsigned int cmd, unsigned long data)
{
@@ -868,6 +1130,10 @@ static long privcmd_ioctl(struct file *file,
ret = privcmd_ioctl_mmap_resource(file, udata);
break;
+ case IOCTL_PRIVCMD_IRQFD:
+ ret = privcmd_ioctl_irqfd(file, udata);
+ break;
+
default:
break;
}
@@ -992,15 +1258,27 @@ static int __init privcmd_init(void)
err = misc_register(&xen_privcmdbuf_dev);
if (err != 0) {
pr_err("Could not register Xen hypercall-buf device\n");
- misc_deregister(&privcmd_dev);
- return err;
+ goto err_privcmdbuf;
+ }
+
+ err = privcmd_irqfd_init();
+ if (err != 0) {
+ pr_err("irqfd init failed\n");
+ goto err_irqfd;
}
return 0;
+
+err_irqfd:
+ misc_deregister(&xen_privcmdbuf_dev);
+err_privcmdbuf:
+ misc_deregister(&privcmd_dev);
+ return err;
}
static void __exit privcmd_exit(void)
{
+ privcmd_irqfd_exit();
misc_deregister(&privcmd_dev);
misc_deregister(&xen_privcmdbuf_dev);
}
diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 9cb61db67efd..296703939846 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -473,11 +473,8 @@ static int xen_upload_processor_pm_data(void)
if (!_pr)
continue;
- if (!pr_backup) {
- pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
- if (pr_backup)
- memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
- }
+ if (!pr_backup)
+ pr_backup = kmemdup(_pr, sizeof(*_pr), GFP_KERNEL);
(void)upload_pm_data(_pr);
}
diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h
index d873abe35bf6..fc1557dfef49 100644
--- a/drivers/xen/xen-pciback/conf_space_quirks.h
+++ b/drivers/xen/xen-pciback/conf_space_quirks.h
@@ -21,8 +21,6 @@ struct xen_pcibk_config_quirk {
int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field
*field);
-int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg);
-
int xen_pcibk_config_quirks_init(struct pci_dev *dev);
void xen_pcibk_config_field_free(struct config_field *field);
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
index 9a64196e831d..f9599ed2f2e2 100644
--- a/drivers/xen/xen-pciback/pciback.h
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -201,6 +201,3 @@ static inline void xen_pcibk_lateeoi(struct xen_pcibk_device *pdev,
int xen_pcibk_xenbus_register(void);
void xen_pcibk_xenbus_unregister(void);
#endif
-
-/* Handles shared IRQs that can to device domain and control domain. */
-void xen_pcibk_irq_handler(struct pci_dev *dev, int reset);
diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
index 3f3836cb7279..fcb335bb7b18 100644
--- a/drivers/xen/xenbus/xenbus_probe_frontend.c
+++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
@@ -429,7 +429,7 @@ static void xenbus_check_frontend(char *class, char *dev)
printk(KERN_DEBUG "XENBUS: frontend %s %s\n",
frontend, xenbus_strstate(fe_state));
backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
- if (!backend || IS_ERR(backend))
+ if (IS_ERR_OR_NULL(backend))
goto out;
err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state);
if (err == 1)
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 12e02eb01f59..028a182bcc9e 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -840,8 +840,8 @@ void xs_suspend(void)
{
xs_suspend_enter();
- down_write(&xs_watch_rwsem);
mutex_lock(&xs_response_mutex);
+ down_write(&xs_watch_rwsem);
}
void xs_resume(void)
@@ -866,8 +866,8 @@ void xs_resume(void)
void xs_suspend_cancel(void)
{
- mutex_unlock(&xs_response_mutex);
up_write(&xs_watch_rwsem);
+ mutex_unlock(&xs_response_mutex);
xs_suspend_exit();
}
diff --git a/drivers/zorro/names.c b/drivers/zorro/names.c
index fa3c83dbe843..077114ccc840 100644
--- a/drivers/zorro/names.c
+++ b/drivers/zorro/names.c
@@ -15,6 +15,7 @@
#include <linux/types.h>
#include <linux/zorro.h>
+#include "zorro.h"
struct zorro_prod_info {
__u16 prod;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 950cf61f118b..0d28ecf668d0 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_mapping->a_ops = &v9fs_addr_operations;
inode->i_private = NULL;
@@ -1011,7 +1011,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
} else if (v9ses->cache & CACHE_WRITEBACK) {
if (S_ISREG(inode->i_mode)) {
@@ -1032,7 +1032,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
return PTR_ERR(st);
v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
p9stat_free(st);
kfree(st);
@@ -1152,7 +1152,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
inode->i_atime.tv_sec = stat->atime;
inode->i_mtime.tv_sec = stat->mtime;
- inode->i_ctime.tv_sec = stat->mtime;
+ inode_set_ctime(inode, stat->mtime, 0);
inode->i_uid = v9ses->dfltuid;
inode->i_gid = v9ses->dfltgid;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 14510872ecc3..1312f68965ac 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -450,7 +450,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
} else if (v9ses->cache) {
if (S_ISREG(inode->i_mode)) {
@@ -475,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
return PTR_ERR(st);
v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
@@ -645,8 +645,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_atime.tv_nsec = stat->st_atime_nsec;
inode->i_mtime.tv_sec = stat->st_mtime_sec;
inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
- inode->i_ctime.tv_sec = stat->st_ctime_sec;
- inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+ inode_set_ctime(inode, stat->st_ctime_sec,
+ stat->st_ctime_nsec);
inode->i_uid = stat->st_uid;
inode->i_gid = stat->st_gid;
set_nlink(inode, stat->st_nlink);
@@ -668,8 +668,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
}
if (stat->st_result_mask & P9_STATS_CTIME) {
- inode->i_ctime.tv_sec = stat->st_ctime_sec;
- inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+ inode_set_ctime(inode, stat->st_ctime_sec,
+ stat->st_ctime_nsec);
}
if (stat->st_result_mask & P9_STATS_UID)
inode->i_uid = stat->st_uid;
diff --git a/fs/Kconfig b/fs/Kconfig
index 18d034ec7953..7da21f563192 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -205,8 +205,8 @@ config TMPFS_XATTR
Extended attributes are name:value pairs associated with inodes by
the kernel or by users (see the attr(5) manual page for details).
- Currently this enables support for the trusted.* and
- security.* namespaces.
+ This enables support for the trusted.*, security.* and user.*
+ namespaces.
You need this for POSIX ACL support on tmpfs.
@@ -233,6 +233,18 @@ config TMPFS_INODE64
If unsure, say N.
+config TMPFS_QUOTA
+ bool "Tmpfs quota support"
+ depends on TMPFS
+ select QUOTA
+ help
+ Quota support allows to set per user and group limits for tmpfs
+ usage. Say Y to enable quota support. Once enabled you can control
+ user and group quota enforcement with quota, usrquota and grpquota
+ mount options.
+
+ If unsure, say N.
+
config ARCH_SUPPORTS_HUGETLBFS
def_bool n
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index c3ac613d0975..20963002578a 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -270,7 +270,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
inode->i_mode = adfs_atts2mode(sb, inode);
adfs_adfs2unix_time(&inode->i_mtime, inode);
inode->i_atime = inode->i_mtime;
- inode->i_ctime = inode->i_mtime;
+ inode_set_ctime_to_ts(inode, inode->i_mtime);
if (S_ISDIR(inode->i_mode)) {
inode->i_op = &adfs_dir_inode_operations;
@@ -331,7 +331,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode);
inode->i_mode = adfs_atts2mode(sb, inode);
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 29f11e10a7c7..7ba93efc1143 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
mark_buffer_dirty_inode(dir_bh, dir);
affs_brelse(dir_bh);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
inode_inc_iversion(dir);
mark_inode_dirty(dir);
@@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
affs_brelse(bh);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
inode_inc_iversion(dir);
mark_inode_dirty(dir);
@@ -315,7 +315,7 @@ affs_remove_header(struct dentry *dentry)
else
clear_nlink(inode);
affs_unlock_link(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
done:
diff --git a/fs/affs/file.c b/fs/affs/file.c
index e43f2f007ac1..472e2bdd5349 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -15,6 +15,7 @@
#include <linux/uio.h>
#include <linux/blkdev.h>
+#include <linux/mpage.h>
#include "affs.h"
static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
@@ -370,9 +371,10 @@ err_alloc:
return -ENOSPC;
}
-static int affs_writepage(struct page *page, struct writeback_control *wbc)
+static int affs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, affs_get_block, wbc);
+ return mpage_writepages(mapping, wbc, affs_get_block);
}
static int affs_read_folio(struct file *file, struct folio *folio)
@@ -456,10 +458,11 @@ const struct address_space_operations affs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = affs_read_folio,
- .writepage = affs_writepage,
+ .writepages = affs_writepages,
.write_begin = affs_write_begin,
.write_end = affs_write_end,
.direct_IO = affs_direct_IO,
+ .migrate_folio = buffer_migrate_folio,
.bmap = _affs_bmap
};
@@ -835,9 +838,10 @@ const struct address_space_operations affs_aops_ofs = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = affs_read_folio_ofs,
- //.writepage = affs_writepage_ofs,
+ //.writepages = affs_writepages_ofs,
.write_begin = affs_write_begin_ofs,
- .write_end = affs_write_end_ofs
+ .write_end = affs_write_end_ofs,
+ .migrate_folio = filemap_migrate_folio,
};
/* Free any preallocated blocks. */
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 27f77a52c5c8..060746c63151 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -149,13 +149,13 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
break;
}
- inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec
- = (be32_to_cpu(tail->change.days) * 86400LL +
- be32_to_cpu(tail->change.mins) * 60 +
- be32_to_cpu(tail->change.ticks) / 50 +
- AFFS_EPOCH_DELTA) +
- sys_tz.tz_minuteswest * 60;
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_sec = inode->i_atime.tv_sec =
+ inode_set_ctime(inode,
+ (be32_to_cpu(tail->change.days) * 86400LL +
+ be32_to_cpu(tail->change.mins) * 60 +
+ be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA)
+ + sys_tz.tz_minuteswest * 60, 0).tv_sec;
+ inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0;
affs_brelse(bh);
unlock_new_inode(inode);
return inode;
@@ -314,7 +314,7 @@ affs_new_inode(struct inode *dir)
inode->i_gid = current_fsgid();
inode->i_ino = block;
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
atomic_set(&AFFS_I(inode)->i_opencnt, 0);
AFFS_I(inode)->i_blkcnt = 0;
AFFS_I(inode)->i_lc = NULL;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d12ccfd2a83d..2fe4a5832fcf 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -43,7 +43,7 @@ affs_get_toupper(struct super_block *sb)
* Note: the dentry argument is the parent dentry.
*/
static inline int
-__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate)
+__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t fn, bool notruncate)
{
const u8 *name = qstr->name;
unsigned long hash;
@@ -57,7 +57,7 @@ __affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t tou
hash = init_name_hash(dentry);
len = min(qstr->len, AFFSNAMEMAX);
for (; len > 0; name++, len--)
- hash = partial_name_hash(toupper(*name), hash);
+ hash = partial_name_hash(fn(*name), hash);
qstr->hash = end_name_hash(hash);
return 0;
@@ -80,7 +80,7 @@ affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
}
static inline int __affs_compare_dentry(unsigned int len,
- const char *str, const struct qstr *name, toupper_t toupper,
+ const char *str, const struct qstr *name, toupper_t fn,
bool notruncate)
{
const u8 *aname = str;
@@ -106,7 +106,7 @@ static inline int __affs_compare_dentry(unsigned int len,
return 1;
for (; len > 0; len--)
- if (toupper(*aname++) != toupper(*bname++))
+ if (fn(*aname++) != fn(*bname++))
return 1;
return 0;
@@ -135,7 +135,7 @@ affs_intl_compare_dentry(const struct dentry *dentry,
*/
static inline int
-affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
+affs_match(struct dentry *dentry, const u8 *name2, toupper_t fn)
{
const u8 *name = dentry->d_name.name;
int len = dentry->d_name.len;
@@ -148,7 +148,7 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
return 0;
for (name2++; len > 0; len--)
- if (toupper(*name++) != toupper(*name2++))
+ if (fn(*name++) != fn(*name2++))
return 0;
return 1;
}
@@ -156,12 +156,12 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
int
affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
{
- toupper_t toupper = affs_get_toupper(sb);
+ toupper_t fn = affs_get_toupper(sb);
u32 hash;
hash = len = min(len, AFFSNAMEMAX);
for (; len > 0; len--)
- hash = (hash * 13 + toupper(*name++)) & 0x7ff;
+ hash = (hash * 13 + fn(*name++)) & 0x7ff;
return hash % AFFS_SB(sb)->s_hashsize;
}
@@ -171,7 +171,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
{
struct super_block *sb = dir->i_sb;
struct buffer_head *bh;
- toupper_t toupper = affs_get_toupper(sb);
+ toupper_t fn = affs_get_toupper(sb);
u32 key;
pr_debug("%s(\"%pd\")\n", __func__, dentry);
@@ -189,7 +189,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
bh = affs_bread(sb, key);
if (!bh)
return ERR_PTR(-EIO);
- if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper))
+ if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, fn))
return bh;
key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain);
}
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index d7d9402ff718..95bcbd7654d1 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
set_nlink(inode, 2);
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_blocks = 0;
inode->i_generation = 0;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 866bab860a88..1c794a1896aa 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -90,7 +90,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
vnode->status = *status;
t = status->mtime_client;
- inode->i_ctime = t;
+ inode_set_ctime_to_ts(inode, t);
inode->i_mtime = t;
inode->i_atime = t;
inode->i_flags |= S_NOATIME;
@@ -206,7 +206,7 @@ static void afs_apply_status(struct afs_operation *op,
t = status->mtime_client;
inode->i_mtime = t;
if (vp->update_ctime)
- inode->i_ctime = op->ctime;
+ inode_set_ctime_to_ts(inode, op->ctime);
if (vnode->status.data_version != status->data_version)
data_changed = true;
@@ -252,7 +252,7 @@ static void afs_apply_status(struct afs_operation *op,
vnode->netfs.remote_i_size = status->size;
if (change_size) {
afs_set_i_size(vnode, status->size);
- inode->i_ctime = t;
+ inode_set_ctime_to_ts(inode, t);
inode->i_atime = t;
}
}
@@ -773,7 +773,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
stat->nlink > 0)
stat->nlink -= 1;
diff --git a/fs/aio.c b/fs/aio.c
index 77e33619de40..b3174da80ff6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1447,13 +1447,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
if (kiocb->ki_flags & IOCB_WRITE) {
struct inode *inode = file_inode(kiocb->ki_filp);
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
if (S_ISREG(inode->i_mode))
- __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
- file_end_write(kiocb->ki_filp);
+ kiocb_end_write(kiocb);
}
iocb->ki_res.res = res;
@@ -1581,17 +1576,8 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
- /*
- * Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in
- * aio_complete_rw(). Fool lockdep by telling it the lock got
- * released so that it doesn't complain about the held lock when
- * we return to userspace.
- */
- if (S_ISREG(file_inode(file)->i_mode)) {
- sb_start_write(file_inode(file)->i_sb);
- __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
- }
+ if (S_ISREG(file_inode(file)->i_mode))
+ kiocb_start_write(req);
req->ki_flags |= IOCB_WRITE;
aio_rw_done(req, call_write_iter(file, req, &iter));
}
diff --git a/fs/attr.c b/fs/attr.c
index d60dc1edb526..a8ae5f6d9b16 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -312,7 +312,7 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
if (ia_valid & ATTR_MTIME)
inode->i_mtime = attr->ia_mtime;
if (ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
if (!in_group_or_capable(idmap, inode,
@@ -394,9 +394,25 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
return error;
if ((ia_valid & ATTR_MODE)) {
- umode_t amode = attr->ia_mode;
+ /*
+ * Don't allow changing the mode of symlinks:
+ *
+ * (1) The vfs doesn't take the mode of symlinks into account
+ * during permission checking.
+ * (2) This has never worked correctly. Most major filesystems
+ * did return EOPNOTSUPP due to interactions with POSIX ACLs
+ * but did still updated the mode of the symlink.
+ * This inconsistency led system call wrapper providers such
+ * as libc to block changing the mode of symlinks with
+ * EOPNOTSUPP already.
+ * (3) To even do this in the first place one would have to use
+ * specific file descriptors and quite some effort.
+ */
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
/* Flag setting protected by i_mutex */
- if (is_sxid(amode))
+ if (is_sxid(attr->ia_mode))
inode->i_flags &= ~S_NOSEC;
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index affa70360b1f..2b49662ed237 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -370,7 +370,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
inode->i_uid = d_inode(sb->s_root)->i_uid;
inode->i_gid = d_inode(sb->s_root)->i_gid;
}
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_ino = get_next_ino();
if (S_ISDIR(mode)) {
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 93046c9dc461..512b9a26c63d 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
return 0;
}
@@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
d_inode(dentry)->i_size = 0;
clear_nlink(d_inode(dentry));
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
spin_lock(&sbi->lookup_lock);
__autofs_add_expiring(dentry);
@@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
return 0;
}
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 54c1f8b8b075..33dd4660d82f 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -32,8 +32,9 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi)
wq->status = -ENOENT; /* Magic is gone - report failure */
kfree(wq->name.name - wq->offset);
wq->name.name = NULL;
- wq->wait_ctr--;
- wake_up_interruptible(&wq->queue);
+ wake_up(&wq->queue);
+ if (!--wq->wait_ctr)
+ kfree(wq);
wq = nwq;
}
fput(sbi->pipe); /* Close the pipe */
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index db649487d58c..83f9566c973b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -133,8 +133,7 @@ static int bad_inode_fiemap(struct inode *inode,
return -EIO;
}
-static int bad_inode_update_time(struct inode *inode, struct timespec64 *time,
- int flags)
+static int bad_inode_update_time(struct inode *inode, int flags)
{
return -EIO;
}
@@ -209,8 +208,7 @@ void make_bad_inode(struct inode *inode)
remove_inode_hash(inode);
inode->i_mode = S_IFREG;
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_op = &bad_inode_ops;
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &bad_file_ops;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index eee9237386e2..9a16a51fbb88 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -363,7 +363,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode->i_mtime.tv_sec =
fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */
- inode->i_ctime = inode->i_mtime;
+ inode_set_ctime_to_ts(inode, inode->i_mtime);
inode->i_atime = inode->i_mtime;
befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 040d5140e426..12b8af04dcb3 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir,
set_bit(ino, info->si_imap);
info->si_freei--;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_blocks = 0;
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
@@ -158,7 +158,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
return err;
}
inc_nlink(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
ihold(inode);
d_instantiate(new, inode);
@@ -187,9 +187,9 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
}
de->ino = 0;
mark_buffer_dirty_inode(bh, dir);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
error = 0;
@@ -240,10 +240,10 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto end_rename;
}
old_de->ino = 0;
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
mark_inode_dirty(old_dir);
if (new_inode) {
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
inode_dec_link_count(new_inode);
}
mark_buffer_dirty_inode(old_bh, old_dir);
@@ -292,9 +292,9 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
pos = (block - sblock) * BFS_BSIZE + off;
if (pos >= dir->i_size) {
dir->i_size += BFS_DIRENT_SIZE;
- dir->i_ctime = current_time(dir);
+ inode_set_ctime_current(dir);
}
- dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
de->ino = cpu_to_le16((u16)ino);
for (i = 0; i < BFS_NAMELEN; i++)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 1926bec2c850..e6a76ae9eb44 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -82,10 +82,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
inode->i_blocks = BFS_FILEBLOCKS(di);
inode->i_atime.tv_sec = le32_to_cpu(di->i_atime);
inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime);
- inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime);
+ inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0);
inode->i_atime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
brelse(bh);
unlock_new_inode(inode);
@@ -143,7 +142,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
di->i_nlink = cpu_to_le32(inode->i_nlink);
di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
i_sblock = BFS_I(inode)->i_sblock;
di->i_sblock = cpu_to_le32(i_sblock);
di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index bb202ad369d5..e0108d17b085 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -547,8 +547,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
return inode;
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 66fa9ab2c046..3282adc84d52 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -49,9 +49,11 @@ config BTRFS_FS_POSIX_ACL
If you don't know what Access Control Lists are, say N
config BTRFS_FS_CHECK_INTEGRITY
- bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+ bool "Btrfs with integrity check tool compiled in (DEPRECATED)"
depends on BTRFS_FS
help
+ This feature has been deprecated and will be removed in 6.7.
+
Adds code that examines all block write requests (including
writes of the super block). The goal is to verify that the
state of the filesystem on disk is always consistent, i.e.,
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index ceadfc5d6c66..8cfc8214109c 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_ACCESSORS_H
#define BTRFS_ACCESSORS_H
+#include <linux/stddef.h>
+
struct btrfs_map_token {
struct extent_buffer *eb;
char *kaddr;
@@ -34,13 +36,13 @@ static inline void put_unaligned_le8(u8 val, void *p)
read_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
offsetof(type, member), \
- sizeof(((type *)0)->member)))
+ sizeof_field(type, member)))
#define write_eb_member(eb, ptr, type, member, result) (\
write_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
offsetof(type, member), \
- sizeof(((type *)0)->member)))
+ sizeof_field(type, member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
@@ -62,25 +64,25 @@ DECLARE_BTRFS_SETGET_BITS(64)
static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
const type *s) \
{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
u##bits val) \
{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
const type *s) \
{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
return btrfs_get_token_##bits(token, s, offsetof(type, member));\
} \
static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
type *s, u##bits val) \
{ \
- static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
@@ -111,17 +113,14 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
- static_assert(sizeof(u64) ==
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
- return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
- total_bytes));
+ static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes));
+ return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes));
}
static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s,
u64 val)
{
- static_assert(sizeof(u64) ==
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes));
WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 79336fa853db..b7d54efb4728 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -3373,7 +3373,6 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
struct btrfs_key *node_key,
struct btrfs_backref_node *cur)
{
- struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_backref_edge *edge;
struct btrfs_backref_node *exist;
int ret;
@@ -3462,25 +3461,21 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
ret = handle_direct_tree_backref(cache, &key, cur);
if (ret < 0)
goto out;
- continue;
- } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
- ret = -EINVAL;
- btrfs_print_v0_err(fs_info);
- btrfs_handle_fs_error(fs_info, ret, NULL);
- goto out;
- } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
- continue;
+ } else if (key.type == BTRFS_TREE_BLOCK_REF_KEY) {
+ /*
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref
+ * offset means the root objectid. We need to search
+ * the tree to get its parent bytenr.
+ */
+ ret = handle_indirect_tree_backref(cache, path, &key, node_key,
+ cur);
+ if (ret < 0)
+ goto out;
}
-
/*
- * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
- * means the root objectid. We need to search the tree to get
- * its parent bytenr.
+ * Unrecognized tree backref items (if it can pass tree-checker)
+ * would be ignored.
*/
- ret = handle_indirect_tree_backref(cache, path, &key, node_key,
- cur);
- if (ret < 0)
- goto out;
}
ret = 0;
cur->checked = 1;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 82324c327a50..0cb1dee965a0 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -504,13 +504,20 @@ static void fragment_free_space(struct btrfs_block_group *block_group)
#endif
/*
- * This is only called by btrfs_cache_block_group, since we could have freed
- * extents we need to check the pinned_extents for any extents that can't be
- * used yet since their free space will be released as soon as the transaction
- * commits.
+ * Add a free space range to the in memory free space cache of a block group.
+ * This checks if the range contains super block locations and any such
+ * locations are not added to the free space cache.
+ *
+ * @block_group: The target block group.
+ * @start: Start offset of the range.
+ * @end: End offset of the range (exclusive).
+ * @total_added_ret: Optional pointer to return the total amount of space
+ * added to the block group's free space cache.
+ *
+ * Returns 0 on success or < 0 on error.
*/
-int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end,
- u64 *total_added_ret)
+int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
+ u64 end, u64 *total_added_ret)
{
struct btrfs_fs_info *info = block_group->fs_info;
u64 extent_start, extent_end, size;
@@ -520,11 +527,10 @@ int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
*total_added_ret = 0;
while (start < end) {
- ret = find_first_extent_bit(&info->excluded_extents, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE,
- NULL);
- if (ret)
+ if (!find_first_extent_bit(&info->excluded_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY | EXTENT_UPTODATE,
+ NULL))
break;
if (extent_start <= start) {
@@ -799,8 +805,8 @@ next:
key.type == BTRFS_METADATA_ITEM_KEY) {
u64 space_added;
- ret = add_new_free_space(block_group, last, key.objectid,
- &space_added);
+ ret = btrfs_add_new_free_space(block_group, last,
+ key.objectid, &space_added);
if (ret)
goto out;
total_found += space_added;
@@ -821,14 +827,20 @@ next:
path->slots[0]++;
}
- ret = add_new_free_space(block_group, last,
- block_group->start + block_group->length,
- NULL);
+ ret = btrfs_add_new_free_space(block_group, last,
+ block_group->start + block_group->length,
+ NULL);
out:
btrfs_free_path(path);
return ret;
}
+static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
+{
+ clear_extent_bits(&bg->fs_info->excluded_extents, bg->start,
+ bg->start + bg->length - 1, EXTENT_UPTODATE);
+}
+
static noinline void caching_thread(struct btrfs_work *work)
{
struct btrfs_block_group *block_group;
@@ -2098,8 +2110,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
cache->bytes_super += stripe_len;
- ret = btrfs_add_excluded_extent(fs_info, cache->start,
- stripe_len);
+ ret = set_extent_bit(&fs_info->excluded_extents, cache->start,
+ cache->start + stripe_len - 1,
+ EXTENT_UPTODATE, NULL);
if (ret)
return ret;
}
@@ -2125,8 +2138,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = btrfs_add_excluded_extent(fs_info, logical[nr],
- len);
+ ret = set_extent_bit(&fs_info->excluded_extents, logical[nr],
+ logical[nr] + len - 1,
+ EXTENT_UPTODATE, NULL);
if (ret) {
kfree(logical);
return ret;
@@ -2319,8 +2333,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
btrfs_free_excluded_extents(cache);
} else if (cache->used == 0) {
cache->cached = BTRFS_CACHE_FINISHED;
- ret = add_new_free_space(cache, cache->start,
- cache->start + cache->length, NULL);
+ ret = btrfs_add_new_free_space(cache, cache->start,
+ cache->start + cache->length, NULL);
btrfs_free_excluded_extents(cache);
if (ret)
goto error;
@@ -2767,7 +2781,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
return ERR_PTR(ret);
}
- ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
+ ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
btrfs_free_excluded_extents(cache);
if (ret) {
btrfs_put_block_group(cache);
@@ -4075,7 +4089,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (IS_ERR(ret_bg)) {
ret = PTR_ERR(ret_bg);
- } else if (from_extent_allocation) {
+ } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
/*
* New block group is likely to be used soon. Try to activate
* it now. Failure is OK for now.
@@ -4273,6 +4287,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
+ if (btrfs_is_zoned(info)) {
+ if (info->active_meta_bg) {
+ btrfs_put_block_group(info->active_meta_bg);
+ info->active_meta_bg = NULL;
+ }
+ if (info->active_system_bg) {
+ btrfs_put_block_group(info->active_system_bg);
+ info->active_system_bg = NULL;
+ }
+ }
+
write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 74b61e663028..2bdbcb834f95 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -291,8 +291,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
-int add_new_free_space(struct btrfs_block_group *block_group,
- u64 start, u64 end, u64 *total_added_ret);
+int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
+ u64 start, u64 end, u64 *total_added_ret);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d47a927b3504..bda1fdbba666 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -498,12 +498,8 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written, struct writeback_control *wbc);
+ u64 start, u64 end, struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page);
-void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
- struct page *page, u64 start,
- u64 end, bool uptodate);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6d51db066503..53c1211dd60b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1736,9 +1736,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
int over = 0;
unsigned char d_type;
- if (list_empty(ins_list))
- return 0;
-
/*
* Changing the data of the delayed item is impossible. So
* we needn't lock them. And we have held i_mutex of the
@@ -1809,9 +1806,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
inode->i_mtime.tv_nsec);
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode->i_ctime.tv_sec);
+ inode_get_ctime(inode).tv_sec);
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode->i_ctime.tv_nsec);
+ inode_get_ctime(inode).tv_nsec);
btrfs_set_stack_timespec_sec(&inode_item->otime,
BTRFS_I(inode)->i_otime.tv_sec);
@@ -1862,8 +1859,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
- inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
- inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
+ inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
+ btrfs_stack_timespec_nsec(&inode_item->ctime));
BTRFS_I(inode)->i_otime.tv_sec =
btrfs_stack_timespec_sec(&inode_item->otime);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5f10965fd72b..fff22ed55c42 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -792,9 +792,9 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
- while (!find_first_extent_bit(&srcdev->alloc_state, start,
- &found_start, &found_end,
- CHUNK_ALLOCATED, &cached_state)) {
+ while (find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
ret = set_extent_bit(&tgtdev->alloc_state, found_start,
found_end, CHUNK_ALLOCATED, NULL);
if (ret)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a9a2c5446c18..0a96ea8c1d3a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -313,21 +313,16 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
u8 fsid[BTRFS_FSID_SIZE];
- u8 *metadata_uuid;
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE);
+
/*
- * Checking the incompat flag is only valid for the current fs. For
- * seed devices it's forbidden to have their uuid changed so reading
- * ->fsid in this case is fine
+ * alloc_fs_devices() copies the fsid into metadata_uuid if the
+ * metadata_uuid is unset in the superblock, including for a seed device.
+ * So, we can use fs_devices->metadata_uuid.
*/
- if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- metadata_uuid = fs_devices->metadata_uuid;
- else
- metadata_uuid = fs_devices->fsid;
-
- if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
return false;
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
@@ -2384,21 +2379,18 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
- BTRFS_FSID_SIZE)) {
+ if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
- fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
+ sb->fsid, fs_info->fs_devices->fsid);
ret = -EINVAL;
}
- if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
- memcmp(fs_info->fs_devices->metadata_uuid,
- fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
+ if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
+ BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
- fs_info->super_copy->metadata_uuid,
- fs_info->fs_devices->metadata_uuid);
+ btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
ret = -EINVAL;
}
@@ -2869,6 +2861,56 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
return 0;
}
+static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ u64 root_objectid = 0;
+ struct btrfs_root *gang[8];
+ int i = 0;
+ int err = 0;
+ unsigned int ret = 0;
+
+ while (1) {
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang));
+ if (!ret) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ break;
+ }
+ root_objectid = gang[ret - 1]->root_key.objectid + 1;
+
+ for (i = 0; i < ret; i++) {
+ /* Avoid to grab roots in dead_roots. */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* Grab all the search result for later use. */
+ gang[i] = btrfs_grab_root(gang[i]);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
+ root_objectid = gang[i]->root_key.objectid;
+ err = btrfs_orphan_cleanup(gang[i]);
+ if (err)
+ goto out;
+ btrfs_put_root(gang[i]);
+ }
+ root_objectid++;
+ }
+out:
+ /* Release the uncleaned roots due to error. */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_root(gang[i]);
+ }
+ return err;
+}
+
/*
* Some options only have meaning at mount time and shouldn't persist across
* remounts, or be displayed. Clear these at the end of mount and remount
@@ -3222,7 +3264,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/* check FS state, whether FS is broken. */
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
- set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+ WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
/*
* In the long term, we'll store the compression type in the super
@@ -3417,6 +3459,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
btrfs_free_zone_cache(fs_info);
+ btrfs_check_active_zone_reservation(fs_info);
+
if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
!btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
@@ -4136,56 +4180,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
btrfs_put_root(root);
}
-int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
-{
- u64 root_objectid = 0;
- struct btrfs_root *gang[8];
- int i = 0;
- int err = 0;
- unsigned int ret = 0;
-
- while (1) {
- spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
- (void **)gang, root_objectid,
- ARRAY_SIZE(gang));
- if (!ret) {
- spin_unlock(&fs_info->fs_roots_radix_lock);
- break;
- }
- root_objectid = gang[ret - 1]->root_key.objectid + 1;
-
- for (i = 0; i < ret; i++) {
- /* Avoid to grab roots in dead_roots */
- if (btrfs_root_refs(&gang[i]->root_item) == 0) {
- gang[i] = NULL;
- continue;
- }
- /* grab all the search result for later use */
- gang[i] = btrfs_grab_root(gang[i]);
- }
- spin_unlock(&fs_info->fs_roots_radix_lock);
-
- for (i = 0; i < ret; i++) {
- if (!gang[i])
- continue;
- root_objectid = gang[i]->root_key.objectid;
- err = btrfs_orphan_cleanup(gang[i]);
- if (err)
- goto out;
- btrfs_put_root(gang[i]);
- }
- root_objectid++;
- }
-out:
- /* release the uncleaned roots due to error */
- for (; i < ret; i++) {
- if (gang[i])
- btrfs_put_root(gang[i]);
- }
- return err;
-}
-
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->tree_root;
@@ -4228,7 +4222,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
u64 found_end;
found = true;
- while (!find_first_extent_bit(&trans->dirty_pages, cur,
+ while (find_first_extent_bit(&trans->dirty_pages, cur,
&found_start, &found_end, EXTENT_DIRTY, &cached)) {
dirty_bytes += found_end + 1 - found_start;
cur = found_end + 1;
@@ -4552,9 +4546,7 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
- struct list_head splice;
-
- INIT_LIST_HEAD(&splice);
+ LIST_HEAD(splice);
spin_lock(&fs_info->ordered_root_lock);
list_splice_init(&fs_info->ordered_roots, &splice);
@@ -4660,9 +4652,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
- struct list_head splice;
-
- INIT_LIST_HEAD(&splice);
+ LIST_HEAD(splice);
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
@@ -4695,9 +4685,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
- struct list_head splice;
-
- INIT_LIST_HEAD(&splice);
+ LIST_HEAD(splice);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
@@ -4716,21 +4704,16 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->delalloc_root_lock);
}
-static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *dirty_pages,
- int mark)
+static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages,
+ int mark)
{
- int ret;
struct extent_buffer *eb;
u64 start = 0;
u64 end;
- while (1) {
- ret = find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, NULL);
- if (ret)
- break;
-
+ while (find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, NULL)) {
clear_extent_bits(dirty_pages, start, end, mark);
while (start <= end) {
eb = find_extent_buffer(fs_info, start);
@@ -4746,16 +4729,13 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
free_extent_buffer_stale(eb);
}
}
-
- return ret;
}
-static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *unpin)
+static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *unpin)
{
u64 start;
u64 end;
- int ret;
while (1) {
struct extent_state *cached_state = NULL;
@@ -4767,9 +4747,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
* the same extent range.
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
- ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state);
- if (ret) {
+ if (!find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state)) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
@@ -4780,8 +4759,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
-
- return 0;
}
static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b03767f4d7ed..02b645744a82 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -77,7 +77,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
-int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index a2315a4b8b75..ff8e117a1ace 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -831,15 +831,15 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t
*
* Note: If there are multiple bits set in @bits, any of them will match.
*
- * Return 0 if we find something, and update @start_ret and @end_ret.
- * Return 1 if we found nothing.
+ * Return true if we find something, and update @start_ret and @end_ret.
+ * Return false if we found nothing.
*/
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state)
+bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
- int ret = 1;
+ bool ret = false;
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
@@ -863,7 +863,7 @@ got_it:
cache_state_if_flags(state, cached_state, 0);
*start_ret = state->start;
*end_ret = state->end;
- ret = 0;
+ ret = true;
}
out:
spin_unlock(&tree->lock);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index fbd3b275ab1c..28c23a23d121 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -182,9 +182,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, u32 clear_bits,
struct extent_state **cached_state);
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state);
+bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state);
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f396a9afa403..f356f08b55cb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -69,27 +69,6 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
return (cache->flags & bits) == bits;
}
-int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 num_bytes)
-{
- u64 end = start + num_bytes - 1;
- set_extent_bit(&fs_info->excluded_extents, start, end,
- EXTENT_UPTODATE, NULL);
- return 0;
-}
-
-void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
-{
- struct btrfs_fs_info *fs_info = cache->fs_info;
- u64 start, end;
-
- start = cache->start;
- end = start + cache->length - 1;
-
- clear_extent_bits(&fs_info->excluded_extents, start, end,
- EXTENT_UPTODATE);
-}
-
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
@@ -187,8 +166,10 @@ search_again:
num_refs = btrfs_extent_refs(leaf, ei);
extent_flags = btrfs_extent_flags(leaf, ei);
} else {
- ret = -EINVAL;
- btrfs_print_v0_err(fs_info);
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected extent item size, has %u expect >= %zu",
+ item_size, sizeof(*ei));
if (trans)
btrfs_abort_transaction(trans, ret);
else
@@ -402,11 +383,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
}
+ WARN_ON(1);
btrfs_print_leaf(eb);
btrfs_err(eb->fs_info,
"eb %llu iref 0x%lx invalid extent inline ref type %d",
eb->start, (unsigned long)iref, type);
- WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
}
@@ -624,12 +605,12 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
ref2 = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_shared_data_ref);
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
- } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
- btrfs_print_v0_err(trans->fs_info);
- btrfs_abort_transaction(trans, -EINVAL);
- return -EINVAL;
} else {
- BUG();
+ btrfs_err(trans->fs_info,
+ "unrecognized backref key (%llu %u %llu)",
+ key.objectid, key.type, key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ return -EUCLEAN;
}
BUG_ON(num_refs < refs_to_drop);
@@ -660,7 +641,6 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path,
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
if (iref) {
/*
* If type is invalid, we should have bailed out earlier than
@@ -869,6 +849,11 @@ again:
err = -ENOENT;
goto out;
} else if (WARN_ON(ret)) {
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_err(fs_info,
+"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
+ bytenr, num_bytes, parent, root_objectid, owner,
+ offset);
err = -EIO;
goto out;
}
@@ -876,8 +861,10 @@ again:
leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
- err = -EINVAL;
- btrfs_print_v0_err(fs_info);
+ err = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected extent item size, has %llu expect >= %zu",
+ item_size, sizeof(*ei));
btrfs_abort_transaction(trans, err);
goto out;
}
@@ -1079,13 +1066,13 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
/*
* helper to update/remove inline back ref
*/
-static noinline_for_stack
-void update_inline_extent_backref(struct btrfs_path *path,
+static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_extent_item *ei;
struct btrfs_extent_data_ref *dref = NULL;
struct btrfs_shared_data_ref *sref = NULL;
@@ -1098,18 +1085,33 @@ void update_inline_extent_backref(struct btrfs_path *path,
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
- WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+ if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
+ struct btrfs_key key;
+ u32 extent_size;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ extent_size = fs_info->nodesize;
+ else
+ extent_size = key.offset;
+ btrfs_print_leaf(leaf);
+ btrfs_err(fs_info,
+ "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
+ key.objectid, extent_size, refs_to_mod, refs);
+ return -EUCLEAN;
+ }
refs += refs_to_mod;
btrfs_set_extent_refs(leaf, ei, refs);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
/*
- * If type is invalid, we should have bailed out after
- * lookup_inline_extent_backref().
+ * Function btrfs_get_extent_inline_ref_type() has already printed
+ * error messages.
*/
- type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
- ASSERT(type != BTRFS_REF_TYPE_INVALID);
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID))
+ return -EUCLEAN;
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -1119,10 +1121,43 @@ void update_inline_extent_backref(struct btrfs_path *path,
refs = btrfs_shared_data_ref_count(leaf, sref);
} else {
refs = 1;
- BUG_ON(refs_to_mod != -1);
+ /*
+ * For tree blocks we can only drop one ref for it, and tree
+ * blocks should not have refs > 1.
+ *
+ * Furthermore if we're inserting a new inline backref, we
+ * won't reach this path either. That would be
+ * setup_inline_extent_backref().
+ */
+ if (unlikely(refs_to_mod != -1)) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ btrfs_print_leaf(leaf);
+ btrfs_err(fs_info,
+ "invalid refs_to_mod for tree block %llu, has %d expect -1",
+ key.objectid, refs_to_mod);
+ return -EUCLEAN;
+ }
}
- BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+ if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
+ struct btrfs_key key;
+ u32 extent_size;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ extent_size = fs_info->nodesize;
+ else
+ extent_size = key.offset;
+ btrfs_print_leaf(leaf);
+ btrfs_err(fs_info,
+"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
+ (unsigned long)iref, key.objectid, extent_size,
+ refs_to_mod, refs);
+ return -EUCLEAN;
+ }
refs += refs_to_mod;
if (refs > 0) {
@@ -1142,6 +1177,7 @@ void update_inline_extent_backref(struct btrfs_path *path,
btrfs_truncate_item(path, item_size, 1);
}
btrfs_mark_buffer_dirty(leaf);
+ return 0;
}
static noinline_for_stack
@@ -1170,7 +1206,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
bytenr, num_bytes, root_objectid, path->slots[0]);
return -EUCLEAN;
}
- update_inline_extent_backref(path, iref, refs_to_add, extent_op);
+ ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(trans->fs_info, path, iref, parent,
root_objectid, owner, offset,
@@ -1190,7 +1226,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
BUG_ON(!is_data && refs_to_drop != 1);
if (iref)
- update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+ ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
else if (is_data)
ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
else
@@ -1629,8 +1665,10 @@ again:
item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
- err = -EINVAL;
- btrfs_print_v0_err(fs_info);
+ err = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected extent item size, has %u expect >= %zu",
+ item_size, sizeof(*ei));
btrfs_abort_transaction(trans, err);
goto out;
}
@@ -2751,9 +2789,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
struct extent_state *cached_state = NULL;
mutex_lock(&fs_info->unused_bg_unpin_mutex);
- ret = find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state);
- if (ret) {
+ if (!find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state)) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
@@ -3059,8 +3096,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, extent_slot);
if (unlikely(item_size < sizeof(*ei))) {
- ret = -EINVAL;
- btrfs_print_v0_err(info);
+ ret = -EUCLEAN;
+ btrfs_err(trans->fs_info,
+ "unexpected extent item size, has %u expect >= %zu",
+ item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3351,11 +3390,38 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
}
enum btrfs_loop_type {
+ /*
+ * Start caching block groups but do not wait for progress or for them
+ * to be done.
+ */
LOOP_CACHING_NOWAIT,
+
+ /*
+ * Wait for the block group free_space >= the space we're waiting for if
+ * the block group isn't cached.
+ */
LOOP_CACHING_WAIT,
+
+ /*
+ * Allow allocations to happen from block groups that do not yet have a
+ * size classification.
+ */
LOOP_UNSET_SIZE_CLASS,
+
+ /*
+ * Allocate a chunk and then retry the allocation.
+ */
LOOP_ALLOC_CHUNK,
+
+ /*
+ * Ignore the size class restrictions for this allocation.
+ */
LOOP_WRONG_SIZE_CLASS,
+
+ /*
+ * Ignore the empty size, only try to allocate the number of bytes
+ * needed for this allocation.
+ */
LOOP_NO_EMPTY_SIZE,
};
@@ -3427,7 +3493,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
* Helper function for find_free_extent().
*
* Return -ENOENT to inform caller that we need fallback to unclustered mode.
- * Return -EAGAIN to inform caller that we need to re-search this block group
* Return >0 to inform caller that we find nothing
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
@@ -3508,14 +3573,6 @@ refill_cluster:
trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
return 0;
}
- } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
- !ffe_ctl->retry_clustered) {
- spin_unlock(&last_ptr->refill_lock);
-
- ffe_ctl->retry_clustered = true;
- btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
- ffe_ctl->empty_cluster + ffe_ctl->empty_size);
- return -EAGAIN;
}
/*
* At this point we either didn't find a cluster or we weren't able to
@@ -3530,7 +3587,6 @@ refill_cluster:
/*
* Return >0 to inform caller that we find nothing
* Return 0 when we found an free extent and set ffe_ctrl->found_offset
- * Return -EAGAIN to inform caller that we need to re-search this block group
*/
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
struct find_free_extent_ctl *ffe_ctl)
@@ -3568,25 +3624,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
ffe_ctl->num_bytes, ffe_ctl->empty_size,
&ffe_ctl->max_extent_size);
-
- /*
- * If we didn't find a chunk, and we haven't failed on this block group
- * before, and this block group is in the middle of caching and we are
- * ok with waiting, then go ahead and wait for progress to be made, and
- * set @retry_unclustered to true.
- *
- * If @retry_unclustered is true then we've already waited on this
- * block group once and should move on to the next block group.
- */
- if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
- ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
- btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
- ffe_ctl->empty_size);
- ffe_ctl->retry_unclustered = true;
- return -EAGAIN;
- } else if (!offset) {
+ if (!offset)
return 1;
- }
ffe_ctl->found_offset = offset;
return 0;
}
@@ -3600,7 +3639,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
/* We want to try and use the cluster allocator, so lets look there */
if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
- if (ret >= 0 || ret == -EAGAIN)
+ if (ret >= 0)
return ret;
/* ret == -ENOENT case falls through */
}
@@ -3685,7 +3724,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
}
spin_unlock(&block_group->lock);
- if (!ret && !btrfs_zone_activate(block_group)) {
+ /* Metadata block group is activated at write time. */
+ if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
+ !btrfs_zone_activate(block_group)) {
ret = 1;
/*
* May need to clear fs_info->{treelog,data_reloc}_bg.
@@ -3709,7 +3750,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
fs_info->data_reloc_bg == 0);
if (block_group->ro ||
- test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
+ (!ffe_ctl->for_data_reloc &&
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
ret = 1;
goto out;
}
@@ -3752,8 +3794,26 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
fs_info->treelog_bg = block_group->start;
- if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
- fs_info->data_reloc_bg = block_group->start;
+ if (ffe_ctl->for_data_reloc) {
+ if (!fs_info->data_reloc_bg)
+ fs_info->data_reloc_bg = block_group->start;
+ /*
+ * Do not allow allocations from this block group, unless it is
+ * for data relocation. Compared to increasing the ->ro, setting
+ * the ->zoned_data_reloc_ongoing flag still allows nocow
+ * writers to come in. See btrfs_inc_nocow_writers().
+ *
+ * We need to disable an allocation to avoid an allocation of
+ * regular (non-relocation data) extent. With mix of relocation
+ * extents and regular extents, we can dispatch WRITE commands
+ * (for relocation extents) and ZONE APPEND commands (for
+ * regular extents) at the same time to the same zone, which
+ * easily break the write pointer.
+ *
+ * Also, this flag avoids this block group to be zone finished.
+ */
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
+ }
ffe_ctl->found_offset = start + block_group->alloc_offset;
block_group->alloc_offset += num_bytes;
@@ -3771,24 +3831,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
- if (ret && ffe_ctl->for_data_reloc &&
- fs_info->data_reloc_bg == block_group->start) {
- /*
- * Do not allow further allocations from this block group.
- * Compared to increasing the ->ro, setting the
- * ->zoned_data_reloc_ongoing flag still allows nocow
- * writers to come in. See btrfs_inc_nocow_writers().
- *
- * We need to disable an allocation to avoid an allocation of
- * regular (non-relocation data) extent. With mix of relocation
- * extents and regular extents, we can dispatch WRITE commands
- * (for relocation extents) and ZONE APPEND commands (for
- * regular extents) at the same time to the same zone, which
- * easily break the write pointer.
- */
- set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
+ if (ret && ffe_ctl->for_data_reloc)
fs_info->data_reloc_bg = 0;
- }
spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
@@ -3816,8 +3860,7 @@ static void release_block_group(struct btrfs_block_group *block_group,
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
- ffe_ctl->retry_clustered = false;
- ffe_ctl->retry_unclustered = false;
+ ffe_ctl->retry_uncached = false;
break;
case BTRFS_EXTENT_ALLOC_ZONED:
/* Nothing to do */
@@ -3861,6 +3904,10 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl)
{
+ /* Block group's activeness is not a requirement for METADATA block groups. */
+ if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
+
/* If we can activate new zone, just allocate a chunk and use it */
if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
return 0;
@@ -3949,15 +3996,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
return 1;
- /*
- * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
- * caching kthreads as we move along
- * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
- * LOOP_UNSET_SIZE_CLASS, allow unset size class
- * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
- * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
- * again
- */
+ /* See the comments for btrfs_loop_type for an explanation of the phases. */
if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
ffe_ctl->index = 0;
/*
@@ -4168,9 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl->orig_have_caching_bg = false;
ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
ffe_ctl->loop = 0;
- /* For clustered allocation */
- ffe_ctl->retry_clustered = false;
- ffe_ctl->retry_unclustered = false;
+ ffe_ctl->retry_uncached = false;
ffe_ctl->cached = 0;
ffe_ctl->max_extent_size = 0;
ffe_ctl->total_free_space = 0;
@@ -4321,16 +4358,12 @@ have_block_group:
bg_ret = NULL;
ret = do_allocation(block_group, ffe_ctl, &bg_ret);
- if (ret == 0) {
- if (bg_ret && bg_ret != block_group) {
- btrfs_release_block_group(block_group,
- ffe_ctl->delalloc);
- block_group = bg_ret;
- }
- } else if (ret == -EAGAIN) {
- goto have_block_group;
- } else if (ret > 0) {
+ if (ret > 0)
goto loop;
+
+ if (bg_ret && bg_ret != block_group) {
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
+ block_group = bg_ret;
}
/* Checks */
@@ -4371,6 +4404,15 @@ have_block_group:
btrfs_release_block_group(block_group, ffe_ctl->delalloc);
break;
loop:
+ if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+ !ffe_ctl->retry_uncached) {
+ ffe_ctl->retry_uncached = true;
+ btrfs_wait_block_group_cache_progress(block_group,
+ ffe_ctl->num_bytes +
+ ffe_ctl->empty_cluster +
+ ffe_ctl->empty_size);
+ goto have_block_group;
+ }
release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
cond_resched();
}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 429d5c570061..88c249c37516 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -48,16 +48,11 @@ struct find_free_extent_ctl {
int loop;
/*
- * Whether we're refilling a cluster, if true we need to re-search
- * current block group but don't try to refill the cluster again.
+ * Set to true if we're retrying the allocation on this block group
+ * after waiting for caching progress, this is so that we retry only
+ * once before moving on to another block group.
*/
- bool retry_clustered;
-
- /*
- * Whether we're updating free space cache, if true we need to re-search
- * current block group but don't try updating free space cache again.
- */
- bool retry_unclustered;
+ bool retry_uncached;
/* If current block group is cached */
int cached;
@@ -96,9 +91,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 num_bytes);
-void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 90ad3006ef3a..ac3fca5a5e41 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -181,34 +181,9 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
}
}
-void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
-{
- struct address_space *mapping = inode->i_mapping;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct folio *folio;
-
- while (index <= end_index) {
- folio = filemap_get_folio(mapping, index);
- filemap_dirty_folio(mapping, folio);
- folio_account_redirty(folio);
- index += folio_nr_pages(folio);
- folio_put(folio);
- }
-}
-
-/*
- * Process one page for __process_pages_contig().
- *
- * Return >0 if we hit @page == @locked_page.
- * Return 0 if we updated the page status.
- * Return -EGAIN if the we need to try again.
- * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
- */
-static int process_one_page(struct btrfs_fs_info *fs_info,
- struct address_space *mapping,
- struct page *page, struct page *locked_page,
- unsigned long page_ops, u64 start, u64 end)
+static void process_one_page(struct btrfs_fs_info *fs_info,
+ struct page *page, struct page *locked_page,
+ unsigned long page_ops, u64 start, u64 end)
{
u32 len;
@@ -224,94 +199,36 @@ static int process_one_page(struct btrfs_fs_info *fs_info,
if (page_ops & PAGE_END_WRITEBACK)
btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
- if (page == locked_page)
- return 1;
-
- if (page_ops & PAGE_LOCK) {
- int ret;
-
- ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
- if (ret)
- return ret;
- if (!PageDirty(page) || page->mapping != mapping) {
- btrfs_page_end_writer_lock(fs_info, page, start, len);
- return -EAGAIN;
- }
- }
- if (page_ops & PAGE_UNLOCK)
+ if (page != locked_page && (page_ops & PAGE_UNLOCK))
btrfs_page_end_writer_lock(fs_info, page, start, len);
- return 0;
}
-static int __process_pages_contig(struct address_space *mapping,
- struct page *locked_page,
- u64 start, u64 end, unsigned long page_ops,
- u64 *processed_end)
+static void __process_pages_contig(struct address_space *mapping,
+ struct page *locked_page, u64 start, u64 end,
+ unsigned long page_ops)
{
struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
pgoff_t index = start_index;
- unsigned long pages_processed = 0;
struct folio_batch fbatch;
- int err = 0;
int i;
- if (page_ops & PAGE_LOCK) {
- ASSERT(page_ops == PAGE_LOCK);
- ASSERT(processed_end && *processed_end == start);
- }
-
folio_batch_init(&fbatch);
while (index <= end_index) {
int found_folios;
found_folios = filemap_get_folios_contig(mapping, &index,
end_index, &fbatch);
-
- if (found_folios == 0) {
- /*
- * Only if we're going to lock these pages, we can find
- * nothing at @index.
- */
- ASSERT(page_ops & PAGE_LOCK);
- err = -EAGAIN;
- goto out;
- }
-
for (i = 0; i < found_folios; i++) {
- int process_ret;
struct folio *folio = fbatch.folios[i];
- process_ret = process_one_page(fs_info, mapping,
- &folio->page, locked_page, page_ops,
- start, end);
- if (process_ret < 0) {
- err = -EAGAIN;
- folio_batch_release(&fbatch);
- goto out;
- }
- pages_processed += folio_nr_pages(folio);
+
+ process_one_page(fs_info, &folio->page, locked_page,
+ page_ops, start, end);
}
folio_batch_release(&fbatch);
cond_resched();
}
-out:
- if (err && processed_end) {
- /*
- * Update @processed_end. I know this is awful since it has
- * two different return value patterns (inclusive vs exclusive).
- *
- * But the exclusive pattern is necessary if @start is 0, or we
- * underflow and check against processed_end won't work as
- * expected.
- */
- if (pages_processed)
- *processed_end = min(end,
- ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
- else
- *processed_end = start;
- }
- return err;
}
static noinline void __unlock_for_delalloc(struct inode *inode,
@@ -326,29 +243,63 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
return;
__process_pages_contig(inode->i_mapping, locked_page, start, end,
- PAGE_UNLOCK, NULL);
+ PAGE_UNLOCK);
}
static noinline int lock_delalloc_pages(struct inode *inode,
struct page *locked_page,
- u64 delalloc_start,
- u64 delalloc_end)
+ u64 start,
+ u64 end)
{
- unsigned long index = delalloc_start >> PAGE_SHIFT;
- unsigned long end_index = delalloc_end >> PAGE_SHIFT;
- u64 processed_end = delalloc_start;
- int ret;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start_index;
+ u64 processed_end = start;
+ struct folio_batch fbatch;
- ASSERT(locked_page);
if (index == locked_page->index && index == end_index)
return 0;
- ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
- delalloc_end, PAGE_LOCK, &processed_end);
- if (ret == -EAGAIN && processed_end > delalloc_start)
- __unlock_for_delalloc(inode, locked_page, delalloc_start,
- processed_end);
- return ret;
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ unsigned int found_folios, i;
+
+ found_folios = filemap_get_folios_contig(mapping, &index,
+ end_index, &fbatch);
+ if (found_folios == 0)
+ goto out;
+
+ for (i = 0; i < found_folios; i++) {
+ struct page *page = &fbatch.folios[i]->page;
+ u32 len = end + 1 - start;
+
+ if (page == locked_page)
+ continue;
+
+ if (btrfs_page_start_writer_lock(fs_info, page, start,
+ len))
+ goto out;
+
+ if (!PageDirty(page) || page->mapping != mapping) {
+ btrfs_page_end_writer_lock(fs_info, page, start,
+ len);
+ goto out;
+ }
+
+ processed_end = page_offset(page) + PAGE_SIZE - 1;
+ }
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
+
+ return 0;
+out:
+ folio_batch_release(&fbatch);
+ if (processed_end > start)
+ __unlock_for_delalloc(inode, locked_page, start, processed_end);
+ return -EAGAIN;
}
/*
@@ -467,7 +418,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
- start, end, page_ops, NULL);
+ start, end, page_ops);
}
static bool btrfs_verify_page(struct page *page, u64 start)
@@ -497,31 +448,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_subpage_end_reader(fs_info, page, start, len);
}
-/* lots and lots of room for performance fixes in the end_bio funcs */
-
-void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
-{
- struct btrfs_inode *inode;
- const bool uptodate = (err == 0);
- int ret = 0;
-
- ASSERT(page && page->mapping);
- inode = BTRFS_I(page->mapping->host);
- btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
-
- if (!uptodate) {
- const struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u32 len;
-
- ASSERT(end + 1 - start <= U32_MAX);
- len = end + 1 - start;
-
- btrfs_page_clear_uptodate(fs_info, page, start, len);
- ret = err < 0 ? err : -EIO;
- mapping_set_error(page->mapping, ret);
- }
-}
-
/*
* after a writepage IO is done, we need to:
* clear the uptodate bits on error
@@ -1243,38 +1169,45 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc)
{
- const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
- u64 delalloc_start = page_offset(page);
+ const u64 page_start = page_offset(page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 delalloc_start = page_start;
+ u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
- /* How many pages are started by btrfs_run_delalloc_range() */
- unsigned long nr_written = 0;
- int ret;
- int page_started = 0;
+ int ret = 0;
while (delalloc_start < page_end) {
- u64 delalloc_end = page_end;
- bool found;
-
- found = find_lock_delalloc_range(&inode->vfs_inode, page,
- &delalloc_start,
- &delalloc_end);
- if (!found) {
+ delalloc_end = page_end;
+ if (!find_lock_delalloc_range(&inode->vfs_inode, page,
+ &delalloc_start, &delalloc_end)) {
delalloc_start = delalloc_end + 1;
continue;
}
+
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
- delalloc_end, &page_started, &nr_written, wbc);
- if (ret)
+ delalloc_end, wbc);
+ if (ret < 0)
return ret;
- /*
- * delalloc_end is already one less than the total length, so
- * we don't subtract one from PAGE_SIZE
- */
- delalloc_to_write += (delalloc_end - delalloc_start +
- PAGE_SIZE) >> PAGE_SHIFT;
delalloc_start = delalloc_end + 1;
}
+
+ /*
+ * delalloc_end is already one less than the total length, so
+ * we don't subtract one from PAGE_SIZE
+ */
+ delalloc_to_write +=
+ DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
+
+ /*
+ * If btrfs_run_dealloc_range() already started I/O and unlocked
+ * the pages, we just need to account for them here.
+ */
+ if (ret == 1) {
+ wbc->nr_to_write -= delalloc_to_write;
+ return 1;
+ }
+
if (wbc->nr_to_write < delalloc_to_write) {
int thresh = 8192;
@@ -1284,16 +1217,6 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
thresh);
}
- /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
- if (page_started) {
- /*
- * We've unlocked the page, so we can't update the mapping's
- * writeback index, just update nr_to_write.
- */
- wbc->nr_to_write -= nr_written;
- return 1;
- }
-
return 0;
}
@@ -1382,6 +1305,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
bio_ctrl->end_io_func = end_bio_extent_writepage;
while (cur <= end) {
+ u32 len = end - cur + 1;
u64 disk_bytenr;
u64 em_end;
u64 dirty_range_start = cur;
@@ -1389,8 +1313,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
u32 iosize;
if (cur >= i_size) {
- btrfs_writepage_endio_finish_ordered(inode, page, cur,
- end, true);
+ btrfs_mark_ordered_io_finished(inode, page, cur, len,
+ true);
/*
* This range is beyond i_size, thus we don't need to
* bother writing back.
@@ -1399,7 +1323,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
+ btrfs_page_clear_dirty(fs_info, page, cur, len);
break;
}
@@ -1410,7 +1334,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
continue;
}
- em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
+ em = btrfs_get_extent(inode, NULL, 0, cur, len);
if (IS_ERR(em)) {
ret = PTR_ERR_OR_ZERO(em);
goto out_error;
@@ -1486,7 +1410,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl
struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
const u64 page_start = page_offset(page);
- const u64 page_end = page_start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset;
@@ -1530,8 +1453,13 @@ done:
set_page_writeback(page);
end_page_writeback(page);
}
- if (ret)
- end_extent_writepage(page, ret, page_start, page_end);
+ if (ret) {
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start,
+ PAGE_SIZE, !ret);
+ btrfs_page_clear_uptodate(btrfs_sb(inode->i_sb), page,
+ page_start, PAGE_SIZE);
+ mapping_set_error(page->mapping, ret);
+ }
unlock_page(page);
ASSERT(ret <= 0);
return ret;
@@ -1877,11 +1805,10 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* previous call.
* Return <0 for fatal error.
*/
-static int submit_eb_page(struct page *page, struct writeback_control *wbc,
- struct extent_buffer **eb_context)
+static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
{
+ struct writeback_control *wbc = ctx->wbc;
struct address_space *mapping = page->mapping;
- struct btrfs_block_group *cache = NULL;
struct extent_buffer *eb;
int ret;
@@ -1908,7 +1835,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
return 0;
}
- if (eb == *eb_context) {
+ if (eb == ctx->eb) {
spin_unlock(&mapping->private_lock);
return 0;
}
@@ -1917,34 +1844,25 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
if (!ret)
return 0;
- if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
- /*
- * If for_sync, this hole will be filled with
- * trasnsaction commit.
- */
- if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
- ret = -EAGAIN;
- else
+ ctx->eb = eb;
+
+ ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx);
+ if (ret) {
+ if (ret == -EBUSY)
ret = 0;
free_extent_buffer(eb);
return ret;
}
- *eb_context = eb;
-
if (!lock_extent_buffer_for_io(eb, wbc)) {
- btrfs_revert_meta_write_pointer(cache, eb);
- if (cache)
- btrfs_put_block_group(cache);
free_extent_buffer(eb);
return 0;
}
- if (cache) {
- /*
- * Implies write in zoned mode. Mark the last eb in a block group.
- */
- btrfs_schedule_zone_finish_bg(cache, eb);
- btrfs_put_block_group(cache);
+ /* Implies write in zoned mode. */
+ if (ctx->zoned_bg) {
+ /* Mark the last eb in the block group. */
+ btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb);
+ ctx->zoned_bg->meta_write_pointer += eb->len;
}
write_one_eb(eb, wbc);
free_extent_buffer(eb);
@@ -1954,7 +1872,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_buffer *eb_context = NULL;
+ struct btrfs_eb_write_context ctx = { .wbc = wbc };
struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
int done = 0;
@@ -1996,7 +1914,7 @@ retry:
for (i = 0; i < nr_folios; i++) {
struct folio *folio = fbatch.folios[i];
- ret = submit_eb_page(&folio->page, wbc, &eb_context);
+ ret = submit_eb_page(&folio->page, &ctx);
if (ret == 0)
continue;
if (ret < 0) {
@@ -2057,6 +1975,9 @@ retry:
ret = 0;
if (!ret && BTRFS_FS_ERROR(fs_info))
ret = -EROFS;
+
+ if (ctx.zoned_bg)
+ btrfs_put_block_group(ctx.zoned_bg);
btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
@@ -2150,7 +2071,7 @@ retry:
for (i = 0; i < nr_folios; i++) {
struct folio *folio = fbatch.folios[i];
- done_index = folio->index + folio_nr_pages(folio);
+ done_index = folio_next_index(folio);
/*
* At this point we hold neither the i_pages lock nor
* the page lock: the page may be truncated or
@@ -2233,11 +2154,11 @@ retry:
* already been ran (aka, ordered extent inserted) and all pages are still
* locked.
*/
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- struct writeback_control *wbc)
+void extent_write_locked_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, struct writeback_control *wbc,
+ bool pages_dirty)
{
bool found_error = false;
- int first_error = 0;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2256,18 +2177,16 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
while (cur <= end) {
u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+ u32 cur_len = cur_end + 1 - cur;
struct page *page;
int nr = 0;
page = find_get_page(mapping, cur >> PAGE_SHIFT);
- /*
- * All pages in the range are locked since
- * btrfs_run_delalloc_range(), thus there is no way to clear
- * the page dirty flag.
- */
ASSERT(PageLocked(page));
- ASSERT(PageDirty(page));
- clear_page_dirty_for_io(page);
+ if (pages_dirty && page != locked_page) {
+ ASSERT(PageDirty(page));
+ clear_page_dirty_for_io(page);
+ }
ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl,
i_size, &nr);
@@ -2279,23 +2198,21 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
set_page_writeback(page);
end_page_writeback(page);
}
- if (ret)
- end_extent_writepage(page, ret, cur, cur_end);
- btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur);
- if (ret < 0) {
- found_error = true;
- first_error = ret;
+ if (ret) {
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), page,
+ cur, cur_len, !ret);
+ btrfs_page_clear_uptodate(fs_info, page, cur, cur_len);
+ mapping_set_error(page->mapping, ret);
}
+ btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
+ if (ret < 0)
+ found_error = true;
next_page:
put_page(page);
cur = cur_end + 1;
}
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
-
- if (found_error)
- return first_error;
- return ret;
}
int extent_writepages(struct address_space *mapping,
@@ -3315,8 +3232,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
return NULL;
}
WARN_ON(PageDirty(p));
- copy_page(page_address(p), page_address(src->pages[i]));
}
+ copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
return new;
@@ -3559,6 +3476,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *exists = NULL;
struct page *p;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
int uptodate = 1;
int ret;
@@ -3595,36 +3513,30 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++, index++) {
- struct btrfs_subpage *prealloc = NULL;
+ /*
+ * Preallocate page->private for subpage case, so that we won't
+ * allocate memory with private_lock nor page lock hold.
+ *
+ * The memory will be freed by attach_extent_buffer_page() or freed
+ * manually if we exit earlier.
+ */
+ if (fs_info->nodesize < PAGE_SIZE) {
+ prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (IS_ERR(prealloc)) {
+ exists = ERR_CAST(prealloc);
+ goto free_eb;
+ }
+ }
+
+ for (i = 0; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p) {
exists = ERR_PTR(-ENOMEM);
+ btrfs_free_subpage(prealloc);
goto free_eb;
}
- /*
- * Preallocate page->private for subpage case, so that we won't
- * allocate memory with private_lock hold. The memory will be
- * freed by attach_extent_buffer_page() or freed manually if
- * we exit earlier.
- *
- * Although we have ensured one subpage eb can only have one
- * page, but it may change in the future for 16K page size
- * support, so we still preallocate the memory in the loop.
- */
- if (fs_info->nodesize < PAGE_SIZE) {
- prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
- if (IS_ERR(prealloc)) {
- ret = PTR_ERR(prealloc);
- unlock_page(p);
- put_page(p);
- exists = ERR_PTR(ret);
- goto free_eb;
- }
- }
-
spin_lock(&mapping->private_lock);
exists = grab_extent_buffer(fs_info, p);
if (exists) {
@@ -4210,30 +4122,9 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
}
}
-void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
- const void *srcv)
-{
- char *kaddr;
-
- assert_eb_page_uptodate(eb, eb->pages[0]);
- kaddr = page_address(eb->pages[0]) +
- get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
- chunk_tree_uuid));
- memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
-}
-
-void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
-{
- char *kaddr;
-
- assert_eb_page_uptodate(eb, eb->pages[0]);
- kaddr = page_address(eb->pages[0]) +
- get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
- memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
-}
-
-void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
- unsigned long start, unsigned long len)
+static void __write_extent_buffer(const struct extent_buffer *eb,
+ const void *srcv, unsigned long start,
+ unsigned long len, bool use_memmove)
{
size_t cur;
size_t offset;
@@ -4241,6 +4132,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
char *kaddr;
char *src = (char *)srcv;
unsigned long i = get_eb_page_index(start);
+ /* For unmapped (dummy) ebs, no need to check their uptodate status. */
+ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
@@ -4251,11 +4144,15 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
while (len > 0) {
page = eb->pages[i];
- assert_eb_page_uptodate(eb, page);
+ if (check_uptodate)
+ assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
- memcpy(kaddr + offset, src, cur);
+ if (use_memmove)
+ memmove(kaddr + offset, src, cur);
+ else
+ memcpy(kaddr + offset, src, cur);
src += cur;
len -= cur;
@@ -4264,55 +4161,54 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
}
}
-void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
- unsigned long len)
+void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
+ unsigned long start, unsigned long len)
{
- size_t cur;
- size_t offset;
- struct page *page;
- char *kaddr;
- unsigned long i = get_eb_page_index(start);
+ return __write_extent_buffer(eb, srcv, start, len, false);
+}
- if (check_eb_range(eb, start, len))
- return;
+static void memset_extent_buffer(const struct extent_buffer *eb, int c,
+ unsigned long start, unsigned long len)
+{
+ unsigned long cur = start;
- offset = get_eb_offset_in_page(eb, start);
+ while (cur < start + len) {
+ unsigned long index = get_eb_page_index(cur);
+ unsigned int offset = get_eb_offset_in_page(eb, cur);
+ unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset);
+ struct page *page = eb->pages[index];
- while (len > 0) {
- page = eb->pages[i];
assert_eb_page_uptodate(eb, page);
+ memset(page_address(page) + offset, c, cur_len);
- cur = min(len, PAGE_SIZE - offset);
- kaddr = page_address(page);
- memset(kaddr + offset, 0, cur);
-
- len -= cur;
- offset = 0;
- i++;
+ cur += cur_len;
}
}
+void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ if (check_eb_range(eb, start, len))
+ return;
+ return memset_extent_buffer(eb, 0, start, len);
+}
+
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
- int i;
- int num_pages;
+ unsigned long cur = 0;
ASSERT(dst->len == src->len);
- if (dst->fs_info->nodesize >= PAGE_SIZE) {
- num_pages = num_extent_pages(dst);
- for (i = 0; i < num_pages; i++)
- copy_page(page_address(dst->pages[i]),
- page_address(src->pages[i]));
- } else {
- size_t src_offset = get_eb_offset_in_page(src, 0);
- size_t dst_offset = get_eb_offset_in_page(dst, 0);
+ while (cur < src->len) {
+ unsigned long index = get_eb_page_index(cur);
+ unsigned long offset = get_eb_offset_in_page(src, cur);
+ unsigned long cur_len = min(src->len, PAGE_SIZE - offset);
+ void *addr = page_address(src->pages[index]) + offset;
+
+ write_extent_buffer(dst, addr, cur, cur_len);
- ASSERT(src->fs_info->nodesize < PAGE_SIZE);
- memcpy(page_address(dst->pages[0]) + dst_offset,
- page_address(src->pages[0]) + src_offset,
- src->len);
+ cur += cur_len;
}
}
@@ -4406,6 +4302,15 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
+static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
+{
+ unsigned long index = get_eb_page_index(bytenr);
+
+ if (check_eb_range(eb, bytenr, 1))
+ return NULL;
+ return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr);
+}
+
/*
* Set an area of a bitmap to 1.
*
@@ -4417,35 +4322,28 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
+ unsigned int first_byte = start + BIT_BYTE(pos);
+ unsigned int last_byte = start + BIT_BYTE(pos + len - 1);
+ const bool same_byte = (first_byte == last_byte);
+ u8 mask = BITMAP_FIRST_BYTE_MASK(pos);
u8 *kaddr;
- struct page *page;
- unsigned long i;
- size_t offset;
- const unsigned int size = pos + len;
- int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
- u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
- eb_bitmap_offset(eb, start, pos, &i, &offset);
- page = eb->pages[i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
+ if (same_byte)
+ mask &= BITMAP_LAST_BYTE_MASK(pos + len);
- while (len >= bits_to_set) {
- kaddr[offset] |= mask_to_set;
- len -= bits_to_set;
- bits_to_set = BITS_PER_BYTE;
- mask_to_set = ~0;
- if (++offset >= PAGE_SIZE && len > 0) {
- offset = 0;
- page = eb->pages[++i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
- }
- }
- if (len) {
- mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
- kaddr[offset] |= mask_to_set;
- }
+ /* Handle the first byte. */
+ kaddr = extent_buffer_get_byte(eb, first_byte);
+ *kaddr |= mask;
+ if (same_byte)
+ return;
+
+ /* Handle the byte aligned part. */
+ ASSERT(first_byte + 1 <= last_byte);
+ memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1);
+
+ /* Handle the last byte. */
+ kaddr = extent_buffer_get_byte(eb, last_byte);
+ *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len);
}
@@ -4461,35 +4359,28 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
unsigned long start, unsigned long pos,
unsigned long len)
{
+ unsigned int first_byte = start + BIT_BYTE(pos);
+ unsigned int last_byte = start + BIT_BYTE(pos + len - 1);
+ const bool same_byte = (first_byte == last_byte);
+ u8 mask = BITMAP_FIRST_BYTE_MASK(pos);
u8 *kaddr;
- struct page *page;
- unsigned long i;
- size_t offset;
- const unsigned int size = pos + len;
- int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
- u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
- eb_bitmap_offset(eb, start, pos, &i, &offset);
- page = eb->pages[i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
+ if (same_byte)
+ mask &= BITMAP_LAST_BYTE_MASK(pos + len);
- while (len >= bits_to_clear) {
- kaddr[offset] &= ~mask_to_clear;
- len -= bits_to_clear;
- bits_to_clear = BITS_PER_BYTE;
- mask_to_clear = ~0;
- if (++offset >= PAGE_SIZE && len > 0) {
- offset = 0;
- page = eb->pages[++i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
- }
- }
- if (len) {
- mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
- kaddr[offset] &= ~mask_to_clear;
- }
+ /* Handle the first byte. */
+ kaddr = extent_buffer_get_byte(eb, first_byte);
+ *kaddr &= ~mask;
+ if (same_byte)
+ return;
+
+ /* Handle the byte aligned part. */
+ ASSERT(first_byte + 1 <= last_byte);
+ memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1);
+
+ /* Handle the last byte. */
+ kaddr = extent_buffer_get_byte(eb, last_byte);
+ *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len);
}
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
@@ -4498,60 +4389,29 @@ static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned
return distance < len;
}
-static void copy_pages(struct page *dst_page, struct page *src_page,
- unsigned long dst_off, unsigned long src_off,
- unsigned long len)
-{
- char *dst_kaddr = page_address(dst_page);
- char *src_kaddr;
- int must_memmove = 0;
-
- if (dst_page != src_page) {
- src_kaddr = page_address(src_page);
- } else {
- src_kaddr = dst_kaddr;
- if (areas_overlap(src_off, dst_off, len))
- must_memmove = 1;
- }
-
- if (must_memmove)
- memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
- else
- memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
-}
-
void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- size_t cur;
- size_t dst_off_in_page;
- size_t src_off_in_page;
- unsigned long dst_i;
- unsigned long src_i;
+ unsigned long cur_off = 0;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
- while (len > 0) {
- dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
- src_off_in_page = get_eb_offset_in_page(dst, src_offset);
-
- dst_i = get_eb_page_index(dst_offset);
- src_i = get_eb_page_index(src_offset);
-
- cur = min(len, (unsigned long)(PAGE_SIZE -
- src_off_in_page));
- cur = min_t(unsigned long, cur,
- (unsigned long)(PAGE_SIZE - dst_off_in_page));
-
- copy_pages(dst->pages[dst_i], dst->pages[src_i],
- dst_off_in_page, src_off_in_page, cur);
-
- src_offset += cur;
- dst_offset += cur;
- len -= cur;
+ while (cur_off < len) {
+ unsigned long cur_src = cur_off + src_offset;
+ unsigned long pg_index = get_eb_page_index(cur_src);
+ unsigned long pg_off = get_eb_offset_in_page(dst, cur_src);
+ unsigned long cur_len = min(src_offset + len - cur_src,
+ PAGE_SIZE - pg_off);
+ void *src_addr = page_address(dst->pages[pg_index]) + pg_off;
+ const bool use_memmove = areas_overlap(src_offset + cur_off,
+ dst_offset + cur_off, cur_len);
+
+ __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len,
+ use_memmove);
+ cur_off += cur_len;
}
}
@@ -4559,23 +4419,26 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- size_t cur;
- size_t dst_off_in_page;
- size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
- unsigned long dst_i;
- unsigned long src_i;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
+
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
+
while (len > 0) {
- dst_i = get_eb_page_index(dst_end);
+ unsigned long src_i;
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ void *src_addr;
+ bool use_memmove;
+
src_i = get_eb_page_index(src_end);
dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
@@ -4583,9 +4446,14 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
- copy_pages(dst->pages[dst_i], dst->pages[src_i],
- dst_off_in_page - cur + 1,
- src_off_in_page - cur + 1, cur);
+
+ src_addr = page_address(dst->pages[src_i]) + src_off_in_page -
+ cur + 1;
+ use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
+ cur);
+
+ __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur,
+ use_memmove);
dst_end -= cur;
src_end -= cur;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5fae3a7d911..68368ba99321 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -40,7 +40,6 @@ enum {
ENUM_BIT(PAGE_START_WRITEBACK),
ENUM_BIT(PAGE_END_WRITEBACK),
ENUM_BIT(PAGE_SET_ORDERED),
- ENUM_BIT(PAGE_LOCK),
};
/*
@@ -94,6 +93,13 @@ struct extent_buffer {
#endif
};
+struct btrfs_eb_write_context {
+ struct writeback_control *wbc;
+ struct extent_buffer *eb;
+ /* Block group @eb resides in. Only used for zoned mode. */
+ struct btrfs_block_group *zoned_bg;
+};
+
/*
* Get the correct offset inside the page of extent buffer.
*
@@ -178,8 +184,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int btrfs_read_folio(struct file *file, struct folio *folio);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- struct writeback_control *wbc);
+void extent_write_locked_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, struct writeback_control *wbc,
+ bool pages_dirty);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
@@ -236,11 +243,24 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dst,
int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dst, unsigned long start,
unsigned long len);
-void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
-void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
- const void *src);
void write_extent_buffer(const struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
+
+static inline void write_extent_buffer_chunk_tree_uuid(
+ const struct extent_buffer *eb, const void *chunk_tree_uuid)
+{
+ write_extent_buffer(eb, chunk_tree_uuid,
+ offsetof(struct btrfs_header, chunk_tree_uuid),
+ BTRFS_FSID_SIZE);
+}
+
+static inline void write_extent_buffer_fsid(const struct extent_buffer *eb,
+ const void *fsid)
+{
+ write_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE);
+}
+
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src);
void copy_extent_buffer(const struct extent_buffer *dst,
@@ -266,7 +286,6 @@ void set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
-void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
@@ -277,8 +296,6 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
-void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 696bf695d8eb..1ce5dd154499 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -597,29 +597,37 @@ fail:
* Each bit represents a sector. Thus caller should ensure @csum_buf passed
* in is large enough to contain all csums.
*/
-int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
- u8 *csum_buf, unsigned long *csum_bitmap,
- bool search_commit)
+int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path,
+ u64 start, u64 end, u8 *csum_buf,
+ unsigned long *csum_bitmap)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
- struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_csum_item *item;
const u64 orig_start = start;
+ bool free_path = false;
int ret;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ free_path = true;
+ }
- if (search_commit) {
- path->skip_locking = 1;
- path->reada = READA_FORWARD;
- path->search_commit_root = 1;
+ /* Check if we can reuse the previous path. */
+ if (path->nodes[0]) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+ key.type == BTRFS_EXTENT_CSUM_KEY &&
+ key.offset <= start)
+ goto search_forward;
+ btrfs_release_path(path);
}
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -656,6 +664,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
}
}
+search_forward:
while (start <= end) {
u64 csum_end;
@@ -712,7 +721,8 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
}
ret = 0;
fail:
- btrfs_free_path(path);
+ if (free_path)
+ btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 4ec669b69008..04bd2d34efb1 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -57,9 +57,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
-int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
- u8 *csum_buf, unsigned long *csum_bitmap,
- bool search_commit);
+int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path,
+ u64 start, u64 end, u8 *csum_buf,
+ unsigned long *csum_bitmap);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fd03e689a6be..ca46a529d56b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -876,9 +876,9 @@ static int prepare_uptodate_page(struct inode *inode,
return 0;
}
-static unsigned int get_prepare_fgp_flags(bool nowait)
+static fgf_t get_prepare_fgp_flags(bool nowait)
{
- unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+ fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
if (nowait)
fgp_flags |= FGP_NOWAIT;
@@ -910,7 +910,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
int i;
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
- unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
+ fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
int err = 0;
int faili;
@@ -1106,24 +1106,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
-static void update_time_for_write(struct inode *inode)
-{
- struct timespec64 now;
-
- if (IS_NOCMTIME(inode))
- return;
-
- now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
- inode->i_mtime = now;
-
- if (!timespec64_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
-
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-}
-
static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
size_t count)
{
@@ -1155,7 +1137,10 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write.
*/
- update_time_for_write(inode);
+ if (!IS_NOCMTIME(inode)) {
+ inode->i_mtime = inode_set_ctime_current(inode);
+ inode_inc_iversion(inode);
+ }
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
@@ -2459,10 +2444,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
*/
inode_inc_iversion(&inode->vfs_inode);
- if (!extent_info || extent_info->update_times) {
- inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
- inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
- }
+ if (!extent_info || extent_info->update_times)
+ inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
ret = btrfs_update_inode(trans, root, inode);
if (ret)
@@ -2703,8 +2686,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
@@ -2721,11 +2703,10 @@ out_only_mutex:
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
- struct timespec64 now = current_time(inode);
+ struct timespec64 now = inode_set_ctime_current(inode);
inode_inc_iversion(inode);
inode->i_mtime = now;
- inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -2796,7 +2777,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
if (IS_ERR(trans))
return PTR_ERR(trans);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
i_size_write(inode, end);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
@@ -3018,7 +2999,7 @@ static long btrfs_fallocate(struct file *file, int mode,
struct extent_changeset *data_reserved = NULL;
struct falloc_range *range;
struct falloc_range *tmp;
- struct list_head reserve_list;
+ LIST_HEAD(reserve_list);
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
@@ -3110,7 +3091,6 @@ static long btrfs_fallocate(struct file *file, int mode,
btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
/* First, check if we exceed the qgroup limit */
- INIT_LIST_HEAD(&reserve_list);
while (cur_offset < alloc_end) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
alloc_end - cur_offset);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 880800418075..27fad70451aa 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1219,10 +1219,9 @@ static noinline_for_stack int write_pinned_extent_entries(
start = block_group->start;
while (start < block_group->start + block_group->length) {
- ret = find_first_extent_bit(unpin, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY, NULL);
- if (ret)
+ if (!find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL))
return 0;
/* This pinned extent is out of our range */
@@ -2705,13 +2704,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
spin_lock(&ctl->tree_lock);
- /* Count initial region as zone_unusable until it gets activated. */
if (!used)
to_free = size;
- else if (initial &&
- test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) &&
- (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
- to_free = 0;
else if (initial)
to_free = block_group->zone_capacity;
else if (offset >= block_group->alloc_offset)
@@ -2739,8 +2733,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
reclaimable_unusable = block_group->zone_unusable -
(block_group->length - block_group->zone_capacity);
/* All the region is now unusable. Mark it as unused and reclaim */
- if (block_group->zone_unusable == block_group->length &&
- block_group->alloc_offset) {
+ if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
reclaimable_unusable >=
@@ -2944,7 +2937,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(fs_info,
- "%d blocks of free space at or bigger than bytes is", count);
+ "%d free space entries at or bigger than %llu bytes",
+ count, bytes);
}
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index f169378e2ca6..c0e734082dcc 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1517,8 +1517,10 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
} else if (prev_bit == 1 && bit == 0) {
u64 space_added;
- ret = add_new_free_space(block_group, extent_start,
- offset, &space_added);
+ ret = btrfs_add_new_free_space(block_group,
+ extent_start,
+ offset,
+ &space_added);
if (ret)
goto out;
total_found += space_added;
@@ -1533,7 +1535,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
}
}
if (prev_bit == 1) {
- ret = add_new_free_space(block_group, extent_start, end, NULL);
+ ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL);
if (ret)
goto out;
extent_count++;
@@ -1590,8 +1592,9 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- ret = add_new_free_space(block_group, key.objectid,
- key.objectid + key.offset, &space_added);
+ ret = btrfs_add_new_free_space(block_group, key.objectid,
+ key.objectid + key.offset,
+ &space_added);
if (ret)
goto out;
total_found += space_added;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 203d2a267828..a523d64d5491 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -46,8 +46,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
* Runtime (in-memory) states of filesystem
*/
enum {
- /* Global indicator of serious filesystem errors */
- BTRFS_FS_STATE_ERROR,
/*
* Filesystem is being remounted, allow to skip some operations, like
* defrag
@@ -686,6 +684,12 @@ struct btrfs_fs_info {
bool qgroup_rescan_running;
u8 qgroup_drop_subtree_thres;
+ /*
+ * If this is not 0, then it indicates a serious filesystem error has
+ * happened and it contains that error (negative errno value).
+ */
+ int fs_error;
+
/* Filesystem state */
unsigned long fs_state;
@@ -766,6 +770,9 @@ struct btrfs_fs_info {
u64 data_reloc_bg;
struct mutex zoned_data_reloc_io_lock;
+ struct btrfs_block_group *active_meta_bg;
+ struct btrfs_block_group *active_system_bg;
+
u64 nr_global_roots;
spinlock_t zone_active_bgs_lock;
@@ -962,8 +969,8 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
}
-#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
- &(fs_info)->fs_state)))
+#define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error))
+
#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
&(fs_info)->fs_state)))
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aa090b0b5d29..f09fbdc43f0f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -124,11 +124,11 @@ static struct kmem_cache *btrfs_inode_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
-static noinline int cow_file_range(struct btrfs_inode *inode,
- struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock,
- u64 *done_offset);
+
+static noinline int run_delalloc_cow(struct btrfs_inode *inode,
+ struct page *locked_page, u64 start,
+ u64 end, struct writeback_control *wbc,
+ bool pages_dirty);
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
@@ -423,11 +423,10 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
while (index <= end_index) {
/*
- * For locked page, we will call end_extent_writepage() on it
- * in run_delalloc_range() for the error handling. That
- * end_extent_writepage() function will call
- * btrfs_mark_ordered_io_finished() to clear page Ordered and
- * run the ordered extent accounting.
+ * For locked page, we will call btrfs_mark_ordered_io_finished
+ * through btrfs_mark_ordered_io_finished() on it
+ * in run_delalloc_range() for the error handling, which will
+ * clear page Ordered and run the ordered extent accounting.
*
* Here we can't just clear the Ordered bit, or
* btrfs_mark_ordered_io_finished() would skip the accounting
@@ -815,24 +814,22 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
}
/*
- * we create compressed extents in two phases. The first
- * phase compresses a range of pages that have already been
- * locked (both pages and state bits are locked).
+ * Work queue call back to started compression on a file and pages.
*
- * This is done inside an ordered work queue, and the compression
- * is spread across many cpus. The actual IO submission is step
- * two, and the ordered work queue takes care of making sure that
- * happens in the same order things were put onto the queue by
- * writepages and friends.
+ * This is done inside an ordered work queue, and the compression is spread
+ * across many cpus. The actual IO submission is step two, and the ordered work
+ * queue takes care of making sure that happens in the same order things were
+ * put onto the queue by writepages and friends.
*
- * If this code finds it can't get good compression, it puts an
- * entry onto the work queue to write the uncompressed bytes. This
- * makes sure that both compressed inodes and uncompressed inodes
- * are written in the same order that the flusher thread sent them
- * down.
+ * If this code finds it can't get good compression, it puts an entry onto the
+ * work queue to write the uncompressed bytes. This makes sure that both
+ * compressed inodes and uncompressed inodes are written in the same order that
+ * the flusher thread sent them down.
*/
-static noinline int compress_file_range(struct async_chunk *async_chunk)
+static void compress_file_range(struct btrfs_work *work)
{
+ struct async_chunk *async_chunk =
+ container_of(work, struct async_chunk, work);
struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
@@ -842,19 +839,24 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
u64 actual_end;
u64 i_size;
int ret = 0;
- struct page **pages = NULL;
+ struct page **pages;
unsigned long nr_pages;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
+ unsigned int poff;
int i;
- int will_compress;
int compress_type = fs_info->compress_type;
- int compressed_extents = 0;
- int redirty = 0;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
/*
+ * We need to call clear_page_dirty_for_io on each page in the range.
+ * Otherwise applications with the file mmap'd can wander in and change
+ * the page contents while we are compressing them.
+ */
+ extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+
+ /*
* We need to save i_size before now because it could change in between
* us evaluating the size and assigning it. This is because we lock and
* unlock the page in truncate and fallocate, and then modify the i_size
@@ -868,7 +870,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
- will_compress = 0;
+ pages = NULL;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
@@ -912,78 +914,57 @@ again:
ret = 0;
/*
- * we do compression for mount -o compress and when the
- * inode has not been flagged as nocompress. This flag can
- * change at any time if we discover bad compression ratios.
+ * We do compression for mount -o compress and when the inode has not
+ * been flagged as NOCOMPRESS. This flag can change at any time if we
+ * discover bad compression ratios.
*/
- if (inode_need_compress(inode, start, end)) {
- WARN_ON(pages);
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages) {
- /* just bail out to the uncompressed code */
- nr_pages = 0;
- goto cont;
- }
-
- if (inode->defrag_compress)
- compress_type = inode->defrag_compress;
- else if (inode->prop_compress)
- compress_type = inode->prop_compress;
+ if (!inode_need_compress(inode, start, end))
+ goto cleanup_and_bail_uncompressed;
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages) {
/*
- * we need to call clear_page_dirty_for_io on each
- * page in the range. Otherwise applications with the file
- * mmap'd can wander in and change the page contents while
- * we are compressing them.
- *
- * If the compression fails for any reason, we set the pages
- * dirty again later on.
- *
- * Note that the remaining part is redirtied, the start pointer
- * has moved, the end is the original one.
+ * Memory allocation failure is not a fatal error, we can fall
+ * back to uncompressed code.
*/
- if (!redirty) {
- extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
- redirty = 1;
- }
+ goto cleanup_and_bail_uncompressed;
+ }
- /* Compression level is applied here and only here */
- ret = btrfs_compress_pages(
- compress_type | (fs_info->compress_level << 4),
- mapping, start,
- pages,
- &nr_pages,
- &total_in,
- &total_compressed);
+ if (inode->defrag_compress)
+ compress_type = inode->defrag_compress;
+ else if (inode->prop_compress)
+ compress_type = inode->prop_compress;
+
+ /* Compression level is applied here. */
+ ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
+ mapping, start, pages, &nr_pages, &total_in,
+ &total_compressed);
+ if (ret)
+ goto mark_incompressible;
- if (!ret) {
- unsigned long offset = offset_in_page(total_compressed);
- struct page *page = pages[nr_pages - 1];
+ /*
+ * Zero the tail end of the last page, as we might be sending it down
+ * to disk.
+ */
+ poff = offset_in_page(total_compressed);
+ if (poff)
+ memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
- /* zero the tail end of the last page, we might be
- * sending it down to disk
- */
- if (offset)
- memzero_page(page, offset, PAGE_SIZE - offset);
- will_compress = 1;
- }
- }
-cont:
/*
+ * Try to create an inline extent.
+ *
+ * If we didn't compress the entire range, try to create an uncompressed
+ * inline extent, else a compressed one.
+ *
* Check cow_file_range() for why we don't even try to create inline
- * extent for subpage case.
+ * extent for the subpage case.
*/
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
- /* lets try to make an inline extent */
- if (ret || total_in < actual_end) {
- /* we didn't compress the entire range, try
- * to make an uncompressed inline extent.
- */
- ret = cow_file_range_inline(inode, actual_end,
- 0, BTRFS_COMPRESS_NONE,
- NULL, false);
+ if (total_in < actual_end) {
+ ret = cow_file_range_inline(inode, actual_end, 0,
+ BTRFS_COMPRESS_NONE, NULL,
+ false);
} else {
- /* try making a compressed inline extent */
ret = cow_file_range_inline(inode, actual_end,
total_compressed,
compress_type, pages,
@@ -1013,99 +994,52 @@ cont:
PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
-
- /*
- * Ensure we only free the compressed pages if we have
- * them allocated, as we can still reach here with
- * inode_need_compress() == false.
- */
- if (pages) {
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
- }
- kfree(pages);
- }
- return 0;
+ goto free_pages;
}
}
- if (will_compress) {
- /*
- * we aren't doing an inline extent round the compressed size
- * up to a block size boundary so the allocator does sane
- * things
- */
- total_compressed = ALIGN(total_compressed, blocksize);
+ /*
+ * We aren't doing an inline extent. Round the compressed size up to a
+ * block size boundary so the allocator does sane things.
+ */
+ total_compressed = ALIGN(total_compressed, blocksize);
- /*
- * one last check to make sure the compression is really a
- * win, compare the page count read with the blocks on disk,
- * compression must free at least one sector size
- */
- total_in = round_up(total_in, fs_info->sectorsize);
- if (total_compressed + blocksize <= total_in) {
- compressed_extents++;
+ /*
+ * One last check to make sure the compression is really a win, compare
+ * the page count read with the blocks on disk, compression must free at
+ * least one sector.
+ */
+ total_in = round_up(total_in, fs_info->sectorsize);
+ if (total_compressed + blocksize > total_in)
+ goto mark_incompressible;
- /*
- * The async work queues will take care of doing actual
- * allocation on disk for these compressed pages, and
- * will submit them to the elevator.
- */
- add_async_extent(async_chunk, start, total_in,
- total_compressed, pages, nr_pages,
- compress_type);
-
- if (start + total_in < end) {
- start += total_in;
- pages = NULL;
- cond_resched();
- goto again;
- }
- return compressed_extents;
- }
+ /*
+ * The async work queues will take care of doing actual allocation on
+ * disk for these compressed pages, and will submit the bios.
+ */
+ add_async_extent(async_chunk, start, total_in, total_compressed, pages,
+ nr_pages, compress_type);
+ if (start + total_in < end) {
+ start += total_in;
+ cond_resched();
+ goto again;
}
+ return;
+
+mark_incompressible:
+ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
+cleanup_and_bail_uncompressed:
+ add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
+ BTRFS_COMPRESS_NONE);
+free_pages:
if (pages) {
- /*
- * the compression code ran but failed to make things smaller,
- * free any pages it allocated and our page pointer array
- */
for (i = 0; i < nr_pages; i++) {
WARN_ON(pages[i]->mapping);
put_page(pages[i]);
}
kfree(pages);
- pages = NULL;
- total_compressed = 0;
- nr_pages = 0;
-
- /* flag the file so we don't compress in the future */
- if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
- !(inode->prop_compress)) {
- inode->flags |= BTRFS_INODE_NOCOMPRESS;
- }
- }
-cleanup_and_bail_uncompressed:
- /*
- * No compression, but we still need to write the pages in the file
- * we've been given so far. redirty the locked page if it corresponds
- * to our extent and set things up for the async work queue to run
- * cow_file_range to do the normal delalloc dance.
- */
- if (async_chunk->locked_page &&
- (page_offset(async_chunk->locked_page) >= start &&
- page_offset(async_chunk->locked_page)) <= end) {
- __set_page_dirty_nobuffers(async_chunk->locked_page);
- /* unlocked later on in the async handlers */
}
-
- if (redirty)
- extent_range_redirty_for_io(&inode->vfs_inode, start, end);
- add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
- BTRFS_COMPRESS_NONE);
- compressed_extents++;
-
- return compressed_extents;
}
static void free_async_extent_pages(struct async_extent *async_extent)
@@ -1124,14 +1058,12 @@ static void free_async_extent_pages(struct async_extent *async_extent)
async_extent->pages = NULL;
}
-static int submit_uncompressed_range(struct btrfs_inode *inode,
- struct async_extent *async_extent,
- struct page *locked_page)
+static void submit_uncompressed_range(struct btrfs_inode *inode,
+ struct async_extent *async_extent,
+ struct page *locked_page)
{
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
- unsigned long nr_written = 0;
- int page_started = 0;
int ret;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
@@ -1140,45 +1072,33 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
.no_cgroup_owner = 1,
};
- /*
- * Call cow_file_range() to run the delalloc range directly, since we
- * won't go to NOCOW or async path again.
- *
- * Also we call cow_file_range() with @unlock_page == 0, so that we
- * can directly submit them without interruption.
- */
- ret = cow_file_range(inode, locked_page, start, end, &page_started,
- &nr_written, 0, NULL);
- /* Inline extent inserted, page gets unlocked and everything is done */
- if (page_started)
- return 0;
-
+ wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
+ ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
+ wbc_detach_inode(&wbc);
if (ret < 0) {
btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
if (locked_page) {
const u64 page_start = page_offset(locked_page);
- const u64 page_end = page_start + PAGE_SIZE - 1;
set_page_writeback(locked_page);
end_page_writeback(locked_page);
- end_extent_writepage(locked_page, ret, page_start, page_end);
+ btrfs_mark_ordered_io_finished(inode, locked_page,
+ page_start, PAGE_SIZE,
+ !ret);
+ btrfs_page_clear_uptodate(inode->root->fs_info,
+ locked_page, page_start,
+ PAGE_SIZE);
+ mapping_set_error(locked_page->mapping, ret);
unlock_page(locked_page);
}
- return ret;
}
-
- /* All pages will be unlocked, including @locked_page */
- wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
- ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc);
- wbc_detach_inode(&wbc);
- return ret;
}
-static int submit_one_async_extent(struct btrfs_inode *inode,
- struct async_chunk *async_chunk,
- struct async_extent *async_extent,
- u64 *alloc_hint)
+static void submit_one_async_extent(struct async_chunk *async_chunk,
+ struct async_extent *async_extent,
+ u64 *alloc_hint)
{
+ struct btrfs_inode *inode = async_chunk->inode;
struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1206,9 +1126,8 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
}
lock_extent(io_tree, start, end, NULL);
- /* We have fall back to uncompressed write */
- if (!async_extent->pages) {
- ret = submit_uncompressed_range(inode, async_extent, locked_page);
+ if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
+ submit_uncompressed_range(inode, async_extent, locked_page);
goto done;
}
@@ -1217,7 +1136,6 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
async_extent->compressed_size,
0, *alloc_hint, &ins, 1, 1);
if (ret) {
- free_async_extent_pages(async_extent);
/*
* Here we used to try again by going back to non-compressed
* path for ENOSPC. But we can't reserve space even for
@@ -1272,7 +1190,7 @@ done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
kfree(async_extent);
- return ret;
+ return;
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1286,39 +1204,13 @@ out_free:
PAGE_UNLOCK | PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
free_async_extent_pages(async_extent);
- goto done;
-}
-
-/*
- * Phase two of compressed writeback. This is the ordered portion of the code,
- * which only gets called in the order the work was queued. We walk all the
- * async extents created by compress_file_range and send them down to the disk.
- */
-static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
-{
- struct btrfs_inode *inode = async_chunk->inode;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct async_extent *async_extent;
- u64 alloc_hint = 0;
- int ret = 0;
-
- while (!list_empty(&async_chunk->extents)) {
- u64 extent_start;
- u64 ram_size;
-
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
- list_del(&async_extent->list);
- extent_start = async_extent->start;
- ram_size = async_extent->ram_size;
-
- ret = submit_one_async_extent(inode, async_chunk, async_extent,
- &alloc_hint);
- btrfs_debug(fs_info,
+ if (async_chunk->blkcg_css)
+ kthread_associate_blkcg(NULL);
+ btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
- inode->root->root_key.objectid,
- btrfs_ino(inode), extent_start, ram_size, ret);
- }
+ root->root_key.objectid, btrfs_ino(inode), start,
+ async_extent->ram_size, ret);
+ kfree(async_extent);
}
static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@ -1362,25 +1254,18 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* locked_page is the page that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
- * *page_started is set to one if we unlock locked_page and do everything
- * required to start IO on it. It may be clean and already done with
- * IO when we return.
- *
- * When unlock == 1, we unlock the pages in successfully allocated regions.
- * When unlock == 0, we leave them locked for writing them out.
+ * When this function fails, it unlocks all pages except @locked_page.
*
- * However, we unlock all the pages except @locked_page in case of failure.
+ * When this function successfully creates an inline extent, it returns 1 and
+ * unlocks all pages including locked_page and starts I/O on them.
+ * (In reality inline extents are limited to a single page, so locked_page is
+ * the only page handled anyway).
*
- * In summary, page locking state will be as follow:
+ * When this function succeed and creates a normal extent, the page locking
+ * status depends on the passed in flags:
*
- * - page_started == 1 (return value)
- * - All the pages are unlocked. IO is started.
- * - Note that this can happen only on success
- * - unlock == 1
- * - All the pages except @locked_page are unlocked in any case
- * - unlock == 0
- * - On success, all the pages are locked for writing out them
- * - On failure, all the pages except @locked_page are unlocked
+ * - If @keep_locked is set, all pages are kept locked.
+ * - Else all pages except for @locked_page are unlocked.
*
* When a failure happens in the second or later iteration of the
* while-loop, the ordered extents created in previous iterations are kept
@@ -1389,10 +1274,9 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* example.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
- struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock,
- u64 *done_offset)
+ struct page *locked_page, u64 start, u64 end,
+ u64 *done_offset,
+ bool keep_locked, bool no_inline)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1431,7 +1315,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* This means we can trigger inline extent even if we didn't want to.
* So here we skip inline extent creation completely.
*/
- if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
end + 1);
@@ -1451,9 +1335,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
- *nr_written = *nr_written +
- (end - start + PAGE_SIZE) / PAGE_SIZE;
- *page_started = 1;
/*
* locked_page is locked by the caller of
* writepage_delalloc(), not locked by
@@ -1463,11 +1344,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* as it doesn't have any subpage::writers recorded.
*
* Here we manually unlock the page, since the caller
- * can't use page_started to determine if it's an
- * inline extent or a compressed extent.
+ * can't determine if it's an inline extent or a
+ * compressed extent.
*/
unlock_page(locked_page);
- goto out;
+ ret = 1;
+ goto done;
} else if (ret < 0) {
goto out_unlock;
}
@@ -1498,6 +1380,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
+ if (ret == -EAGAIN) {
+ /*
+ * btrfs_reserve_extent only returns -EAGAIN for zoned
+ * file systems, which is an indication that there are
+ * no active zones to allocate from at the moment.
+ *
+ * If this is the first loop iteration, wait for at
+ * least one zone to finish before retrying the
+ * allocation. Otherwise ask the caller to write out
+ * the already allocated blocks before coming back to
+ * us, or return -ENOSPC if it can't handle retries.
+ */
+ ASSERT(btrfs_is_zoned(fs_info));
+ if (start == orig_start) {
+ wait_on_bit_io(&inode->root->fs_info->flags,
+ BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ continue;
+ }
+ if (done_offset) {
+ *done_offset = start - 1;
+ return 0;
+ }
+ ret = -ENOSPC;
+ }
if (ret < 0)
goto out_unlock;
cur_alloc_size = ins.offset;
@@ -1558,7 +1465,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* Do set the Ordered (Private2) bit so we know this page was
* properly setup for writepage.
*/
- page_ops = unlock ? PAGE_UNLOCK : 0;
+ page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
@@ -1581,7 +1488,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
}
-out:
+done:
+ if (done_offset)
+ *done_offset = end;
return ret;
out_drop_extent_cache:
@@ -1591,21 +1500,6 @@ out_reserve:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
/*
- * If done_offset is non-NULL and ret == -EAGAIN, we expect the
- * caller to write out the successfully allocated region and retry.
- */
- if (done_offset && ret == -EAGAIN) {
- if (orig_start < start)
- *done_offset = start - 1;
- else
- *done_offset = start;
- return ret;
- } else if (ret == -EAGAIN) {
- /* Convert to -ENOSPC since the caller cannot retry. */
- ret = -ENOSPC;
- }
-
- /*
* Now, we have three regions to clean up:
*
* |-------(1)----|---(2)---|-------------(3)----------|
@@ -1627,10 +1521,10 @@ out_unlock:
* EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
* function.
*
- * However, in case of unlock == 0, we still need to unlock the pages
+ * However, in case of @keep_locked, we still need to unlock the pages
* (except @locked_page) to ensure all the pages are unlocked.
*/
- if (!unlock && orig_start < start) {
+ if (keep_locked && orig_start < start) {
if (!locked_page)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
@@ -1671,43 +1565,28 @@ out_unlock:
}
/*
- * work queue call back to started compression on a file and pages
- */
-static noinline void async_cow_start(struct btrfs_work *work)
-{
- struct async_chunk *async_chunk;
- int compressed_extents;
-
- async_chunk = container_of(work, struct async_chunk, work);
-
- compressed_extents = compress_file_range(async_chunk);
- if (compressed_extents == 0) {
- btrfs_add_delayed_iput(async_chunk->inode);
- async_chunk->inode = NULL;
- }
-}
-
-/*
- * work queue call back to submit previously compressed pages
+ * Phase two of compressed writeback. This is the ordered portion of the code,
+ * which only gets called in the order the work was queued. We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
*/
-static noinline void async_cow_submit(struct btrfs_work *work)
+static noinline void submit_compressed_extents(struct btrfs_work *work)
{
struct async_chunk *async_chunk = container_of(work, struct async_chunk,
work);
struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
+ struct async_extent *async_extent;
unsigned long nr_pages;
+ u64 alloc_hint = 0;
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /*
- * ->inode could be NULL if async_chunk_start has failed to compress,
- * in which case we don't have anything to submit, yet we need to
- * always adjust ->async_delalloc_pages as its paired with the init
- * happening in run_delalloc_compressed
- */
- if (async_chunk->inode)
- submit_compressed_extents(async_chunk);
+ while (!list_empty(&async_chunk->extents)) {
+ async_extent = list_entry(async_chunk->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+ submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
+ }
/* atomic_sub_return implies a barrier */
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
@@ -1721,8 +1600,7 @@ static noinline void async_cow_free(struct btrfs_work *work)
struct async_cow *async_cow;
async_chunk = container_of(work, struct async_chunk, work);
- if (async_chunk->inode)
- btrfs_add_delayed_iput(async_chunk->inode);
+ btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
@@ -1732,10 +1610,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
}
static bool run_delalloc_compressed(struct btrfs_inode *inode,
- struct writeback_control *wbc,
- struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written)
+ struct page *locked_page, u64 start,
+ u64 end, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
@@ -1809,65 +1685,42 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
async_chunk[i].blkcg_css = NULL;
}
- btrfs_init_work(&async_chunk[i].work, async_cow_start,
- async_cow_submit, async_cow_free);
+ btrfs_init_work(&async_chunk[i].work, compress_file_range,
+ submit_compressed_extents, async_cow_free);
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
- *nr_written += nr_pages;
start = cur_end + 1;
}
- *page_started = 1;
return true;
}
-static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
- u64 end, int *page_started,
- unsigned long *nr_written,
- struct writeback_control *wbc)
+/*
+ * Run the delalloc range from start to end, and write back any dirty pages
+ * covered by the range.
+ */
+static noinline int run_delalloc_cow(struct btrfs_inode *inode,
+ struct page *locked_page, u64 start,
+ u64 end, struct writeback_control *wbc,
+ bool pages_dirty)
{
u64 done_offset = end;
int ret;
- bool locked_page_done = false;
while (start <= end) {
- ret = cow_file_range(inode, locked_page, start, end, page_started,
- nr_written, 0, &done_offset);
- if (ret && ret != -EAGAIN)
+ ret = cow_file_range(inode, locked_page, start, end, &done_offset,
+ true, false);
+ if (ret)
return ret;
-
- if (*page_started) {
- ASSERT(ret == 0);
- return 0;
- }
-
- if (ret == 0)
- done_offset = end;
-
- if (done_offset == start) {
- wait_on_bit_io(&inode->root->fs_info->flags,
- BTRFS_FS_NEED_ZONE_FINISH,
- TASK_UNINTERRUPTIBLE);
- continue;
- }
-
- if (!locked_page_done) {
- __set_page_dirty_nobuffers(locked_page);
- account_page_redirty(locked_page);
- }
- locked_page_done = true;
- extent_write_locked_range(&inode->vfs_inode, start, done_offset,
- wbc);
+ extent_write_locked_range(&inode->vfs_inode, locked_page, start,
+ done_offset, wbc, pages_dirty);
start = done_offset + 1;
}
- *page_started = 1;
-
- return 0;
+ return 1;
}
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
@@ -1894,8 +1747,7 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
}
static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
- const u64 start, const u64 end,
- int *page_started, unsigned long *nr_written)
+ const u64 start, const u64 end)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
@@ -1903,6 +1755,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
u64 count;
+ int ret;
/*
* If EXTENT_NORESERVE is set it means that when the buffered write was
@@ -1955,8 +1808,14 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
NULL);
}
- return cow_file_range(inode, locked_page, start, end, page_started,
- nr_written, 1, NULL);
+ /*
+ * Don't try to create inline extents, as a mix of inline extent that
+ * is written out and unlocked directly and a normal NOCOW extent
+ * doesn't work.
+ */
+ ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
+ ASSERT(ret != 1);
+ return ret;
}
struct can_nocow_file_extent_args {
@@ -2105,9 +1964,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
*/
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
- const u64 start, const u64 end,
- int *page_started,
- unsigned long *nr_written)
+ const u64 start, const u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -2117,25 +1974,26 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
int ret;
bool check_prev = true;
u64 ino = btrfs_ino(inode);
- struct btrfs_block_group *bg;
- bool nocow = false;
struct can_nocow_file_extent_args nocow_args = { 0 };
+ /*
+ * Normally on a zoned device we're only doing COW writes, but in case
+ * of relocation on a zoned filesystem serializes I/O so that we're only
+ * writing sequentially and can end up here as well.
+ */
+ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
+
path = btrfs_alloc_path();
if (!path) {
- extent_clear_unlock_delalloc(inode, start, end, locked_page,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, PAGE_UNLOCK |
- PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto error;
}
nocow_args.end = end;
nocow_args.writeback_path = true;
while (1) {
+ struct btrfs_block_group *nocow_bg = NULL;
struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
@@ -2146,8 +2004,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
int extent_type;
bool is_prealloc;
- nocow = false;
-
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
if (ret < 0)
@@ -2172,11 +2028,8 @@ next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
+ if (ret < 0)
goto error;
- }
if (ret > 0)
break;
leaf = path->nodes[0];
@@ -2209,7 +2062,7 @@ next_slot:
if (found_key.offset > cur_offset) {
extent_end = found_key.offset;
extent_type = 0;
- goto out_check;
+ goto must_cow;
}
/*
@@ -2239,24 +2092,22 @@ next_slot:
nocow_args.start = cur_offset;
ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
+ if (ret < 0)
goto error;
- } else if (ret == 0) {
- goto out_check;
- }
+ if (ret == 0)
+ goto must_cow;
ret = 0;
- bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
- if (bg)
- nocow = true;
-out_check:
- /*
- * If nocow is false then record the beginning of the range
- * that needs to be COWed
- */
- if (!nocow) {
+ nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ if (!nocow_bg) {
+must_cow:
+ /*
+ * If we can't perform NOCOW writeback for the range,
+ * then record the beginning of the range that needs to
+ * be COWed. It will be written out before the next
+ * NOCOW range if we find one, or when exiting this
+ * loop.
+ */
if (cow_start == (u64)-1)
cow_start = cur_offset;
cur_offset = extent_end;
@@ -2275,11 +2126,12 @@ out_check:
*/
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_page,
- cow_start, found_key.offset - 1,
- page_started, nr_written);
- if (ret)
- goto error;
+ cow_start, found_key.offset - 1);
cow_start = (u64)-1;
+ if (ret) {
+ btrfs_dec_nocow_writers(nocow_bg);
+ goto error;
+ }
}
nocow_end = cur_offset + nocow_args.num_bytes - 1;
@@ -2296,6 +2148,7 @@ out_check:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
+ btrfs_dec_nocow_writers(nocow_bg);
ret = PTR_ERR(em);
goto error;
}
@@ -2309,6 +2162,7 @@ out_check:
? (1 << BTRFS_ORDERED_PREALLOC)
: (1 << BTRFS_ORDERED_NOCOW),
BTRFS_COMPRESS_NONE);
+ btrfs_dec_nocow_writers(nocow_bg);
if (IS_ERR(ordered)) {
if (is_prealloc) {
btrfs_drop_extent_map_range(inode, cur_offset,
@@ -2318,11 +2172,6 @@ out_check:
goto error;
}
- if (nocow) {
- btrfs_dec_nocow_writers(bg);
- nocow = false;
- }
-
if (btrfs_is_data_reloc_root(root))
/*
* Error handled later, as we must prevent
@@ -2357,17 +2206,24 @@ out_check:
if (cow_start != (u64)-1) {
cur_offset = end;
- ret = fallback_to_cow(inode, locked_page, cow_start, end,
- page_started, nr_written);
+ ret = fallback_to_cow(inode, locked_page, cow_start, end);
+ cow_start = (u64)-1;
if (ret)
goto error;
}
-error:
- if (nocow)
- btrfs_dec_nocow_writers(bg);
+ btrfs_free_path(path);
+ return 0;
- if (ret && cur_offset < end)
+error:
+ /*
+ * If an error happened while a COW region is outstanding, cur_offset
+ * needs to be reset to cow_start to ensure the COW region is unlocked
+ * as well.
+ */
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ if (cur_offset < end)
extent_clear_unlock_delalloc(inode, cur_offset, end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
@@ -2395,49 +2251,37 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
* being touched for the first time.
*/
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
- u64 start, u64 end, int *page_started, unsigned long *nr_written,
- struct writeback_control *wbc)
+ u64 start, u64 end, struct writeback_control *wbc)
{
- int ret = 0;
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
+ int ret;
/*
- * The range must cover part of the @locked_page, or the returned
- * @page_started can confuse the caller.
+ * The range must cover part of the @locked_page, or a return of 1
+ * can confuse the caller.
*/
ASSERT(!(end <= page_offset(locked_page) ||
start >= page_offset(locked_page) + PAGE_SIZE));
if (should_nocow(inode, start, end)) {
- /*
- * Normally on a zoned device we're only doing COW writes, but
- * in case of relocation on a zoned filesystem we have taken
- * precaution, that we're only writing sequentially. It's safe
- * to use run_delalloc_nocow() here, like for regular
- * preallocated inodes.
- */
- ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
- ret = run_delalloc_nocow(inode, locked_page, start, end,
- page_started, nr_written);
+ ret = run_delalloc_nocow(inode, locked_page, start, end);
goto out;
}
if (btrfs_inode_can_compress(inode) &&
inode_need_compress(inode, start, end) &&
- run_delalloc_compressed(inode, wbc, locked_page, start,
- end, page_started, nr_written))
- goto out;
+ run_delalloc_compressed(inode, locked_page, start, end, wbc))
+ return 1;
if (zoned)
- ret = run_delalloc_zoned(inode, locked_page, start, end,
- page_started, nr_written, wbc);
+ ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
+ true);
else
- ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1, NULL);
+ ret = cow_file_range(inode, locked_page, start, end, NULL,
+ false, false);
out:
- ASSERT(ret <= 0);
- if (ret)
+ if (ret < 0)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
end - start + 1);
return ret;
@@ -2840,23 +2684,19 @@ struct btrfs_writepage_fixup {
static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
{
- struct btrfs_writepage_fixup *fixup;
+ struct btrfs_writepage_fixup *fixup =
+ container_of(work, struct btrfs_writepage_fixup, work);
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- struct page *page;
- struct btrfs_inode *inode;
- u64 page_start;
- u64 page_end;
+ struct page *page = fixup->page;
+ struct btrfs_inode *inode = fixup->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 page_start = page_offset(page);
+ u64 page_end = page_offset(page) + PAGE_SIZE - 1;
int ret = 0;
bool free_delalloc_space = true;
- fixup = container_of(work, struct btrfs_writepage_fixup, work);
- page = fixup->page;
- inode = fixup->inode;
- page_start = page_offset(page);
- page_end = page_offset(page) + PAGE_SIZE - 1;
-
/*
* This is similar to page_mkwrite, we need to reserve the space before
* we take the page lock.
@@ -2949,10 +2789,12 @@ out_page:
* to reflect the errors and clean the page.
*/
mapping_set_error(page->mapping, ret);
- end_extent_writepage(page, ret, page_start, page_end);
+ btrfs_mark_ordered_io_finished(inode, page, page_start,
+ PAGE_SIZE, !ret);
+ btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE);
clear_page_dirty_for_io(page);
}
- btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
+ btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -3359,6 +3201,13 @@ out:
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes, 1);
+ /*
+ * Actually free the qgroup rsv which was released when
+ * the ordered extent was created.
+ */
+ btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
+ ordered_extent->qgroup_rsv,
+ BTRFS_QGROUP_RSV_DATA);
}
}
@@ -3384,15 +3233,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
return btrfs_finish_one_ordered(ordered);
}
-void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
- struct page *page, u64 start,
- u64 end, bool uptodate)
-{
- trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
-
- btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
-}
-
/*
* Verify the checksum for a single sector without any extra action that depend
* on the type of I/O.
@@ -3662,9 +3502,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*/
if (found_key.offset == last_objectid) {
+ /*
+ * We found the same inode as before. This means we were
+ * not able to remove its items via eviction triggered
+ * by an iput(). A transaction abort may have happened,
+ * due to -ENOSPC for example, so try to grab the error
+ * that lead to a transaction abort, if any.
+ */
btrfs_err(fs_info,
"Error removing orphan entry, stopping orphan cleanup");
- ret = -EINVAL;
+ ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
goto out;
}
@@ -3917,8 +3764,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
- inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
- inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
+ inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
+ btrfs_timespec_nsec(leaf, &inode_item->ctime));
BTRFS_I(inode)->i_otime.tv_sec =
btrfs_timespec_sec(leaf, &inode_item->otime);
@@ -4089,9 +3936,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
inode->i_mtime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode->i_ctime.tv_sec);
+ inode_get_ctime(inode).tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode->i_ctime.tv_nsec);
+ inode_get_ctime(inode).tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->otime,
BTRFS_I(inode)->i_otime.tv_sec);
@@ -4289,9 +4136,8 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
- dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
+ inode_set_ctime_current(&inode->vfs_inode);
+ dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
@@ -4464,8 +4310,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
inode_inc_iversion(&dir->vfs_inode);
- dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode);
- dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime;
+ dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -5115,8 +4960,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (newsize != oldsize) {
inode_inc_iversion(inode);
if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
+ inode->i_mtime = inode_set_ctime_current(inode);
}
}
@@ -5738,11 +5582,11 @@ struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root
return btrfs_iget_path(s, ino, root, NULL);
}
-static struct inode *new_simple_dir(struct super_block *s,
+static struct inode *new_simple_dir(struct inode *dir,
struct btrfs_key *key,
struct btrfs_root *root)
{
- struct inode *inode = new_inode(s);
+ struct inode *inode = new_inode(dir->i_sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -5760,10 +5604,11 @@ static struct inode *new_simple_dir(struct super_block *s,
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = current_time(inode);
- inode->i_atime = inode->i_mtime;
- inode->i_ctime = inode->i_mtime;
+ inode->i_mtime = inode_set_ctime_current(inode);
+ inode->i_atime = dir->i_atime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
+ inode->i_uid = dir->i_uid;
+ inode->i_gid = dir->i_gid;
return inode;
}
@@ -5822,7 +5667,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret != -ENOENT)
inode = ERR_PTR(ret);
else
- inode = new_simple_dir(dir->i_sb, &location, root);
+ inode = new_simple_dir(dir, &location, root);
} else {
inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
btrfs_put_root(sub_root);
@@ -6007,8 +5852,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct btrfs_key found_key;
struct btrfs_path *path;
void *addr;
- struct list_head ins_list;
- struct list_head del_list;
+ LIST_HEAD(ins_list);
+ LIST_HEAD(del_list);
int ret;
char *name_ptr;
int name_len;
@@ -6027,8 +5872,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
addr = private->filldir_buf;
path->reada = READA_FORWARD;
- INIT_LIST_HEAD(&ins_list);
- INIT_LIST_HEAD(&del_list);
put = btrfs_readdir_get_delayed_items(inode, private->last_index,
&ins_list, &del_list);
@@ -6165,8 +6008,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
* This is a copy of file_update_time. We need this so we can return error on
* ENOSPC for updating the inode in the case of file write and mmap writes.
*/
-static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
- int flags)
+static int btrfs_update_time(struct inode *inode, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
bool dirty = flags & ~S_VERSION;
@@ -6174,14 +6016,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
if (btrfs_root_readonly(root))
return -EROFS;
- if (flags & S_VERSION)
- dirty |= inode_maybe_inc_iversion(inode, dirty);
- if (flags & S_CTIME)
- inode->i_ctime = *now;
- if (flags & S_MTIME)
- inode->i_mtime = *now;
- if (flags & S_ATIME)
- inode->i_atime = *now;
+ dirty = inode_update_timestamps(inode, flags);
return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
}
@@ -6429,9 +6264,8 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
goto discard;
}
- inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
inode->i_atime = inode->i_mtime;
- inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
/*
@@ -6596,12 +6430,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
* log replay procedure is responsible for setting them to their correct
* values (the ones it had when the fsync was done).
*/
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
- struct timespec64 now = current_time(&parent_inode->vfs_inode);
+ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
+ parent_inode->vfs_inode.i_mtime =
+ inode_set_ctime_current(&parent_inode->vfs_inode);
- parent_inode->vfs_inode.i_mtime = now;
- parent_inode->vfs_inode.i_ctime = now;
- }
ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -6741,7 +6573,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
BTRFS_I(inode)->dir_index = 0ULL;
inc_nlink(inode);
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
@@ -8807,7 +8639,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
@@ -8831,7 +8663,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
- struct timespec64 ctime = current_time(old_inode);
struct btrfs_rename_ctx old_rename_ctx;
struct btrfs_rename_ctx new_rename_ctx;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
@@ -8962,12 +8793,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
inode_inc_iversion(new_inode);
- old_dir->i_mtime = ctime;
- old_dir->i_ctime = ctime;
- new_dir->i_mtime = ctime;
- new_dir->i_ctime = ctime;
- old_inode->i_ctime = ctime;
- new_inode->i_ctime = ctime;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
@@ -9231,11 +9057,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
- old_dir->i_mtime = current_time(old_dir);
- old_dir->i_ctime = old_dir->i_mtime;
- new_dir->i_mtime = old_dir->i_mtime;
- new_dir->i_ctime = old_dir->i_mtime;
- old_inode->i_ctime = old_dir->i_mtime;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
@@ -9257,7 +9079,6 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (new_inode) {
inode_inc_iversion(new_inode);
- new_inode->i_ctime = current_time(new_inode);
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
@@ -9390,14 +9211,11 @@ static int start_delalloc_inodes(struct btrfs_root *root,
struct btrfs_inode *binode;
struct inode *inode;
struct btrfs_delalloc_work *work, *next;
- struct list_head works;
- struct list_head splice;
+ LIST_HEAD(works);
+ LIST_HEAD(splice);
int ret = 0;
bool full_flush = wbc->nr_to_write == LONG_MAX;
- INIT_LIST_HEAD(&works);
- INIT_LIST_HEAD(&splice);
-
mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
@@ -9485,14 +9303,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
.range_end = LLONG_MAX,
};
struct btrfs_root *root;
- struct list_head splice;
+ LIST_HEAD(splice);
int ret;
if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
- INIT_LIST_HEAD(&splice);
-
mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
@@ -9797,7 +9613,7 @@ next:
*alloc_hint = ins.objectid + ins.offset;
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(actual_len > inode->i_size) &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a895d105464b..a18ee7b5a166 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -384,7 +384,7 @@ update_flags:
binode->flags = binode_flags;
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out_end_trans:
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 23fc11af498a..7695decc7243 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -10,14 +10,13 @@
#ifdef CONFIG_PRINTK
#define STATE_STRING_PREFACE ": state "
-#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1)
/*
* Characters to print to indicate error conditions or uncommon filesystem state.
* RO is not an error.
*/
static const char fs_state_chars[] = {
- [BTRFS_FS_STATE_ERROR] = 'E',
[BTRFS_FS_STATE_REMOUNTING] = 'M',
[BTRFS_FS_STATE_RO] = 0,
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
@@ -37,6 +36,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
curr += sizeof(STATE_STRING_PREFACE) - 1;
+ if (BTRFS_FS_ERROR(info)) {
+ *curr++ = 'E';
+ states_printed = true;
+ }
+
for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
@@ -155,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* Today we only save the error info to memory. Long term we'll also
* send it down to the disk.
*/
- set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+ WRITE_ONCE(fs_info->fs_error, errno);
/* Don't go through full error handling during mount. */
if (!(sb->s_flags & SB_BORN))
@@ -252,12 +256,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
}
#endif
-void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
-{
- btrfs_err(fs_info,
-"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
-}
-
#if BITS_PER_LONG == 32
void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
{
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index deedc1a168e2..1ae6f8e23e07 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -181,8 +181,6 @@ do { \
#define ASSERT(expr) (void)(expr)
#endif
-void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info);
-
__printf(5, 6)
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a629532283bc..b46ab348e8e5 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -410,6 +410,10 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
unsigned long flags;
u64 cur = file_offset;
+ trace_btrfs_writepage_end_io_hook(inode, file_offset,
+ file_offset + num_bytes - 1,
+ uptodate);
+
spin_lock_irqsave(&tree->lock, flags);
while (cur < file_offset + num_bytes) {
u64 entry_end;
@@ -736,11 +740,9 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
- struct list_head splice;
+ LIST_HEAD(splice);
u64 done;
- INIT_LIST_HEAD(&splice);
-
mutex_lock(&fs_info->ordered_operations_mutex);
spin_lock(&fs_info->ordered_root_lock);
list_splice_init(&fs_info->ordered_roots, &splice);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index aa06d9ca911d..0c93439e929f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -95,8 +95,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
int ref_index = 0;
if (unlikely(item_size < sizeof(*ei))) {
- btrfs_print_v0_err(eb->fs_info);
- btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
+ btrfs_err(eb->fs_info,
+ "unexpected extent item size, has %u expect >= %zu",
+ item_size, sizeof(*ei));
+ btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL);
}
ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
@@ -291,10 +293,6 @@ void btrfs_print_leaf(const struct extent_buffer *l)
btrfs_file_extent_num_bytes(l, fi),
btrfs_file_extent_ram_bytes(l, fi));
break;
- case BTRFS_EXTENT_REF_V0_KEY:
- btrfs_print_v0_err(fs_info);
- btrfs_handle_fs_error(fs_info, -EINVAL, NULL);
- break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
bi = btrfs_item_ptr(l, i,
struct btrfs_block_group_item);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 2637d6b157ff..b99230db3c82 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3590,15 +3590,16 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
* going to clear all tracking information for a clean start.
*/
- trans = btrfs_join_transaction(fs_info->fs_root);
- if (IS_ERR(trans)) {
+ trans = btrfs_attach_transaction_barrier(fs_info->fs_root);
+ if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
return PTR_ERR(trans);
- }
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- return ret;
+ } else if (trans != ERR_PTR(-ENOENT)) {
+ ret = btrfs_commit_transaction(trans);
+ if (ret) {
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ return ret;
+ }
}
qgroup_rescan_zero_tracking(fs_info);
@@ -3757,9 +3758,11 @@ static int try_flush_qgroup(struct btrfs_root *root)
goto out;
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
- trans = btrfs_join_transaction(root);
+ trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ if (ret == -ENOENT)
+ ret = 0;
goto out;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0249ea52bb80..3e014b9370a3 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -584,8 +584,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
- if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
- last->operation == BTRFS_RBIO_READ_REBUILD)
+ if (last->operation == BTRFS_RBIO_READ_REBUILD)
return 0;
return 1;
@@ -784,10 +783,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
spin_unlock(&rbio->bio_list_lock);
spin_unlock(&h->lock);
- if (next->operation == BTRFS_RBIO_READ_REBUILD)
- start_async_work(next, recover_rbio_work_locked);
- else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
- steal_rbio(rbio, next);
+ if (next->operation == BTRFS_RBIO_READ_REBUILD) {
start_async_work(next, recover_rbio_work_locked);
} else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
@@ -1517,11 +1513,11 @@ static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
while ((bio = bio_list_pop(bio_list))) {
bio->bi_end_io = raid_wait_read_end_io;
- if (trace_raid56_scrub_read_recover_enabled()) {
+ if (trace_raid56_read_enabled()) {
struct raid56_bio_trace_info trace_info = { 0 };
bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
+ trace_raid56_read(rbio, bio, &trace_info);
}
submit_bio(bio);
}
@@ -1698,8 +1694,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
* If we're rebuilding a read, we have to use pages from the
* bio list if possible.
*/
- if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
} else {
sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
@@ -1763,8 +1758,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
* If we're rebuilding a read, we have to use pages from the
* bio list if possible.
*/
- if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
} else {
sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
@@ -1897,8 +1891,7 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
goto out;
}
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
spin_lock(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock(&rbio->bio_list_lock);
@@ -2112,8 +2105,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
goto error;
}
- ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
- rbio->csum_buf, rbio->csum_bitmap, false);
+ ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
+ rbio->csum_buf, rbio->csum_bitmap);
if (ret < 0)
goto error;
if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
@@ -2198,11 +2191,11 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio,
while ((bio = bio_list_pop(bio_list))) {
bio->bi_end_io = raid_wait_write_end_io;
- if (trace_raid56_write_stripe_enabled()) {
+ if (trace_raid56_write_enabled()) {
struct raid56_bio_trace_info trace_info = { 0 };
bio_get_trace_info(rbio, bio, &trace_info);
- trace_raid56_write_stripe(rbio, bio, &trace_info);
+ trace_raid56_write(rbio, bio, &trace_info);
}
submit_bio(bio);
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 0e84c9c9293f..45e6ff78316f 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -14,7 +14,6 @@ enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
BTRFS_RBIO_PARITY_SCRUB,
- BTRFS_RBIO_REBUILD_MISSING,
};
struct btrfs_raid_bio {
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 0474bbe39da7..65d2bd6910f2 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -30,8 +30,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
inode_inc_iversion(inode);
if (!no_time_update) {
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
+ inode->i_mtime = inode_set_ctime_current(inode);
}
/*
* We round up to the block size at eof when determining which
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 46c3c1d57266..9951a0caf5bb 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3006,9 +3006,6 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
if (!page)
return -ENOMEM;
}
- ret = set_page_extent_mapped(page);
- if (ret < 0)
- goto release_page;
if (PageReadahead(page))
page_cache_async_readahead(inode->i_mapping, ra, NULL,
@@ -3024,6 +3021,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
}
}
+ /*
+ * We could have lost page private when we dropped the lock to read the
+ * page above, make sure we set_page_extent_mapped here so we have any
+ * of the subpage blocksize stuff we need in place.
+ */
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto release_page;
+
page_start = page_offset(page);
page_end = page_start + PAGE_SIZE - 1;
@@ -3250,12 +3256,13 @@ static int add_tree_block(struct reloc_control *rc,
if (type == BTRFS_TREE_BLOCK_REF_KEY)
owner = btrfs_extent_inline_ref_offset(eb, iref);
}
- } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
- btrfs_print_v0_err(eb->fs_info);
- btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
- return -EINVAL;
} else {
- BUG();
+ btrfs_print_leaf(eb);
+ btrfs_err(rc->block_group->fs_info,
+ "unrecognized tree backref at tree block %llu slot %u",
+ eb->start, path->slots[0]);
+ btrfs_release_path(path);
+ return -EUCLEAN;
}
btrfs_release_path(path);
@@ -3498,6 +3505,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
last = rc->block_group->start + rc->block_group->length;
while (1) {
+ bool block_found;
+
cond_resched();
if (rc->search_start >= last) {
ret = 1;
@@ -3548,11 +3557,11 @@ next:
goto next;
}
- ret = find_first_extent_bit(&rc->processed_blocks,
- key.objectid, &start, &end,
- EXTENT_DIRTY, NULL);
+ block_found = find_first_extent_bit(&rc->processed_blocks,
+ key.objectid, &start, &end,
+ EXTENT_DIRTY, NULL);
- if (ret == 0 && start <= key.objectid) {
+ if (block_found && start <= key.objectid) {
btrfs_release_path(path);
rc->search_start = end + 1;
} else {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 7289f5bff397..b877203f1dc5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -43,9 +43,20 @@ struct scrub_ctx;
/*
* The following value only influences the performance.
*
- * This determines the batch size for stripe submitted in one go.
+ * This detemines how many stripes would be submitted in one go,
+ * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
*/
-#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */
+#define SCRUB_STRIPES_PER_GROUP 8
+
+/*
+ * How many groups we have for each sctx.
+ *
+ * This would be 8M per device, the same value as the old scrub in-flight bios
+ * size limit.
+ */
+#define SCRUB_GROUPS_PER_SCTX 16
+
+#define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)
/*
* The following value times PAGE_SIZE needs to be large enough to match the
@@ -172,9 +183,11 @@ struct scrub_stripe {
};
struct scrub_ctx {
- struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX];
+ struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES];
struct scrub_stripe *raid56_data_stripes;
struct btrfs_fs_info *fs_info;
+ struct btrfs_path extent_path;
+ struct btrfs_path csum_path;
int first_free;
int cur_stripe;
atomic_t cancel_req;
@@ -315,10 +328,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (!sctx)
return;
- for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
+ for (i = 0; i < SCRUB_TOTAL_STRIPES; i++)
release_scrub_stripe(&sctx->stripes[i]);
- kfree(sctx);
+ kvfree(sctx);
}
static void scrub_put_ctx(struct scrub_ctx *sctx)
@@ -333,13 +346,20 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
struct scrub_ctx *sctx;
int i;
- sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
+ /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use
+ * kvzalloc().
+ */
+ sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->fs_info = fs_info;
- for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
+ sctx->extent_path.search_commit_root = 1;
+ sctx->extent_path.skip_locking = 1;
+ sctx->csum_path.search_commit_root = 1;
+ sctx->csum_path.skip_locking = 1;
+ for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) {
int ret;
ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
@@ -970,6 +990,9 @@ skip:
spin_unlock(&sctx->stat_lock);
}
+static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
+ unsigned long write_bitmap, bool dev_replace);
+
/*
* The main entrance for all read related scrub work, including:
*
@@ -978,13 +1001,16 @@ skip:
* - Go through the remaining mirrors and try to read as large blocksize as
* possible
* - Go through all mirrors (including the failed mirror) sector-by-sector
+ * - Submit writeback for repaired sectors
*
- * Writeback does not happen here, it needs extra synchronization.
+ * Writeback for dev-replace does not happen here, it needs extra
+ * synchronization for zoned devices.
*/
static void scrub_stripe_read_repair_worker(struct work_struct *work)
{
struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
- struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct scrub_ctx *sctx = stripe->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
stripe->bg->length);
int mirror;
@@ -1049,7 +1075,23 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
goto out;
}
out:
- scrub_stripe_report_errors(stripe->sctx, stripe);
+ /*
+ * Submit the repaired sectors. For zoned case, we cannot do repair
+ * in-place, but queue the bg to be relocated.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start);
+ } else if (!sctx->readonly) {
+ unsigned long repaired;
+
+ bitmap_andnot(&repaired, &stripe->init_error_bitmap,
+ &stripe->error_bitmap, stripe->nr_sectors);
+ scrub_write_sectors(sctx, stripe, repaired, false);
+ wait_scrub_stripe_io(stripe);
+ }
+
+ scrub_stripe_report_errors(sctx, stripe);
set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
wake_up(&stripe->repair_wait);
}
@@ -1262,7 +1304,6 @@ static int get_raid56_logic_offset(u64 physical, int num,
/* Work out the disk rotation on this stripe-set */
rot = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
/* calculate which stripe this data locates */
rot += i;
stripe_index = rot % map->num_stripes;
@@ -1468,6 +1509,8 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
* Return <0 for error.
*/
static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
+ struct btrfs_path *extent_path,
+ struct btrfs_path *csum_path,
struct btrfs_device *dev, u64 physical,
int mirror_num, u64 logical_start,
u32 logical_len,
@@ -1477,7 +1520,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
const u64 logical_end = logical_start + logical_len;
- struct btrfs_path path = { 0 };
u64 cur_logical = logical_start;
u64 stripe_end;
u64 extent_start;
@@ -1493,14 +1535,13 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
/* The range must be inside the bg. */
ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
- path.search_commit_root = 1;
- path.skip_locking = 1;
-
- ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
+ ret = find_first_extent_item(extent_root, extent_path, logical_start,
+ logical_len);
/* Either error or not found. */
if (ret)
goto out;
- get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
+ get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags,
+ &extent_gen);
if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
stripe->nr_meta_extents++;
if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
@@ -1528,7 +1569,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
/* Fill the extent info for the remaining sectors. */
while (cur_logical <= stripe_end) {
- ret = find_first_extent_item(extent_root, &path, cur_logical,
+ ret = find_first_extent_item(extent_root, extent_path, cur_logical,
stripe_end - cur_logical + 1);
if (ret < 0)
goto out;
@@ -1536,7 +1577,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
ret = 0;
break;
}
- get_extent_info(&path, &extent_start, &extent_len,
+ get_extent_info(extent_path, &extent_start, &extent_len,
&extent_flags, &extent_gen);
if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
stripe->nr_meta_extents++;
@@ -1561,9 +1602,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
*/
ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
- ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
- stripe_end, stripe->csums,
- &csum_bitmap, true);
+ ret = btrfs_lookup_csums_bitmap(csum_root, csum_path,
+ stripe->logical, stripe_end,
+ stripe->csums, &csum_bitmap);
if (ret < 0)
goto out;
if (ret > 0)
@@ -1576,7 +1617,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
}
set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
out:
- btrfs_release_path(&path);
return ret;
}
@@ -1654,6 +1694,28 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
return false;
}
+static void submit_initial_group_read(struct scrub_ctx *sctx,
+ unsigned int first_slot,
+ unsigned int nr_stripes)
+{
+ struct blk_plug plug;
+
+ ASSERT(first_slot < SCRUB_TOTAL_STRIPES);
+ ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES);
+
+ scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
+ btrfs_stripe_nr_to_offset(nr_stripes));
+ blk_start_plug(&plug);
+ for (int i = 0; i < nr_stripes; i++) {
+ struct scrub_stripe *stripe = &sctx->stripes[first_slot + i];
+
+ /* Those stripes should be initialized. */
+ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
+ scrub_submit_initial_read(sctx, stripe);
+ }
+ blk_finish_plug(&plug);
+}
+
static int flush_scrub_stripes(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -1666,11 +1728,11 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
- scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
- btrfs_stripe_nr_to_offset(nr_stripes));
- for (int i = 0; i < nr_stripes; i++) {
- stripe = &sctx->stripes[i];
- scrub_submit_initial_read(sctx, stripe);
+ /* Submit the stripes which are populated but not submitted. */
+ if (nr_stripes % SCRUB_STRIPES_PER_GROUP) {
+ const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP);
+
+ submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot);
}
for (int i = 0; i < nr_stripes; i++) {
@@ -1680,32 +1742,6 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
}
- /*
- * Submit the repaired sectors. For zoned case, we cannot do repair
- * in-place, but queue the bg to be relocated.
- */
- if (btrfs_is_zoned(fs_info)) {
- for (int i = 0; i < nr_stripes; i++) {
- stripe = &sctx->stripes[i];
-
- if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) {
- btrfs_repair_one_zone(fs_info,
- sctx->stripes[0].bg->start);
- break;
- }
- }
- } else if (!sctx->readonly) {
- for (int i = 0; i < nr_stripes; i++) {
- unsigned long repaired;
-
- stripe = &sctx->stripes[i];
-
- bitmap_andnot(&repaired, &stripe->init_error_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
- scrub_write_sectors(sctx, stripe, repaired, false);
- }
- }
-
/* Submit for dev-replace. */
if (sctx->is_dev_replace) {
/*
@@ -1750,28 +1786,40 @@ static void raid56_scrub_wait_endio(struct bio *bio)
static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
struct btrfs_device *dev, int mirror_num,
- u64 logical, u32 length, u64 physical)
+ u64 logical, u32 length, u64 physical,
+ u64 *found_logical_ret)
{
struct scrub_stripe *stripe;
int ret;
- /* No available slot, submit all stripes and wait for them. */
- if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
- ret = flush_scrub_stripes(sctx);
- if (ret < 0)
- return ret;
- }
+ /*
+ * There should always be one slot left, as caller filling the last
+ * slot should flush them all.
+ */
+ ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
stripe = &sctx->stripes[sctx->cur_stripe];
-
- /* We can queue one stripe using the remaining slot. */
scrub_reset_stripe(stripe);
- ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num,
- logical, length, stripe);
+ ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
+ &sctx->csum_path, dev, physical,
+ mirror_num, logical, length, stripe);
/* Either >0 as no more extents or <0 for error. */
if (ret)
return ret;
+ if (found_logical_ret)
+ *found_logical_ret = stripe->logical;
sctx->cur_stripe++;
+
+ /* We filled one group, submit it. */
+ if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) {
+ const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP;
+
+ submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP);
+ }
+
+ /* Last slot used, flush them all. */
+ if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES)
+ return flush_scrub_stripes(sctx);
return 0;
}
@@ -1785,6 +1833,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_io_context *bioc = NULL;
+ struct btrfs_path extent_path = { 0 };
+ struct btrfs_path csum_path = { 0 };
struct bio *bio;
struct scrub_stripe *stripe;
bool all_empty = true;
@@ -1795,6 +1845,16 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ASSERT(sctx->raid56_data_stripes);
+ /*
+ * For data stripe search, we cannot re-use the same extent/csum paths,
+ * as the data stripe bytenr may be smaller than previous extent. Thus
+ * we have to use our own extent/csum paths.
+ */
+ extent_path.search_commit_root = 1;
+ extent_path.skip_locking = 1;
+ csum_path.search_commit_root = 1;
+ csum_path.skip_locking = 1;
+
for (int i = 0; i < data_stripes; i++) {
int stripe_index;
int rot;
@@ -1809,7 +1869,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
scrub_reset_stripe(stripe);
set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
- ret = scrub_find_fill_first_stripe(bg,
+ ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path,
map->stripes[stripe_index].dev, physical, 1,
full_stripe_start + btrfs_stripe_nr_to_offset(i),
BTRFS_STRIPE_LEN, stripe);
@@ -1854,24 +1914,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
/* For now, no zoned support for RAID56. */
ASSERT(!btrfs_is_zoned(sctx->fs_info));
- /* Writeback for the repaired sectors. */
- for (int i = 0; i < data_stripes; i++) {
- unsigned long repaired;
-
- stripe = &sctx->raid56_data_stripes[i];
-
- bitmap_andnot(&repaired, &stripe->init_error_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
- scrub_write_sectors(sctx, stripe, repaired, false);
- }
-
- /* Wait for the above writebacks to finish. */
- for (int i = 0; i < data_stripes; i++) {
- stripe = &sctx->raid56_data_stripes[i];
-
- wait_scrub_stripe_io(stripe);
- }
-
/*
* Now all data stripes are properly verified. Check if we have any
* unrepaired, if so abort immediately or we could further corrupt the
@@ -1937,6 +1979,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
bio_put(bio);
btrfs_bio_counter_dec(fs_info);
+ btrfs_release_path(&extent_path);
+ btrfs_release_path(&csum_path);
out:
return ret;
}
@@ -1958,18 +2002,15 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
const u64 logical_end = logical_start + logical_length;
- /* An artificial limit, inherit from old scrub behavior */
- struct btrfs_path path = { 0 };
u64 cur_logical = logical_start;
int ret;
/* The range must be inside the bg */
ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
- path.search_commit_root = 1;
- path.skip_locking = 1;
/* Go through each extent items inside the logical range */
while (cur_logical < logical_end) {
+ u64 found_logical;
u64 cur_physical = physical + cur_logical - logical_start;
/* Canceled? */
@@ -1994,7 +2035,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
cur_logical, logical_end - cur_logical,
- cur_physical);
+ cur_physical, &found_logical);
if (ret > 0) {
/* No more extent, just update the accounting */
sctx->stat.last_physical = physical + logical_length;
@@ -2004,14 +2045,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
if (ret < 0)
break;
- ASSERT(sctx->cur_stripe > 0);
- cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
- + BTRFS_STRIPE_LEN;
+ cur_logical = found_logical + BTRFS_STRIPE_LEN;
/* Don't hold CPU for too long time */
cond_resched();
}
- btrfs_release_path(&path);
return ret;
}
@@ -2109,6 +2147,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 stripe_logical;
int stop_loop = 0;
+ /* Extent_path should be released by now. */
+ ASSERT(sctx->extent_path.nodes[0] == NULL);
+
scrub_blocked_if_needed(fs_info);
if (sctx->is_dev_replace &&
@@ -2227,6 +2268,9 @@ out:
ret2 = flush_scrub_stripes(sctx);
if (!ret)
ret = ret2;
+ btrfs_release_path(&sctx->extent_path);
+ btrfs_release_path(&sctx->csum_path);
+
if (sctx->raid56_data_stripes) {
for (int i = 0; i < nr_data_stripes(map); i++)
release_scrub_stripe(&sctx->raid56_data_stripes[i]);
@@ -2711,8 +2755,7 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
-static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
- int is_dev_replace)
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
{
struct workqueue_struct *scrub_workers = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
@@ -2722,10 +2765,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
return 0;
- if (is_dev_replace)
- scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags);
- else
- scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
+ scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
if (!scrub_workers)
return -ENOMEM;
@@ -2777,7 +2817,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (IS_ERR(sctx))
return PTR_ERR(sctx);
- ret = scrub_workers_get(fs_info, is_dev_replace);
+ ret = scrub_workers_get(fs_info);
if (ret)
goto out_free_ctx;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8bfd44750efe..3a566150c531 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3685,7 +3685,7 @@ static void tail_append_pending_moves(struct send_ctx *sctx,
static int apply_children_dir_moves(struct send_ctx *sctx)
{
struct pending_dir_move *pm;
- struct list_head stack;
+ LIST_HEAD(stack);
u64 parent_ino = sctx->cur_ino;
int ret = 0;
@@ -3693,7 +3693,6 @@ static int apply_children_dir_moves(struct send_ctx *sctx)
if (!pm)
return 0;
- INIT_LIST_HEAD(&stack);
tail_append_pending_moves(sctx, pm, &stack);
while (!list_empty(&stack)) {
@@ -4165,7 +4164,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
int ret = 0;
struct recorded_ref *cur;
struct recorded_ref *cur2;
- struct list_head check_dirs;
+ LIST_HEAD(check_dirs);
struct fs_path *valid_path = NULL;
u64 ow_inode = 0;
u64 ow_gen;
@@ -4184,7 +4183,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* which is always '..'
*/
BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
- INIT_LIST_HEAD(&check_dirs);
valid_path = fs_path_alloc();
if (!valid_path) {
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 75e7fa337e66..d7e8cd4f140c 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -389,11 +389,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
return 0;
used = btrfs_space_info_used(space_info, true);
- if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) &&
- (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
- avail = 0;
- else
- avail = calc_available_free_space(fs_info, space_info, flush);
+ avail = calc_available_free_space(fs_info, space_info, flush);
if (used + bytes < space_info->total_bytes + avail)
return 1;
@@ -510,6 +506,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
int dump_block_groups)
{
struct btrfs_block_group *cache;
+ u64 total_avail = 0;
int index = 0;
spin_lock(&info->lock);
@@ -523,18 +520,27 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
down_read(&info->groups_sem);
again:
list_for_each_entry(cache, &info->block_groups[index], list) {
+ u64 avail;
+
spin_lock(&cache->lock);
+ avail = cache->length - cache->used - cache->pinned -
+ cache->reserved - cache->delalloc_bytes -
+ cache->bytes_super - cache->zone_unusable;
btrfs_info(fs_info,
- "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
- cache->start, cache->length, cache->used, cache->pinned,
- cache->reserved, cache->zone_unusable,
- cache->ro ? "[readonly]" : "");
+"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s",
+ cache->start, cache->length, cache->used, cache->pinned,
+ cache->reserved, cache->delalloc_bytes,
+ cache->bytes_super, cache->zone_unusable,
+ avail, cache->ro ? "[readonly]" : "");
spin_unlock(&cache->lock);
btrfs_dump_free_space(cache, bytes);
+ total_avail += avail;
}
if (++index < BTRFS_NR_RAID_TYPES)
goto again;
up_read(&info->groups_sem);
+
+ btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail);
}
static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
@@ -715,9 +721,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
else
nr = -1;
- trans = btrfs_join_transaction(root);
+ trans = btrfs_join_transaction_nostart(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ if (ret == -ENOENT)
+ ret = 0;
break;
}
ret = btrfs_run_delayed_items_nr(trans, nr);
@@ -733,9 +741,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
- trans = btrfs_join_transaction(root);
+ trans = btrfs_join_transaction_nostart(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ if (ret == -ENOENT)
+ ret = 0;
break;
}
if (state == FLUSH_DELAYED_REFS_NR)
@@ -747,18 +757,6 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case ALLOC_CHUNK:
case ALLOC_CHUNK_FORCE:
- /*
- * For metadata space on zoned filesystem, reaching here means we
- * don't have enough space left in active_total_bytes. Try to
- * activate a block group first, because we may have inactive
- * block group already allocated.
- */
- ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false);
- if (ret < 0)
- break;
- else if (ret == 1)
- break;
-
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -770,22 +768,6 @@ static void flush_space(struct btrfs_fs_info *fs_info,
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
- /*
- * For metadata space on zoned filesystem, allocating a new chunk
- * is not enough. We still need to activate the block * group.
- * Active the newly allocated block group by (maybe) finishing
- * a block group.
- */
- if (ret == 1) {
- ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
- /*
- * Revert to the original ret regardless we could finish
- * one block group or not.
- */
- if (ret >= 0)
- ret = 1;
- }
-
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
@@ -800,9 +782,18 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case COMMIT_TRANS:
ASSERT(current->journal_info == NULL);
- trans = btrfs_join_transaction(root);
+ /*
+ * We don't want to start a new transaction, just attach to the
+ * current one or wait it fully commits in case its commit is
+ * happening at the moment. Note: we don't use a nostart join
+ * because that does not wait for a transaction to fully commit
+ * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
+ */
+ trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ if (ret == -ENOENT)
+ ret = 0;
break;
}
ret = btrfs_commit_transaction(trans);
@@ -1408,8 +1399,18 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
}
}
- /* Attempt to steal from the global rsv if we can. */
- if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
+ /*
+ * Attempt to steal from the global rsv if we can, except if the fs was
+ * turned into error mode due to a transaction abort when flushing space
+ * above, in that case fail with the abort error instead of returning
+ * success to the caller if we can steal from the global rsv - this is
+ * just to have caller fail immeditelly instead of later when trying to
+ * modify the fs, making it easier to debug -ENOSPC problems.
+ */
+ if (BTRFS_FS_ERROR(fs_info)) {
+ ticket->error = BTRFS_FS_ERROR(fs_info);
+ remove_ticket(space_info, ticket);
+ } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
ticket->error = -ENOSPC;
remove_ticket(space_info, ticket);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f1dd172d8d5b..09bfe68d2ea3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -709,12 +709,16 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
case Opt_check_integrity_including_extent_data:
+ btrfs_warn(info,
+ "integrity checker is deprecated and will be removed in 6.7");
btrfs_info(info,
"enabling check integrity including extent data");
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity:
+ btrfs_warn(info,
+ "integrity checker is deprecated and will be removed in 6.7");
btrfs_info(info, "enabling check integrity");
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
@@ -727,6 +731,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
}
info->check_integrity_print_mask = intarg;
+ btrfs_warn(info,
+ "integrity checker is deprecated and will be removed in 6.7");
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
break;
@@ -2144,7 +2150,7 @@ static struct file_system_type btrfs_fs_type = {
.name = "btrfs",
.mount = btrfs_mount,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME,
};
static struct file_system_type btrfs_root_fs_type = {
@@ -2152,7 +2158,8 @@ static struct file_system_type btrfs_root_fs_type = {
.name = "btrfs",
.mount = btrfs_mount_root,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA |
+ FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("btrfs");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 25294e624851..b1d1ac25237b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -414,6 +414,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
BTRFS_ATTR(static_feature, supported_sectorsizes,
supported_sectorsizes_show);
+static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL));
+}
+BTRFS_ATTR(static_feature, acl, acl_show);
+
/*
* Features which only depend on kernel version.
*
@@ -421,6 +427,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes,
* btrfs_supported_feature_attrs.
*/
static struct attribute *btrfs_supported_static_feature_attrs[] = {
+ BTRFS_ATTR_PTR(static_feature, acl),
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
BTRFS_ATTR_PTR(static_feature, send_stream_version),
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index f6bc6d738555..1cc86af97dc6 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -319,86 +319,139 @@ out:
return ret;
}
-static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb,
- unsigned long len)
+static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
{
unsigned long i;
- for (i = 0; i < len * BITS_PER_BYTE; i++) {
+ for (i = 0; i < eb->len * BITS_PER_BYTE; i++) {
int bit, bit1;
bit = !!test_bit(i, bitmap);
bit1 = !!extent_buffer_test_bit(eb, 0, i);
if (bit1 != bit) {
- test_err("bits do not match");
+ u8 has;
+ u8 expect;
+
+ read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1);
+ expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE));
+
+ test_err(
+ "bits do not match, start byte 0 bit %lu, byte %lu has 0x%02x expect 0x%02x",
+ i, i / BITS_PER_BYTE, has, expect);
return -EINVAL;
}
bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
i % BITS_PER_BYTE);
if (bit1 != bit) {
- test_err("offset bits do not match");
+ u8 has;
+ u8 expect;
+
+ read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1);
+ expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE));
+
+ test_err(
+ "bits do not match, start byte %lu bit %lu, byte %lu has 0x%02x expect 0x%02x",
+ i / BITS_PER_BYTE, i % BITS_PER_BYTE,
+ i / BITS_PER_BYTE, has, expect);
return -EINVAL;
}
}
return 0;
}
-static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
- unsigned long len)
+static int test_bitmap_set(const char *name, unsigned long *bitmap,
+ struct extent_buffer *eb,
+ unsigned long byte_start, unsigned long bit_start,
+ unsigned long bit_len)
+{
+ int ret;
+
+ bitmap_set(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len);
+ extent_buffer_bitmap_set(eb, byte_start, bit_start, bit_len);
+ ret = check_eb_bitmap(bitmap, eb);
+ if (ret < 0)
+ test_err("%s test failed", name);
+ return ret;
+}
+
+static int test_bitmap_clear(const char *name, unsigned long *bitmap,
+ struct extent_buffer *eb,
+ unsigned long byte_start, unsigned long bit_start,
+ unsigned long bit_len)
+{
+ int ret;
+
+ bitmap_clear(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len);
+ extent_buffer_bitmap_clear(eb, byte_start, bit_start, bit_len);
+ ret = check_eb_bitmap(bitmap, eb);
+ if (ret < 0)
+ test_err("%s test failed", name);
+ return ret;
+}
+static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb)
{
unsigned long i, j;
+ unsigned long byte_len = eb->len;
u32 x;
int ret;
- memset(bitmap, 0, len);
- memzero_extent_buffer(eb, 0, len);
- if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
- test_err("bitmap was not zeroed");
- return -EINVAL;
- }
+ ret = test_bitmap_clear("clear all run 1", bitmap, eb, 0, 0,
+ byte_len * BITS_PER_BYTE);
+ if (ret < 0)
+ return ret;
- bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
- extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
- ret = check_eb_bitmap(bitmap, eb, len);
- if (ret) {
- test_err("setting all bits failed");
+ ret = test_bitmap_set("set all", bitmap, eb, 0, 0, byte_len * BITS_PER_BYTE);
+ if (ret < 0)
return ret;
- }
- bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
- extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
- ret = check_eb_bitmap(bitmap, eb, len);
- if (ret) {
- test_err("clearing all bits failed");
+ ret = test_bitmap_clear("clear all run 2", bitmap, eb, 0, 0,
+ byte_len * BITS_PER_BYTE);
+ if (ret < 0)
+ return ret;
+
+ ret = test_bitmap_set("same byte set", bitmap, eb, 0, 2, 4);
+ if (ret < 0)
+ return ret;
+
+ ret = test_bitmap_clear("same byte partial clear", bitmap, eb, 0, 4, 1);
+ if (ret < 0)
+ return ret;
+
+ ret = test_bitmap_set("cross byte set", bitmap, eb, 2, 4, 8);
+ if (ret < 0)
+ return ret;
+
+ ret = test_bitmap_set("cross multi byte set", bitmap, eb, 4, 4, 24);
+ if (ret < 0)
+ return ret;
+
+ ret = test_bitmap_clear("cross byte clear", bitmap, eb, 2, 6, 4);
+ if (ret < 0)
+ return ret;
+
+ ret = test_bitmap_clear("cross multi byte clear", bitmap, eb, 4, 6, 20);
+ if (ret < 0)
return ret;
- }
/* Straddling pages test */
- if (len > PAGE_SIZE) {
- bitmap_set(bitmap,
- (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
- sizeof(long) * BITS_PER_BYTE);
- extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0,
- sizeof(long) * BITS_PER_BYTE);
- ret = check_eb_bitmap(bitmap, eb, len);
- if (ret) {
- test_err("setting straddling pages failed");
+ if (byte_len > PAGE_SIZE) {
+ ret = test_bitmap_set("cross page set", bitmap, eb,
+ PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (ret < 0)
+ return ret;
+
+ ret = test_bitmap_set("cross page set all", bitmap, eb, 0, 0,
+ byte_len * BITS_PER_BYTE);
+ if (ret < 0)
return ret;
- }
- bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
- bitmap_clear(bitmap,
- (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
- sizeof(long) * BITS_PER_BYTE);
- extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
- extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ ret = test_bitmap_clear("cross page clear", bitmap, eb,
+ PAGE_SIZE - sizeof(long) / 2, 0,
sizeof(long) * BITS_PER_BYTE);
- ret = check_eb_bitmap(bitmap, eb, len);
- if (ret) {
- test_err("clearing straddling pages failed");
+ if (ret < 0)
return ret;
- }
}
/*
@@ -406,9 +459,12 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
* something repetitive that could miss some hypothetical off-by-n bug.
*/
x = 0;
- bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
- extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
- for (i = 0; i < len * BITS_PER_BYTE / 32; i++) {
+ ret = test_bitmap_clear("clear all run 3", bitmap, eb, 0, 0,
+ byte_len * BITS_PER_BYTE);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < byte_len * BITS_PER_BYTE / 32; i++) {
x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU;
for (j = 0; j < 32; j++) {
if (x & (1U << j)) {
@@ -418,7 +474,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
}
}
- ret = check_eb_bitmap(bitmap, eb, len);
+ ret = check_eb_bitmap(bitmap, eb);
if (ret) {
test_err("random bit pattern failed");
return ret;
@@ -456,7 +512,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
goto out;
}
- ret = __test_eb_bitmaps(bitmap, eb, nodesize);
+ ret = __test_eb_bitmaps(bitmap, eb);
if (ret)
goto out;
@@ -473,7 +529,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
goto out;
}
- ret = __test_eb_bitmaps(bitmap, eb, nodesize);
+ ret = __test_eb_bitmaps(bitmap, eb);
out:
free_extent_buffer(eb);
kfree(bitmap);
@@ -592,6 +648,146 @@ out:
return ret;
}
+static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory,
+ const char *test_name)
+{
+ for (int i = 0; i < eb->len; i++) {
+ struct page *page = eb->pages[i >> PAGE_SHIFT];
+ void *addr = page_address(page) + offset_in_page(i);
+
+ if (memcmp(addr, memory + i, 1) != 0) {
+ test_err("%s failed", test_name);
+ test_err("eb and memory diffs at byte %u, eb has 0x%02x memory has 0x%02x",
+ i, *(u8 *)addr, *(u8 *)(memory + i));
+ return;
+ }
+ }
+}
+
+static int verify_eb_and_memory(struct extent_buffer *eb, void *memory,
+ const char *test_name)
+{
+ for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) {
+ void *eb_addr = page_address(eb->pages[i]);
+
+ if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) {
+ dump_eb_and_memory_contents(eb, memory, test_name);
+ return -EUCLEAN;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Init both memory and extent buffer contents to the same randomly generated
+ * contents.
+ */
+static void init_eb_and_memory(struct extent_buffer *eb, void *memory)
+{
+ get_random_bytes(memory, eb->len);
+ write_extent_buffer(eb, memory, 0, eb->len);
+}
+
+static int test_eb_mem_ops(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_fs_info *fs_info;
+ struct extent_buffer *eb = NULL;
+ void *memory = NULL;
+ int ret;
+
+ test_msg("running extent buffer memory operation tests");
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+
+ memory = kvzalloc(nodesize, GFP_KERNEL);
+ if (!memory) {
+ test_err("failed to allocate memory");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize);
+ if (!eb) {
+ test_std_err(TEST_ALLOC_EXTENT_BUFFER);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ init_eb_and_memory(eb, memory);
+ ret = verify_eb_and_memory(eb, memory, "full eb write");
+ if (ret < 0)
+ goto out;
+
+ memcpy(memory, memory + 16, 16);
+ memcpy_extent_buffer(eb, 0, 16, 16);
+ ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 1");
+ if (ret < 0)
+ goto out;
+
+ memcpy(memory, memory + 2048, 16);
+ memcpy_extent_buffer(eb, 0, 2048, 16);
+ ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 2");
+ if (ret < 0)
+ goto out;
+ memcpy(memory, memory + 2048, 2048);
+ memcpy_extent_buffer(eb, 0, 2048, 2048);
+ ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 3");
+ if (ret < 0)
+ goto out;
+
+ memmove(memory + 512, memory + 256, 512);
+ memmove_extent_buffer(eb, 512, 256, 512);
+ ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 1");
+ if (ret < 0)
+ goto out;
+
+ memmove(memory + 2048, memory + 512, 2048);
+ memmove_extent_buffer(eb, 2048, 512, 2048);
+ ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 2");
+ if (ret < 0)
+ goto out;
+ memmove(memory + 512, memory + 2048, 2048);
+ memmove_extent_buffer(eb, 512, 2048, 2048);
+ ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 3");
+ if (ret < 0)
+ goto out;
+
+ if (nodesize > PAGE_SIZE) {
+ memcpy(memory, memory + 4096 - 128, 256);
+ memcpy_extent_buffer(eb, 0, 4096 - 128, 256);
+ ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 1");
+ if (ret < 0)
+ goto out;
+
+ memcpy(memory + 4096 - 128, memory + 4096 + 128, 256);
+ memcpy_extent_buffer(eb, 4096 - 128, 4096 + 128, 256);
+ ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 2");
+ if (ret < 0)
+ goto out;
+
+ memmove(memory + 4096 - 128, memory + 4096 - 64, 256);
+ memmove_extent_buffer(eb, 4096 - 128, 4096 - 64, 256);
+ ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 1");
+ if (ret < 0)
+ goto out;
+
+ memmove(memory + 4096 - 64, memory + 4096 - 128, 256);
+ memmove_extent_buffer(eb, 4096 - 64, 4096 - 128, 256);
+ ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 2");
+ if (ret < 0)
+ goto out;
+ }
+out:
+ free_extent_buffer(eb);
+ kvfree(memory);
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
+
int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
{
int ret;
@@ -607,6 +803,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
goto out;
ret = test_eb_bitmaps(sectorsize, nodesize);
+ if (ret)
+ goto out;
+
+ ret = test_eb_mem_ops(sectorsize, nodesize);
out:
return ret;
}
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index ed0f36ae5346..29bdd08b241f 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -6,6 +6,7 @@
#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../btrfs_inode.h"
#include "../volumes.h"
#include "../disk-io.h"
#include "../block-group.h"
@@ -442,6 +443,406 @@ static int test_case_4(struct btrfs_fs_info *fs_info,
return ret;
}
+static int add_compressed_extent(struct extent_map_tree *em_tree,
+ u64 start, u64 len, u64 block_start)
+{
+ struct extent_map *em;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ em->start = start;
+ em->len = len;
+ em->block_start = block_start;
+ em->block_len = SZ_4K;
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ free_extent_map(em);
+ if (ret < 0) {
+ test_err("cannot add extent map [%llu, %llu)", start, start + len);
+ return ret;
+ }
+
+ return 0;
+}
+
+struct extent_range {
+ u64 start;
+ u64 len;
+};
+
+/* The valid states of the tree after every drop, as described below. */
+struct extent_range valid_ranges[][7] = {
+ {
+ { .start = 0, .len = SZ_8K }, /* [0, 8K) */
+ { .start = SZ_4K * 3, .len = SZ_4K * 3}, /* [12k, 24k) */
+ { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */
+ { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */
+ { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */
+ },
+ {
+ { .start = 0, .len = SZ_8K }, /* [0, 8K) */
+ { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */
+ { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */
+ { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */
+ { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */
+ },
+ {
+ { .start = 0, .len = SZ_8K }, /* [0, 8K) */
+ { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */
+ { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */
+ { .start = SZ_32K, .len = SZ_4K}, /* [32k, 36k) */
+ { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */
+ { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */
+ },
+ {
+ { .start = 0, .len = SZ_8K}, /* [0, 8K) */
+ { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */
+ { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */
+ }
+};
+
+static int validate_range(struct extent_map_tree *em_tree, int index)
+{
+ struct rb_node *n;
+ int i;
+
+ for (i = 0, n = rb_first_cached(&em_tree->map);
+ valid_ranges[index][i].len && n;
+ i++, n = rb_next(n)) {
+ struct extent_map *entry = rb_entry(n, struct extent_map, rb_node);
+
+ if (entry->start != valid_ranges[index][i].start) {
+ test_err("mapping has start %llu expected %llu",
+ entry->start, valid_ranges[index][i].start);
+ return -EINVAL;
+ }
+
+ if (entry->len != valid_ranges[index][i].len) {
+ test_err("mapping has len %llu expected %llu",
+ entry->len, valid_ranges[index][i].len);
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * We exited because we don't have any more entries in the extent_map
+ * but we still expect more valid entries.
+ */
+ if (valid_ranges[index][i].len) {
+ test_err("missing an entry");
+ return -EINVAL;
+ }
+
+ /* We exited the loop but still have entries in the extent map. */
+ if (n) {
+ test_err("we have a left over entry in the extent map we didn't expect");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Test scenario:
+ *
+ * Test the various edge cases of btrfs_drop_extent_map_range, create the
+ * following ranges
+ *
+ * [0, 12k)[12k, 24k)[24k, 36k)[36k, 40k)[40k,64k)
+ *
+ * And then we'll drop:
+ *
+ * [8k, 12k) - test the single front split
+ * [12k, 20k) - test the single back split
+ * [28k, 32k) - test the double split
+ * [32k, 64k) - test whole em dropping
+ *
+ * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from
+ * merging the em's.
+ */
+static int test_case_5(void)
+{
+ struct extent_map_tree *em_tree;
+ struct inode *inode;
+ u64 start, end;
+ int ret;
+
+ test_msg("Running btrfs_drop_extent_map_range tests");
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
+ return -ENOMEM;
+ }
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ /* [0, 12k) */
+ ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0);
+ if (ret) {
+ test_err("cannot add extent range [0, 12K)");
+ goto out;
+ }
+
+ /* [12k, 24k) */
+ ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
+ if (ret) {
+ test_err("cannot add extent range [12k, 24k)");
+ goto out;
+ }
+
+ /* [24k, 36k) */
+ ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
+ if (ret) {
+ test_err("cannot add extent range [12k, 24k)");
+ goto out;
+ }
+
+ /* [36k, 40k) */
+ ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
+ if (ret) {
+ test_err("cannot add extent range [12k, 24k)");
+ goto out;
+ }
+
+ /* [40k, 64k) */
+ ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
+ if (ret) {
+ test_err("cannot add extent range [12k, 24k)");
+ goto out;
+ }
+
+ /* Drop [8k, 12k) */
+ start = SZ_8K;
+ end = (3 * SZ_4K) - 1;
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
+ ret = validate_range(&BTRFS_I(inode)->extent_tree, 0);
+ if (ret)
+ goto out;
+
+ /* Drop [12k, 20k) */
+ start = SZ_4K * 3;
+ end = SZ_16K + SZ_4K - 1;
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
+ ret = validate_range(&BTRFS_I(inode)->extent_tree, 1);
+ if (ret)
+ goto out;
+
+ /* Drop [28k, 32k) */
+ start = SZ_32K - SZ_4K;
+ end = SZ_32K - 1;
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
+ ret = validate_range(&BTRFS_I(inode)->extent_tree, 2);
+ if (ret)
+ goto out;
+
+ /* Drop [32k, 64k) */
+ start = SZ_32K;
+ end = SZ_64K - 1;
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false);
+ ret = validate_range(&BTRFS_I(inode)->extent_tree, 3);
+ if (ret)
+ goto out;
+out:
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Test the btrfs_add_extent_mapping helper which will attempt to create an em
+ * for areas between two existing ems. Validate it doesn't do this when there
+ * are two unmerged em's side by side.
+ */
+static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree)
+{
+ struct extent_map *em = NULL;
+ int ret;
+
+ ret = add_compressed_extent(em_tree, 0, SZ_4K, 0);
+ if (ret)
+ goto out;
+
+ ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0);
+ if (ret)
+ goto out;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ em->start = SZ_4K;
+ em->len = SZ_4K;
+ em->block_start = SZ_16K;
+ em->block_len = SZ_16K;
+ write_lock(&em_tree->lock);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K);
+ write_unlock(&em_tree->lock);
+
+ if (ret != 0) {
+ test_err("got an error when adding our em: %d", ret);
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (em->start != 0) {
+ test_err("unexpected em->start at %llu, wanted 0", em->start);
+ goto out;
+ }
+ if (em->len != SZ_4K) {
+ test_err("unexpected em->len %llu, expected 4K", em->len);
+ goto out;
+ }
+ ret = 0;
+out:
+ free_extent_map(em);
+ free_extent_map_tree(em_tree);
+ return ret;
+}
+
+/*
+ * Regression test for btrfs_drop_extent_map_range. Calling with skip_pinned ==
+ * true would mess up the start/end calculations and subsequent splits would be
+ * incorrect.
+ */
+static int test_case_7(void)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct inode *inode;
+ int ret;
+
+ test_msg("Running btrfs_drop_extent_cache with pinned");
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_std_err(TEST_ALLOC_INODE);
+ return -ENOMEM;
+ }
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* [0, 16K), pinned */
+ em->start = 0;
+ em->len = SZ_16K;
+ em->block_start = 0;
+ em->block_len = SZ_4K;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("couldn't add extent map");
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* [32K, 48K), not pinned */
+ em->start = SZ_32K;
+ em->len = SZ_16K;
+ em->block_start = SZ_32K;
+ em->block_len = SZ_16K;
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 0);
+ write_unlock(&em_tree->lock);
+ if (ret < 0) {
+ test_err("couldn't add extent map");
+ goto out;
+ }
+ free_extent_map(em);
+
+ /*
+ * Drop [0, 36K) This should skip the [0, 4K) extent and then split the
+ * [32K, 48K) extent.
+ */
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true);
+
+ /* Make sure our extent maps look sane. */
+ ret = -EINVAL;
+
+ em = lookup_extent_mapping(em_tree, 0, SZ_16K);
+ if (!em) {
+ test_err("didn't find an em at 0 as expected");
+ goto out;
+ }
+
+ if (em->start != 0) {
+ test_err("em->start is %llu, expected 0", em->start);
+ goto out;
+ }
+
+ if (em->len != SZ_16K) {
+ test_err("em->len is %llu, expected 16K", em->len);
+ goto out;
+ }
+
+ free_extent_map(em);
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K);
+ read_unlock(&em_tree->lock);
+ if (em) {
+ test_err("found an em when we weren't expecting one");
+ goto out;
+ }
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ test_err("didn't find an em at 32K as expected");
+ goto out;
+ }
+
+ if (em->start != (36 * SZ_1K)) {
+ test_err("em->start is %llu, expected 36K", em->start);
+ goto out;
+ }
+
+ if (em->len != (12 * SZ_1K)) {
+ test_err("em->len is %llu, expected 12K", em->len);
+ goto out;
+ }
+
+ free_extent_map(em);
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1);
+ read_unlock(&em_tree->lock);
+ if (em) {
+ test_err("found an unexpected em above 48K");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ free_extent_map(em);
+ iput(inode);
+ return ret;
+}
+
struct rmap_test_vector {
u64 raid_type;
u64 physical_start;
@@ -619,6 +1020,17 @@ int btrfs_test_extent_map(void)
if (ret)
goto out;
ret = test_case_4(fs_info, em_tree);
+ if (ret)
+ goto out;
+ ret = test_case_5();
+ if (ret)
+ goto out;
+ ret = test_case_6(fs_info, em_tree);
+ if (ret)
+ goto out;
+ ret = test_case_7();
+ if (ret)
+ goto out;
test_msg("running rmap tests");
for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 91b6c2fdc420..874e4394df86 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -292,10 +292,11 @@ loop:
spin_unlock(&fs_info->trans_lock);
/*
- * If we are ATTACH, we just want to catch the current transaction,
- * and commit it. If there is no transaction, just return ENOENT.
+ * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the
+ * current transaction, and commit it. If there is no transaction, just
+ * return ENOENT.
*/
- if (type == TRANS_ATTACH)
+ if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART)
return -ENOENT;
/*
@@ -591,8 +592,13 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
u64 delayed_refs_bytes = 0;
qgroup_reserved = num_items * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
- enforce_qgroups);
+ /*
+ * Use prealloc for now, as there might be a currently running
+ * transaction that could free this reserved space prematurely
+ * by committing.
+ */
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved,
+ enforce_qgroups, false);
if (ret)
return ERR_PTR(ret);
@@ -705,6 +711,14 @@ again:
h->reloc_reserved = reloc_reserved;
}
+ /*
+ * Now that we have found a transaction to be a part of, convert the
+ * qgroup reservation from prealloc to pertrans. A different transaction
+ * can't race in and free our pertrans out from under us.
+ */
+ if (qgroup_reserved)
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
+
got_it:
if (!current->journal_info)
current->journal_info = h;
@@ -752,7 +766,7 @@ alloc_fail:
btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
num_bytes, NULL);
reserve_fail:
- btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
return ERR_PTR(ret);
}
@@ -785,7 +799,10 @@ struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *
/*
* Similar to regular join but it never starts a transaction when none is
- * running or after waiting for the current one to finish.
+ * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED.
+ * This is similar to btrfs_attach_transaction() but it allows the join to
+ * happen if the transaction commit already started but it's not yet in the
+ * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING).
*/
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
{
@@ -1060,8 +1077,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (!find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, &cached_state)) {
+ while (find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, &cached_state)) {
bool wait_writeback = false;
err = convert_extent_bit(dirty_pages, start, end,
@@ -1114,8 +1131,8 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (!find_first_extent_bit(dirty_pages, start, &start, &end,
- EXTENT_NEED_WAIT, &cached_state)) {
+ while (find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_NEED_WAIT, &cached_state)) {
/*
* Ignore -ENOMEM errors returned by clear_extent_bit().
* When committing the transaction, we'll remove any entries
@@ -1837,8 +1854,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
fname.disk_name.len * 2);
- parent_inode->i_mtime = current_time(parent_inode);
- parent_inode->i_ctime = parent_inode->i_mtime;
+ parent_inode->i_mtime = inode_set_ctime_current(parent_inode);
ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 365a1cc0a3c3..d1e46b839519 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4148,9 +4148,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
inode->i_mtime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode->i_ctime.tv_sec);
+ inode_get_ctime(inode).tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode->i_ctime.tv_nsec);
+ inode_get_ctime(inode).tv_nsec);
/*
* We do not need to set the nbytes field, in fact during a fast fsync
@@ -4841,13 +4841,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
- struct list_head extents;
+ LIST_HEAD(extents);
struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
int num = 0;
- INIT_LIST_HEAD(&extents);
-
write_lock(&tree->lock);
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
@@ -6794,8 +6792,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
while (true) {
struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
+ struct extent_buffer *leaf;
+ int slot;
struct btrfs_key search_key;
struct inode *inode;
u64 ino;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6aa9bf3661ac..9621455edebc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -681,6 +681,14 @@ error_free_page:
return -EINVAL;
}
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
+{
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+
+ return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
+}
+
/*
* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
* being created with a disk that has already completed its fsid change. Such
@@ -833,15 +841,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
-
- if (has_metadata_uuid)
- memcpy(fs_devices->metadata_uuid,
- disk_super->metadata_uuid,
- BTRFS_FSID_SIZE);
- else
- memcpy(fs_devices->metadata_uuid,
- disk_super->fsid, BTRFS_FSID_SIZE);
-
+ memcpy(fs_devices->metadata_uuid,
+ btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
fs_devices->fsid_change = false;
}
}
@@ -851,8 +852,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (fs_devices->opened) {
btrfs_err(NULL,
- "device %s belongs to fsid %pU, and the fs is already mounted",
- path, fs_devices->fsid);
+"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
+ path, fs_devices->fsid, current->comm,
+ task_pid_nr(current));
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
}
@@ -1424,9 +1426,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
lockdep_assert_held(&device->fs_info->chunk_mutex);
- if (!find_first_extent_bit(&device->alloc_state, *start,
- &physical_start, &physical_end,
- CHUNK_ALLOCATED, NULL)) {
+ if (find_first_extent_bit(&device->alloc_state, *start,
+ &physical_start, &physical_end,
+ CHUNK_ALLOCATED, NULL)) {
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
@@ -1438,18 +1440,18 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
return false;
}
-static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
+static u64 dev_extent_search_start(struct btrfs_device *device)
{
switch (device->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
- return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
+ return BTRFS_DEVICE_RANGE_RESERVED;
case BTRFS_CHUNK_ALLOC_ZONED:
/*
* We don't care about the starting region like regular
* allocator, because we anyway use/reserve the first two zones
* for superblock logging.
*/
- return ALIGN(start, device->zone_info->zone_size);
+ return 0;
default:
BUG();
}
@@ -1581,15 +1583,15 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
* correct usable device space, as device extent freed in current transaction
* is not reported as available.
*/
-static int find_free_dev_extent_start(struct btrfs_device *device,
- u64 num_bytes, u64 search_start, u64 *start,
- u64 *len)
+static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *len)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
+ u64 search_start;
u64 hole_size;
u64 max_hole_start;
u64 max_hole_size;
@@ -1599,7 +1601,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
int slot;
struct extent_buffer *l;
- search_start = dev_extent_search_start(device, search_start);
+ search_start = dev_extent_search_start(device);
WARN_ON(device->zone_info &&
!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
@@ -1725,13 +1727,6 @@ out:
return ret;
}
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *len)
-{
- /* FIXME use last free of some kind */
- return find_free_dev_extent_start(device, num_bytes, 0, start, len);
-}
-
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
@@ -1917,15 +1912,13 @@ out:
static void update_dev_time(const char *device_path)
{
struct path path;
- struct timespec64 now;
int ret;
ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
if (ret)
return;
- now = current_time(d_inode(path.dentry));
- inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION);
+ inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
path_put(&path);
}
@@ -6219,6 +6212,45 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
}
+/*
+ * Map one logical range to one or more physical ranges.
+ *
+ * @length: (Mandatory) mapped length of this run.
+ * One logical range can be split into different segments
+ * due to factors like zones and RAID0/5/6/10 stripe
+ * boundaries.
+ *
+ * @bioc_ret: (Mandatory) returned btrfs_io_context structure.
+ * which has one or more physical ranges (btrfs_io_stripe)
+ * recorded inside.
+ * Caller should call btrfs_put_bioc() to free it after use.
+ *
+ * @smap: (Optional) single physical range optimization.
+ * If the map request can be fulfilled by one single
+ * physical range, and this is parameter is not NULL,
+ * then @bioc_ret would be NULL, and @smap would be
+ * updated.
+ *
+ * @mirror_num_ret: (Mandatory) returned mirror number if the original
+ * value is 0.
+ *
+ * Mirror number 0 means to choose any live mirrors.
+ *
+ * For non-RAID56 profiles, non-zero mirror_num means
+ * the Nth mirror. (e.g. mirror_num 1 means the first
+ * copy).
+ *
+ * For RAID56 profile, mirror 1 means rebuild from P and
+ * the remaining data stripes.
+ *
+ * For RAID6 profile, mirror > 2 means mark another
+ * data/P stripe error and rebuild from the remaining
+ * stripes..
+ *
+ * @need_raid_map: (Used only for integrity checker) whether the map wants
+ * a full stripe map (including all data and P/Q stripes)
+ * for RAID56. Should always be 1 except integrity checker.
+ */
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
@@ -6393,9 +6425,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* I/O context structure.
*/
if (smap && num_alloc_stripes == 1 &&
- !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
- (op == BTRFS_MAP_READ || !dev_replace_is_ongoing ||
- !dev_replace->tgtdev)) {
+ !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
if (mirror_num_ret)
*mirror_num_ret = mirror_num;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b8c51f16ba86..2128a032c3b7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -650,8 +650,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats);
@@ -749,5 +747,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index fc4b20c2688a..96828a13dd43 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -264,7 +264,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
goto out;
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -407,7 +407,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
ret = btrfs_set_prop(trans, inode, name, value, size, flags);
if (!ret) {
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 72b90bc19a19..09bc325d075d 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -65,6 +65,9 @@
#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+static void wait_eb_writebacks(struct btrfs_block_group *block_group);
+static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written);
+
static inline bool sb_zone_is_full(const struct blk_zone *zone)
{
return (zone->cond == BLK_ZONE_COND_FULL) ||
@@ -465,8 +468,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
* use the cache.
*/
if (populate_cache && bdev_is_zoned(device->bdev)) {
- zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
- zone_info->nr_zones);
+ zone_info->zone_cache = vcalloc(zone_info->nr_zones,
+ sizeof(struct blk_zone));
if (!zone_info->zone_cache) {
btrfs_err_in_rcu(device->fs_info,
"zoned: failed to allocate zone cache for %s",
@@ -1583,19 +1586,9 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
return;
WARN_ON(cache->bytes_super != 0);
-
- /* Check for block groups never get activated */
- if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) &&
- cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) &&
- !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) &&
- cache->alloc_offset == 0) {
- unusable = cache->length;
- free = 0;
- } else {
- unusable = (cache->alloc_offset - cache->used) +
- (cache->length - cache->zone_capacity);
- free = cache->zone_capacity - cache->alloc_offset;
- }
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
cache->cached = BTRFS_CACHE_FINISHED;
@@ -1707,10 +1700,21 @@ void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
{
struct btrfs_inode *inode = BTRFS_I(ordered->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_sum *sum =
- list_first_entry(&ordered->list, typeof(*sum), list);
- u64 logical = sum->logical;
- u64 len = sum->len;
+ struct btrfs_ordered_sum *sum;
+ u64 logical, len;
+
+ /*
+ * Write to pre-allocated region is for the data relocation, and so
+ * it should use WRITE operation. No split/rewrite are necessary.
+ */
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
+ return;
+
+ ASSERT(!list_empty(&ordered->list));
+ /* The ordered->list can be empty in the above pre-alloc case. */
+ sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list);
+ logical = sum->logical;
+ len = sum->len;
while (len < ordered->disk_num_bytes) {
sum = list_next_entry(sum, list);
@@ -1747,41 +1751,121 @@ out:
}
}
-bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb,
- struct btrfs_block_group **cache_ret)
+static bool check_bg_is_active(struct btrfs_eb_write_context *ctx,
+ struct btrfs_block_group **active_bg)
{
- struct btrfs_block_group *cache;
- bool ret = true;
+ const struct writeback_control *wbc = ctx->wbc;
+ struct btrfs_block_group *block_group = ctx->zoned_bg;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
- if (!btrfs_is_zoned(fs_info))
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
return true;
- cache = btrfs_lookup_block_group(fs_info, eb->start);
- if (!cache)
- return true;
+ if (fs_info->treelog_bg == block_group->start) {
+ if (!btrfs_zone_activate(block_group)) {
+ int ret_fin = btrfs_zone_finish_one_bg(fs_info);
- if (cache->meta_write_pointer != eb->start) {
- btrfs_put_block_group(cache);
- cache = NULL;
- ret = false;
- } else {
- cache->meta_write_pointer = eb->start + eb->len;
- }
+ if (ret_fin != 1 || !btrfs_zone_activate(block_group))
+ return false;
+ }
+ } else if (*active_bg != block_group) {
+ struct btrfs_block_group *tgt = *active_bg;
- *cache_ret = cache;
+ /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
+ lockdep_assert_held(&fs_info->zoned_meta_io_lock);
- return ret;
+ if (tgt) {
+ /*
+ * If there is an unsent IO left in the allocated area,
+ * we cannot wait for them as it may cause a deadlock.
+ */
+ if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) {
+ if (wbc->sync_mode == WB_SYNC_NONE ||
+ (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync))
+ return false;
+ }
+
+ /* Pivot active metadata/system block group. */
+ btrfs_zoned_meta_io_unlock(fs_info);
+ wait_eb_writebacks(tgt);
+ do_zone_finish(tgt, true);
+ btrfs_zoned_meta_io_lock(fs_info);
+ if (*active_bg == tgt) {
+ btrfs_put_block_group(tgt);
+ *active_bg = NULL;
+ }
+ }
+ if (!btrfs_zone_activate(block_group))
+ return false;
+ if (*active_bg != block_group) {
+ ASSERT(*active_bg == NULL);
+ *active_bg = block_group;
+ btrfs_get_block_group(block_group);
+ }
+ }
+
+ return true;
}
-void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
- struct extent_buffer *eb)
+/*
+ * Check if @ctx->eb is aligned to the write pointer.
+ *
+ * Return:
+ * 0: @ctx->eb is at the write pointer. You can write it.
+ * -EAGAIN: There is a hole. The caller should handle the case.
+ * -EBUSY: There is a hole, but the caller can just bail out.
+ */
+int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_write_context *ctx)
{
- if (!btrfs_is_zoned(eb->fs_info) || !cache)
- return;
+ const struct writeback_control *wbc = ctx->wbc;
+ const struct extent_buffer *eb = ctx->eb;
+ struct btrfs_block_group *block_group = ctx->zoned_bg;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ if (block_group) {
+ if (block_group->start > eb->start ||
+ block_group->start + block_group->length <= eb->start) {
+ btrfs_put_block_group(block_group);
+ block_group = NULL;
+ ctx->zoned_bg = NULL;
+ }
+ }
+
+ if (!block_group) {
+ block_group = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!block_group)
+ return 0;
+ ctx->zoned_bg = block_group;
+ }
+
+ if (block_group->meta_write_pointer == eb->start) {
+ struct btrfs_block_group **tgt;
+
+ if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
+ return 0;
- ASSERT(cache->meta_write_pointer == eb->start + eb->len);
- cache->meta_write_pointer = eb->start;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ tgt = &fs_info->active_system_bg;
+ else
+ tgt = &fs_info->active_meta_bg;
+ if (check_bg_is_active(ctx, tgt))
+ return 0;
+ }
+
+ /*
+ * Since we may release fs_info->zoned_meta_io_lock, someone can already
+ * start writing this eb. In that case, we can just bail out.
+ */
+ if (block_group->meta_write_pointer > eb->start)
+ return -EBUSY;
+
+ /* If for_sync, this hole will be filled with trasnsaction commit. */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ return -EAGAIN;
+ return -EBUSY;
}
int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
@@ -1879,10 +1963,10 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_space_info *space_info = block_group->space_info;
struct map_lookup *map;
struct btrfs_device *device;
u64 physical;
+ const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
bool ret;
int i;
@@ -1891,7 +1975,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
- spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
@@ -1904,30 +1987,44 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
+ spin_lock(&fs_info->zone_active_bgs_lock);
for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_zoned_device_info *zinfo;
+ int reserved = 0;
+
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
+ zinfo = device->zone_info;
- if (device->zone_info->max_active_zones == 0)
+ if (zinfo->max_active_zones == 0)
continue;
+ if (is_data)
+ reserved = zinfo->reserved_active_zones;
+ /*
+ * For the data block group, leave active zones for one
+ * metadata block group and one system block group.
+ */
+ if (atomic_read(&zinfo->active_zones_left) <= reserved) {
+ ret = false;
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ goto out_unlock;
+ }
+
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
+ spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
+ if (!is_data)
+ zinfo->reserved_active_zones--;
}
+ spin_unlock(&fs_info->zone_active_bgs_lock);
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
- WARN_ON(block_group->alloc_offset != 0);
- if (block_group->zone_unusable == block_group->length) {
- block_group->zone_unusable = block_group->length - block_group->zone_capacity;
- space_info->bytes_zone_unusable -= block_group->zone_capacity;
- }
spin_unlock(&block_group->lock);
- btrfs_try_granting_tickets(fs_info, space_info);
- spin_unlock(&space_info->lock);
/* For the active block group list */
btrfs_get_block_group(block_group);
@@ -1940,7 +2037,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
- spin_unlock(&space_info->lock);
return ret;
}
@@ -2006,6 +2102,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
* and block_group->meta_write_pointer for metadata.
*/
if (!fully_written) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
spin_unlock(&block_group->lock);
ret = btrfs_inc_block_group_ro(block_group, false);
@@ -2034,7 +2134,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
return 0;
}
- if (block_group->reserved) {
+ if (block_group->reserved ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
btrfs_dec_block_group_ro(block_group);
return -EAGAIN;
@@ -2043,6 +2145,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
block_group->alloc_offset = block_group->zone_capacity;
+ if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
+ block_group->meta_write_pointer = block_group->start +
+ block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
btrfs_clear_treelog_bg(block_group);
btrfs_clear_data_reloc_bg(block_group);
@@ -2052,18 +2157,21 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
- if (device->zone_info->max_active_zones == 0)
+ if (zinfo->max_active_zones == 0)
continue;
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- device->zone_info->zone_size >> SECTOR_SHIFT,
+ zinfo->zone_size >> SECTOR_SHIFT,
GFP_NOFS);
if (ret)
return ret;
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ zinfo->reserved_active_zones++;
btrfs_dev_clear_active_zone(device, physical);
}
@@ -2102,8 +2210,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
/* Check if there is a device with active zones left */
mutex_lock(&fs_info->chunk_mutex);
+ spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int reserved = 0;
if (!device->bdev)
continue;
@@ -2113,17 +2223,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
break;
}
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ reserved = zinfo->reserved_active_zones;
+
switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case 0: /* single */
- ret = (atomic_read(&zinfo->active_zones_left) >= 1);
+ ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved));
break;
case BTRFS_BLOCK_GROUP_DUP:
- ret = (atomic_read(&zinfo->active_zones_left) >= 2);
+ ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved));
break;
}
if (ret)
break;
}
+ spin_unlock(&fs_info->zone_active_bgs_lock);
mutex_unlock(&fs_info->chunk_mutex);
if (!ret)
@@ -2265,7 +2379,10 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica
/* All relocation extents are written. */
if (block_group->start + block_group->alloc_offset == logical + length) {
- /* Now, release this block group for further allocations. */
+ /*
+ * Now, release this block group for further allocations and
+ * zone finish.
+ */
clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
&block_group->runtime_flags);
}
@@ -2289,7 +2406,8 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
}
@@ -2365,3 +2483,55 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
return 0;
}
+
+/*
+ * Reserve zones for one metadata block group, one tree-log block group, and one
+ * system block group.
+ */
+void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_block_group *block_group;
+ struct btrfs_device *device;
+ /* Reserve zones for normal SINGLE metadata and tree-log block group. */
+ unsigned int metadata_reserve = 2;
+ /* Reserve a zone for SINGLE system block group. */
+ unsigned int system_reserve = 1;
+
+ if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
+ return;
+
+ /*
+ * This function is called from the mount context. So, there is no
+ * parallel process touching the bits. No need for read_seqretry().
+ */
+ if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
+ metadata_reserve = 4;
+ if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
+ system_reserve = 2;
+
+ /* Apply the reservation on all the devices. */
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ device->zone_info->reserved_active_zones =
+ metadata_reserve + system_reserve;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ /* Release reservation for currently active block groups. */
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ struct map_lookup *map = block_group->physical_map;
+
+ if (!(block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
+ continue;
+
+ for (int i = 0; i < map->num_stripes; i++)
+ map->stripes[i].dev->zone_info->reserved_active_zones--;
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 27322b926038..b9cec523b778 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -22,6 +22,11 @@ struct btrfs_zoned_device_info {
u8 zone_size_shift;
u32 nr_zones;
unsigned int max_active_zones;
+ /*
+ * Reserved active zones for one metadata and one system block group.
+ * It can vary per-device depending on the allocation status.
+ */
+ int reserved_active_zones;
atomic_t active_zones_left;
unsigned long *seq_zones;
unsigned long *empty_zones;
@@ -58,11 +63,8 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb);
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
-bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb,
- struct btrfs_block_group **cache_ret);
-void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
- struct extent_buffer *eb);
+int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_write_context *ctx);
int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
u64 physical_start, u64 physical_pos);
@@ -81,6 +83,7 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica
int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, bool do_finish);
+void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -189,17 +192,10 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
{
}
-static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb,
- struct btrfs_block_group **cache_ret)
-{
- return true;
-}
-
-static inline void btrfs_revert_meta_write_pointer(
- struct btrfs_block_group *cache,
- struct extent_buffer *eb)
+static inline int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct btrfs_eb_write_context *ctx)
{
+ return 0;
}
static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device,
@@ -262,6 +258,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
return 0;
}
+static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/buffer.c b/fs/buffer.c
index bd091329026c..084a6ade108a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,6 +49,7 @@
#include <trace/events/block.h>
#include <linux/fscrypt.h>
#include <linux/fsverity.h>
+#include <linux/sched/isolation.h>
#include "internal.h"
@@ -1225,19 +1226,14 @@ EXPORT_SYMBOL(mark_buffer_dirty);
void mark_buffer_write_io_error(struct buffer_head *bh)
{
- struct super_block *sb;
-
set_buffer_write_io_error(bh);
/* FIXME: do we need to set this in both places? */
if (bh->b_folio && bh->b_folio->mapping)
mapping_set_error(bh->b_folio->mapping, -EIO);
- if (bh->b_assoc_map)
+ if (bh->b_assoc_map) {
mapping_set_error(bh->b_assoc_map, -EIO);
- rcu_read_lock();
- sb = READ_ONCE(bh->b_bdev->bd_super);
- if (sb)
- errseq_set(&sb->s_wb_err, -EIO);
- rcu_read_unlock();
+ errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
+ }
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
@@ -1352,7 +1348,7 @@ static void bh_lru_install(struct buffer_head *bh)
* failing page migration.
* Skip putting upcoming bh into bh_lru until migration is done.
*/
- if (lru_cache_disabled()) {
+ if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
bh_lru_unlock();
return;
}
@@ -1382,6 +1378,10 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
check_irqs_on();
bh_lru_lock();
+ if (cpu_is_isolated(smp_processor_id())) {
+ bh_lru_unlock();
+ return NULL;
+ }
for (i = 0; i < BH_LRU_SIZE; i++) {
struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 175a25fcade8..009d23cd435b 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
_enter("%ld", ret);
- /* Tell lockdep we inherited freeze protection from submission thread */
- __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
- __sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
+ kiocb_end_write(iocb);
if (ret < 0)
trace_cachefiles_io_error(object, inode, ret,
@@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object,
{
struct cachefiles_cache *cache;
struct cachefiles_kiocb *ki;
- struct inode *inode;
unsigned int old_nofs;
ssize_t ret;
size_t len = iov_iter_count(iter);
@@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object,
ki->iocb.ki_complete = cachefiles_write_complete;
atomic_long_add(ki->b_writing, &cache->b_writing);
- /* Open-code file_start_write here to grab freeze protection, which
- * will be released by another thread in aio_complete_rw(). Fool
- * lockdep by telling it the lock got released so that it doesn't
- * complain about the held lock when we return to userspace.
- */
- inode = file_inode(file);
- __sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
- __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+ kiocb_start_write(&ki->iocb);
get_file(ki->iocb.ki_filp);
cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
- trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len);
+ trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len);
old_nofs = memalloc_nofs_save();
ret = cachefiles_inject_write_error();
if (ret == 0)
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 6945a938d396..c91b293267d7 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -93,7 +93,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
char *value = NULL;
struct iattr newattrs;
struct inode *inode = d_inode(dentry);
- struct timespec64 old_ctime = inode->i_ctime;
+ struct timespec64 old_ctime = inode_get_ctime(inode);
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
if (ceph_snap(inode) != CEPH_NOSNAP) {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e2bb0d0072da..09cd6d334604 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1400,7 +1400,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
arg->mtime = inode->i_mtime;
arg->atime = inode->i_atime;
- arg->ctime = inode->i_ctime;
+ arg->ctime = inode_get_ctime(inode);
arg->btime = ci->i_btime;
arg->change_attr = inode_peek_iversion_raw(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8e5f41d45283..fd05d68e2990 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -100,7 +100,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
inode->i_mtime = parent->i_mtime;
- inode->i_ctime = parent->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
inode->i_atime = parent->i_atime;
ci->i_rbytes = 0;
ci->i_btime = ceph_inode(parent)->i_btime;
@@ -688,6 +688,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
struct timespec64 *mtime, struct timespec64 *atime)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct timespec64 ictime = inode_get_ctime(inode);
int warn = 0;
if (issued & (CEPH_CAP_FILE_EXCL|
@@ -696,11 +697,11 @@ void ceph_fill_file_time(struct inode *inode, int issued,
CEPH_CAP_AUTH_EXCL|
CEPH_CAP_XATTR_EXCL)) {
if (ci->i_version == 0 ||
- timespec64_compare(ctime, &inode->i_ctime) > 0) {
+ timespec64_compare(ctime, &ictime) > 0) {
dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+ ictime.tv_sec, ictime.tv_nsec,
ctime->tv_sec, ctime->tv_nsec);
- inode->i_ctime = *ctime;
+ inode_set_ctime_to_ts(inode, *ctime);
}
if (ci->i_version == 0 ||
ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
@@ -738,7 +739,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
} else {
/* we have no write|excl caps; whatever the MDS says is true */
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
- inode->i_ctime = *ctime;
+ inode_set_ctime_to_ts(inode, *ctime);
inode->i_mtime = *mtime;
inode->i_atime = *atime;
ci->i_time_warp_seq = time_warp_seq;
@@ -2166,7 +2167,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+ inode_get_ctime(inode).tv_sec,
+ inode_get_ctime(inode).tv_nsec,
attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
only ? "ctime only" : "ignored");
if (only) {
@@ -2191,7 +2193,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
inode_inc_iversion_raw(inode);
}
@@ -2465,7 +2467,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
return err;
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->ino = ceph_present_inode(inode);
/*
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 343d738448dc..c9920ade15f5 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -660,7 +660,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size = i_size_read(inode);
capsnap->mtime = inode->i_mtime;
capsnap->atime = inode->i_atime;
- capsnap->ctime = inode->i_ctime;
+ capsnap->ctime = inode_get_ctime(inode);
capsnap->btime = ci->i_btime;
capsnap->change_attr = inode_peek_iversion_raw(inode);
capsnap->time_warp_seq = ci->i_time_warp_seq;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 806183959c47..1cbd84cc82a8 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1238,7 +1238,7 @@ retry:
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
}
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 903ca8fa4b9b..ae023853a98f 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -127,7 +127,8 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
if (attr->va_mtime.tv_sec != -1)
inode->i_mtime = coda_to_timespec64(attr->va_mtime);
if (attr->va_ctime.tv_sec != -1)
- inode->i_ctime = coda_to_timespec64(attr->va_ctime);
+ inode_set_ctime_to_ts(inode,
+ coda_to_timespec64(attr->va_ctime));
}
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 1b960de2bf39..cb512b10473b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir)
/* optimistically we can also act as if our nose bleeds. The
* granularity of the mtime is coarse anyways so we might actually be
* right most of the time. Note: we only do this for directories. */
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
#endif
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 12b26bd13564..42346618b4ed 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
- coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode);
+ coda_inode->i_mtime = inode_set_ctime_current(coda_inode);
inode_unlock(coda_inode);
file_end_write(host_file);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d661e6cf17ac..0c7c2528791e 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -256,7 +256,8 @@ int coda_getattr(struct mnt_idmap *idmap, const struct path *path,
{
int err = coda_revalidate_inode(d_inode(path->dentry));
if (!err)
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask,
+ d_inode(path->dentry), stat);
return err;
}
@@ -269,7 +270,7 @@ int coda_setattr(struct mnt_idmap *idmap, struct dentry *de,
memset(&vattr, 0, sizeof(vattr));
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
coda_iattr_to_vattr(iattr, &vattr);
vattr.va_type = C_VNON; /* cannot set type */
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 1c15edbe70ff..fbdcb3582926 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -88,8 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -99,7 +98,7 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
inode->i_gid = iattr->ia_gid;
inode->i_atime = iattr->ia_atime;
inode->i_mtime = iattr->ia_mtime;
- inode->i_ctime = iattr->ia_ctime;
+ inode_set_ctime_to_ts(inode, iattr->ia_ctime);
}
struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
@@ -172,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode)
return ERR_PTR(-ENOMEM);
p_inode = d_inode(dentry->d_parent);
- p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode);
+ p_inode->i_mtime = inode_set_ctime_current(p_inode);
configfs_set_inode_lock_class(sd, inode);
return inode;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 27c6597aa1be..5ee7d7bbb361 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -133,7 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
}
/* Struct copy intentional */
- inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+ inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
+ zerotime);
/* inode->i_nlink is left 1 - arguably wrong for directories,
but it's the best we can do without reading the directory
contents. 1 yields the right result in GNU find, even
@@ -485,12 +486,16 @@ static void cramfs_kill_sb(struct super_block *sb)
{
struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+ generic_shutdown_super(sb);
+
if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) {
if (sbi && sbi->mtd_point_size)
mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size);
- kill_mtd_super(sb);
+ put_mtd_device(sb->s_mtd);
+ sb->s_mtd = NULL;
} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
- kill_block_super(sb);
+ sync_blockdev(sb->s_bdev);
+ blkdev_put(sb->s_bdev, sb);
}
kfree(sbi);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 52e6d5fdab6b..25ac74d30bff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1664,7 +1664,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
if (dentry == _data && dentry->d_lockref.count == 1)
return D_WALK_CONTINUE;
- printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
+ WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} "
" still in use (%d) [unmount of %s %s]\n",
dentry,
dentry->d_inode ?
@@ -1673,7 +1673,6 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
dentry->d_lockref.count,
dentry->d_sb->s_type->name,
dentry->d_sb->s_id);
- WARN_ON(1);
return D_WALK_CONTINUE;
}
@@ -3247,8 +3246,6 @@ void d_genocide(struct dentry *parent)
d_walk(parent, parent, d_genocide_kill);
}
-EXPORT_SYMBOL(d_genocide);
-
void d_tmpfile(struct file *file, struct inode *inode)
{
struct dentry *dentry = file->f_path.dentry;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3f81f73c241a..83e57e9f9fa0 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -72,8 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
return inode;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index fe3db0eda8e4..299c295a27a0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb)
}
inode->i_ino = 2;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
mode = S_IFCHR|opts->ptmxmode;
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
@@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
if (!inode)
goto fail;
inode->i_ino = 1;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -534,12 +534,12 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx)
/**
* devpts_pty_new -- create a new inode in /dev/pts/
- * @ptmx_inode: inode of the master
- * @device: major+minor of the node to be created
+ * @fsi: Filesystem info for this instance.
* @index: used as a name of the node
* @priv: what's given back by devpts_get_priv
*
- * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
+ * The dentry for the created inode is returned.
+ * Remove it from /dev/pts/ with devpts_pty_kill().
*/
struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
{
@@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
inode->i_ino = index + 3;
inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index));
sprintf(s, "%d", index);
@@ -580,7 +580,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
/**
* devpts_get_priv -- get private data for a slave
- * @pts_inode: inode of the slave
+ * @dentry: dentry of the slave
*
* Returns whatever was passed as priv in devpts_pty_new for a given inode.
*/
@@ -593,7 +593,7 @@ void *devpts_get_priv(struct dentry *dentry)
/**
* devpts_pty_kill -- remove inode form /dev/pts/
- * @inode: inode of the slave to be removed
+ * @dentry: dentry of the slave to be removed
*
* This is an inverse operation of devpts_pty_new.
*/
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c16f0d660cb7..03bd55069d86 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -441,10 +441,10 @@ int ecryptfs_encrypt_page(struct page *page)
}
lower_offset = lower_offset_for_page(crypt_stat, page);
- enc_extent_virt = kmap(enc_extent_page);
+ enc_extent_virt = kmap_local_page(enc_extent_page);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
PAGE_SIZE);
- kunmap(enc_extent_page);
+ kunmap_local(enc_extent_virt);
if (rc < 0) {
ecryptfs_printk(KERN_ERR,
"Error attempting to write lower page; rc = [%d]\n",
@@ -490,10 +490,10 @@ int ecryptfs_decrypt_page(struct page *page)
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
lower_offset = lower_offset_for_page(crypt_stat, page);
- page_virt = kmap(page);
+ page_virt = kmap_local_page(page);
rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
ecryptfs_inode);
- kunmap(page);
+ kunmap_local(page_virt);
if (rc < 0) {
ecryptfs_printk(KERN_ERR,
"Error attempting to read lower page; rc = [%d]\n",
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 83274915ba6d..992d9c7e64ae 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -148,7 +148,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
}
fsstack_copy_attr_times(dir, lower_dir);
set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
out_unlock:
dput(lower_dentry);
inode_unlock(lower_dir);
@@ -982,7 +982,7 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap,
mount_crypt_stat = &ecryptfs_superblock_to_private(
dentry->d_sb)->mount_crypt_stat;
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
char *target;
size_t targetsiz;
@@ -1011,7 +1011,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask,
+ d_inode(dentry), stat);
stat->blocks = lower_stat.blocks;
}
return rc;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 373c3e5747e6..e2483acc4366 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -125,7 +125,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
/* This is a header extent */
char *page_virt;
- page_virt = kmap_atomic(page);
+ page_virt = kmap_local_page(page);
memset(page_virt, 0, PAGE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
@@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
crypt_stat,
&written);
}
- kunmap_atomic(page_virt);
+ kunmap_local(page_virt);
flush_dcache_page(page);
if (rc) {
printk(KERN_ERR "%s: Error reading xattr "
@@ -255,7 +255,6 @@ out:
* @mapping: The eCryptfs object
* @pos: The file offset at which to start writing
* @len: Length of the write
- * @flags: Various flags
* @pagep: Pointer to return the page
* @fsdata: Pointer to return fs data (unused)
*
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 60bdcaddcbe5..3458f153a588 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
+ offset_in_page);
- virt = kmap(page_for_lower);
+ virt = kmap_local_page(page_for_lower);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
if (rc > 0)
rc = 0;
- kunmap(page_for_lower);
+ kunmap_local(virt);
return rc;
}
@@ -140,7 +140,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
ecryptfs_page_idx, rc);
goto out;
}
- ecryptfs_page_virt = kmap_atomic(ecryptfs_page);
+ ecryptfs_page_virt = kmap_local_page(ecryptfs_page);
/*
* pos: where we're now writing, offset: where the request was
@@ -163,7 +163,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
(data + data_offset), num_bytes);
data_offset += num_bytes;
}
- kunmap_atomic(ecryptfs_page_virt);
+ kunmap_local(ecryptfs_page_virt);
flush_dcache_page(ecryptfs_page);
SetPageUptodate(ecryptfs_page);
unlock_page(ecryptfs_page);
@@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
int rc;
offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
- virt = kmap(page_for_ecryptfs);
+ virt = kmap_local_page(page_for_ecryptfs);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
if (rc > 0)
rc = 0;
- kunmap(page_for_ecryptfs);
+ kunmap_local(virt);
flush_dcache_page(page_for_ecryptfs);
return rc;
}
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index d57ee15874f9..59b52718a3a2 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
} else {
inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
- inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
inode_unlock(inode);
}
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index b973a2c03dde..db9231f0e77b 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -25,7 +25,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 3ba94bb005a6..3789d22ba501 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -105,8 +105,8 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
inode->i_size = be32_to_cpu(efs_inode->di_size);
inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
- inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime);
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0);
+ inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
/* this is the number of blocks in the file */
if (inode->i_size == 0) {
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index f259d92c9720..f6dc961e6c2b 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -38,6 +38,7 @@ config EROFS_FS_DEBUG
config EROFS_FS_XATTR
bool "EROFS extended attributes"
depends on EROFS_FS
+ select XXHASH
default y
help
Extended attributes are name:value pairs associated with inodes by
@@ -99,6 +100,21 @@ config EROFS_FS_ZIP_LZMA
If unsure, say N.
+config EROFS_FS_ZIP_DEFLATE
+ bool "EROFS DEFLATE compressed data support"
+ depends on EROFS_FS_ZIP
+ select ZLIB_INFLATE
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing DEFLATE compressed data. It gives better compression
+ ratios than the default LZ4 format, while it costs more CPU
+ overhead.
+
+ DEFLATE support is an experimental feature for now and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support"
depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index a3a98fc3e481..994d0b9deddf 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index b1b846504027..349c3316ae6b 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -94,4 +94,6 @@ extern const struct z_erofs_decompressor erofs_decompressors[];
/* prototypes for specific algorithms */
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool);
+int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
#endif
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index cfad1eac7fd9..332ec5f74002 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -379,4 +379,10 @@ const struct z_erofs_decompressor erofs_decompressors[] = {
.name = "lzma"
},
#endif
+#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
+ [Z_EROFS_COMPRESSION_DEFLATE] = {
+ .decompress = z_erofs_deflate_decompress,
+ .name = "deflate"
+ },
+#endif
};
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
new file mode 100644
index 000000000000..19e5bdeb30b6
--- /dev/null
+++ b/fs/erofs/decompressor_deflate.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/module.h>
+#include <linux/zlib.h>
+#include "compress.h"
+
+struct z_erofs_deflate {
+ struct z_erofs_deflate *next;
+ struct z_stream_s z;
+ u8 bounce[PAGE_SIZE];
+};
+
+static DEFINE_SPINLOCK(z_erofs_deflate_lock);
+static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms;
+static struct z_erofs_deflate *z_erofs_deflate_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq);
+
+module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444);
+
+void z_erofs_deflate_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ spin_lock(&z_erofs_deflate_lock);
+ strm = z_erofs_deflate_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_deflate_lock);
+ continue;
+ }
+ z_erofs_deflate_head = NULL;
+ spin_unlock(&z_erofs_deflate_lock);
+
+ while (strm) {
+ struct z_erofs_deflate *n = strm->next;
+
+ vfree(strm->z.workspace);
+ kfree(strm);
+ --z_erofs_deflate_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int __init z_erofs_deflate_init(void)
+{
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_deflate_nstrms)
+ z_erofs_deflate_nstrms = num_possible_cpus();
+
+ for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
+ ++z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm)
+ goto out_failed;
+
+ /* XXX: in-kernel zlib cannot shrink windowbits currently */
+ strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!strm->z.workspace) {
+ kfree(strm);
+ goto out_failed;
+ }
+
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ }
+ return 0;
+
+out_failed:
+ pr_err("failed to allocate zlib workspace\n");
+ z_erofs_deflate_exit();
+ return -ENOMEM;
+}
+
+int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_deflate_cfgs *dfl, int size)
+{
+ if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
+ erofs_err(sb, "invalid deflate cfgs, size=%u", size);
+ return -EINVAL;
+ }
+
+ if (dfl->windowbits > MAX_WBITS) {
+ erofs_err(sb, "unsupported windowbits %u", dfl->windowbits);
+ return -EOPNOTSUPP;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!");
+ return 0;
+}
+
+int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ struct super_block *sb = rq->sb;
+ unsigned int insz, outsz, pofs;
+ struct z_erofs_deflate *strm;
+ u8 *kin, *kout = NULL;
+ bool bounced = false;
+ int no = -1, ni = 0, j = 0, zerr, err;
+
+ /* 1. get the exact DEFLATE compressed size */
+ kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ sb->s_blocksize - rq->pageofs_in));
+ if (err) {
+ kunmap_local(kin);
+ return err;
+ }
+
+ /* 2. get an available DEFLATE context */
+again:
+ spin_lock(&z_erofs_deflate_lock);
+ strm = z_erofs_deflate_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_deflate_lock);
+ wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head));
+ goto again;
+ }
+ z_erofs_deflate_head = strm->next;
+ spin_unlock(&z_erofs_deflate_lock);
+
+ /* 3. multi-call decompress */
+ insz = rq->inputsize;
+ outsz = rq->outputsize;
+ zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
+ if (zerr != Z_OK) {
+ err = -EIO;
+ goto failed_zinit;
+ }
+
+ pofs = rq->pageofs_out;
+ strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in);
+ insz -= strm->z.avail_in;
+ strm->z.next_in = kin + rq->pageofs_in;
+ strm->z.avail_out = 0;
+
+ while (1) {
+ if (!strm->z.avail_out) {
+ if (++no >= nrpages_out || !outsz) {
+ erofs_err(sb, "insufficient space for decompressed data");
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ if (kout)
+ kunmap_local(kout);
+ strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
+ outsz -= strm->z.avail_out;
+ if (!rq->out[no]) {
+ rq->out[no] = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(rq->out[no],
+ Z_EROFS_SHORTLIVED_PAGE);
+ }
+ kout = kmap_local_page(rq->out[no]);
+ strm->z.next_out = kout + pofs;
+ pofs = 0;
+ }
+
+ if (!strm->z.avail_in && insz) {
+ if (++ni >= nrpages_in) {
+ erofs_err(sb, "invalid compressed data");
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ if (kout) { /* unlike kmap(), take care of the orders */
+ j = strm->z.next_out - kout;
+ kunmap_local(kout);
+ }
+ kunmap_local(kin);
+ strm->z.avail_in = min_t(u32, insz, PAGE_SIZE);
+ insz -= strm->z.avail_in;
+ kin = kmap_local_page(rq->in[ni]);
+ strm->z.next_in = kin;
+ bounced = false;
+ if (kout) {
+ kout = kmap_local_page(rq->out[no]);
+ strm->z.next_out = kout + j;
+ }
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Or use short-lived pages from the
+ * on-stack pagepool where pages share among the same request
+ * and not _all_ inplace I/O pages are needed to be doubled.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in);
+ strm->z.next_in = strm->bounce;
+ bounced = true;
+ }
+
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+
+ zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH);
+ if (zerr != Z_OK || !(outsz + strm->z.avail_out)) {
+ if (zerr == Z_OK && rq->partial_decoding)
+ break;
+ if (zerr == Z_STREAM_END && !outsz)
+ break;
+ erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
+ zerr, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+
+ if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
+ err = -EIO;
+ if (kout)
+ kunmap_local(kout);
+failed_zinit:
+ kunmap_local(kin);
+ /* 4. push back DEFLATE stream context to the global list */
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ wake_up(&z_erofs_deflate_wq);
+ return err;
+}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 2c7b16e340fe..a03ec70ba6f2 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -13,6 +13,7 @@
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -81,7 +82,8 @@ struct erofs_super_block {
__u8 xattr_prefix_count; /* # of long xattr name prefixes */
__le32 xattr_prefix_start; /* start of long xattr prefixes */
__le64 packed_nid; /* nid of the special packed inode */
- __u8 reserved2[24];
+ __u8 xattr_filter_reserved; /* reserved for xattr name filter */
+ __u8 reserved2[23];
};
/*
@@ -200,7 +202,7 @@ struct erofs_inode_extended {
* for read-only fs, no need to introduce h_refcount
*/
struct erofs_xattr_ibody_header {
- __le32 h_reserved;
+ __le32 h_name_filter; /* bit value 1 indicates not-present */
__u8 h_shared_count;
__u8 h_reserved2[7];
__le32 h_shared_xattrs[]; /* shared xattr id array */
@@ -221,6 +223,10 @@ struct erofs_xattr_ibody_header {
#define EROFS_XATTR_LONG_PREFIX 0x80
#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f
+#define EROFS_XATTR_FILTER_BITS 32
+#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX
+#define EROFS_XATTR_FILTER_SEED 0x25BBE08F
+
/* xattr entry (for both inline & shared xattrs) */
struct erofs_xattr_entry {
__u8 e_name_len; /* length of name */
@@ -289,6 +295,7 @@ struct erofs_dirent {
enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
Z_EROFS_COMPRESSION_LZMA = 1,
+ Z_EROFS_COMPRESSION_DEFLATE = 2,
Z_EROFS_COMPRESSION_MAX
};
#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
@@ -309,6 +316,12 @@ struct z_erofs_lzma_cfgs {
#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+/* 6 bytes (+ length field = 8 bytes) */
+struct z_erofs_deflate_cfgs {
+ u8 windowbits; /* 8..15 for DEFLATE */
+ u8 reserved[5];
+} __packed;
+
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index e12592727a54..edc8ec7581b8 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -105,8 +105,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
set_nlink(inode, le32_to_cpu(die->i_nlink));
/* extended inode has its own timestamp */
- inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime);
- inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec);
+ inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
+ le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
@@ -148,8 +148,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
set_nlink(inode, le16_to_cpu(dic->i_nlink));
/* use build time for compact inodes */
- inode->i_ctime.tv_sec = sbi->build_time;
- inode->i_ctime.tv_nsec = sbi->build_time_nsec;
+ inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
if (erofs_inode_is_data_compressed(vi->datalayout))
@@ -176,10 +175,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
- inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
- inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
+ inode->i_mtime = inode->i_atime = inode_get_ctime(inode);
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
@@ -373,7 +369,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 36e32fa542f0..4ff88d0dd980 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -151,6 +151,7 @@ struct erofs_sb_info {
u32 xattr_prefix_start;
u8 xattr_prefix_count;
struct erofs_xattr_prefix_item *xattr_prefixes;
+ unsigned int xattr_filter_reserved;
#endif
u16 device_id_mask; /* valid bits of device id to be used */
@@ -251,6 +252,7 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
@@ -270,6 +272,7 @@ struct erofs_inode {
unsigned char inode_isize;
unsigned int xattr_isize;
+ unsigned int xattr_name_filter;
unsigned int xattr_shared_count;
unsigned int *xattr_shared_xattrs;
@@ -519,6 +522,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */
+#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
+int __init z_erofs_deflate_init(void);
+void z_erofs_deflate_exit(void);
+int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_deflate_cfgs *dfl, int size);
+#else
+static inline int z_erofs_deflate_init(void) { return 0; }
+static inline int z_erofs_deflate_exit(void) { return 0; }
+static inline int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_deflate_cfgs *dfl, int size) {
+ if (dfl) {
+ erofs_err(sb, "deflate algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */
+
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 566f68ddfa36..44a24d573f1f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -19,10 +19,8 @@
#include <trace/events/erofs.h>
static struct kmem_cache *erofs_inode_cachep __read_mostly;
-struct file_system_type erofs_fs_type;
-void _erofs_err(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -32,12 +30,11 @@ void _erofs_err(struct super_block *sb, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf);
+ pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
va_end(args);
}
-void _erofs_info(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -102,11 +99,9 @@ static void erofs_free_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
- /* be careful of RCU symlink path */
if (inode->i_op == &erofs_fast_symlink_iops)
kfree(inode->i_link);
kfree(vi->xattr_shared_xattrs);
-
kmem_cache_free(erofs_inode_cachep, vi);
}
@@ -119,8 +114,7 @@ static bool check_layout_compatibility(struct super_block *sb,
/* check if current kernel meets all mandatory requirements */
if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
- erofs_err(sb,
- "unidentified incompatible feature %x, please upgrade kernel version",
+ erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
feature & ~EROFS_ALL_FEATURE_INCOMPAT);
return false;
}
@@ -201,6 +195,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
case Z_EROFS_COMPRESSION_LZMA:
ret = z_erofs_load_lzma_config(sb, dsb, data, size);
break;
+ case Z_EROFS_COMPRESSION_DEFLATE:
+ ret = z_erofs_load_deflate_config(sb, dsb, data, size);
+ break;
default:
DBG_BUGON(1);
ret = -EFAULT;
@@ -388,6 +385,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start);
sbi->xattr_prefix_count = dsb->xattr_prefix_count;
+ sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
sbi->root_nid = le16_to_cpu(dsb->root_nid);
@@ -420,16 +418,11 @@ static int erofs_read_superblock(struct super_block *sb)
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
- if (erofs_sb_has_fragments(sbi))
- erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
- if (erofs_sb_has_dedupe(sbi))
- erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
}
-/* set up default EROFS parameters */
static void erofs_default_options(struct erofs_fs_context *ctx)
{
#ifdef CONFIG_EROFS_FS_ZIP
@@ -731,7 +724,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
xa_init(&sbi->managed_pslots);
#endif
- /* get the root inode */
inode = erofs_iget(sb, ROOT_NID(sbi));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -748,7 +740,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
erofs_shrinker_register(sb);
- /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
sbi->packed_inode = erofs_iget(sb, sbi->packed_nid);
if (IS_ERR(sbi->packed_inode)) {
@@ -881,10 +872,6 @@ static int erofs_init_fs_context(struct fs_context *fc)
return 0;
}
-/*
- * could be triggered after deactivate_locked_super()
- * is called, thus including umount and failed to initialize.
- */
static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi;
@@ -913,7 +900,6 @@ static void erofs_kill_sb(struct super_block *sb)
sb->s_fs_info = NULL;
}
-/* called when ->s_root is non-NULL */
static void erofs_put_super(struct super_block *sb)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
@@ -950,9 +936,9 @@ static int __init erofs_module_init(void)
erofs_check_ondisk_layout_definitions();
erofs_inode_cachep = kmem_cache_create("erofs_inode",
- sizeof(struct erofs_inode), 0,
- SLAB_RECLAIM_ACCOUNT,
- erofs_inode_init_once);
+ sizeof(struct erofs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ erofs_inode_init_once);
if (!erofs_inode_cachep)
return -ENOMEM;
@@ -964,6 +950,10 @@ static int __init erofs_module_init(void)
if (err)
goto lzma_err;
+ err = z_erofs_deflate_init();
+ if (err)
+ goto deflate_err;
+
erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
@@ -984,6 +974,8 @@ fs_err:
sysfs_err:
z_erofs_exit_zip_subsystem();
zip_err:
+ z_erofs_deflate_exit();
+deflate_err:
z_erofs_lzma_exit();
lzma_err:
erofs_exit_shrinker();
@@ -1001,13 +993,13 @@ static void __exit erofs_module_exit(void)
erofs_exit_sysfs();
z_erofs_exit_zip_subsystem();
+ z_erofs_deflate_exit();
z_erofs_lzma_exit();
erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
erofs_pcpubuf_exit();
}
-/* get filesystem statistics */
static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 40178b6e0688..09d341675e89 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021-2022, Alibaba Cloud
*/
#include <linux/security.h>
+#include <linux/xxhash.h>
#include "xattr.h"
struct erofs_xattr_iter {
@@ -87,6 +88,7 @@ static int erofs_init_inode_xattrs(struct inode *inode)
}
ih = it.kaddr + erofs_blkoff(sb, it.pos);
+ vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
sizeof(uint), GFP_KERNEL);
@@ -392,7 +394,10 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
void *buffer, size_t buffer_size)
{
int ret;
+ unsigned int hashbit;
struct erofs_xattr_iter it;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
if (!name)
return -EINVAL;
@@ -401,6 +406,15 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
if (ret)
return ret;
+ /* reserved flag is non-zero if there's any change of on-disk format */
+ if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) {
+ hashbit = xxh32(name, strlen(name),
+ EROFS_XATTR_FILTER_SEED + index);
+ hashbit &= EROFS_XATTR_FILTER_BITS - 1;
+ if (vi->xattr_name_filter & (1U << hashbit))
+ return -ENOATTR;
+ }
+
it.index = index;
it.name = (struct qstr)QSTR_INIT(name, strlen(name));
if (it.name.len > EROFS_NAME_LEN)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index de4f12152b62..036f610e044b 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -143,22 +143,17 @@ static inline void z_erofs_onlinepage_split(struct page *page)
atomic_inc((atomic_t *)&page->private);
}
-static inline void z_erofs_page_mark_eio(struct page *page)
+static void z_erofs_onlinepage_endio(struct page *page, int err)
{
- int orig;
+ int orig, v;
+
+ DBG_BUGON(!PagePrivate(page));
do {
orig = atomic_read((atomic_t *)&page->private);
- } while (atomic_cmpxchg((atomic_t *)&page->private, orig,
- orig | Z_EROFS_PAGE_EIO) != orig);
-}
-
-static inline void z_erofs_onlinepage_endio(struct page *page)
-{
- unsigned int v;
+ v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
+ } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
- DBG_BUGON(!PagePrivate(page));
- v = atomic_dec_return((atomic_t *)&page->private);
if (!(v & ~Z_EROFS_PAGE_EIO)) {
set_page_private(page, 0);
ClearPagePrivate(page);
@@ -507,19 +502,17 @@ enum z_erofs_pclustermode {
*/
Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
- * The current collection has been linked with the owned chain, and
- * could also be linked with the remaining collections, which means
- * if the processing page is the tail page of the collection, thus
- * the current collection can safely use the whole page (since
- * the previous collection is under control) for in-place I/O, as
- * illustrated below:
- * ________________________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (of the current cl) | (of the previous collection) |
- * | | |
- * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
+ * The pcluster was just linked to a decompression chain by us. It can
+ * also be linked with the remaining pclusters, which means if the
+ * processing page is the tail page of a pcluster, this pcluster can
+ * safely use the whole page (since the previous pcluster is within the
+ * same chain) for in-place I/O, as illustrated below:
+ * ___________________________________________________
+ * | tail (partial) page | head (partial) page |
+ * | (of the current pcl) | (of the previous pcl) |
+ * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
*
- * [ (*) the above page can be used as inplace I/O. ]
+ * [ (*) the page above can be used as inplace I/O. ]
*/
Z_EROFS_PCLUSTER_FOLLOWED,
};
@@ -535,8 +528,6 @@ struct z_erofs_decompress_frontend {
z_erofs_next_pcluster_t owned_head;
enum z_erofs_pclustermode mode;
- /* used for applying cache strategy on the fly */
- bool backmost;
erofs_off_t headoffset;
/* a pointer used to pick up inplace I/O pages */
@@ -545,7 +536,7 @@ struct z_erofs_decompress_frontend {
#define DECOMPRESS_FRONTEND_INIT(__i) { \
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED }
static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
{
@@ -554,7 +545,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
return false;
- if (fe->backmost)
+ if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
return true;
if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
@@ -851,9 +842,11 @@ err_out:
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
+ struct super_block *sb = fe->inode->i_sb;
+ erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
struct erofs_workgroup *grp = NULL;
int ret;
@@ -863,8 +856,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
if (!(map->m_flags & EROFS_MAP_META)) {
- grp = erofs_find_workgroup(fe->inode->i_sb,
- map->m_pa >> PAGE_SHIFT);
+ grp = erofs_find_workgroup(sb, blknr);
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
@@ -883,9 +875,26 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
} else if (ret) {
return ret;
}
+
z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
- /* since file-backed online pages are traversed in reverse order */
+ if (!z_erofs_is_inline_pcluster(fe->pcl)) {
+ /* bind cache first when cached decompression is preferred */
+ z_erofs_bind_cache(fe);
+ } else {
+ void *mptr;
+
+ mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
+ if (IS_ERR(mptr)) {
+ ret = PTR_ERR(mptr);
+ erofs_err(sb, "failed to get inline data %d", ret);
+ return ret;
+ }
+ get_page(map->buf.page);
+ WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
+ }
+ /* file-backed inplace I/O pages are traversed in reverse order */
fe->icur = z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -908,12 +917,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
-static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
{
struct z_erofs_pcluster *pcl = fe->pcl;
if (!pcl)
- return false;
+ return;
z_erofs_bvec_iter_end(&fe->biter);
mutex_unlock(&pcl->lock);
@@ -929,37 +938,29 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
erofs_workgroup_put(&pcl->obj);
fe->pcl = NULL;
- return true;
}
-static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
- struct page *page, unsigned int pageofs,
- unsigned int len)
+static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
+ unsigned int cur, unsigned int end, erofs_off_t pos)
{
- struct super_block *sb = inode->i_sb;
- struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
+ struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- u8 *src, *dst;
- unsigned int i, cnt;
+ unsigned int cnt;
+ u8 *src;
if (!packed_inode)
return -EFSCORRUPTED;
buf.inode = packed_inode;
- pos += EROFS_I(inode)->z_fragmentoff;
- for (i = 0; i < len; i += cnt) {
- cnt = min_t(unsigned int, len - i,
+ for (; cur < end; cur += cnt, pos += cnt) {
+ cnt = min_t(unsigned int, end - cur,
sb->s_blocksize - erofs_blkoff(sb, pos));
src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
-
- dst = kmap_local_page(page);
- memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt);
- kunmap_local(dst);
- pos += cnt;
+ memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
}
erofs_put_metabuf(&buf);
return 0;
@@ -972,94 +973,60 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
bool tight = true, exclusive;
- unsigned int cur, end, spiltted;
+ unsigned int cur, end, len, split;
int err = 0;
- /* register locked file pages as online pages in pack */
z_erofs_onlinepage_init(page);
- spiltted = 0;
+ split = 0;
end = PAGE_SIZE;
repeat:
- cur = end - 1;
-
- if (offset + cur < map->m_la ||
- offset + cur >= map->m_la + map->m_llen) {
- if (z_erofs_collector_end(fe))
- fe->backmost = false;
- map->m_la = offset + cur;
+ if (offset + end - 1 < map->m_la ||
+ offset + end - 1 >= map->m_la + map->m_llen) {
+ z_erofs_pcluster_end(fe);
+ map->m_la = offset + end - 1;
map->m_llen = 0;
err = z_erofs_map_blocks_iter(inode, map, 0);
if (err)
goto out;
- } else {
- if (fe->pcl)
- goto hitted;
- /* didn't get a valid pcluster previously (very rare) */
}
- if (!(map->m_flags & EROFS_MAP_MAPPED) ||
- map->m_flags & EROFS_MAP_FRAGMENT)
- goto hitted;
-
- err = z_erofs_collector_begin(fe);
- if (err)
- goto out;
-
- if (z_erofs_is_inline_pcluster(fe->pcl)) {
- void *mp;
-
- mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
- erofs_blknr(inode->i_sb, map->m_pa),
- EROFS_NO_KMAP);
- if (IS_ERR(mp)) {
- err = PTR_ERR(mp);
- erofs_err(inode->i_sb,
- "failed to get inline page, err %d", err);
- goto out;
- }
- get_page(fe->map.buf.page);
- WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
- fe->map.buf.page);
- fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
- } else {
- /* bind cache first when cached decompression is preferred */
- z_erofs_bind_cache(fe);
- }
-hitted:
- /*
- * Ensure the current partial page belongs to this submit chain rather
- * than other concurrent submit chains or the noio(bypass) chain since
- * those chains are handled asynchronously thus the page cannot be used
- * for inplace I/O or bvpage (should be processed in a strict order.)
- */
- tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+ cur = offset > map->m_la ? 0 : map->m_la - offset;
+ /* bump split parts first to avoid several separate cases */
+ ++split;
- cur = end - min_t(erofs_off_t, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
+ tight = false;
goto next_part;
}
+
if (map->m_flags & EROFS_MAP_FRAGMENT) {
- unsigned int pageofs, skip, len;
+ erofs_off_t fpos = offset + cur - map->m_la;
- if (offset > map->m_la) {
- pageofs = 0;
- skip = offset - map->m_la;
- } else {
- pageofs = map->m_la & ~PAGE_MASK;
- skip = 0;
- }
- len = min_t(unsigned int, map->m_llen - skip, end - cur);
- err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
+ len = min_t(unsigned int, map->m_llen - fpos, end - cur);
+ err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
+ EROFS_I(inode)->z_fragmentoff + fpos);
if (err)
goto out;
- ++spiltted;
tight = false;
goto next_part;
}
- exclusive = (!cur && (!spiltted || tight));
+ if (!fe->pcl) {
+ err = z_erofs_pcluster_begin(fe);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * Ensure the current partial page belongs to this submit chain rather
+ * than other concurrent submit chains or the noio(bypass) chain since
+ * those chains are handled asynchronously thus the page cannot be used
+ * for inplace I/O or bvpage (should be processed in a strict order.)
+ */
+ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+ exclusive = (!cur && ((split <= 1) || tight));
if (cur)
tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
@@ -1072,8 +1039,6 @@ hitted:
goto out;
z_erofs_onlinepage_split(page);
- /* bump up the number of spiltted parts of a page */
- ++spiltted;
if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
fe->pcl->multibases = true;
if (fe->pcl->length < offset + end - map->m_la) {
@@ -1094,9 +1059,7 @@ next_part:
goto repeat;
out:
- if (err)
- z_erofs_page_mark_eio(page);
- z_erofs_onlinepage_endio(page);
+ z_erofs_onlinepage_endio(page, err);
return err;
}
@@ -1199,9 +1162,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
- if (err)
- z_erofs_page_mark_eio(bvi->bvec.page);
- z_erofs_onlinepage_endio(bvi->bvec.page);
+ z_erofs_onlinepage_endio(bvi->bvec.page, err);
list_del(p);
kfree(bvi);
}
@@ -1372,9 +1333,7 @@ out:
/* recycle all individual short-lived pages */
if (z_erofs_put_shortlivedpage(be->pagepool, page))
continue;
- if (err)
- z_erofs_page_mark_eio(page);
- z_erofs_onlinepage_endio(page);
+ z_erofs_onlinepage_endio(page, err);
}
if (be->decompressed_pages != be->onstack_pages)
@@ -1410,7 +1369,10 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
owned = READ_ONCE(be.pcl->next);
z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
- erofs_workgroup_put(&be.pcl->obj);
+ if (z_erofs_is_inline_pcluster(be.pcl))
+ z_erofs_free_pcluster(be.pcl);
+ else
+ erofs_workgroup_put(&be.pcl->obj);
}
}
@@ -1848,15 +1810,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
if (page) {
- if (PageUptodate(page)) {
+ if (PageUptodate(page))
unlock_page(page);
- } else {
- err = z_erofs_do_read_page(f, page);
- if (err)
- erofs_err(inode->i_sb,
- "readmore error at page %lu @ nid %llu",
- index, EROFS_I(inode)->nid);
- }
+ else
+ (void)z_erofs_do_read_page(f, page);
put_page(page);
}
@@ -1868,25 +1825,25 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *const inode = page->mapping->host;
+ struct inode *const inode = folio->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
int err;
- trace_erofs_readpage(page, false);
- f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+ trace_erofs_read_folio(folio, false);
+ f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, page);
+ err = z_erofs_do_read_page(&f, &folio->page);
z_erofs_pcluster_readmore(&f, NULL, false);
- (void)z_erofs_collector_end(&f);
+ z_erofs_pcluster_end(&f);
/* if some compressed cluster ready, need submit them anyway */
z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
- if (err)
- erofs_err(inode->i_sb, "failed to read, err [%d]", err);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
+ err, folio->index, EROFS_I(inode)->nid);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
@@ -1898,38 +1855,35 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *head = NULL, *page;
- unsigned int nr_pages;
+ struct folio *head = NULL, *folio;
+ unsigned int nr_folios;
+ int err;
f.headoffset = readahead_pos(rac);
z_erofs_pcluster_readmore(&f, rac, true);
- nr_pages = readahead_count(rac);
- trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ nr_folios = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
- while ((page = readahead_page(rac))) {
- set_page_private(page, (unsigned long)head);
- head = page;
+ while ((folio = readahead_folio(rac))) {
+ folio->private = head;
+ head = folio;
}
+ /* traverse in reverse order for best metadata I/O performance */
while (head) {
- struct page *page = head;
- int err;
-
- /* traversal in reverse order */
- head = (void *)page_private(page);
+ folio = head;
+ head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, page);
- if (err)
- erofs_err(inode->i_sb,
- "readahead error at page %lu @ nid %llu",
- page->index, EROFS_I(inode)->nid);
- put_page(page);
+ err = z_erofs_do_read_page(&f, &folio->page);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
}
z_erofs_pcluster_readmore(&f, rac, false);
- (void)z_erofs_collector_end(&f);
+ z_erofs_pcluster_end(&f);
- z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true);
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 1909ddafd9c7..7b55111fd533 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -561,8 +561,9 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
- map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
- map->m_llen >= i_blocksize(inode))) {
+ (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA ||
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) &&
+ map->m_llen >= i_blocksize(inode))) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8aa36cd37351..33a918f9566c 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -189,7 +189,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{
lockdep_assert_held(&ctx->wqh.lock);
- *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+ *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count;
ctx->count -= *cnt;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4b1b3362f697..1d9a71a0c4c1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -975,15 +975,11 @@ again:
static int ep_alloc(struct eventpoll **pep)
{
- int error;
- struct user_struct *user;
struct eventpoll *ep;
- user = get_current_user();
- error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL);
if (unlikely(!ep))
- goto free_uid;
+ return -ENOMEM;
mutex_init(&ep->mtx);
rwlock_init(&ep->lock);
@@ -992,16 +988,12 @@ static int ep_alloc(struct eventpoll **pep)
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT_CACHED;
ep->ovflist = EP_UNACTIVE_PTR;
- ep->user = user;
+ ep->user = get_current_user();
refcount_set(&ep->refcount, 1);
*pep = ep;
return 0;
-
-free_uid:
- free_uid(user);
- return error;
}
/*
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 729ada9e26e8..f55498e5c23d 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -273,8 +273,6 @@ struct exfat_sb_info {
spinlock_t inode_hash_lock;
struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
-
- struct rcu_head rcu;
};
#define EXFAT_CACHE_VALID 0
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 3cbd270e0cba..32395ef686a2 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -22,7 +22,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
if (err)
return err;
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
if (!IS_SYNC(inode))
@@ -232,7 +232,7 @@ int exfat_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_backing_inode(path->dentry);
struct exfat_inode_info *ei = EXFAT_I(inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
exfat_truncate_atime(&stat->atime);
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ei->i_crtime.tv_sec;
@@ -290,7 +290,7 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_valid & ATTR_SIZE)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
setattr_copy(&nop_mnt_idmap, inode, attr);
exfat_truncate_atime(&inode->i_atime);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 481dd338f2b8..13329baeafbc 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -355,7 +355,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
exfat_truncate(inode);
}
}
@@ -398,7 +398,7 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
exfat_write_failed(mapping, pos+len);
if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ei->attr |= ATTR_ARCHIVE;
mark_inode_dirty(inode);
}
@@ -577,7 +577,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
inode->i_mtime = info->mtime;
- inode->i_ctime = info->mtime;
+ inode_set_ctime_to_ts(inode, info->mtime);
ei->i_crtime = info->crtime;
inode->i_atime = info->atime;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index e0ff9d156f6f..1b9f587f6cca 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -569,7 +569,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(dir);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -582,8 +582,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = inode->i_ctime =
- EXFAT_I(inode)->i_crtime = current_time(inode);
+ inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
@@ -817,7 +816,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_atime = current_time(dir);
+ dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
exfat_truncate_atime(&dir->i_atime);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
@@ -825,7 +824,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
mark_inode_dirty(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
@@ -852,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(dir);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -866,8 +865,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = inode->i_ctime =
- EXFAT_I(inode)->i_crtime = current_time(inode);
+ inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
@@ -979,7 +977,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_atime = current_time(dir);
+ dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
exfat_truncate_atime(&dir->i_atime);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
@@ -988,7 +986,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
@@ -1312,8 +1310,8 @@ static int exfat_rename(struct mnt_idmap *idmap,
goto unlock;
inode_inc_iversion(new_dir);
- new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime =
- EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
exfat_truncate_atime(&new_dir->i_atime);
if (IS_DIRSYNC(new_dir))
exfat_sync_inode(new_dir);
@@ -1336,7 +1334,6 @@ static int exfat_rename(struct mnt_idmap *idmap,
}
inode_inc_iversion(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
if (IS_DIRSYNC(old_dir))
exfat_sync_inode(old_dir);
else
@@ -1354,8 +1351,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
exfat_warn(sb, "abnormal access to an inode dropped");
WARN_ON(new_inode->i_nlink == 0);
}
- new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime =
- current_time(new_inode);
+ EXFAT_I(new_inode)->i_crtime = current_time(new_inode);
}
unlock:
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8c32460e031e..2778bd9b631e 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -31,16 +31,6 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi)
kfree(sbi->options.iocharset);
}
-static void exfat_delayed_free(struct rcu_head *p)
-{
- struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu);
-
- unload_nls(sbi->nls_io);
- exfat_free_iocharset(sbi);
- exfat_free_upcase_table(sbi);
- kfree(sbi);
-}
-
static void exfat_put_super(struct super_block *sb)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -50,7 +40,8 @@ static void exfat_put_super(struct super_block *sb)
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
- call_rcu(&sbi->rcu, exfat_delayed_free);
+ unload_nls(sbi->nls_io);
+ exfat_free_upcase_table(sbi);
}
static int exfat_sync_fs(struct super_block *sb, int wait)
@@ -379,8 +370,7 @@ static int exfat_read_root(struct inode *inode)
ei->i_size_ondisk = i_size_read(inode);
exfat_save_attr(inode, ATTR_SUBDIR);
- inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
- current_time(inode);
+ inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
return 0;
}
@@ -710,9 +700,6 @@ free_table:
check_nls_io:
unload_nls(sbi->nls_io);
- exfat_free_iocharset(sbi);
- sb->s_fs_info = NULL;
- kfree(sbi);
return err;
}
@@ -721,14 +708,18 @@ static int exfat_get_tree(struct fs_context *fc)
return get_tree_bdev(fc, exfat_fill_super);
}
+static void exfat_free_sbi(struct exfat_sb_info *sbi)
+{
+ exfat_free_iocharset(sbi);
+ kfree(sbi);
+}
+
static void exfat_free(struct fs_context *fc)
{
struct exfat_sb_info *sbi = fc->s_fs_info;
- if (sbi) {
- exfat_free_iocharset(sbi);
- kfree(sbi);
- }
+ if (sbi)
+ exfat_free_sbi(sbi);
}
static int exfat_reconfigure(struct fs_context *fc)
@@ -773,12 +764,21 @@ static int exfat_init_fs_context(struct fs_context *fc)
return 0;
}
+static void exfat_kill_sb(struct super_block *sb)
+{
+ struct exfat_sb_info *sbi = sb->s_fs_info;
+
+ kill_block_super(sb);
+ if (sbi)
+ exfat_free_sbi(sbi);
+}
+
static struct file_system_type exfat_fs_type = {
.owner = THIS_MODULE,
.name = "exfat",
.init_fs_context = exfat_init_fs_context,
.parameters = exfat_parameters,
- .kill_sb = kill_block_super,
+ .kill_sb = exfat_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 82b17d7fc93f..7e54c31589c7 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -237,7 +237,7 @@ ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
error = __ext2_set_acl(inode, acl, type);
if (!error && update_mode) {
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
return error;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 42db804794bd..b335f17f682f 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -468,7 +468,7 @@ int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
ext2_set_de_type(de, inode);
ext2_commit_chunk(page, pos, len);
if (update_times)
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
return ext2_handle_dirsync(dir);
@@ -555,7 +555,7 @@ got_it:
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
ext2_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
err = ext2_handle_dirsync(dir);
@@ -606,7 +606,7 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
pde->rec_len = ext2_rec_len_to_disk(to - from);
dir->inode = 0;
ext2_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(inode);
return ext2_handle_dirsync(inode);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index a4e1d7a9c544..124df89689e1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -549,7 +549,7 @@ got:
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_flags =
ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 75983215c7a1..acbab27fe957 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -595,7 +595,7 @@ static void ext2_splice_branch(struct inode *inode,
if (where->bh)
mark_buffer_dirty_inode(where->bh, inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
@@ -1287,7 +1287,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
__ext2_truncate_blocks(inode, newsize);
filemap_invalidate_unlock(inode->i_mapping);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (inode_needs_sync(inode)) {
sync_mapping_buffers(inode->i_mapping);
sync_inode_metadata(inode, 1);
@@ -1409,9 +1409,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
- inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+ inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0);
inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+ inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
@@ -1541,7 +1541,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le32(inode->i_size);
raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
@@ -1628,7 +1628,7 @@ int ext2_getattr(struct mnt_idmap *idmap, const struct path *path,
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index cc87d413eb43..44e04484e570 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -44,7 +44,7 @@ int ext2_fileattr_set(struct mnt_idmap *idmap,
(fa->flags & EXT2_FL_USER_MODIFIABLE);
ext2_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return 0;
@@ -77,7 +77,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
inode_lock(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode->i_generation = generation;
inode_unlock(inode);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 937dd8f60f96..059517068adc 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -211,7 +211,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
if (err)
return err;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -291,7 +291,7 @@ static int ext2_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
err = 0;
out:
@@ -367,7 +367,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
ext2_put_page(new_page, new_de);
if (err)
goto out_dir;
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
@@ -383,7 +383,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
err = ext2_delete_entry(old_de, old_page);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 2959afc7541c..aaf3e3e88cb2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1572,7 +1572,7 @@ out:
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8906ba479aaf..1c9187188d68 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -773,7 +773,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
/* Update the inode. */
EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_SYNC(inode)) {
error = sync_inode_metadata(inode, 1);
/* In case sync failed due to ENOSPC the inode was actually
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 27fcbddfb148..3bffe862f954 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -259,7 +259,7 @@ retry:
error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
if (!error && update_mode) {
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
error = ext4_mark_inode_dirty(handle, inode);
}
out_stop:
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0a2d55faa095..1e2259d9967d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -868,64 +868,70 @@ struct ext4_inode {
* affected filesystem before 2242.
*/
-static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
+static inline __le32 ext4_encode_extra_time(struct timespec64 ts)
{
- u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK;
- return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
+ u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK;
+ return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS));
}
-static inline void ext4_decode_extra_time(struct timespec64 *time,
- __le32 extra)
+static inline struct timespec64 ext4_decode_extra_time(__le32 base,
+ __le32 extra)
{
+ struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) };
+
if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
- time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
+ ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ return ts;
}
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \
do { \
- if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
- (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
- (raw_inode)->xtime ## _extra = \
- ext4_encode_extra_time(&(inode)->xtime); \
- } \
- else \
- (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \
+ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
+ (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \
+ (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \
+ } else \
+ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \
} while (0)
-#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
-do { \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
- (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
- (raw_inode)->xtime ## _extra = \
- ext4_encode_extra_time(&(einode)->xtime); \
-} while (0)
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime)
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \
+ raw_inode, (einode)->xtime)
+
+#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \
+ (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \
+ ext4_decode_extra_time((raw_inode)->xtime, \
+ (raw_inode)->xtime ## _extra) : \
+ (struct timespec64) { \
+ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \
+ })
#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
do { \
- (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \
- if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
- ext4_decode_extra_time(&(inode)->xtime, \
- raw_inode->xtime ## _extra); \
- } \
- else \
- (inode)->xtime.tv_nsec = 0; \
+ (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \
} while (0)
+#define EXT4_INODE_GET_CTIME(inode, raw_inode) \
+do { \
+ inode_set_ctime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \
+} while (0)
-#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
-do { \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
- (einode)->xtime.tv_sec = \
- (signed)le32_to_cpu((raw_inode)->xtime); \
- else \
- (einode)->xtime.tv_sec = 0; \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
- ext4_decode_extra_time(&(einode)->xtime, \
- raw_inode->xtime ## _extra); \
- else \
- (einode)->xtime.tv_nsec = 0; \
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
+do { \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ (einode)->xtime = \
+ EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \
+ raw_inode); \
+ else \
+ (einode)->xtime = (struct timespec64){0, 0}; \
} while (0)
#define i_disk_version osd1.linux1.l_i_version
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 77f318ec8abb..b38d59581411 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -234,8 +234,7 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
might_sleep();
- if (bh->b_bdev->bd_super)
- ext4_check_bdev_write_error(bh->b_bdev->bd_super);
+ ext4_check_bdev_write_error(sb);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e4115d338f10..202c76996b62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4476,12 +4476,12 @@ retry:
map.m_lblk += ret;
map.m_len = len = len - ret;
epos = (loff_t)map.m_lblk << inode->i_blkbits;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
- inode->i_mtime = inode->i_ctime;
+ inode->i_mtime = inode_get_ctime(inode);
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4617,7 +4617,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
@@ -4642,7 +4642,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_mutex;
}
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
@@ -5378,7 +5378,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -5488,7 +5488,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_stop;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 754f961cd9fd..48abef5f23e7 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1250,7 +1250,7 @@ got:
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
ei->i_crtime = inode->i_mtime;
memset(ei->i_data, 0, sizeof(ei->i_data));
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index a4b7e4bc32d4..003861037374 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
return 1;
@@ -1991,7 +1991,7 @@ out:
ext4_orphan_del(handle, inode);
if (err == 0) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index 7935ea6cf92c..f0c0fd507fbc 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -245,9 +245,9 @@ static void inode_test_xtimestamp_decoding(struct kunit *test)
struct timestamp_expectation *test_param =
(struct timestamp_expectation *)(test->param_value);
- timestamp.tv_sec = get_32bit_time(test_param);
- ext4_decode_extra_time(&timestamp,
- cpu_to_le32(test_param->extra_bits));
+ timestamp = ext4_decode_extra_time(
+ cpu_to_le32(get_32bit_time(test_param)),
+ cpu_to_le32(test_param->extra_bits));
KUNIT_EXPECT_EQ_MSG(test,
test_param->expected.tv_sec,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 43775a6ca505..6683076ecb2f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3986,7 +3986,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret2))
ret = ret2;
@@ -4146,7 +4146,7 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(err2 && !err))
err = err2;
@@ -4249,7 +4249,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
}
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_CTIME(inode, raw_inode);
EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
@@ -4858,7 +4858,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
}
- EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_GET_CTIME(inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
@@ -4981,7 +4981,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_CTIME(inode, raw_inode);
EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
ext4_inode_csum_set(inode, raw_inode, ei);
@@ -5376,10 +5376,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
* Update c/mtime on truncate up, ext4_truncate() will
* update c/mtime in shrink case below
*/
- if (!shrink) {
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
- }
+ if (!shrink)
+ inode->i_mtime = inode_set_ctime_current(inode);
if (shrink)
ext4_fc_track_range(handle, inode,
@@ -5537,7 +5535,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 331859511f80..b0349f451863 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -449,7 +449,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
diff = size - size_bl;
swap_inode_data(inode, inode_bl);
- inode->i_ctime = inode_bl->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
+ inode_set_ctime_current(inode_bl);
inode_inc_iversion(inode);
inode->i_generation = get_random_u32();
@@ -663,7 +664,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_set_inode_flags(inode, false);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
@@ -774,7 +775,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
}
EXT4_I(inode)->i_projid = kprojid;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
@@ -1266,7 +1267,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0caf6c730ce3..933ad03f4f58 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2203,7 +2203,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
err2 = ext4_mark_inode_dirty(handle, dir);
@@ -3197,7 +3197,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (retval)
goto end_rmdir;
@@ -3271,7 +3272,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto out_handle;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
@@ -3286,7 +3287,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (dentry && !retval)
ext4_fc_track_unlink(handle, dentry);
@@ -3463,7 +3464,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ext4_inc_count(inode);
ihold(inode);
@@ -3641,8 +3642,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
inode_inc_iversion(ent->dir);
- ent->dir->i_ctime = ent->dir->i_mtime =
- current_time(ent->dir);
+ ent->dir->i_mtime = inode_set_ctime_current(ent->dir);
retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
@@ -3941,7 +3941,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old.inode->i_ctime = current_time(old.inode);
+ inode_set_ctime_current(old.inode);
retval = ext4_mark_inode_dirty(handle, old.inode);
if (unlikely(retval))
goto end_rename;
@@ -3955,9 +3955,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new.inode) {
ext4_dec_count(new.inode);
- new.inode->i_ctime = current_time(new.inode);
+ inode_set_ctime_current(new.inode);
}
- old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
+ old.dir->i_mtime = inode_set_ctime_current(old.dir);
ext4_update_dx_flag(old.dir);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
@@ -4053,7 +4053,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
};
u8 new_file_type;
int retval;
- struct timespec64 ctime;
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
!projid_eq(EXT4_I(new_dir)->i_projid,
@@ -4147,9 +4146,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- ctime = current_time(old.inode);
- old.inode->i_ctime = ctime;
- new.inode->i_ctime = ctime;
+ inode_set_ctime_current(old.inode);
+ inode_set_ctime_current(new.inode);
retval = ext4_mark_inode_dirty(handle, old.inode);
if (unlikely(retval))
goto end_rename;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c94ebf704616..73547d2334fd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -93,6 +93,7 @@ static int ext4_get_tree(struct fs_context *fc);
static int ext4_reconfigure(struct fs_context *fc);
static void ext4_fc_free(struct fs_context *fc);
static int ext4_init_fs_context(struct fs_context *fc);
+static void ext4_kill_sb(struct super_block *sb);
static const struct fs_parameter_spec ext4_param_specs[];
/*
@@ -135,12 +136,12 @@ static struct file_system_type ext2_fs_type = {
.name = "ext2",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
+ .kill_sb = ext4_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
-#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif
@@ -151,12 +152,12 @@ static struct file_system_type ext3_fs_type = {
.name = "ext3",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
+ .kill_sb = ext4_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
-#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)
static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
@@ -1096,15 +1097,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
-static void ext4_bdev_mark_dead(struct block_device *bdev)
-{
- ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH);
-}
-
-static const struct blk_holder_ops ext4_holder_ops = {
- .mark_dead = ext4_bdev_mark_dead,
-};
-
/*
* Open the external journal device
*/
@@ -1113,7 +1105,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
struct block_device *bdev;
bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
- &ext4_holder_ops);
+ &fs_holder_ops);
if (IS_ERR(bdev))
goto fail;
return bdev;
@@ -1125,25 +1117,6 @@ fail:
return NULL;
}
-/*
- * Release the journal device
- */
-static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
-{
- struct block_device *bdev;
- bdev = sbi->s_journal_bdev;
- if (bdev) {
- /*
- * Invalidate the journal device's buffers. We don't want them
- * floating about in memory - the physical journal device may
- * hotswapped, and it breaks the `ro-after' testing code.
- */
- invalidate_bdev(bdev);
- blkdev_put(bdev, sbi->s_sb);
- sbi->s_journal_bdev = NULL;
- }
-}
-
static inline struct inode *orphan_list_entry(struct list_head *l)
{
return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
@@ -1339,8 +1312,13 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
if (sbi->s_journal_bdev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
sync_blockdev(sbi->s_journal_bdev);
- ext4_blkdev_remove(sbi);
+ invalidate_bdev(sbi->s_journal_bdev);
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -5572,7 +5550,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
spin_lock_init(&sbi->s_bdev_wb_lock);
errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
&sbi->s_bdev_wb_err);
- sb->s_bdev->bd_super = sb;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
@@ -5664,9 +5641,11 @@ failed_mount:
kfree(get_qf_name(sb, sbi, i));
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
- /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
brelse(sbi->s_sbh);
- ext4_blkdev_remove(sbi);
+ if (sbi->s_journal_bdev) {
+ invalidate_bdev(sbi->s_journal_bdev);
+ blkdev_put(sbi->s_journal_bdev, sb);
+ }
out_fail:
invalidate_bdev(sb->s_bdev);
sb->s_fs_info = NULL;
@@ -5854,7 +5833,10 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
return NULL;
+ /* see get_tree_bdev why this is needed and safe */
+ up_write(&sb->s_umount);
bdev = ext4_blkdev_get(j_dev, sb);
+ down_write(&sb->s_umount);
if (bdev == NULL)
return NULL;
@@ -7103,7 +7085,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
}
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
@@ -7273,13 +7255,24 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
return 1;
}
+static void ext4_kill_sb(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
+
+ kill_block_super(sb);
+
+ if (journal_bdev)
+ blkdev_put(journal_bdev, sb);
+}
+
static struct file_system_type ext4_fs_type = {
.owner = THIS_MODULE,
.name = "ext4",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .kill_sb = ext4_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("ext4");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 05151d61b00b..281e1bfbbe3e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -356,13 +356,13 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
- return ((u64)ea_inode->i_ctime.tv_sec << 32) |
+ return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) |
(u32) inode_peek_iversion_raw(ea_inode);
}
static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
{
- ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
+ inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0);
inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff);
}
@@ -2473,7 +2473,7 @@ retry_inode:
}
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
if (!value)
no_expand = 0;
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 236d890f560b..0f7df9c11af3 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1045,7 +1045,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
struct address_space *mapping = cc->inode->i_mapping;
struct page *page;
sector_t last_block_in_bio;
- unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
+ fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
pgoff_t start_idx = start_idx_of_cluster(cc);
int i, ret;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index d635c58cf5a3..8aa29fe2e87b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
de->file_type = fs_umode_to_ftype(inode->i_mode);
set_page_dirty(page);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
f2fs_mark_inode_dirty_sync(dir, false);
f2fs_put_page(page, 1);
}
@@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
f2fs_i_links_write(dir, true);
clear_inode_flag(inode, FI_NEW_INODE);
}
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
f2fs_mark_inode_dirty_sync(dir, false);
if (F2FS_I(dir)->i_current_depth != current_depth)
@@ -858,7 +858,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
if (S_ISDIR(inode->i_mode))
f2fs_i_links_write(dir, false);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_i_links_write(inode, false);
if (S_ISDIR(inode->i_mode)) {
@@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
}
f2fs_put_page(page, 1);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c7cb2177b252..613132339d72 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2736,7 +2736,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
static inline struct page *f2fs_pagecache_get_page(
struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp_mask)
+ fgf_t fgp_flags, gfp_t gfp_mask)
{
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET))
return NULL;
@@ -3303,9 +3303,11 @@ static inline void clear_file(struct inode *inode, int type)
static inline bool f2fs_is_time_consistent(struct inode *inode)
{
+ struct timespec64 ctime = inode_get_ctime(inode);
+
if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime))
return false;
if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
return false;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 093039dee992..35886a52edfb 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -794,7 +794,7 @@ int f2fs_truncate(struct inode *inode)
if (err)
return err;
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, false);
return 0;
}
@@ -882,7 +882,7 @@ int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
/* we need to show initial sectors used for inline_data/dentries */
if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
@@ -905,7 +905,7 @@ static void __setattr_copy(struct mnt_idmap *idmap,
if (ia_valid & ATTR_MTIME)
inode->i_mtime = attr->ia_mtime;
if (ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
@@ -1008,7 +1008,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return err;
spin_lock(&F2FS_I(inode)->i_size_lock);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
F2FS_I(inode)->last_disk_size = i_size_read(inode);
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
@@ -1835,7 +1835,7 @@ static long f2fs_fallocate(struct file *file, int mode,
}
if (!ret) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, false);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -1937,7 +1937,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
else
clear_inode_flag(inode, FI_PROJ_INHERIT);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_set_inode_flags(inode);
f2fs_mark_inode_dirty_sync(inode, true);
return 0;
@@ -2874,10 +2874,10 @@ out_src:
if (ret)
goto out_unlock;
- src->i_mtime = src->i_ctime = current_time(src);
+ src->i_mtime = inode_set_ctime_current(src);
f2fs_mark_inode_dirty_sync(src, false);
if (src != dst) {
- dst->i_mtime = dst->i_ctime = current_time(dst);
+ dst->i_mtime = inode_set_ctime_current(dst);
f2fs_mark_inode_dirty_sync(dst, false);
}
f2fs_update_time(sbi, REQ_TIME);
@@ -3073,7 +3073,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
goto out_unlock;
fi->i_projid = kprojid;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
out_unlock:
f2fs_unlock_op(sbi);
@@ -3511,7 +3511,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
}
set_inode_flag(inode, FI_COMPRESS_RELEASED);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -3710,7 +3710,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
if (ret >= 0) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
}
unlock_inode:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 01effd3fcb6c..a1ca394bc327 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -2181,12 +2181,14 @@ out_drop_write:
if (err)
return err;
- err = freeze_super(sbi->sb);
+ err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
if (err)
return err;
if (f2fs_readonly(sbi->sb)) {
- thaw_super(sbi->sb);
+ err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
+ if (err)
+ return err;
return -EROFS;
}
@@ -2240,6 +2242,6 @@ recover_out:
out_err:
f2fs_up_write(&sbi->cp_global_sem);
f2fs_up_write(&sbi->gc_lock);
- thaw_super(sbi->sb);
+ thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
return err;
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 4638fee16a91..88fc9208ffa7 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -698,7 +698,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
f2fs_put_page(page, 1);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 09e986b050c6..c1c2ba9f28e5 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -403,7 +403,7 @@ static void init_idisk_time(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
fi->i_disk_time[0] = inode->i_atime;
- fi->i_disk_time[1] = inode->i_ctime;
+ fi->i_disk_time[1] = inode_get_ctime(inode);
fi->i_disk_time[2] = inode->i_mtime;
}
@@ -434,10 +434,10 @@ static int do_read_inode(struct inode *inode)
inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
- inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
+ inode_set_ctime(inode, le64_to_cpu(ri->i_ctime),
+ le32_to_cpu(ri->i_ctime_nsec));
inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
- inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
inode->i_generation = le32_to_cpu(ri->i_generation);
if (S_ISDIR(inode->i_mode))
@@ -714,10 +714,10 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
set_raw_inline(inode, ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
if (S_ISDIR(inode->i_mode))
ri->i_current_depth =
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index bee0568888da..193b22a2d6bf 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -243,7 +243,7 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap,
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
F2FS_I(inode)->i_crtime = inode->i_mtime;
inode->i_generation = get_random_u32();
@@ -420,7 +420,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_balance_fs(sbi, true);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ihold(inode);
set_inode_flag(inode, FI_INC_LINK);
@@ -1052,7 +1052,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
new_page = NULL;
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
f2fs_i_links_write(new_inode, false);
@@ -1086,7 +1086,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
f2fs_i_pino_write(old_inode, new_dir->i_ino);
f2fs_up_write(&F2FS_I(old_inode)->i_sem);
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
f2fs_mark_inode_dirty_sync(old_inode, false);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
@@ -1251,7 +1251,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_pino_write(old_inode, new_dir->i_ino);
f2fs_up_write(&F2FS_I(old_inode)->i_sem);
- old_dir->i_ctime = current_time(old_dir);
+ inode_set_ctime_current(old_dir);
if (old_nlink) {
f2fs_down_write(&F2FS_I(old_dir)->i_sem);
f2fs_i_links_write(old_dir, old_nlink > 0);
@@ -1270,7 +1270,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_pino_write(new_inode, old_dir->i_ino);
f2fs_up_write(&F2FS_I(new_inode)->i_sem);
- new_dir->i_ctime = current_time(new_dir);
+ inode_set_ctime_current(new_dir);
if (new_nlink) {
f2fs_down_write(&F2FS_I(new_dir)->i_sem);
f2fs_i_links_write(new_dir, new_nlink > 0);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 4e7d4ceeb084..b8637e88d94f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -321,10 +321,10 @@ static int recover_inode(struct inode *inode, struct page *page)
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
- inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
+ inode_set_ctime(inode, le64_to_cpu(raw->i_ctime),
+ le32_to_cpu(raw->i_ctime_nsec));
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
- inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
F2FS_I(inode)->i_advise = raw->i_advise;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ca31163da00a..aa1f9a3a8037 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1561,7 +1561,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
int i;
for (i = 0; i < sbi->s_ndevs; i++) {
- blkdev_put(FDEV(i).bdev, sbi->sb->s_type);
+ blkdev_put(FDEV(i).bdev, sbi->sb);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
#endif
@@ -2703,7 +2703,7 @@ retry:
if (len == towrite)
return err;
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, false);
return len - towrite;
}
@@ -4198,7 +4198,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
/* Single zoned block device mount */
FDEV(0).bdev =
blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode,
- sbi->sb->s_type, NULL);
+ sbi->sb, NULL);
} else {
/* Multi-device mount */
memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
@@ -4217,8 +4217,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
sbi->log_blocks_per_seg) - 1;
}
FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode,
- sbi->sb->s_type,
- NULL);
+ sbi->sb, NULL);
}
if (IS_ERR(FDEV(i).bdev))
return PTR_ERR(FDEV(i).bdev);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 476b186b90a6..4ae93e1df421 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -764,7 +764,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
same:
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
inode->i_mode = F2FS_I(inode)->i_acl_mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
clear_inode_flag(inode, FI_ACL_MODE);
}
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e3b690b48e3e..66cf4778cf3b 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -460,8 +460,7 @@ extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi,
const struct timespec64 *ts);
extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
int flags);
-extern int fat_update_time(struct inode *inode, struct timespec64 *now,
- int flags);
+extern int fat_update_time(struct inode *inode, int flags);
extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
int fat_cache_init(void);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 456477946dd9..e887e9ab7472 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -401,7 +401,7 @@ int fat_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->blksize = sbi->cluster_size;
if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d99b8549ec8f..cdd39b6020f3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -562,7 +562,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
- inode->i_ctime = inode->i_mtime;
+ inode_set_ctime_to_ts(inode, inode->i_mtime);
if (sbi->options.isvfat) {
fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
@@ -1407,8 +1407,7 @@ static int fat_read_root(struct inode *inode)
MSDOS_I(inode)->mmu_private = inode->i_size;
fat_save_attrs(inode, ATTR_DIR);
- inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
- inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0);
set_nlink(inode, fat_subdirs(inode)+2);
return 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 7e5d6ae305f2..f2304a1054aa 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -332,13 +332,14 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
* but ctime updates are ignored.
*/
if (flags & S_MTIME)
- inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now);
+ inode->i_mtime = inode_set_ctime_to_ts(inode,
+ fat_truncate_mtime(sbi, now));
return 0;
}
EXPORT_SYMBOL_GPL(fat_truncate_time);
-int fat_update_time(struct inode *inode, struct timespec64 *now, int flags)
+int fat_update_time(struct inode *inode, int flags)
{
int dirty_flags = 0;
@@ -346,16 +347,13 @@ int fat_update_time(struct inode *inode, struct timespec64 *now, int flags)
return 0;
if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
- fat_truncate_time(inode, now, flags);
+ fat_truncate_time(inode, NULL, flags);
if (inode->i_sb->s_flags & SB_LAZYTIME)
dirty_flags |= I_DIRTY_TIME;
else
dirty_flags |= I_DIRTY_SYNC;
}
- if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
- dirty_flags |= I_DIRTY_SYNC;
-
__mark_inode_dirty(inode, dirty_flags);
return 0;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index b622be119706..e871009f6c88 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -34,7 +34,7 @@
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
-static int setfl(int fd, struct file * filp, unsigned long arg)
+static int setfl(int fd, struct file * filp, unsigned int arg)
{
struct inode * inode = file_inode(filp);
int error = 0;
@@ -112,11 +112,11 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
}
EXPORT_SYMBOL(__f_setown);
-int f_setown(struct file *filp, unsigned long arg, int force)
+int f_setown(struct file *filp, int who, int force)
{
enum pid_type type;
struct pid *pid = NULL;
- int who = arg, ret = 0;
+ int ret = 0;
type = PIDTYPE_TGID;
if (who < 0) {
@@ -317,28 +317,29 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
void __user *argp = (void __user *)arg;
+ int argi = (int)arg;
struct flock flock;
long err = -EINVAL;
switch (cmd) {
case F_DUPFD:
- err = f_dupfd(arg, filp, 0);
+ err = f_dupfd(argi, filp, 0);
break;
case F_DUPFD_CLOEXEC:
- err = f_dupfd(arg, filp, O_CLOEXEC);
+ err = f_dupfd(argi, filp, O_CLOEXEC);
break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
break;
case F_SETFD:
err = 0;
- set_close_on_exec(fd, arg & FD_CLOEXEC);
+ set_close_on_exec(fd, argi & FD_CLOEXEC);
break;
case F_GETFL:
err = filp->f_flags;
break;
case F_SETFL:
- err = setfl(fd, filp, arg);
+ err = setfl(fd, filp, argi);
break;
#if BITS_PER_LONG != 32
/* 32-bit arches must use fcntl64() */
@@ -375,7 +376,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
force_successful_syscall_return();
break;
case F_SETOWN:
- err = f_setown(filp, arg, 1);
+ err = f_setown(filp, argi, 1);
break;
case F_GETOWN_EX:
err = f_getown_ex(filp, arg);
@@ -391,28 +392,28 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
case F_SETSIG:
/* arg == 0 restores default behaviour. */
- if (!valid_signal(arg)) {
+ if (!valid_signal(argi)) {
break;
}
err = 0;
- filp->f_owner.signum = arg;
+ filp->f_owner.signum = argi;
break;
case F_GETLEASE:
err = fcntl_getlease(filp);
break;
case F_SETLEASE:
- err = fcntl_setlease(fd, filp, arg);
+ err = fcntl_setlease(fd, filp, argi);
break;
case F_NOTIFY:
- err = fcntl_dirnotify(fd, filp, arg);
+ err = fcntl_dirnotify(fd, filp, argi);
break;
case F_SETPIPE_SZ:
case F_GETPIPE_SZ:
- err = pipe_fcntl(filp, cmd, arg);
+ err = pipe_fcntl(filp, cmd, argi);
break;
case F_ADD_SEALS:
case F_GET_SEALS:
- err = memfd_fcntl(filp, cmd, arg);
+ err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
case F_SET_RW_HINT:
diff --git a/fs/file.c b/fs/file.c
index 3fd003a8604f..3e4a4dfa38fc 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -668,7 +668,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */
/**
* last_fd - return last valid index into fd table
- * @cur_fds: files struct
+ * @fdt: File descriptor table.
*
* Context: Either rcu read lock or files_lock must be held.
*
@@ -693,29 +693,30 @@ static inline void __range_cloexec(struct files_struct *cur_fds,
spin_unlock(&cur_fds->file_lock);
}
-static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
+static inline void __range_close(struct files_struct *files, unsigned int fd,
unsigned int max_fd)
{
+ struct file *file;
unsigned n;
- rcu_read_lock();
- n = last_fd(files_fdtable(cur_fds));
- rcu_read_unlock();
+ spin_lock(&files->file_lock);
+ n = last_fd(files_fdtable(files));
max_fd = min(max_fd, n);
- while (fd <= max_fd) {
- struct file *file;
-
- spin_lock(&cur_fds->file_lock);
- file = pick_file(cur_fds, fd++);
- spin_unlock(&cur_fds->file_lock);
-
+ for (; fd <= max_fd; fd++) {
+ file = pick_file(files, fd);
if (file) {
- /* found a valid file to close */
- filp_close(file, cur_fds);
+ spin_unlock(&files->file_lock);
+ filp_close(file, files);
cond_resched();
+ spin_lock(&files->file_lock);
+ } else if (need_resched()) {
+ spin_unlock(&files->file_lock);
+ cond_resched();
+ spin_lock(&files->file_lock);
}
}
+ spin_unlock(&files->file_lock);
}
/**
@@ -723,6 +724,7 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
+ * @flags: CLOSE_RANGE flags.
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
diff --git a/fs/file_table.c b/fs/file_table.c
index fc7d677ff5ad..ee21b3da9d08 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -461,11 +461,8 @@ void fput(struct file *file)
*/
void __fput_sync(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
- struct task_struct *task = current;
- BUG_ON(!(task->flags & PF_KTHREAD));
+ if (atomic_long_dec_and_test(&file->f_count))
__fput(file);
- }
}
EXPORT_SYMBOL(fput);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ceb6a12649ba..ac5d43b164b5 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -110,10 +110,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi,
inode->i_size = vip->vii_size;
inode->i_atime.tv_sec = vip->vii_atime;
- inode->i_ctime.tv_sec = vip->vii_ctime;
+ inode_set_ctime(inode, vip->vii_ctime, 0);
inode->i_mtime.tv_sec = vip->vii_mtime;
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
inode->i_blocks = vip->vii_blocks;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index aca4b4811394..969ce991b0b0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1953,9 +1953,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
- if (!trylock_super(sb)) {
+ if (!super_trylock_shared(sb)) {
/*
- * trylock_super() may fail consistently due to
+ * super_trylock_shared() may fail consistently due to
* s_umount being grabbed by someone else. Don't use
* requeue_io() to avoid busy retrying the inode/sb.
*/
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 851214d1d013..a0ad7a0c4680 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -162,6 +162,10 @@ EXPORT_SYMBOL(vfs_parse_fs_param);
/**
* vfs_parse_fs_string - Convenience function to just parse a string.
+ * @fc: Filesystem context.
+ * @key: Parameter name.
+ * @value: Default value.
+ * @v_size: Maximum number of bytes in the value.
*/
int vfs_parse_fs_string(struct fs_context *fc, const char *key,
const char *value, size_t v_size)
@@ -189,7 +193,7 @@ EXPORT_SYMBOL(vfs_parse_fs_string);
/**
* generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
- * @ctx: The superblock configuration to fill in.
+ * @fc: The superblock configuration to fill in.
* @data: The data to parse
*
* Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
@@ -315,10 +319,31 @@ struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
}
EXPORT_SYMBOL(fs_context_for_reconfigure);
+/**
+ * fs_context_for_submount: allocate a new fs_context for a submount
+ * @type: file_system_type of the new context
+ * @reference: reference dentry from which to copy relevant info
+ *
+ * Allocate a new fs_context suitable for a submount. This also ensures that
+ * the fc->security object is inherited from @reference (if needed).
+ */
struct fs_context *fs_context_for_submount(struct file_system_type *type,
struct dentry *reference)
{
- return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+ struct fs_context *fc;
+ int ret;
+
+ fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+ if (IS_ERR(fc))
+ return fc;
+
+ ret = security_fs_context_submount(fc, reference->d_sb);
+ if (ret) {
+ put_fs_context(fc);
+ return ERR_PTR(ret);
+ }
+
+ return fc;
}
EXPORT_SYMBOL(fs_context_for_submount);
@@ -333,7 +358,7 @@ void fc_drop_locked(struct fs_context *fc)
static void legacy_fs_context_free(struct fs_context *fc);
/**
- * vfs_dup_fc_config: Duplicate a filesystem context.
+ * vfs_dup_fs_context - Duplicate a filesystem context.
* @src_fc: The context to copy.
*/
struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
@@ -379,7 +404,9 @@ EXPORT_SYMBOL(vfs_dup_fs_context);
/**
* logfc - Log a message to a filesystem context
- * @fc: The filesystem context to log to.
+ * @log: The filesystem context to log to, or NULL to use printk.
+ * @prefix: A string to prefix the output with, or NULL.
+ * @level: 'w' for a warning, 'e' for an error. Anything else is a notice.
* @fmt: The format of the buffer.
*/
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...)
@@ -692,6 +719,7 @@ void vfs_clean_context(struct fs_context *fc)
security_free_mnt_opts(&fc->security);
kfree(fc->source);
fc->source = NULL;
+ fc->exclusive = false;
fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
fc->phase = FS_CONTEXT_AWAITING_RECONF;
diff --git a/fs/fsopen.c b/fs/fsopen.c
index fc9d2d9fd234..ce03f6521c88 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -209,6 +209,72 @@ err:
return ret;
}
+static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
+{
+ struct super_block *sb;
+ int ret;
+
+ if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+ return -EBUSY;
+
+ if (!mount_capable(fc))
+ return -EPERM;
+
+ /* require the new mount api */
+ if (exclusive && fc->ops == &legacy_fs_context_ops)
+ return -EOPNOTSUPP;
+
+ fc->phase = FS_CONTEXT_CREATING;
+ fc->exclusive = exclusive;
+
+ ret = vfs_get_tree(fc);
+ if (ret) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ sb = fc->root->d_sb;
+ ret = security_sb_kern_mount(sb);
+ if (unlikely(ret)) {
+ fc_drop_locked(fc);
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ /* vfs_get_tree() callchains will have grabbed @s_umount */
+ up_write(&sb->s_umount);
+ fc->phase = FS_CONTEXT_AWAITING_MOUNT;
+ return 0;
+}
+
+static int vfs_cmd_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb;
+ int ret;
+
+ if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
+ return -EBUSY;
+
+ fc->phase = FS_CONTEXT_RECONFIGURING;
+
+ sb = fc->root->d_sb;
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return -EPERM;
+ }
+
+ down_write(&sb->s_umount);
+ ret = reconfigure_super(fc);
+ up_write(&sb->s_umount);
+ if (ret) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ vfs_clean_context(fc);
+ return 0;
+}
+
/*
* Check the state and apply the configuration. Note that this function is
* allowed to 'steal' the value by setting param->xxx to NULL before returning.
@@ -216,7 +282,6 @@ err:
static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
struct fs_parameter *param)
{
- struct super_block *sb;
int ret;
ret = finish_clean_context(fc);
@@ -224,39 +289,11 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
return ret;
switch (cmd) {
case FSCONFIG_CMD_CREATE:
- if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
- return -EBUSY;
- if (!mount_capable(fc))
- return -EPERM;
- fc->phase = FS_CONTEXT_CREATING;
- ret = vfs_get_tree(fc);
- if (ret)
- break;
- sb = fc->root->d_sb;
- ret = security_sb_kern_mount(sb);
- if (unlikely(ret)) {
- fc_drop_locked(fc);
- break;
- }
- up_write(&sb->s_umount);
- fc->phase = FS_CONTEXT_AWAITING_MOUNT;
- return 0;
+ return vfs_cmd_create(fc, false);
+ case FSCONFIG_CMD_CREATE_EXCL:
+ return vfs_cmd_create(fc, true);
case FSCONFIG_CMD_RECONFIGURE:
- if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
- return -EBUSY;
- fc->phase = FS_CONTEXT_RECONFIGURING;
- sb = fc->root->d_sb;
- if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
- ret = -EPERM;
- break;
- }
- down_write(&sb->s_umount);
- ret = reconfigure_super(fc);
- up_write(&sb->s_umount);
- if (ret)
- break;
- vfs_clean_context(fc);
- return 0;
+ return vfs_cmd_reconfigure(fc);
default:
if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
fc->phase != FS_CONTEXT_RECONF_PARAMS)
@@ -264,8 +301,6 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
return vfs_parse_fs_param(fc, param);
}
- fc->phase = FS_CONTEXT_FAILED;
- return ret;
}
/**
@@ -353,6 +388,7 @@ SYSCALL_DEFINE5(fsconfig,
return -EINVAL;
break;
case FSCONFIG_CMD_CREATE:
+ case FSCONFIG_CMD_CREATE_EXCL:
case FSCONFIG_CMD_RECONFIGURE:
if (_key || _value || aux)
return -EINVAL;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 247ef4f76761..ab62e4624256 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
inode->i_mode = mode;
inode->i_uid = fc->user_id;
inode->i_gid = fc->group_id;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
/* setting ->i_op to NULL is not allowed */
if (iop)
inode->i_op = iop;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f67bef9d83c4..881524b9a55a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -933,7 +933,7 @@ void fuse_flush_time_update(struct inode *inode)
static void fuse_update_ctime_in_cache(struct inode *inode)
{
if (!IS_NOCMTIME(inode)) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty_sync(inode);
fuse_flush_time_update(inode);
}
@@ -1222,7 +1222,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
forget_all_cached_acls(inode);
err = fuse_do_getattr(inode, stat, file);
} else if (stat) {
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
}
@@ -1715,8 +1715,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.mtimensec = inode->i_mtime.tv_nsec;
if (fm->fc->minor >= 23) {
inarg.valid |= FATTR_CTIME;
- inarg.ctime = inode->i_ctime.tv_sec;
- inarg.ctimensec = inode->i_ctime.tv_nsec;
+ inarg.ctime = inode_get_ctime(inode).tv_sec;
+ inarg.ctimensec = inode_get_ctime(inode).tv_nsec;
}
if (ff) {
inarg.valid |= FATTR_FH;
@@ -1857,7 +1857,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (attr->ia_valid & ATTR_MTIME)
inode->i_mtime = attr->ia_mtime;
if (attr->ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
/* FIXME: clear I_DIRTY_SYNC? */
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f19d748890f0..549358ffea8b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -194,8 +194,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_mtime.tv_nsec = attr->mtimensec;
}
if (!(cache_mask & STATX_CTIME)) {
- inode->i_ctime.tv_sec = attr->ctime;
- inode->i_ctime.tv_nsec = attr->ctimensec;
+ inode_set_ctime(inode, attr->ctime, attr->ctimensec);
}
if (attr->blksize != 0)
@@ -259,8 +258,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
attr->mtimensec = inode->i_mtime.tv_nsec;
}
if (cache_mask & STATX_CTIME) {
- attr->ctime = inode->i_ctime.tv_sec;
- attr->ctimensec = inode->i_ctime.tv_nsec;
+ attr->ctime = inode_get_ctime(inode).tv_sec;
+ attr->ctimensec = inode_get_ctime(inode).tv_nsec;
}
if ((attr_version != 0 && fi->attr_version > attr_version) ||
@@ -318,8 +317,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
inode->i_size = attr->size;
inode->i_mtime.tv_sec = attr->mtime;
inode->i_mtime.tv_nsec = attr->mtimensec;
- inode->i_ctime.tv_sec = attr->ctime;
- inode->i_ctime.tv_nsec = attr->ctimensec;
+ inode_set_ctime(inode, attr->ctime, attr->ctimensec);
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
fuse_init_file_inode(inode, attr->flags);
@@ -1401,16 +1399,18 @@ EXPORT_SYMBOL_GPL(fuse_dev_free);
static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
const struct fuse_inode *fi)
{
+ struct timespec64 ctime = inode_get_ctime(&fi->inode);
+
*attr = (struct fuse_attr){
.ino = fi->inode.i_ino,
.size = fi->inode.i_size,
.blocks = fi->inode.i_blocks,
.atime = fi->inode.i_atime.tv_sec,
.mtime = fi->inode.i_mtime.tv_sec,
- .ctime = fi->inode.i_ctime.tv_sec,
+ .ctime = ctime.tv_sec,
.atimensec = fi->inode.i_atime.tv_nsec,
.mtimensec = fi->inode.i_mtime.tv_nsec,
- .ctimensec = fi->inode.i_ctime.tv_nsec,
+ .ctimensec = ctime.tv_nsec,
.mode = fi->inode.i_mode,
.nlink = fi->inode.i_nlink,
.uid = fi->inode.i_uid.val,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index a392aa0f041d..443640e6fb9c 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -142,7 +142,7 @@ int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
ret = __gfs2_set_acl(inode, acl, type);
if (!ret && mode != inode->i_mode) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode->i_mode = mode;
mark_inode_dirty(inode);
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ae49256b7c8c..9c4b26aec580 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -747,7 +747,7 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages,
.read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 8d611fbcf0bd..f62366be7587 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
if (status)
return ERR_PTR(status);
- folio = iomap_get_folio(iter, pos);
+ folio = iomap_get_folio(iter, pos, len);
if (IS_ERR(folio))
gfs2_trans_end(sdp);
return folio;
@@ -1386,7 +1386,7 @@ static int trunc_start(struct inode *inode, u64 newsize)
ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
i_size_write(inode, newsize);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_dinode_out(ip, dibh->b_data);
if (journaled)
@@ -1583,8 +1583,7 @@ out_unlock:
/* Every transaction boundary, we rewrite the dinode
to keep its di_blocks current in case of failure. */
- ip->i_inode.i_mtime = ip->i_inode.i_ctime =
- current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1950,7 +1949,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
gfs2_statfs_change(sdp, 0, +btotal, 0);
gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
ip->i_inode.i_gid);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
up_write(&ip->i_rw_mutex);
@@ -1993,7 +1992,7 @@ static int trunc_end(struct gfs2_inode *ip)
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
gfs2_ordered_del_inode(ip);
}
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -2094,7 +2093,7 @@ static int do_grow(struct inode *inode, u64 size)
goto do_end_trans;
truncate_setsize(inode, size);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 54a6d17b8c25..1a2afa88f8be 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
if (ip->i_inode.i_size < offset + size)
i_size_write(&ip->i_inode, offset + size);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -227,7 +227,7 @@ out:
if (ip->i_inode.i_size < offset + copied)
i_size_write(&ip->i_inode, offset + copied);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
@@ -1814,7 +1814,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
- tv = current_time(&ip->i_inode);
+ tv = inode_set_ctime_current(&ip->i_inode);
if (ip->i_diskflags & GFS2_DIF_EXHASH) {
leaf = (struct gfs2_leaf *)bh->b_data;
be16_add_cpu(&leaf->lf_entries, 1);
@@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
da->bh = NULL;
brelse(bh);
ip->i_entries++;
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
+ ip->i_inode.i_mtime = tv;
if (S_ISDIR(nip->i_inode.i_mode))
inc_nlink(&ip->i_inode);
mark_inode_dirty(inode);
@@ -1876,7 +1876,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
const struct qstr *name = &dentry->d_name;
struct gfs2_dirent *dent, *prev = NULL;
struct buffer_head *bh;
- struct timespec64 tv = current_time(&dip->i_inode);
+ struct timespec64 tv;
/* Returns _either_ the entry (if its first in block) or the
previous entry otherwise */
@@ -1896,6 +1896,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
}
dirent_del(dip, bh, prev, dent);
+ tv = inode_set_ctime_current(&dip->i_inode);
if (dip->i_diskflags & GFS2_DIF_EXHASH) {
struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
u16 entries = be16_to_cpu(leaf->lf_entries);
@@ -1910,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
if (!dip->i_entries)
gfs2_consist_inode(dip);
dip->i_entries--;
- dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
+ dip->i_inode.i_mtime = tv;
if (d_is_dir(dentry))
drop_nlink(&dip->i_inode);
mark_inode_dirty(&dip->i_inode);
@@ -1951,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
dent->de_type = cpu_to_be16(new_type);
brelse(bh);
- dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode);
+ dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode);
mark_inode_dirty_sync(&dip->i_inode);
return 0;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b43fa8b8fc05..766186c80682 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -260,7 +260,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask)
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
goto out_trans_end;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
gfs2_trans_add_meta(ip->i_gl, bh);
ip->i_diskflags = new_flags;
gfs2_dinode_out(ip, bh->b_data);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 54319328b16b..aecdac3cfbe1 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -437,8 +437,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
inode->i_atime = atime;
inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
- inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
- inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+ inode_set_ctime(inode, be64_to_cpu(str->di_ctime),
+ be32_to_cpu(str->di_ctime_nsec));
ip->i_goal = be64_to_cpu(str->di_goal_meta);
ip->i_generation = be64_to_cpu(str->di_generation);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 17c994a0c0d0..a21ac41d6669 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -690,7 +690,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
inode->i_rdev = dev;
inode->i_size = size;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
munge_mode_uid_gid(dip, inode);
check_and_update_goal(dip);
ip->i_goal = dip->i_goal;
@@ -1029,7 +1029,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
gfs2_trans_add_meta(ip->i_gl, dibh);
inc_nlink(&ip->i_inode);
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
ihold(inode);
d_instantiate(dentry, inode);
mark_inode_dirty(inode);
@@ -1114,7 +1114,7 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
return error;
ip->i_entries = 0;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (S_ISDIR(inode->i_mode))
clear_nlink(inode);
else
@@ -1371,7 +1371,7 @@ static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip,
if (dir_rename)
return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
mark_inode_dirty_sync(&ip->i_inode);
return 0;
}
@@ -2071,7 +2071,7 @@ static int gfs2_getattr(struct mnt_idmap *idmap,
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (gfs2_holder_initialized(&gh))
gfs2_glock_dq_uninit(&gh);
@@ -2139,8 +2139,7 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset)
return vfs_setpos(file, ret, inode->i_sb->s_maxbytes);
}
-static int gfs2_update_time(struct inode *inode, struct timespec64 *time,
- int flags)
+static int gfs2_update_time(struct inode *inode, int flags)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_glock *gl = ip->i_gl;
@@ -2155,7 +2154,8 @@ static int gfs2_update_time(struct inode *inode, struct timespec64 *time,
if (error)
return error;
}
- return generic_update_time(inode, time, flags);
+ generic_update_time(inode, flags);
+ return 0;
}
static const struct inode_operations gfs2_file_iops = {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 704192b73605..aa5fd06d47bc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -871,7 +871,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
size = loc + sizeof(struct gfs2_quota);
if (size > inode->i_size)
i_size_write(inode, size);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
set_bit(QDF_REFRESH, &qd->qd_flags);
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9f4d5d6549ee..2f701335e8ee 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -412,7 +412,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
- str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
+ str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec);
str->di_goal_meta = cpu_to_be64(ip->i_goal);
str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -429,7 +429,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_eattr = cpu_to_be64(ip->i_eattr);
str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
- str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+ str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec);
}
/**
@@ -689,7 +689,7 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp)
struct super_block *sb = sdp->sd_vfs;
int error;
- error = freeze_super(sb);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
if (error)
return error;
@@ -697,7 +697,9 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp)
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
GFS2_LFC_FREEZE_GO_SYNC);
if (gfs2_withdrawn(sdp)) {
- thaw_super(sb);
+ error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ if (error)
+ return error;
return -EIO;
}
}
@@ -712,7 +714,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp)
error = gfs2_freeze_lock_shared(sdp);
if (error)
goto fail;
- error = thaw_super(sb);
+ error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
if (!error)
return 0;
@@ -761,7 +763,7 @@ out:
*
*/
-static int gfs2_freeze_super(struct super_block *sb)
+static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
@@ -816,7 +818,7 @@ out:
*
*/
-static int gfs2_thaw_super(struct super_block *sb)
+static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 2dfbe2f188dd..c60bc7f628e1 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -168,10 +168,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
switch (n) {
case 0:
- error = thaw_super(sdp->sd_vfs);
+ error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE);
break;
case 1:
- error = freeze_super(sdp->sd_vfs);
+ error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE);
break;
default:
return -EINVAL;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 93b36d026bb4..4fea70c0fe3d 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -311,7 +311,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
ea->ea_num_ptrs = 0;
}
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
__mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(sdp);
@@ -763,7 +763,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (error)
goto out_end_trans;
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
__mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
out_end_trans:
@@ -888,7 +888,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
if (es->es_el)
ea_set_remove_stuffed(ip, es->es_el);
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
__mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
@@ -1106,7 +1106,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
ea->ea_type = GFS2_EATYPE_UNUSED;
}
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
__mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index d365bf0b8c77..632c226a3972 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
goto err1;
dir->i_size++;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
hfs_find_exit(&fd);
return 0;
@@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
}
dir->i_size--;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
res = 0;
out:
@@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
dst_dir->i_size++;
- dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir);
+ dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
mark_inode_dirty(dst_dir);
/* finally remove the old entry */
@@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
src_dir->i_size--;
- src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir);
+ src_dir->i_mtime = inode_set_ctime_current(src_dir);
mark_inode_dirty(src_dir);
type = entry.type;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 3e1e3dcf0b48..b75c26045df4 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -263,7 +263,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
if (res)
return res;
clear_nlink(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
hfs_delete_inode(inode);
mark_inode_dirty(inode);
return 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 441d7fc952e3..ee349b72cfb3 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
HFS_I(inode)->fs_blocks = 0;
@@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_mode |= S_IWUGO;
inode->i_mode &= ~hsb->s_file_umask;
inode->i_mode |= S_IFREG;
- inode->i_ctime = inode->i_atime = inode->i_mtime =
- hfs_m_to_utime(rec->file.MdDat);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
+ hfs_m_to_utime(rec->file.MdDat));
inode->i_op = &hfs_file_inode_operations;
inode->i_fop = &hfs_file_operations;
inode->i_mapping->a_ops = &hfs_aops;
@@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
HFS_I(inode)->fs_blocks = 0;
inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
- inode->i_ctime = inode->i_atime = inode->i_mtime =
- hfs_m_to_utime(rec->dir.MdDat);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
+ hfs_m_to_utime(rec->dir.MdDat));
inode->i_op = &hfs_dir_inode_operations;
inode->i_fop = &hfs_dir_operations;
break;
@@ -654,8 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
truncate_setsize(inode, attr->ia_size);
hfs_file_truncate(inode);
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
setattr_copy(&nop_mnt_idmap, inode, attr);
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 2875961fdc10..dc27d418fbcd 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -28,7 +28,9 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
/* fix up inode on a timezone change */
diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest;
if (diff) {
- inode->i_ctime.tv_sec += diff;
+ struct timespec64 ctime = inode_get_ctime(inode);
+
+ inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec);
inode->i_atime.tv_sec += diff;
inode->i_mtime.tv_sec += diff;
HFS_I(inode)->tz_secondswest += diff;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 35472cba750e..e71ae2537eaa 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
dir->i_size++;
if (S_ISDIR(inode->i_mode))
hfsplus_subfolders_inc(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
hfs_find_exit(&fd);
@@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
@@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid,
dst_dir->i_size++;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_inc(dst_dir);
- dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir);
+ dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
/* finally remove the old entry */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
@@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid,
src_dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(src_dir);
- src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir);
+ src_dir->i_mtime = inode_set_ctime_current(src_dir);
/* remove old thread entry */
hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 56fb5f1312e7..f5c4b3e31a1c 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -346,7 +346,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
inc_nlink(inode);
hfsplus_instantiate(dst_dentry, inode, cnid);
ihold(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
sbi->file_count++;
hfsplus_mark_mdb_dirty(dst_dir->i_sb);
@@ -405,7 +405,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
hfsplus_delete_inode(inode);
} else
sbi->file_count--;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
out:
mutex_unlock(&sbi->vh_mutex);
@@ -426,7 +426,7 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
if (res)
goto out;
clear_nlink(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
hfsplus_delete_inode(inode);
mark_inode_dirty(inode);
out:
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 7d1a675e037d..c65c8c4b03dd 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap,
}
truncate_setsize(inode, attr->ia_size);
hfsplus_file_truncate(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
}
setattr_copy(&nop_mnt_idmap, inode, attr);
@@ -298,7 +298,7 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
@@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
inode->i_ino = sbi->next_cnid++;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
hip = HFSPLUS_I(inode);
INIT_LIST_HEAD(&hip->open_dir_list);
@@ -523,7 +523,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_size = 2 + be32_to_cpu(folder->valence);
inode->i_atime = hfsp_mt2ut(folder->access_date);
inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
- inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
+ inode_set_ctime_to_ts(inode,
+ hfsp_mt2ut(folder->attribute_mod_date));
HFSPLUS_I(inode)->create_date = folder->create_date;
HFSPLUS_I(inode)->fs_blocks = 0;
if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
@@ -564,7 +565,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
}
inode->i_atime = hfsp_mt2ut(file->access_date);
inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
- inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
+ inode_set_ctime_to_ts(inode,
+ hfsp_mt2ut(file->attribute_mod_date));
HFSPLUS_I(inode)->create_date = file->create_date;
} else {
pr_err("bad catalog entry used to create inode\n");
@@ -609,7 +611,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
hfsplus_cat_set_perms(inode, &folder->permissions);
folder->access_date = hfsp_ut2mt(inode->i_atime);
folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
- folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
+ folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
folder->valence = cpu_to_be32(inode->i_size - 2);
if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
folder->subfolders =
@@ -644,7 +646,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
file->access_date = hfsp_ut2mt(inode->i_atime);
file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
- file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
+ file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
}
@@ -700,7 +702,7 @@ int hfsplus_fileattr_set(struct mnt_idmap *idmap,
else
hip->userflags &= ~HFSPLUS_FLG_NODUMP;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 46387090eb76..dc5a5cea5fae 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -517,8 +517,7 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
(struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec };
ino->i_mtime =
(struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec };
- ino->i_ctime =
- (struct timespec64){ st->ctime.tv_sec, st->ctime.tv_nsec };
+ inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec);
ino->i_size = st->size;
ino->i_blocks = st->blocks;
return 0;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index f32f15669996..f36566d61215 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -277,10 +277,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in
* inode.
*/
- if (!result->i_ctime.tv_sec) {
- if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date))))
- result->i_ctime.tv_sec = 1;
- result->i_ctime.tv_nsec = 0;
+ if (!inode_get_ctime(result).tv_sec) {
+ time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date));
+
+ inode_set_ctime(result, csec ? csec : 1, 0);
result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date));
result->i_mtime.tv_nsec = 0;
result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date));
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index e50e92a42432..479166378bae 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -36,7 +36,7 @@ void hpfs_init_inode(struct inode *i)
hpfs_inode->i_rddir_off = NULL;
hpfs_inode->i_dirty = 0;
- i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0;
+ inode_set_ctime(i, 0, 0);
i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0;
i->i_atime.tv_sec = i->i_atime.tv_nsec = 0;
}
@@ -232,7 +232,7 @@ void hpfs_write_inode_nolock(struct inode *i)
if (de) {
de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
- de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec));
+ de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
de->read_only = !(i->i_mode & 0222);
de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size);
hpfs_mark_4buffers_dirty(&qbh);
@@ -242,7 +242,7 @@ void hpfs_write_inode_nolock(struct inode *i)
if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) {
de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
- de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec));
+ de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
de->read_only = !(i->i_mode & 0222);
de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0);
de->file_size = cpu_to_le32(0);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 69fb40b2c99a..f4eb8d6f5989 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -13,10 +13,9 @@ static void hpfs_update_directory_times(struct inode *dir)
{
time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
if (t == dir->i_mtime.tv_sec &&
- t == dir->i_ctime.tv_sec)
+ t == inode_get_ctime(dir).tv_sec)
return;
- dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t;
- dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0;
+ dir->i_mtime = inode_set_ctime(dir, t, 0);
hpfs_write_inode_nolock(dir);
}
@@ -59,10 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
result->i_ino = fno;
hpfs_i(result)->i_parent_dir = dir->i_ino;
hpfs_i(result)->i_dno = dno;
- result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
- result->i_ctime.tv_nsec = 0;
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_nsec = 0;
+ result->i_mtime = result->i_atime =
+ inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
hpfs_i(result)->i_ea_size = 0;
result->i_mode |= S_IFDIR;
result->i_op = &hpfs_dir_iops;
@@ -167,10 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
result->i_fop = &hpfs_file_ops;
set_nlink(result, 1);
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
- result->i_ctime.tv_nsec = 0;
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_nsec = 0;
+ result->i_mtime = result->i_atime =
+ inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
hpfs_i(result)->i_ea_size = 0;
if (dee.read_only)
result->i_mode &= ~0222;
@@ -250,10 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
hpfs_init_inode(result);
result->i_ino = fno;
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
- result->i_ctime.tv_nsec = 0;
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_nsec = 0;
+ result->i_mtime = result->i_atime =
+ inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
hpfs_i(result)->i_ea_size = 0;
result->i_uid = current_fsuid();
result->i_gid = current_fsgid();
@@ -326,10 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
result->i_ino = fno;
hpfs_init_inode(result);
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
- result->i_ctime.tv_nsec = 0;
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_nsec = 0;
+ result->i_mtime = result->i_atime =
+ inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
hpfs_i(result)->i_ea_size = 0;
result->i_mode = S_IFLNK | 0777;
result->i_uid = current_fsuid();
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 1cb89595b875..758a51564124 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -729,8 +729,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
root->i_atime.tv_nsec = 0;
root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date));
root->i_mtime.tv_nsec = 0;
- root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date));
- root->i_ctime.tv_nsec = 0;
+ inode_set_ctime(root,
+ local_to_gmt(s, le32_to_cpu(de->creation_date)),
+ 0);
hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size);
hpfs_i(root)->i_parent_dir = root->i_ino;
if (root->i_size == -1)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7b17ccfa039d..93d3bcfd4fc8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -887,7 +887,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
out:
inode_unlock(inode);
return error;
@@ -935,7 +935,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
inode->i_mode = S_IFDIR | ctx->mode;
inode->i_uid = ctx->uid;
inode->i_gid = ctx->gid;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -979,7 +979,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_mapping->private_data = resv_map;
info->seals = F_SEAL_SEAL;
switch (mode & S_IFMT) {
@@ -1022,7 +1022,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (!inode)
return -ENOSPC;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
d_instantiate(dentry, inode);
dget(dentry);/* Extra count - pin the dentry in core */
return 0;
@@ -1054,7 +1054,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
if (!inode)
return -ENOSPC;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
d_tmpfile(file, inode);
return finish_open_simple(file, 0);
}
@@ -1076,7 +1076,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
} else
iput(inode);
}
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
return error;
}
diff --git a/fs/inode.c b/fs/inode.c
index 67611a360031..35fd688168c5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -751,16 +751,11 @@ EXPORT_SYMBOL_GPL(evict_inodes);
/**
* invalidate_inodes - attempt to free all inodes on a superblock
* @sb: superblock to operate on
- * @kill_dirty: flag to guide handling of dirty inodes
*
- * Attempts to free all inodes for a given superblock. If there were any
- * busy inodes return a non-zero value, else zero.
- * If @kill_dirty is set, discard dirty inodes too, otherwise treat
- * them as busy.
+ * Attempts to free all inodes (including dirty inodes) for a given superblock.
*/
-int invalidate_inodes(struct super_block *sb, bool kill_dirty)
+void invalidate_inodes(struct super_block *sb)
{
- int busy = 0;
struct inode *inode, *next;
LIST_HEAD(dispose);
@@ -772,14 +767,8 @@ again:
spin_unlock(&inode->i_lock);
continue;
}
- if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
- spin_unlock(&inode->i_lock);
- busy = 1;
- continue;
- }
if (atomic_read(&inode->i_count)) {
spin_unlock(&inode->i_lock);
- busy = 1;
continue;
}
@@ -797,8 +786,6 @@ again:
spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
-
- return busy;
}
/*
@@ -1850,6 +1837,7 @@ EXPORT_SYMBOL(bmap);
static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
struct timespec64 now)
{
+ struct timespec64 ctime;
if (!(mnt->mnt_flags & MNT_RELATIME))
return 1;
@@ -1861,7 +1849,8 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
/*
* Is ctime younger than or equal to atime? If yes, update atime:
*/
- if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+ ctime = inode_get_ctime(inode);
+ if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
return 1;
/*
@@ -1876,29 +1865,76 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
return 0;
}
-int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
+/**
+ * inode_update_timestamps - update the timestamps on the inode
+ * @inode: inode to be updated
+ * @flags: S_* flags that needed to be updated
+ *
+ * The update_time function is called when an inode's timestamps need to be
+ * updated for a read or write operation. This function handles updating the
+ * actual timestamps. It's up to the caller to ensure that the inode is marked
+ * dirty appropriately.
+ *
+ * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated,
+ * attempt to update all three of them. S_ATIME updates can be handled
+ * independently of the rest.
+ *
+ * Returns a set of S_* flags indicating which values changed.
+ */
+int inode_update_timestamps(struct inode *inode, int flags)
{
- int dirty_flags = 0;
+ int updated = 0;
+ struct timespec64 now;
+
+ if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
+ struct timespec64 ctime = inode_get_ctime(inode);
- if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
- if (flags & S_ATIME)
- inode->i_atime = *time;
- if (flags & S_CTIME)
- inode->i_ctime = *time;
- if (flags & S_MTIME)
- inode->i_mtime = *time;
-
- if (inode->i_sb->s_flags & SB_LAZYTIME)
- dirty_flags |= I_DIRTY_TIME;
- else
- dirty_flags |= I_DIRTY_SYNC;
+ now = inode_set_ctime_current(inode);
+ if (!timespec64_equal(&now, &ctime))
+ updated |= S_CTIME;
+ if (!timespec64_equal(&now, &inode->i_mtime)) {
+ inode->i_mtime = now;
+ updated |= S_MTIME;
+ }
+ if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
+ updated |= S_VERSION;
+ } else {
+ now = current_time(inode);
}
- if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
- dirty_flags |= I_DIRTY_SYNC;
+ if (flags & S_ATIME) {
+ if (!timespec64_equal(&now, &inode->i_atime)) {
+ inode->i_atime = now;
+ updated |= S_ATIME;
+ }
+ }
+ return updated;
+}
+EXPORT_SYMBOL(inode_update_timestamps);
+/**
+ * generic_update_time - update the timestamps on the inode
+ * @inode: inode to be updated
+ * @flags: S_* flags that needed to be updated
+ *
+ * The update_time function is called when an inode's timestamps need to be
+ * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME,
+ * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME
+ * updates can be handled done independently of the rest.
+ *
+ * Returns a S_* mask indicating which fields were updated.
+ */
+int generic_update_time(struct inode *inode, int flags)
+{
+ int updated = inode_update_timestamps(inode, flags);
+ int dirty_flags = 0;
+
+ if (updated & (S_ATIME|S_MTIME|S_CTIME))
+ dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC;
+ if (updated & S_VERSION)
+ dirty_flags |= I_DIRTY_SYNC;
__mark_inode_dirty(inode, dirty_flags);
- return 0;
+ return updated;
}
EXPORT_SYMBOL(generic_update_time);
@@ -1906,11 +1942,12 @@ EXPORT_SYMBOL(generic_update_time);
* This does the actual work of updating an inodes time or version. Must have
* had called mnt_want_write() before calling this.
*/
-int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
+int inode_update_time(struct inode *inode, int flags)
{
if (inode->i_op->update_time)
- return inode->i_op->update_time(inode, time, flags);
- return generic_update_time(inode, time, flags);
+ return inode->i_op->update_time(inode, flags);
+ generic_update_time(inode, flags);
+ return 0;
}
EXPORT_SYMBOL(inode_update_time);
@@ -1962,7 +1999,6 @@ void touch_atime(const struct path *path)
{
struct vfsmount *mnt = path->mnt;
struct inode *inode = d_inode(path->dentry);
- struct timespec64 now;
if (!atime_needs_update(path, inode))
return;
@@ -1981,8 +2017,7 @@ void touch_atime(const struct path *path)
* We may also fail on filesystems that have the ability to make parts
* of the fs read only, e.g. subvolumes in Btrfs.
*/
- now = current_time(inode);
- inode_update_time(inode, &now, S_ATIME);
+ inode_update_time(inode, S_ATIME);
__mnt_drop_write(mnt);
skip_update:
sb_end_write(inode->i_sb);
@@ -2067,18 +2102,63 @@ int file_remove_privs(struct file *file)
}
EXPORT_SYMBOL(file_remove_privs);
-static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
+/**
+ * current_mgtime - Return FS time (possibly fine-grained)
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
+ * as having been QUERIED, get a fine-grained timestamp.
+ */
+struct timespec64 current_mgtime(struct inode *inode)
+{
+ struct timespec64 now, ctime;
+ atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
+ long nsec = atomic_long_read(pnsec);
+
+ if (nsec & I_CTIME_QUERIED) {
+ ktime_get_real_ts64(&now);
+ return timestamp_truncate(now, inode);
+ }
+
+ ktime_get_coarse_real_ts64(&now);
+ now = timestamp_truncate(now, inode);
+
+ /*
+ * If we've recently fetched a fine-grained timestamp
+ * then the coarse-grained one may still be earlier than the
+ * existing ctime. Just keep the existing value if so.
+ */
+ ctime = inode_get_ctime(inode);
+ if (timespec64_compare(&ctime, &now) > 0)
+ now = ctime;
+
+ return now;
+}
+EXPORT_SYMBOL(current_mgtime);
+
+static struct timespec64 current_ctime(struct inode *inode)
+{
+ if (is_mgtime(inode))
+ return current_mgtime(inode);
+ return current_time(inode);
+}
+
+static int inode_needs_update_time(struct inode *inode)
{
int sync_it = 0;
+ struct timespec64 now = current_ctime(inode);
+ struct timespec64 ctime;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- if (!timespec64_equal(&inode->i_mtime, now))
+ if (!timespec64_equal(&inode->i_mtime, &now))
sync_it = S_MTIME;
- if (!timespec64_equal(&inode->i_ctime, now))
+ ctime = inode_get_ctime(inode);
+ if (!timespec64_equal(&ctime, &now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2087,15 +2167,14 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
return sync_it;
}
-static int __file_update_time(struct file *file, struct timespec64 *now,
- int sync_mode)
+static int __file_update_time(struct file *file, int sync_mode)
{
int ret = 0;
struct inode *inode = file_inode(file);
/* try to update time settings */
if (!__mnt_want_write_file(file)) {
- ret = inode_update_time(inode, now, sync_mode);
+ ret = inode_update_time(inode, sync_mode);
__mnt_drop_write_file(file);
}
@@ -2120,13 +2199,12 @@ int file_update_time(struct file *file)
{
int ret;
struct inode *inode = file_inode(file);
- struct timespec64 now = current_time(inode);
- ret = inode_needs_update_time(inode, &now);
+ ret = inode_needs_update_time(inode);
if (ret <= 0)
return ret;
- return __file_update_time(file, &now, ret);
+ return __file_update_time(file, ret);
}
EXPORT_SYMBOL(file_update_time);
@@ -2149,7 +2227,6 @@ static int file_modified_flags(struct file *file, int flags)
{
int ret;
struct inode *inode = file_inode(file);
- struct timespec64 now = current_time(inode);
/*
* Clear the security bits if the process is not being run by root.
@@ -2162,13 +2239,13 @@ static int file_modified_flags(struct file *file, int flags)
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
- ret = inode_needs_update_time(inode, &now);
+ ret = inode_needs_update_time(inode);
if (ret <= 0)
return ret;
if (flags & IOCB_NOWAIT)
return -EAGAIN;
- return __file_update_time(file, &now, ret);
+ return __file_update_time(file, ret);
}
/**
@@ -2488,15 +2565,59 @@ struct timespec64 current_time(struct inode *inode)
struct timespec64 now;
ktime_get_coarse_real_ts64(&now);
+ return timestamp_truncate(now, inode);
+}
+EXPORT_SYMBOL(current_time);
- if (unlikely(!inode->i_sb)) {
- WARN(1, "current_time() called with uninitialized super_block in the inode");
+/**
+ * inode_set_ctime_current - set the ctime to current_time
+ * @inode: inode
+ *
+ * Set the inode->i_ctime to the current value for the inode. Returns
+ * the current value that was assigned to i_ctime.
+ */
+struct timespec64 inode_set_ctime_current(struct inode *inode)
+{
+ struct timespec64 now;
+ struct timespec64 ctime;
+
+ ctime.tv_nsec = READ_ONCE(inode->__i_ctime.tv_nsec);
+ if (!(ctime.tv_nsec & I_CTIME_QUERIED)) {
+ now = current_time(inode);
+
+ /* Just copy it into place if it's not multigrain */
+ if (!is_mgtime(inode)) {
+ inode_set_ctime_to_ts(inode, now);
+ return now;
+ }
+
+ /*
+ * If we've recently updated with a fine-grained timestamp,
+ * then the coarse-grained one may still be earlier than the
+ * existing ctime. Just keep the existing value if so.
+ */
+ ctime.tv_sec = inode->__i_ctime.tv_sec;
+ if (timespec64_compare(&ctime, &now) > 0)
+ return ctime;
+
+ /*
+ * Ctime updates are usually protected by the inode_lock, but
+ * we can still race with someone setting the QUERIED flag.
+ * Try to swap the new nsec value into place. If it's changed
+ * in the interim, then just go with a fine-grained timestamp.
+ */
+ if (cmpxchg(&inode->__i_ctime.tv_nsec, ctime.tv_nsec,
+ now.tv_nsec) != ctime.tv_nsec)
+ goto fine_grained;
+ inode->__i_ctime.tv_sec = now.tv_sec;
return now;
}
-
- return timestamp_truncate(now, inode);
+fine_grained:
+ ktime_get_real_ts64(&now);
+ inode_set_ctime_to_ts(inode, timestamp_truncate(now, inode));
+ return now;
}
-EXPORT_SYMBOL(current_time);
+EXPORT_SYMBOL(inode_set_ctime_current);
/**
* in_group_or_capable - check whether caller is CAP_FSETID privileged
diff --git a/fs/internal.h b/fs/internal.h
index f7a3dc111026..74d3b161dd2c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -115,7 +115,7 @@ static inline void put_file_access(struct file *file)
* super.c
*/
extern int reconfigure_super(struct fs_context *);
-extern bool trylock_super(struct super_block *sb);
+extern bool super_trylock_shared(struct super_block *sb);
struct super_block *user_get_super(dev_t, bool excl);
void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
@@ -201,7 +201,7 @@ void lock_two_inodes(struct inode *inode1, struct inode *inode2,
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
-extern int invalidate_inodes(struct super_block *, bool);
+void invalidate_inodes(struct super_block *sb);
/*
* dcache.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5b2481cd4750..f5fd99d6b0d4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -109,9 +109,6 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
* Returns 0 on success, -errno on error, 1 if this was the last
* extent that will fit in user array.
*/
-#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
-#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
-#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
u64 phys, u64 len, u32 flags)
{
@@ -127,6 +124,10 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
return 1;
+#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+
if (flags & SET_UNKNOWN_FLAGS)
flags |= FIEMAP_EXTENT_UNKNOWN;
if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
@@ -396,8 +397,8 @@ static int ioctl_fsfreeze(struct file *filp)
/* Freeze */
if (sb->s_op->freeze_super)
- return sb->s_op->freeze_super(sb);
- return freeze_super(sb);
+ return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+ return freeze_super(sb, FREEZE_HOLDER_USERSPACE);
}
static int ioctl_fsthaw(struct file *filp)
@@ -409,8 +410,8 @@ static int ioctl_fsthaw(struct file *filp)
/* Thaw */
if (sb->s_op->thaw_super)
- return sb->s_op->thaw_super(sb);
- return thaw_super(sb);
+ return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ return thaw_super(sb, FREEZE_HOLDER_USERSPACE);
}
static int ioctl_file_dedupe_range(struct file *file,
@@ -877,6 +878,9 @@ out:
#ifdef CONFIG_COMPAT
/**
* compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
+ * @file: The file to operate on.
+ * @cmd: The ioctl command number.
+ * @arg: The argument to the ioctl.
*
* This is not normally called as a function, but instead set in struct
* file_operations as
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index aa8967cca1a3..283fb96f6609 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -23,65 +23,169 @@
#define IOEND_BATCH_SIZE 4096
+typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
/*
- * Structure allocated for each folio when block size < folio size
- * to track sub-folio uptodate status and I/O completions.
+ * Structure allocated for each folio to track per-block uptodate, dirty state
+ * and I/O completions.
*/
-struct iomap_page {
+struct iomap_folio_state {
atomic_t read_bytes_pending;
atomic_t write_bytes_pending;
- spinlock_t uptodate_lock;
- unsigned long uptodate[];
+ spinlock_t state_lock;
+
+ /*
+ * Each block has two bits in this bitmap:
+ * Bits [0..blocks_per_folio) has the uptodate status.
+ * Bits [b_p_f...(2*b_p_f)) has the dirty status.
+ */
+ unsigned long state[];
};
-static inline struct iomap_page *to_iomap_page(struct folio *folio)
+static struct bio_set iomap_ioend_bioset;
+
+static inline bool ifs_is_fully_uptodate(struct folio *folio,
+ struct iomap_folio_state *ifs)
{
- if (folio_test_private(folio))
- return folio_get_private(folio);
- return NULL;
+ struct inode *inode = folio->mapping->host;
+
+ return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
}
-static struct bio_set iomap_ioend_bioset;
+static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
+ unsigned int block)
+{
+ return test_bit(block, ifs->state);
+}
+
+static void ifs_set_range_uptodate(struct folio *folio,
+ struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned int first_blk = off >> inode->i_blkbits;
+ unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+ unsigned int nr_blks = last_blk - first_blk + 1;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ bitmap_set(ifs->state, first_blk, nr_blks);
+ if (ifs_is_fully_uptodate(folio, ifs))
+ folio_mark_uptodate(folio);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_uptodate(struct folio *folio, size_t off,
+ size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (ifs)
+ ifs_set_range_uptodate(folio, ifs, off, len);
+ else
+ folio_mark_uptodate(folio);
+}
+
+static inline bool ifs_block_is_dirty(struct folio *folio,
+ struct iomap_folio_state *ifs, int block)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+ return test_bit(block + blks_per_folio, ifs->state);
+}
+
+static void ifs_clear_range_dirty(struct folio *folio,
+ struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+ unsigned int first_blk = (off >> inode->i_blkbits);
+ unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+ unsigned int nr_blks = last_blk - first_blk + 1;
+ unsigned long flags;
-static struct iomap_page *
-iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (ifs)
+ ifs_clear_range_dirty(folio, ifs, off, len);
+}
+
+static void ifs_set_range_dirty(struct folio *folio,
+ struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+ unsigned int first_blk = (off >> inode->i_blkbits);
+ unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+ unsigned int nr_blks = last_blk - first_blk + 1;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (ifs)
+ ifs_set_range_dirty(folio, ifs, off, len);
+}
+
+static struct iomap_folio_state *ifs_alloc(struct inode *inode,
+ struct folio *folio, unsigned int flags)
+{
+ struct iomap_folio_state *ifs = folio->private;
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
gfp_t gfp;
- if (iop || nr_blocks <= 1)
- return iop;
+ if (ifs || nr_blocks <= 1)
+ return ifs;
if (flags & IOMAP_NOWAIT)
gfp = GFP_NOWAIT;
else
gfp = GFP_NOFS | __GFP_NOFAIL;
- iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
- gfp);
- if (iop) {
- spin_lock_init(&iop->uptodate_lock);
- if (folio_test_uptodate(folio))
- bitmap_fill(iop->uptodate, nr_blocks);
- folio_attach_private(folio, iop);
- }
- return iop;
+ /*
+ * ifs->state tracks two sets of state flags when the
+ * filesystem block size is smaller than the folio size.
+ * The first state tracks per-block uptodate and the
+ * second tracks per-block dirty state.
+ */
+ ifs = kzalloc(struct_size(ifs, state,
+ BITS_TO_LONGS(2 * nr_blocks)), gfp);
+ if (!ifs)
+ return ifs;
+
+ spin_lock_init(&ifs->state_lock);
+ if (folio_test_uptodate(folio))
+ bitmap_set(ifs->state, 0, nr_blocks);
+ if (folio_test_dirty(folio))
+ bitmap_set(ifs->state, nr_blocks, nr_blocks);
+ folio_attach_private(folio, ifs);
+
+ return ifs;
}
-static void iomap_page_release(struct folio *folio)
+static void ifs_free(struct folio *folio)
{
- struct iomap_page *iop = folio_detach_private(folio);
- struct inode *inode = folio->mapping->host;
- unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+ struct iomap_folio_state *ifs = folio_detach_private(folio);
- if (!iop)
+ if (!ifs)
return;
- WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
- WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
- WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
+ WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending));
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending));
+ WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) !=
folio_test_uptodate(folio));
- kfree(iop);
+ kfree(ifs);
}
/*
@@ -90,7 +194,7 @@ static void iomap_page_release(struct folio *folio)
static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
loff_t orig_pos = *pos;
loff_t isize = i_size_read(inode);
unsigned block_bits = inode->i_blkbits;
@@ -105,12 +209,12 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* per-block uptodate status and adjust the offset and length if needed
* to avoid reading in already uptodate ranges.
*/
- if (iop) {
+ if (ifs) {
unsigned int i;
/* move forward for each leading block marked uptodate */
for (i = first; i <= last; i++) {
- if (!test_bit(i, iop->uptodate))
+ if (!ifs_block_is_uptodate(ifs, i))
break;
*pos += block_size;
poff += block_size;
@@ -120,7 +224,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
/* truncate len if we find any trailing uptodate block(s) */
for ( ; i <= last; i++) {
- if (test_bit(i, iop->uptodate)) {
+ if (ifs_block_is_uptodate(ifs, i)) {
plen -= (last - i + 1) * block_size;
last = i - 1;
break;
@@ -144,43 +248,19 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen;
}
-static void iomap_iop_set_range_uptodate(struct folio *folio,
- struct iomap_page *iop, size_t off, size_t len)
-{
- struct inode *inode = folio->mapping->host;
- unsigned first = off >> inode->i_blkbits;
- unsigned last = (off + len - 1) >> inode->i_blkbits;
- unsigned long flags;
-
- spin_lock_irqsave(&iop->uptodate_lock, flags);
- bitmap_set(iop->uptodate, first, last - first + 1);
- if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
- folio_mark_uptodate(folio);
- spin_unlock_irqrestore(&iop->uptodate_lock, flags);
-}
-
-static void iomap_set_range_uptodate(struct folio *folio,
- struct iomap_page *iop, size_t off, size_t len)
-{
- if (iop)
- iomap_iop_set_range_uptodate(folio, iop, off, len);
- else
- folio_mark_uptodate(folio);
-}
-
static void iomap_finish_folio_read(struct folio *folio, size_t offset,
size_t len, int error)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
if (unlikely(error)) {
folio_clear_uptodate(folio);
folio_set_error(folio);
} else {
- iomap_set_range_uptodate(folio, iop, offset, len);
+ iomap_set_range_uptodate(folio, offset, len);
}
- if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
+ if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending))
folio_unlock(folio);
}
@@ -213,7 +293,6 @@ struct iomap_readpage_ctx {
static int iomap_read_inline_data(const struct iomap_iter *iter,
struct folio *folio)
{
- struct iomap_page *iop;
const struct iomap *iomap = iomap_iter_srcmap(iter);
size_t size = i_size_read(iter->inode) - iomap->offset;
size_t poff = offset_in_page(iomap->offset);
@@ -231,15 +310,13 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
- iop = iomap_page_create(iter->inode, folio, iter->flags);
- else
- iop = to_iomap_page(folio);
+ ifs_alloc(iter->inode, folio, iter->flags);
addr = kmap_local_folio(folio, offset);
memcpy(addr, iomap->inline_data, size);
memset(addr + size, 0, PAGE_SIZE - poff - size);
kunmap_local(addr);
- iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
+ iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff);
return 0;
}
@@ -260,7 +337,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
loff_t pos = iter->pos + offset;
loff_t length = iomap_length(iter) - offset;
struct folio *folio = ctx->cur_folio;
- struct iomap_page *iop;
+ struct iomap_folio_state *ifs;
loff_t orig_pos = pos;
size_t poff, plen;
sector_t sector;
@@ -269,20 +346,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */
- iop = iomap_page_create(iter->inode, folio, iter->flags);
+ ifs = ifs_alloc(iter->inode, folio, iter->flags);
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
if (iomap_block_needs_zeroing(iter, pos)) {
folio_zero_range(folio, poff, plen);
- iomap_set_range_uptodate(folio, iop, poff, plen);
+ iomap_set_range_uptodate(folio, poff, plen);
goto done;
}
ctx->cur_folio_in_bio = true;
- if (iop)
- atomic_add(plen, &iop->read_bytes_pending);
+ if (ifs)
+ atomic_add(plen, &ifs->read_bytes_pending);
sector = iomap_sector(iomap, pos);
if (!ctx->bio ||
@@ -436,11 +513,11 @@ EXPORT_SYMBOL_GPL(iomap_readahead);
*/
bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
unsigned first, last, i;
- if (!iop)
+ if (!ifs)
return false;
/* Caller's range may extend past the end of this folio */
@@ -451,7 +528,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
last = (from + count - 1) >> inode->i_blkbits;
for (i = first; i <= last; i++)
- if (!test_bit(i, iop->uptodate))
+ if (!ifs_block_is_uptodate(ifs, i))
return false;
return true;
}
@@ -461,16 +538,18 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
* iomap_get_folio - get a folio reference for writing
* @iter: iteration structure
* @pos: start offset of write
+ * @len: Suggested size of folio to create.
*
* Returns a locked reference to the folio at @pos, or an error pointer if the
* folio could not be obtained.
*/
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
{
- unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
+ fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
+ fgp |= fgf_set_order(len);
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
@@ -483,14 +562,13 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
folio_size(folio));
/*
- * mm accommodates an old ext3 case where clean folios might
- * not have had the dirty bit cleared. Thus, it can send actual
- * dirty folios to ->release_folio() via shrink_active_list();
- * skip those here.
+ * If the folio is dirty, we refuse to release our metadata because
+ * it may be partially dirty. Once we track per-block dirty state,
+ * we can release the metadata if every block is dirty.
*/
- if (folio_test_dirty(folio) || folio_test_writeback(folio))
+ if (folio_test_dirty(folio))
return false;
- iomap_page_release(folio);
+ ifs_free(folio);
return true;
}
EXPORT_SYMBOL_GPL(iomap_release_folio);
@@ -507,16 +585,22 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
if (offset == 0 && len == folio_size(folio)) {
WARN_ON_ONCE(folio_test_writeback(folio));
folio_cancel_dirty(folio);
- iomap_page_release(folio);
- } else if (folio_test_large(folio)) {
- /* Must release the iop so the page can be split */
- WARN_ON_ONCE(!folio_test_uptodate(folio) &&
- folio_test_dirty(folio));
- iomap_page_release(folio);
+ ifs_free(folio);
}
}
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ size_t len = folio_size(folio);
+
+ ifs_alloc(inode, folio, 0);
+ iomap_set_range_dirty(folio, 0, len);
+ return filemap_dirty_folio(mapping, folio);
+}
+EXPORT_SYMBOL_GPL(iomap_dirty_folio);
+
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -547,7 +631,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct iomap_page *iop;
+ struct iomap_folio_state *ifs;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
@@ -555,14 +639,23 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
- if (folio_test_uptodate(folio))
+ /*
+ * If the write completely overlaps the current folio, then
+ * entire folio will be dirtied so there is no need for
+ * per-block state tracking structures to be attached to this folio.
+ */
+ if (pos <= folio_pos(folio) &&
+ pos + len >= folio_pos(folio) + folio_size(folio))
return 0;
- folio_clear_error(folio);
- iop = iomap_page_create(iter->inode, folio, iter->flags);
- if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
+ ifs = ifs_alloc(iter->inode, folio, iter->flags);
+ if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1)
return -EAGAIN;
+ if (folio_test_uptodate(folio))
+ return 0;
+ folio_clear_error(folio);
+
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen);
@@ -589,7 +682,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (status)
return status;
}
- iomap_set_range_uptodate(folio, iop, poff, plen);
+ iomap_set_range_uptodate(folio, poff, plen);
} while ((block_start += plen) < block_end);
return 0;
@@ -603,7 +696,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
if (folio_ops && folio_ops->get_folio)
return folio_ops->get_folio(iter, pos, len);
else
- return iomap_get_folio(iter, pos);
+ return iomap_get_folio(iter, pos, len);
}
static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
@@ -696,7 +789,6 @@ out_unlock:
static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
- struct iomap_page *iop = to_iomap_page(folio);
flush_dcache_folio(folio);
/*
@@ -712,7 +804,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
*/
if (unlikely(copied < len && !folio_test_uptodate(folio)))
return 0;
- iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
+ iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
+ iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
filemap_dirty_folio(inode->i_mapping, folio);
return copied;
}
@@ -773,6 +866,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
loff_t length = iomap_length(iter);
+ size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
loff_t pos = iter->pos;
ssize_t written = 0;
long status = 0;
@@ -781,15 +875,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
do {
struct folio *folio;
- struct page *page;
- unsigned long offset; /* Offset into pagecache page */
- unsigned long bytes; /* Bytes to write to page */
+ size_t offset; /* Offset into folio */
+ size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
- offset = offset_in_page(pos);
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_count(i));
-again:
+ offset = pos & (chunk - 1);
+ bytes = min(chunk - offset, iov_iter_count(i));
status = balance_dirty_pages_ratelimited_flags(mapping,
bdp_flags);
if (unlikely(status))
@@ -819,12 +910,14 @@ again:
if (iter->iomap.flags & IOMAP_F_STALE)
break;
- page = folio_file_page(folio, pos >> PAGE_SHIFT);
- if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+ bytes = folio_size(folio) - offset;
- copied = copy_page_from_iter_atomic(page, offset, bytes, i);
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
status = iomap_write_end(iter, pos, bytes, copied, folio);
if (unlikely(copied != status))
@@ -840,11 +933,13 @@ again:
*/
if (copied)
bytes = copied;
- goto again;
+ if (chunk > PAGE_SIZE)
+ chunk /= 2;
+ } else {
+ pos += status;
+ written += status;
+ length -= status;
}
- pos += status;
- written += status;
- length -= status;
} while (iov_iter_count(i) && length);
if (status == -EAGAIN) {
@@ -880,6 +975,76 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+static int iomap_write_delalloc_ifs_punch(struct inode *inode,
+ struct folio *folio, loff_t start_byte, loff_t end_byte,
+ iomap_punch_t punch)
+{
+ unsigned int first_blk, last_blk, i;
+ loff_t last_byte;
+ u8 blkbits = inode->i_blkbits;
+ struct iomap_folio_state *ifs;
+ int ret = 0;
+
+ /*
+ * When we have per-block dirty tracking, there can be
+ * blocks within a folio which are marked uptodate
+ * but not dirty. In that case it is necessary to punch
+ * out such blocks to avoid leaking any delalloc blocks.
+ */
+ ifs = folio->private;
+ if (!ifs)
+ return ret;
+
+ last_byte = min_t(loff_t, end_byte - 1,
+ folio_pos(folio) + folio_size(folio) - 1);
+ first_blk = offset_in_folio(folio, start_byte) >> blkbits;
+ last_blk = offset_in_folio(folio, last_byte) >> blkbits;
+ for (i = first_blk; i <= last_blk; i++) {
+ if (!ifs_block_is_dirty(folio, ifs, i)) {
+ ret = punch(inode, folio_pos(folio) + (i << blkbits),
+ 1 << blkbits);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+
+static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
+ loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
+ iomap_punch_t punch)
+{
+ int ret = 0;
+
+ if (!folio_test_dirty(folio))
+ return ret;
+
+ /* if dirty, punch up to offset */
+ if (start_byte > *punch_start_byte) {
+ ret = punch(inode, *punch_start_byte,
+ start_byte - *punch_start_byte);
+ if (ret)
+ return ret;
+ }
+
+ /* Punch non-dirty blocks within folio */
+ ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
+ end_byte, punch);
+ if (ret)
+ return ret;
+
+ /*
+ * Make sure the next punch start is correctly bound to
+ * the end of this data range, not the end of the folio.
+ */
+ *punch_start_byte = min_t(loff_t, end_byte,
+ folio_pos(folio) + folio_size(folio));
+
+ return ret;
+}
+
/*
* Scan the data range passed to us for dirty page cache folios. If we find a
* dirty folio, punch out the preceeding range and update the offset from which
@@ -899,10 +1064,11 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
*/
static int iomap_write_delalloc_scan(struct inode *inode,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
- int (*punch)(struct inode *inode, loff_t offset, loff_t length))
+ iomap_punch_t punch)
{
while (start_byte < end_byte) {
struct folio *folio;
+ int ret;
/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
@@ -913,26 +1079,12 @@ static int iomap_write_delalloc_scan(struct inode *inode,
continue;
}
- /* if dirty, punch up to offset */
- if (folio_test_dirty(folio)) {
- if (start_byte > *punch_start_byte) {
- int error;
-
- error = punch(inode, *punch_start_byte,
- start_byte - *punch_start_byte);
- if (error) {
- folio_unlock(folio);
- folio_put(folio);
- return error;
- }
- }
-
- /*
- * Make sure the next punch start is correctly bound to
- * the end of this data range, not the end of the folio.
- */
- *punch_start_byte = min_t(loff_t, end_byte,
- folio_next_index(folio) << PAGE_SHIFT);
+ ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte,
+ start_byte, end_byte, punch);
+ if (ret) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
}
/* move offset to start of next folio in range */
@@ -977,8 +1129,7 @@ static int iomap_write_delalloc_scan(struct inode *inode,
* the code to subtle off-by-one bugs....
*/
static int iomap_write_delalloc_release(struct inode *inode,
- loff_t start_byte, loff_t end_byte,
- int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+ loff_t start_byte, loff_t end_byte, iomap_punch_t punch)
{
loff_t punch_start_byte = start_byte;
loff_t scan_end_byte = min(i_size_read(inode), end_byte);
@@ -1071,8 +1222,7 @@ out_unlock:
*/
int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
struct iomap *iomap, loff_t pos, loff_t length,
- ssize_t written,
- int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+ ssize_t written, iomap_punch_t punch)
{
loff_t start_byte;
loff_t end_byte;
@@ -1293,17 +1443,17 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len, int error)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
if (error) {
folio_set_error(folio);
mapping_set_error(inode->i_mapping, error);
}
- WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop);
- WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
- if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
folio_end_writeback(folio);
}
@@ -1570,7 +1720,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
*/
static void
iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
- struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
+ struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct list_head *iolist)
{
sector_t sector = iomap_sector(&wpc->iomap, pos);
@@ -1588,8 +1738,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
}
- if (iop)
- atomic_add(len, &iop->write_bytes_pending);
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, &folio->page, len);
}
@@ -1615,7 +1765,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos)
{
- struct iomap_page *iop = iomap_page_create(inode, folio, 0);
+ struct iomap_folio_state *ifs = folio->private;
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1623,7 +1773,14 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
int error = 0, count = 0, i;
LIST_HEAD(submit_list);
- WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
+ WARN_ON_ONCE(end_pos <= pos);
+
+ if (!ifs && nblocks > 1) {
+ ifs = ifs_alloc(inode, folio, 0);
+ iomap_set_range_dirty(folio, 0, end_pos - pos);
+ }
+
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
/*
* Walk through the folio to find areas to write back. If we
@@ -1631,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* invalid, grab a new one.
*/
for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
- if (iop && !test_bit(i, iop->uptodate))
+ if (ifs && !ifs_block_is_dirty(folio, ifs, i))
continue;
error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1642,7 +1799,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
continue;
if (wpc->iomap.type == IOMAP_HOLE)
continue;
- iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc,
+ iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
&submit_list);
count++;
}
@@ -1675,6 +1832,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
}
}
+ /*
+ * We can have dirty bits set past end of file in page_mkwrite path
+ * while mapping the last partial folio. Hence it's better to clear
+ * all the dirty bits in the folio here.
+ */
+ iomap_clear_range_dirty(folio, 0, folio_size(folio));
folio_start_writeback(folio);
folio_unlock(folio);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ea3b868c8355..bcd3f8cf5ea4 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -20,10 +20,12 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
-#define IOMAP_DIO_WRITE_FUA (1 << 28)
-#define IOMAP_DIO_NEED_SYNC (1 << 29)
-#define IOMAP_DIO_WRITE (1 << 30)
-#define IOMAP_DIO_DIRTY (1 << 31)
+#define IOMAP_DIO_CALLER_COMP (1U << 26)
+#define IOMAP_DIO_INLINE_COMP (1U << 27)
+#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
+#define IOMAP_DIO_NEED_SYNC (1U << 29)
+#define IOMAP_DIO_WRITE (1U << 30)
+#define IOMAP_DIO_DIRTY (1U << 31)
struct iomap_dio {
struct kiocb *iocb;
@@ -41,7 +43,6 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
- struct bio *poll_bio;
} submit;
/* used for aio completion: */
@@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
+ struct kiocb *iocb = dio->iocb;
+
atomic_inc(&dio->ref);
/* Sync dio can't be polled reliably */
- if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
- bio_set_polled(bio, dio->iocb);
- dio->submit.poll_bio = bio;
+ if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) {
+ bio_set_polled(bio, iocb);
+ WRITE_ONCE(iocb->private, bio);
}
if (dio->dops && dio->dops->submit_io)
@@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);
+static ssize_t iomap_dio_deferred_complete(void *data)
+{
+ return iomap_dio_complete(data);
+}
+
static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+ struct kiocb *iocb = dio->iocb;
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+ if (!atomic_dec_and_test(&dio->ref))
+ goto release_bio;
- if (atomic_dec_and_test(&dio->ref)) {
- if (dio->wait_for_completion) {
- struct task_struct *waiter = dio->submit.waiter;
- WRITE_ONCE(dio->submit.waiter, NULL);
- blk_wake_io_task(waiter);
- } else if (dio->flags & IOMAP_DIO_WRITE) {
- struct inode *inode = file_inode(dio->iocb->ki_filp);
-
- WRITE_ONCE(dio->iocb->private, NULL);
- INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
- queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
- } else {
- WRITE_ONCE(dio->iocb->private, NULL);
- iomap_dio_complete_work(&dio->aio.work);
- }
+ /*
+ * Synchronous dio, task itself will handle any completion work
+ * that needs after IO. All we need to do is wake the task.
+ */
+ if (dio->wait_for_completion) {
+ struct task_struct *waiter = dio->submit.waiter;
+
+ WRITE_ONCE(dio->submit.waiter, NULL);
+ blk_wake_io_task(waiter);
+ goto release_bio;
+ }
+
+ /*
+ * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
+ */
+ if (dio->flags & IOMAP_DIO_INLINE_COMP) {
+ WRITE_ONCE(iocb->private, NULL);
+ iomap_dio_complete_work(&dio->aio.work);
+ goto release_bio;
+ }
+
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
+ * our completion that way to avoid an async punt to a workqueue.
+ */
+ if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /* only polled IO cares about private cleared */
+ iocb->private = dio;
+ iocb->dio_complete = iomap_dio_deferred_complete;
+
+ /*
+ * Invoke ->ki_complete() directly. We've assigned our
+ * dio_complete callback handler, and since the issuer set
+ * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
+ * notice ->dio_complete being set and will defer calling that
+ * handler until it can be done from a safe task context.
+ *
+ * Note that the 'res' being passed in here is not important
+ * for this case. The actual completion value of the request
+ * will be gotten from dio_complete when that is run by the
+ * issuer.
+ */
+ iocb->ki_complete(iocb, 0);
+ goto release_bio;
}
+ /*
+ * Async DIO completion that requires filesystem level completion work
+ * gets punted to a work queue to complete as the operation may require
+ * more IO to be issued to finalise filesystem metadata changes or
+ * guarantee data integrity.
+ */
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
+ &dio->aio.work);
+release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
/*
* Figure out the bio's operation flags from the dio request, the
* mapping, and whether or not we want FUA. Note that we can end up
- * clearing the WRITE_FUA flag in the dio request.
+ * clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
@@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
if (use_fua)
opflags |= REQ_FUA;
else
- dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
return opflags;
}
@@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* Use a FUA write if we need datasync semantics, this is a pure
* data IO that doesn't require any metadata updates (including
* after IO completion such as unwritten extent conversion) and
- * the underlying device supports FUA. This allows us to avoid
- * cache flushes on IO completion.
+ * the underlying device either supports FUA or doesn't have
+ * a volatile write cache. This allows us to avoid cache flushes
+ * on IO completion. If we can't use writethrough and need to
+ * sync, disable in-task completions as dio completion will
+ * need to call generic_write_sync() which will do a blocking
+ * fsync / cache flush call.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
+ (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
+ (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
use_fua = true;
+ else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
}
/*
@@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
/*
- * We can only poll for single bio I/Os.
+ * We can only do deferred completion for pure overwrites that
+ * don't require additional IO at completion. This rules out
+ * writes that need zeroing or extent conversion, extend
+ * the file size, or issue journal IO or cache flushes
+ * during completion processing.
*/
if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+
+ /*
+ * The rules for polled IO completions follow the guidelines as the
+ * ones we set for inline and deferred completions. If none of those
+ * are available for this IO, clear the polled flag.
+ */
+ if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
dio->iocb->ki_flags &= ~IOCB_HIPRI;
if (need_zeroout) {
@@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.iter = iter;
dio->submit.waiter = current;
- dio->submit.poll_bio = NULL;
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
if (iov_iter_rw(iter) == READ) {
+ /* reads can always complete inline */
+ dio->flags |= IOMAP_DIO_INLINE_COMP;
+
if (iomi.pos >= dio->i_size)
goto out_free_dio;
@@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
+ /*
+ * Flag as supporting deferred completions, if the issuer
+ * groks it. This can avoid a workqueue punt for writes.
+ * We may later clear this flag if we need to do other IO
+ * as part of this IO completion.
+ */
+ if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
+ dio->flags |= IOMAP_DIO_CALLER_COMP;
+
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||
@@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->flags |= IOMAP_DIO_NEED_SYNC;
/*
- * For datasync only writes, we optimistically try
- * using FUA for this IO. Any non-FUA write that
- * occurs will clear this flag, hence we know before
- * completion whether a cache flush is necessary.
+ * For datasync only writes, we optimistically try using
+ * WRITE_THROUGH for this IO. This flag requires either
+ * FUA writes through the device's write cache, or a
+ * normal write to a device without a volatile write
+ * cache. For the former, Any non-FUA write that occurs
+ * will clear this flag, hence we know before completion
+ * whether a cache flush is necessary.
*/
if (!(iocb->ki_flags & IOCB_SYNC))
- dio->flags |= IOMAP_DIO_WRITE_FUA;
+ dio->flags |= IOMAP_DIO_WRITE_THROUGH;
}
/*
@@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomap_dio_set_error(dio, ret);
/*
- * If all the writes we issued were FUA, we don't need to flush the
- * cache on IO completion. Clear the sync flag for this case.
+ * If all the writes we issued were already written through to the
+ * media, we don't need to flush the cache on IO completion. Clear the
+ * sync flag for this case.
*/
- if (dio->flags & IOMAP_DIO_WRITE_FUA)
+ if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
- WRITE_ONCE(iocb->private, dio->submit.poll_bio);
-
/*
* We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three different
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index df9d70588b60..2ee21286ac8f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1422,13 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_ino, de->flags[-high_sierra]);
}
#endif
-
- inode->i_mtime.tv_sec =
- inode->i_atime.tv_sec =
- inode->i_ctime.tv_sec = iso_date(de->date, high_sierra);
- inode->i_mtime.tv_nsec =
- inode->i_atime.tv_nsec =
- inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime = inode->i_atime =
+ inode_set_ctime(inode, iso_date(de->date, high_sierra), 0);
ei->i_first_extent = (isonum_733(de->extent) +
isonum_711(de->ext_attr_length));
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 48f58c6c9e69..348783a70f57 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -421,10 +421,9 @@ repeat:
/* Rock ridge never appears on a High Sierra disk */
cnt = 0;
if (rr->u.TF.flags & TF_CREATE) {
- inode->i_ctime.tv_sec =
- iso_date(rr->u.TF.times[cnt++].time,
- 0);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode,
+ iso_date(rr->u.TF.times[cnt++].time, 0),
+ 0);
}
if (rr->u.TF.flags & TF_MODIFY) {
inode->i_mtime.tv_sec =
@@ -439,10 +438,9 @@ repeat:
inode->i_atime.tv_nsec = 0;
}
if (rr->u.TF.flags & TF_ATTRIBUTES) {
- inode->i_ctime.tv_sec =
- iso_date(rr->u.TF.times[cnt++].time,
- 0);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode,
+ iso_date(rr->u.TF.times[cnt++].time, 0),
+ 0);
}
break;
case SIG('S', 'L'):
@@ -534,7 +532,7 @@ repeat:
inode->i_size = reloc->i_size;
inode->i_blocks = reloc->i_blocks;
inode->i_atime = reloc->i_atime;
- inode->i_ctime = reloc->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(reloc));
inode->i_mtime = reloc->i_mtime;
iput(reloc);
break;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 5075a0a6d594..091ab0eaabbe 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -204,7 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i,
if (ret)
goto fail;
- dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
+ ITIME(je32_to_cpu(ri->ctime)));
jffs2_free_raw_inode(ri);
@@ -237,7 +238,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
if (dead_f->inocache)
set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
if (!ret)
- dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
return ret;
}
/***********************************************************************/
@@ -271,7 +272,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
mutex_unlock(&f->sem);
d_instantiate(dentry, d_inode(old_dentry));
- dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
ihold(d_inode(old_dentry));
}
return ret;
@@ -422,7 +423,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
+ ITIME(je32_to_cpu(rd->mctime)));
jffs2_free_raw_dirent(rd);
@@ -566,7 +568,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
+ ITIME(je32_to_cpu(rd->mctime)));
inc_nlink(dir_i);
jffs2_free_raw_dirent(rd);
@@ -607,7 +610,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
dentry->d_name.len, f, now);
if (!ret) {
- dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
clear_nlink(d_inode(dentry));
drop_nlink(dir_i);
}
@@ -743,7 +746,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
+ ITIME(je32_to_cpu(rd->mctime)));
jffs2_free_raw_dirent(rd);
@@ -864,14 +868,16 @@ static int jffs2_rename (struct mnt_idmap *idmap,
* caller won't do it on its own since we are returning an error.
*/
d_invalidate(new_dentry);
- new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
+ new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i,
+ ITIME(now));
return ret;
}
if (d_is_dir(old_dentry))
drop_nlink(old_dir_i);
- new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
+ old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now));
+ new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now));
return 0;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 2345ca3f09ee..11c66793960e 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -317,7 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
inode->i_size = pos + writtenlen;
inode->i_blocks = (inode->i_size + 511) >> 9;
- inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime));
+ inode->i_mtime = inode_set_ctime_to_ts(inode,
+ ITIME(je32_to_cpu(ri->ctime)));
}
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 038516bee1ab..0403efab4089 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -115,7 +115,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size);
ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime));
ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime));
- ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime));
+ ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode)));
ri->offset = cpu_to_je32(0);
ri->csize = ri->dsize = cpu_to_je32(mdatalen);
@@ -148,7 +148,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
}
/* It worked. Update the inode */
inode->i_atime = ITIME(je32_to_cpu(ri->atime));
- inode->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+ inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime)));
inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
inode->i_mode = jemode_to_cpu(ri->mode);
i_uid_write(inode, je16_to_cpu(ri->uid));
@@ -284,7 +284,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
inode->i_size = je32_to_cpu(latest_node.isize);
inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
- inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
+ inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime)));
set_nlink(inode, f->inocache->pino_nlink);
@@ -388,7 +388,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
iattr.ia_gid = inode->i_gid;
iattr.ia_atime = inode->i_atime;
iattr.ia_mtime = inode->i_mtime;
- iattr.ia_ctime = inode->i_ctime;
+ iattr.ia_ctime = inode_get_ctime(inode);
jffs2_do_setattr(inode, &iattr);
}
@@ -475,7 +475,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
inode->i_mode = jemode_to_cpu(ri->mode);
i_gid_write(inode, je16_to_cpu(ri->gid));
i_uid_write(inode, je16_to_cpu(ri->uid));
- inode->i_atime = inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
inode->i_blocks = 0;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 8da19766c101..50727a1ff931 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -35,7 +35,7 @@ struct kvec;
#define ITIME(sec) ((struct timespec64){sec, 0})
#define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds())
#define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec)
-#define JFFS2_F_I_CTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_ctime)
+#define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f)))
#define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime)
#define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime)
#define sleep_on_spinunlock(wq, s) \
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index fb96f872d207..1de3602c98de 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -116,7 +116,7 @@ int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
if (!rc) {
if (update_mode) {
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
rc = txCommit(tid, 1, &inode, 0);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 8ac10e396050..920d58a1566b 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
break;
}
- ip->i_mtime = ip->i_ctime = current_time(ip);
+ ip->i_mtime = inode_set_ctime_current(ip);
mark_inode_dirty(ip);
txCommit(tid, 1, &ip, 0);
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index ed7989bc2db1..f7bd7e8f5be4 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -96,7 +96,7 @@ int jfs_fileattr_set(struct mnt_idmap *idmap,
jfs_inode->mode2 = flags;
jfs_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return 0;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 390cbfce391f..a40383aa6c84 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3064,8 +3064,8 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
- ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
- ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
+ inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec),
+ le32_to_cpu(dip->di_ctime.tv_nsec));
ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
ip->i_generation = le32_to_cpu(dip->di_gen);
@@ -3139,8 +3139,8 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
- dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec);
- dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec);
+ dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec);
+ dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec);
dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 9e1f02767201..87594efa7f7c 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
jfs_inode->mode2 |= inode->i_mode;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- jfs_inode->otime = inode->i_ctime.tv_sec;
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ jfs_inode->otime = inode_get_ctime(inode).tv_sec;
inode->i_generation = JFS_SBI(sb)->gengen++;
jfs_inode->cflag = 0;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index e98ddb2b1cf2..029d47065600 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
mark_inode_dirty(ip);
- dip->i_ctime = dip->i_mtime = current_time(dip);
+ dip->i_mtime = inode_set_ctime_current(dip);
mark_inode_dirty(dip);
@@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
/* update parent directory inode */
inc_nlink(dip); /* for '..' from child directory */
- dip->i_ctime = dip->i_mtime = current_time(dip);
+ dip->i_mtime = inode_set_ctime_current(dip);
mark_inode_dirty(dip);
rc = txCommit(tid, 2, &iplist[0], 0);
@@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
/* update parent directory's link count corresponding
* to ".." entry of the target directory deleted
*/
- dip->i_ctime = dip->i_mtime = current_time(dip);
+ dip->i_mtime = inode_set_ctime_current(dip);
inode_dec_link_count(dip);
/*
@@ -512,7 +512,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
ASSERT(ip->i_nlink);
- ip->i_ctime = dip->i_ctime = dip->i_mtime = current_time(ip);
+ dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip));
mark_inode_dirty(dip);
/* update target's inode */
@@ -827,8 +827,8 @@ static int jfs_link(struct dentry *old_dentry,
/* update object inode */
inc_nlink(ip); /* for new link */
- ip->i_ctime = current_time(ip);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_set_ctime_current(ip);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
ihold(ip);
@@ -1028,7 +1028,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
mark_inode_dirty(ip);
- dip->i_ctime = dip->i_mtime = current_time(dip);
+ dip->i_mtime = inode_set_ctime_current(dip);
mark_inode_dirty(dip);
/*
* commit update of parent directory and link object
@@ -1205,7 +1205,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
tblk->xflag |= COMMIT_DELETE;
tblk->u.ip = new_ip;
} else {
- new_ip->i_ctime = current_time(new_ip);
+ inode_set_ctime_current(new_ip);
mark_inode_dirty(new_ip);
}
} else {
@@ -1268,10 +1268,10 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
/*
* Update ctime on changed/moved inodes & mark dirty
*/
- old_ip->i_ctime = current_time(old_ip);
+ inode_set_ctime_current(old_ip);
mark_inode_dirty(old_ip);
- new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir);
+ new_dir->i_mtime = inode_set_ctime_current(new_dir);
mark_inode_dirty(new_dir);
/* Build list of inodes modified by this transaction */
@@ -1283,7 +1283,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (old_dir != new_dir) {
iplist[ipcount++] = new_dir;
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
mark_inode_dirty(old_dir);
}
@@ -1416,7 +1416,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
mark_inode_dirty(ip);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d2f82cb7db1b..2e2f7f6d36a0 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -818,7 +818,7 @@ out:
}
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
inode_unlock(inode);
return len - towrite;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 931e50018f88..8577ad494e05 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -647,7 +647,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
if (old_blocks)
dquot_free_block(inode, old_blocks);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
return 0;
}
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 5d826274570c..c429c42a6867 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -8,16 +8,16 @@
/**
* kernel_read_file() - read file contents into a kernel buffer
*
- * @file file to read from
- * @offset where to start reading from (see below).
- * @buf pointer to a "void *" buffer for reading into (if
+ * @file: file to read from
+ * @offset: where to start reading from (see below).
+ * @buf: pointer to a "void *" buffer for reading into (if
* *@buf is NULL, a buffer will be allocated, and
* @buf_size will be ignored)
- * @buf_size size of buf, if already allocated. If @buf not
+ * @buf_size: size of buf, if already allocated. If @buf not
* allocated, this is the largest size to allocate.
- * @file_size if non-NULL, the full size of @file will be
+ * @file_size: if non-NULL, the full size of @file will be
* written here.
- * @id the kernel_read_file_id identifying the type of
+ * @id: the kernel_read_file_id identifying the type of
* file contents being read (for LSMs to examine)
*
* @offset must be 0 unless both @buf and @file_size are non-NULL
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5a1a4af9d3d2..660995856a04 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -556,7 +556,7 @@ void kernfs_put(struct kernfs_node *kn)
kfree_const(kn->name);
if (kn->iattr) {
- simple_xattrs_free(&kn->iattr->xattrs);
+ simple_xattrs_free(&kn->iattr->xattrs, NULL);
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
spin_lock(&kernfs_idr_lock);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b22b74d1a115..922719a343a7 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -151,8 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
static inline void set_inode_attr(struct inode *inode,
@@ -162,7 +161,7 @@ static inline void set_inode_attr(struct inode *inode,
inode->i_gid = attrs->ia_gid;
inode->i_atime = attrs->ia_atime;
inode->i_mtime = attrs->ia_mtime;
- inode->i_ctime = attrs->ia_ctime;
+ inode_set_ctime_to_ts(inode, attrs->ia_ctime);
}
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
@@ -191,7 +190,7 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
up_read(&root->kernfs_iattr_rwsem);
return 0;
@@ -306,11 +305,17 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
const void *value, size_t size, int flags)
{
+ struct simple_xattr *old_xattr;
struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
- return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL);
+ old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags);
+ if (IS_ERR(old_xattr))
+ return PTR_ERR(old_xattr);
+
+ simple_xattr_free(old_xattr);
+ return 0;
}
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
@@ -342,7 +347,7 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
- ssize_t removed_size;
+ struct simple_xattr *old_xattr;
int ret;
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
@@ -355,13 +360,18 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
goto dec_size_out;
}
- ret = simple_xattr_set(xattrs, full_name, value, size, flags,
- &removed_size);
-
- if (!ret && removed_size >= 0)
- size = removed_size;
- else if (!ret)
+ old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
+ if (!old_xattr)
return 0;
+
+ if (IS_ERR(old_xattr)) {
+ ret = PTR_ERR(old_xattr);
+ goto dec_size_out;
+ }
+
+ ret = 0;
+ size = old_xattr->size;
+ simple_xattr_free(old_xattr);
dec_size_out:
atomic_sub(size, sz);
dec_count_out:
@@ -376,18 +386,19 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
- ssize_t removed_size;
- int ret;
+ struct simple_xattr *old_xattr;
- ret = simple_xattr_set(xattrs, full_name, value, size, flags,
- &removed_size);
+ old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
+ if (!old_xattr)
+ return 0;
- if (removed_size >= 0) {
- atomic_sub(removed_size, sz);
- atomic_dec(nr);
- }
+ if (IS_ERR(old_xattr))
+ return PTR_ERR(old_xattr);
- return ret;
+ atomic_sub(old_xattr->size, sz);
+ atomic_dec(nr);
+ simple_xattr_free(old_xattr);
+ return 0;
}
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
diff --git a/fs/libfs.c b/fs/libfs.c
index 5b851315eeed..da78eb64831e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -33,7 +33,7 @@ int simple_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
return 0;
}
@@ -239,6 +239,254 @@ const struct inode_operations simple_dir_inode_operations = {
};
EXPORT_SYMBOL(simple_dir_inode_operations);
+static void offset_set(struct dentry *dentry, u32 offset)
+{
+ dentry->d_fsdata = (void *)((uintptr_t)(offset));
+}
+
+static u32 dentry2offset(struct dentry *dentry)
+{
+ return (u32)((uintptr_t)(dentry->d_fsdata));
+}
+
+static struct lock_class_key simple_offset_xa_lock;
+
+/**
+ * simple_offset_init - initialize an offset_ctx
+ * @octx: directory offset map to be initialized
+ *
+ */
+void simple_offset_init(struct offset_ctx *octx)
+{
+ xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
+ lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
+
+ /* 0 is '.', 1 is '..', so always start with offset 2 */
+ octx->next_offset = 2;
+}
+
+/**
+ * simple_offset_add - Add an entry to a directory's offset map
+ * @octx: directory offset ctx to be updated
+ * @dentry: new dentry being added
+ *
+ * Returns zero on success. @so_ctx and the dentry offset are updated.
+ * Otherwise, a negative errno value is returned.
+ */
+int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
+{
+ static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
+ u32 offset;
+ int ret;
+
+ if (dentry2offset(dentry) != 0)
+ return -EBUSY;
+
+ ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
+ &octx->next_offset, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+
+ offset_set(dentry, offset);
+ return 0;
+}
+
+/**
+ * simple_offset_remove - Remove an entry to a directory's offset map
+ * @octx: directory offset ctx to be updated
+ * @dentry: dentry being removed
+ *
+ */
+void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
+{
+ u32 offset;
+
+ offset = dentry2offset(dentry);
+ if (offset == 0)
+ return;
+
+ xa_erase(&octx->xa, offset);
+ offset_set(dentry, 0);
+}
+
+/**
+ * simple_offset_rename_exchange - exchange rename with directory offsets
+ * @old_dir: parent of dentry being moved
+ * @old_dentry: dentry being moved
+ * @new_dir: destination parent
+ * @new_dentry: destination dentry
+ *
+ * Returns zero on success. Otherwise a negative errno is returned and the
+ * rename is rolled back.
+ */
+int simple_offset_rename_exchange(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
+ struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
+ u32 old_index = dentry2offset(old_dentry);
+ u32 new_index = dentry2offset(new_dentry);
+ int ret;
+
+ simple_offset_remove(old_ctx, old_dentry);
+ simple_offset_remove(new_ctx, new_dentry);
+
+ ret = simple_offset_add(new_ctx, old_dentry);
+ if (ret)
+ goto out_restore;
+
+ ret = simple_offset_add(old_ctx, new_dentry);
+ if (ret) {
+ simple_offset_remove(new_ctx, old_dentry);
+ goto out_restore;
+ }
+
+ ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ if (ret) {
+ simple_offset_remove(new_ctx, old_dentry);
+ simple_offset_remove(old_ctx, new_dentry);
+ goto out_restore;
+ }
+ return 0;
+
+out_restore:
+ offset_set(old_dentry, old_index);
+ xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+ offset_set(new_dentry, new_index);
+ xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+ return ret;
+}
+
+/**
+ * simple_offset_destroy - Release offset map
+ * @octx: directory offset ctx that is about to be destroyed
+ *
+ * During fs teardown (eg. umount), a directory's offset map might still
+ * contain entries. xa_destroy() cleans out anything that remains.
+ */
+void simple_offset_destroy(struct offset_ctx *octx)
+{
+ xa_destroy(&octx->xa);
+}
+
+/**
+ * offset_dir_llseek - Advance the read position of a directory descriptor
+ * @file: an open directory whose position is to be updated
+ * @offset: a byte offset
+ * @whence: enumerator describing the starting position for this update
+ *
+ * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
+ *
+ * Returns the updated read position if successful; otherwise a
+ * negative errno is returned and the read position remains unchanged.
+ */
+static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ switch (whence) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ fallthrough;
+ case SEEK_SET:
+ if (offset >= 0)
+ break;
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ return vfs_setpos(file, offset, U32_MAX);
+}
+
+static struct dentry *offset_find_next(struct xa_state *xas)
+{
+ struct dentry *child, *found = NULL;
+
+ rcu_read_lock();
+ child = xas_next_entry(xas, U32_MAX);
+ if (!child)
+ goto out;
+ spin_lock(&child->d_lock);
+ if (simple_positive(child))
+ found = dget_dlock(child);
+ spin_unlock(&child->d_lock);
+out:
+ rcu_read_unlock();
+ return found;
+}
+
+static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
+{
+ u32 offset = dentry2offset(dentry);
+ struct inode *inode = d_inode(dentry);
+
+ return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
+ inode->i_ino, fs_umode_to_dtype(inode->i_mode));
+}
+
+static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+{
+ struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
+ XA_STATE(xas, &so_ctx->xa, ctx->pos);
+ struct dentry *dentry;
+
+ while (true) {
+ dentry = offset_find_next(&xas);
+ if (!dentry)
+ break;
+
+ if (!offset_dir_emit(ctx, dentry)) {
+ dput(dentry);
+ break;
+ }
+
+ dput(dentry);
+ ctx->pos = xas.xa_index + 1;
+ }
+}
+
+/**
+ * offset_readdir - Emit entries starting at offset @ctx->pos
+ * @file: an open directory to iterate over
+ * @ctx: directory iteration context
+ *
+ * Caller must hold @file's i_rwsem to prevent insertion or removal of
+ * entries during this call.
+ *
+ * On entry, @ctx->pos contains an offset that represents the first entry
+ * to be read from the directory.
+ *
+ * The operation continues until there are no more entries to read, or
+ * until the ctx->actor indicates there is no more space in the caller's
+ * output buffer.
+ *
+ * On return, @ctx->pos contains an offset that will read the next entry
+ * in this directory when offset_readdir() is called again with @ctx.
+ *
+ * Return values:
+ * %0 - Complete
+ */
+static int offset_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct dentry *dir = file->f_path.dentry;
+
+ lockdep_assert_held(&d_inode(dir)->i_rwsem);
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ offset_iterate_dir(d_inode(dir), ctx);
+ return 0;
+}
+
+const struct file_operations simple_offset_dir_operations = {
+ .llseek = offset_dir_llseek,
+ .iterate_shared = offset_readdir,
+ .read = generic_read_dir,
+ .fsync = noop_fsync,
+};
+
static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
struct dentry *child = NULL;
@@ -275,7 +523,7 @@ void simple_recursive_removal(struct dentry *dentry,
while ((child = find_next_child(this, victim)) == NULL) {
// kill and ascend
// update metadata while it's still locked
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
clear_nlink(inode);
inode_unlock(inode);
victim = this;
@@ -293,8 +541,7 @@ void simple_recursive_removal(struct dentry *dentry,
dput(victim); // unpin it
}
if (victim == dentry) {
- inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (d_is_dir(dentry))
drop_nlink(inode);
inode_unlock(inode);
@@ -335,7 +582,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
- root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
+ root->i_atime = root->i_mtime = inode_set_ctime_current(root);
s->s_root = d_make_root(root);
if (!s->s_root)
return -ENOMEM;
@@ -391,7 +638,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inc_nlink(inode);
ihold(inode);
dget(dentry);
@@ -425,7 +673,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
drop_nlink(inode);
dput(dentry);
return 0;
@@ -444,6 +693,31 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL(simple_rmdir);
+/**
+ * simple_rename_timestamp - update the various inode timestamps for rename
+ * @old_dir: old parent directory
+ * @old_dentry: dentry that is being renamed
+ * @new_dir: new parent directory
+ * @new_dentry: target for rename
+ *
+ * POSIX mandates that the old and new parent directories have their ctime and
+ * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have
+ * their ctime updated.
+ */
+void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *newino = d_inode(new_dentry);
+
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ if (new_dir != old_dir)
+ new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_ctime_current(d_inode(old_dentry));
+ if (newino)
+ inode_set_ctime_current(newino);
+}
+EXPORT_SYMBOL_GPL(simple_rename_timestamp);
+
int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -459,11 +733,7 @@ int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
inc_nlink(old_dir);
}
}
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- d_inode(old_dentry)->i_ctime =
- d_inode(new_dentry)->i_ctime = current_time(old_dir);
-
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
return 0;
}
EXPORT_SYMBOL_GPL(simple_rename_exchange);
@@ -472,7 +742,6 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
- struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
@@ -495,9 +764,7 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
inc_nlink(new_dir);
}
- old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
- new_dir->i_mtime = inode->i_ctime = current_time(old_dir);
-
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
return 0;
}
EXPORT_SYMBOL(simple_rename);
@@ -548,21 +815,20 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
- struct page *page;
- pgoff_t index;
-
- index = pos >> PAGE_SHIFT;
+ struct folio *folio;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- *pagep = page;
+ *pagep = &folio->page;
- if (!PageUptodate(page) && (len != PAGE_SIZE)) {
- unsigned from = pos & (PAGE_SIZE - 1);
+ if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
+ size_t from = offset_in_folio(folio, pos);
- zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
+ folio_zero_segments(folio, 0, from,
+ from + len, folio_size(folio));
}
return 0;
}
@@ -594,17 +860,18 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
- /* zero the stale part of the page if we did a short copy */
- if (!PageUptodate(page)) {
+ /* zero the stale part of the folio if we did a short copy */
+ if (!folio_test_uptodate(folio)) {
if (copied < len) {
- unsigned from = pos & (PAGE_SIZE - 1);
+ size_t from = offset_in_folio(folio, pos);
- zero_user(page, from + copied, len - copied);
+ folio_zero_range(folio, from + copied, len - copied);
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/*
* No need to use i_size_read() here, the i_size
@@ -613,9 +880,9 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -659,7 +926,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
@@ -685,7 +952,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
goto out;
}
inode->i_mode = S_IFREG | files->mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_fop = files->ops;
inode->i_ino = i;
d_add(dentry, inode);
@@ -1253,7 +1520,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);
@@ -1269,7 +1536,7 @@ EXPORT_SYMBOL(alloc_anon_inode);
* All arguments are ignored and it just returns -EINVAL.
*/
int
-simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
+simple_nosetlease(struct file *filp, int arg, struct file_lock **flp,
void **priv)
{
return -EINVAL;
@@ -1315,7 +1582,7 @@ static int empty_dir_getattr(struct mnt_idmap *idmap,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/locks.c b/fs/locks.c
index df8b26a42524..a45efc16945d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -438,7 +438,7 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
fl->fl_end = OFFSET_MAX;
}
-static int assign_type(struct file_lock *fl, long type)
+static int assign_type(struct file_lock *fl, int type)
{
switch (type) {
case F_RDLCK:
@@ -549,7 +549,7 @@ static const struct lock_manager_operations lease_manager_ops = {
/*
* Initialize a lease, use the default lock manager operations
*/
-static int lease_init(struct file *filp, long type, struct file_lock *fl)
+static int lease_init(struct file *filp, int type, struct file_lock *fl)
{
if (assign_type(fl, type) != 0)
return -EINVAL;
@@ -567,7 +567,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
}
/* Allocate a file_lock initialised to this type of lease */
-static struct file_lock *lease_alloc(struct file *filp, long type)
+static struct file_lock *lease_alloc(struct file *filp, int type)
{
struct file_lock *fl = locks_alloc_lock();
int error = -ENOMEM;
@@ -868,6 +868,21 @@ static bool posix_locks_conflict(struct file_lock *caller_fl,
return locks_conflict(caller_fl, sys_fl);
}
+/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
+ * path so checks for additional GETLK-specific things like F_UNLCK.
+ */
+static bool posix_test_locks_conflict(struct file_lock *caller_fl,
+ struct file_lock *sys_fl)
+{
+ /* F_UNLCK checks any locks on the same fd. */
+ if (caller_fl->fl_type == F_UNLCK) {
+ if (!posix_same_owner(caller_fl, sys_fl))
+ return false;
+ return locks_overlap(caller_fl, sys_fl);
+ }
+ return posix_locks_conflict(caller_fl, sys_fl);
+}
+
/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
* checking before calling the locks_conflict().
*/
@@ -901,7 +916,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
retry:
spin_lock(&ctx->flc_lock);
list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
- if (!posix_locks_conflict(fl, cfl))
+ if (!posix_test_locks_conflict(fl, cfl))
continue;
if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
&& (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
@@ -1301,6 +1316,7 @@ retry:
out:
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
+ trace_posix_lock_inode(inode, request, error);
/*
* Free any unused locks.
*/
@@ -1309,7 +1325,6 @@ retry:
if (new_fl2)
locks_free_lock(new_fl2);
locks_dispose_list(&dispose);
- trace_posix_lock_inode(inode, request, error);
return error;
}
@@ -1666,7 +1681,7 @@ int fcntl_getlease(struct file *filp)
* conflict with the lease we're trying to set.
*/
static int
-check_conflicting_open(struct file *filp, const long arg, int flags)
+check_conflicting_open(struct file *filp, const int arg, int flags)
{
struct inode *inode = file_inode(filp);
int self_wcount = 0, self_rcount = 0;
@@ -1701,7 +1716,7 @@ check_conflicting_open(struct file *filp, const long arg, int flags)
}
static int
-generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
+generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv)
{
struct file_lock *fl, *my_fl = NULL, *lease;
struct inode *inode = file_inode(filp);
@@ -1859,7 +1874,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
* The (input) flp->fl_lmops->lm_break function is required
* by break_lease().
*/
-int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
+int generic_setlease(struct file *filp, int arg, struct file_lock **flp,
void **priv)
{
struct inode *inode = file_inode(filp);
@@ -1906,7 +1921,7 @@ lease_notifier_chain_init(void)
}
static inline void
-setlease_notifier(long arg, struct file_lock *lease)
+setlease_notifier(int arg, struct file_lock *lease)
{
if (arg != F_UNLCK)
srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
@@ -1942,7 +1957,7 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier);
* may be NULL if the lm_setup operation doesn't require it.
*/
int
-vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
+vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv)
{
if (lease)
setlease_notifier(arg, *lease);
@@ -1953,7 +1968,7 @@ vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
}
EXPORT_SYMBOL_GPL(vfs_setlease);
-static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
{
struct file_lock *fl;
struct fasync_struct *new;
@@ -1988,7 +2003,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
* Note that you also need to call %F_SETSIG to
* receive a signal when the lease is broken.
*/
-int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
if (arg == F_UNLCK)
return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
@@ -2136,7 +2151,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
* @fl: The file_lock who's fl_pid should be translated
* @ns: The namespace into which the pid should be translated
*
- * Used to tranlate a fl_pid into a namespace virtual pid number
+ * Used to translate a fl_pid into a namespace virtual pid number
*/
static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
{
@@ -2207,7 +2222,8 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
if (fl == NULL)
return -ENOMEM;
error = -EINVAL;
- if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
+ if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
+ && flock->l_type != F_WRLCK)
goto out;
error = flock_to_posix_lock(filp, fl, flock);
@@ -2414,7 +2430,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
return -ENOMEM;
error = -EINVAL;
- if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
+ if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
+ && flock->l_type != F_WRLCK)
goto out;
error = flock64_to_posix_lock(filp, fl, flock);
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 870207ba23f1..25c08fbfcb9d 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
}
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = j;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_blocks = 0;
memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
insert_inode_hash(inode);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index bf9858f76b6a..20f23e6e58ad 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -281,7 +281,7 @@ got_it:
de->inode = inode->i_ino;
}
dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
err = minix_handle_dirsync(dir);
out_put:
@@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
else
de->inode = 0;
dir_commit_chunk(page, pos, len);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return minix_handle_dirsync(inode);
}
@@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
else
de->inode = inode->i_ino;
dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
return minix_handle_dirsync(dir);
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e9fbb5303a22..df575473c1cc 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -501,10 +501,7 @@ static struct inode *V1_minix_iget(struct inode *inode)
i_gid_write(inode, raw_inode->i_gid);
set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
- inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0);
inode->i_blocks = 0;
for (i = 0; i < 9; i++)
minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
@@ -543,10 +540,9 @@ static struct inode *V2_minix_iget(struct inode *inode)
inode->i_size = raw_inode->i_size;
inode->i_mtime.tv_sec = raw_inode->i_mtime;
inode->i_atime.tv_sec = raw_inode->i_atime;
- inode->i_ctime.tv_sec = raw_inode->i_ctime;
+ inode_set_ctime(inode, raw_inode->i_ctime, 0);
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
inode->i_blocks = 0;
for (i = 0; i < 10; i++)
minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
@@ -622,7 +618,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
raw_inode->i_size = inode->i_size;
raw_inode->i_mtime = inode->i_mtime.tv_sec;
raw_inode->i_atime = inode->i_atime.tv_sec;
- raw_inode->i_ctime = inode->i_ctime.tv_sec;
+ raw_inode->i_ctime = inode_get_ctime(inode).tv_sec;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
else for (i = 0; i < 10; i++)
@@ -660,7 +656,7 @@ int minix_getattr(struct mnt_idmap *idmap, const struct path *path,
struct super_block *sb = path->dentry->d_sb;
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (INODE_VERSION(inode) == MINIX_V1)
stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
else
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index 446148792f41..ce18ae37c29d 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -131,7 +131,7 @@ static inline int splice_branch(struct inode *inode,
/* We are done with atomic stuff, now do the rest of housekeeping */
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
/* had we spliced it onto indirect block? */
if (where->bh)
@@ -350,7 +350,7 @@ do_indirects:
}
first_whole++;
}
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 956d5183828d..114084d5636a 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -98,7 +98,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
return add_nondir(dentry, inode);
@@ -154,7 +154,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry)
if (err)
return err;
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
return 0;
}
@@ -218,7 +218,7 @@ static int minix_rename(struct mnt_idmap *idmap,
put_page(new_page);
if (err)
goto out_dir;
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
diff --git a/fs/namei.c b/fs/namei.c
index e56ff39a79bc..567ee547492b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -643,6 +643,8 @@ static bool nd_alloc_stack(struct nameidata *nd)
/**
* path_connected - Verify that a dentry is below mnt.mnt_root
+ * @mnt: The mountpoint to check.
+ * @dentry: The dentry to check.
*
* Rename can sometimes move a file or directory outside of a bind
* mount, path_connected allows those cases to be detected.
@@ -1083,6 +1085,7 @@ fs_initcall(init_fs_namei_sysctls);
/**
* may_follow_link - Check symlink following for unsafe situations
* @nd: nameidata pathwalk data
+ * @inode: Used for idmapping.
*
* In the case of the sysctl_protected_symlinks sysctl being enabled,
* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
@@ -2890,7 +2893,7 @@ int path_pts(struct path *path)
dput(path->dentry);
path->dentry = parent;
child = d_hash_and_lookup(parent, &this);
- if (!child)
+ if (IS_ERR_OR_NULL(child))
return -ENOENT;
path->dentry = child;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c1eda73254e1..6bed1394d748 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -59,7 +59,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
res->change_attr = delegation->change_attr;
if (nfs_have_writebacks(inode))
res->change_attr++;
- res->ctime = inode->i_ctime;
+ res->ctime = inode_get_ctime(inode);
res->mtime = inode->i_mtime;
res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
args->bitmap[0];
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9a18c5a69ace..aaffaaa336cc 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -472,20 +472,26 @@ out:
return result;
}
-static void
-nfs_direct_join_group(struct list_head *list, struct inode *inode)
+static void nfs_direct_join_group(struct list_head *list, struct inode *inode)
{
- struct nfs_page *req, *next;
+ struct nfs_page *req, *subreq;
list_for_each_entry(req, list, wb_list) {
- if (req->wb_head != req || req->wb_this_page == req)
+ if (req->wb_head != req)
continue;
- for (next = req->wb_this_page;
- next != req->wb_head;
- next = next->wb_this_page) {
- nfs_list_remove_request(next);
- nfs_release_request(next);
- }
+ subreq = req->wb_this_page;
+ if (subreq == req)
+ continue;
+ do {
+ /*
+ * Remove subrequests from this list before freeing
+ * them in the call to nfs_join_page_group().
+ */
+ if (!list_empty(&subreq->wb_list)) {
+ nfs_list_remove_request(subreq);
+ nfs_release_request(subreq);
+ }
+ } while ((subreq = subreq->wb_this_page) != req);
nfs_join_page_group(req, inode);
}
}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index e1706e736c64..2dc64454492b 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -116,8 +116,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
memset(auxdata, 0, sizeof(*auxdata));
auxdata->mtime_sec = inode->i_mtime.tv_sec;
auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
- auxdata->ctime_sec = inode->i_ctime.tv_sec;
- auxdata->ctime_nsec = inode->i_ctime.tv_nsec;
+ auxdata->ctime_sec = inode_get_ctime(inode).tv_sec;
+ auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec;
if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4)
auxdata->change_attr = inode_peek_iversion_raw(inode);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8172dd4135a1..e21c073158e5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -514,7 +514,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
memset(&inode->i_atime, 0, sizeof(inode->i_atime));
memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
- memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+ inode_set_ctime(inode, 0, 0);
inode_set_iversion_raw(inode, 0);
inode->i_size = 0;
clear_nlink(inode);
@@ -535,7 +535,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -731,7 +731,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
if ((attr->ia_valid & ATTR_GID) != 0)
inode->i_gid = attr->ia_gid;
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_CTIME);
@@ -749,7 +749,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_CTIME);
@@ -765,7 +765,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_CTIME);
@@ -912,7 +912,7 @@ out_no_revalidate:
/* Only return attributes that were revalidated. */
stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
stat->change_cookie = inode_peek_iversion_raw(inode);
stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
@@ -1444,11 +1444,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
}
/* If we have atomic WCC data, we may update some attributes */
- ts = inode->i_ctime;
+ ts = inode_get_ctime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
&& timespec64_equal(&ts, &fattr->pre_ctime)) {
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
}
ts = inode->i_mtime;
@@ -1510,7 +1510,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
invalid |= NFS_INO_INVALID_MTIME;
- ts = inode->i_ctime;
+ ts = inode_get_ctime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime))
invalid |= NFS_INO_INVALID_CTIME;
@@ -1997,7 +1997,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
}
if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
- fattr->pre_ctime = inode->i_ctime;
+ fattr->pre_ctime = inode_get_ctime(inode);
fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
}
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
@@ -2190,7 +2190,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
save_cache_validity & NFS_INO_INVALID_MTIME;
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_CTIME;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 19d51ebf842c..e7494cdd957e 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -215,7 +215,8 @@ nfs_namespace_getattr(struct mnt_idmap *idmap,
if (NFS_FH(d_inode(path->dentry))->size != 0)
return nfs_getattr(idmap, path, stat, request_mask,
query_flags);
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
return 0;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 63802d195556..49f78e23b34c 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1377,7 +1377,6 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
for (i = 0; i < np; i++) {
pages[i] = alloc_page(GFP_KERNEL);
if (!pages[i]) {
- np = i + 1;
err = -ENOMEM;
goto out;
}
@@ -1401,8 +1400,8 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
} while (exception.retry);
out:
- while (--np >= 0)
- __free_page(pages[np]);
+ while (--i >= 0)
+ __free_page(pages[i]);
kfree(pages);
return err;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4c9f8bd866ab..47c5c1f86d66 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -328,7 +328,7 @@ extern int update_open_stateid(struct nfs4_state *state,
const nfs4_stateid *open_stateid,
const nfs4_stateid *deleg_stateid,
fmode_t fmode);
-extern int nfs4_proc_setlease(struct file *file, long arg,
+extern int nfs4_proc_setlease(struct file *file, int arg,
struct file_lock **lease, void **priv);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 4aeadd6e1a6d..02788c3c85e5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -438,7 +438,7 @@ void nfs42_ssc_unregister_ops(void)
}
#endif /* CONFIG_NFS_V4_2 */
-static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease,
+static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease,
void **priv)
{
return nfs4_proc_setlease(file, arg, lease, priv);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e1a886b58354..d57aaf0cc577 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6004,9 +6004,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
out_ok:
ret = res.acl_len;
out_free:
- for (i = 0; i < npages; i++)
- if (pages[i])
- __free_page(pages[i]);
+ while (--i >= 0)
+ __free_page(pages[i]);
if (res.acl_scratch)
__free_page(res.acl_scratch);
kfree(pages);
@@ -7181,8 +7180,15 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
} else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
goto out_restart;
break;
- case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OLD_STATEID:
+ if (data->arg.new_lock_owner != 0 &&
+ nfs4_refresh_open_old_stateid(&data->arg.open_stateid,
+ lsp->ls_state))
+ goto out_restart;
+ if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp))
+ goto out_restart;
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
if (data->arg.new_lock_owner != 0) {
@@ -7573,7 +7579,7 @@ static int nfs4_delete_lease(struct file *file, void **priv)
return generic_setlease(file, F_UNLCK, NULL, priv);
}
-static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease,
+static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
void **priv)
{
struct inode *inode = file_inode(file);
@@ -7591,7 +7597,7 @@ static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease,
return -EAGAIN;
}
-int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease,
+int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease,
void **priv)
{
switch (arg) {
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index acda8f033d30..bf378ecd5d9f 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -345,8 +345,10 @@ void nfs_sysfs_move_sb_to_server(struct nfs_server *server)
int ret = -ENOMEM;
s = kasprintf(GFP_KERNEL, "server-%d", server->s_sysfs_id);
- if (s)
+ if (s) {
ret = kobject_rename(&server->kobj, s);
+ kfree(s);
+ }
if (ret < 0)
pr_warn("NFS: rename sysfs %s failed (%d)\n",
server->kobj.name, ret);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3aefbad4cc09..daf305daa751 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1354,9 +1354,9 @@ static void revoke_delegation(struct nfs4_delegation *dp)
trace_nfsd_stid_revoke(&dp->dl_stid);
if (clp->cl_minorversion) {
+ spin_lock(&clp->cl_lock);
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
refcount_inc(&dp->dl_stid.sc_count);
- spin_lock(&clp->cl_lock);
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
spin_unlock(&clp->cl_lock);
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1b8b1aab9a15..3709830f90a6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1105,6 +1105,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
if (!nn->nfsd_serv)
return -EBUSY;
trace_nfsd_end_grace(netns(file));
+ nfsd4_end_grace(nn);
break;
default:
return -EINVAL;
@@ -1131,7 +1132,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
/* Following advice from simple_fill_super documentation: */
inode->i_ino = iunique(sb, NFSD_MaxReserved);
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_fop = &simple_dir_operations;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2c9074ab2315..9b7acba382fe 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -520,7 +520,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_sanitize_attrs(inode, iap);
- if (check_guard && guardtime != inode->i_ctime.tv_sec)
+ if (check_guard && guardtime != inode_get_ctime(inode).tv_sec)
return nfserr_notsync;
/*
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index decd6471300b..bce734b68f08 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, mapping, from, to);
nilfs_put_page(page);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
}
/*
@@ -519,7 +519,7 @@ got_it:
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, page->mapping, from, to);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
nilfs_mark_inode_dirty(dir);
/* OFFSET_CACHE */
out_put:
@@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
pde->rec_len = nilfs_rec_len_to_disk(to - from);
dir->inode = 0;
nilfs_commit_chunk(page, mapping, from, to);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
out:
nilfs_put_page(page);
return err;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 35bc79305318..d588c719d743 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
atomic64_inc(&root->inodes_count);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
err = nilfs_bmap_read(ii->i_bmap, NULL);
@@ -450,10 +450,10 @@ int nilfs_read_inode_common(struct inode *inode,
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le64_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
- inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+ inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime),
+ le32_to_cpu(raw_inode->i_ctime_nsec));
inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
return -EIO; /* this inode is for metadata and corrupted */
@@ -768,9 +768,9 @@ void nilfs_write_inode_common(struct inode *inode,
raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le64(inode->i_size);
- raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
@@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode)
nilfs_truncate_bmap(ii, blkoff);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 1dfbc0c34513..40ffade49f38 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -149,7 +149,7 @@ int nilfs_fileattr_set(struct mnt_idmap *idmap,
NILFS_I(inode)->i_flags = oldflags | (flags & FS_FL_USER_MODIFIABLE);
nilfs_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index c7024da8f1e2..2a4e7f4a8102 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -185,7 +185,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
return err;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
drop_nlink(inode);
err = 0;
out:
@@ -387,7 +387,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
goto out_dir;
nilfs_set_link(new_dir, new_de, new_page, old_inode);
nilfs_mark_inode_dirty(new_dir);
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
drop_nlink(new_inode);
@@ -406,7 +406,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
nilfs_delete_entry(old_de, old_page);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 581691e4be49..7ec16879756e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -725,6 +725,11 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
struct folio *folio = fbatch.folios[i];
folio_lock(folio);
+ if (unlikely(folio->mapping != mapping)) {
+ /* Exclude folios removed from the address space */
+ folio_unlock(folio);
+ continue;
+ }
head = folio_buffers(folio);
if (!head) {
create_empty_buffers(&folio->page, i_blocksize(inode), 0);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0ef8c71bde8e..a5d1fa4e7552 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -35,6 +35,7 @@
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include "nilfs.h"
#include "export.h"
#include "mdt.h"
@@ -1216,7 +1217,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
}
struct nilfs_super_data {
- struct block_device *bdev;
__u64 cno;
int flags;
};
@@ -1283,64 +1283,49 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
static int nilfs_set_bdev_super(struct super_block *s, void *data)
{
- s->s_bdev = data;
- s->s_dev = s->s_bdev->bd_dev;
+ s->s_dev = *(dev_t *)data;
return 0;
}
static int nilfs_test_bdev_super(struct super_block *s, void *data)
{
- return (void *)s->s_bdev == data;
+ return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
}
static struct dentry *
nilfs_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
- struct nilfs_super_data sd;
+ struct nilfs_super_data sd = { .flags = flags };
struct super_block *s;
- struct dentry *root_dentry;
- int err, s_new = false;
+ dev_t dev;
+ int err;
- sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type,
- NULL);
- if (IS_ERR(sd.bdev))
- return ERR_CAST(sd.bdev);
+ if (nilfs_identify(data, &sd))
+ return ERR_PTR(-EINVAL);
- sd.cno = 0;
- sd.flags = flags;
- if (nilfs_identify((char *)data, &sd)) {
- err = -EINVAL;
- goto failed;
- }
+ err = lookup_bdev(dev_name, &dev);
+ if (err)
+ return ERR_PTR(err);
- /*
- * once the super is inserted into the list by sget, s_umount
- * will protect the lockfs code from trying to start a snapshot
- * while we are mounting
- */
- mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
- if (sd.bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
- err = -EBUSY;
- goto failed;
- }
s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
- sd.bdev);
- mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s)) {
- err = PTR_ERR(s);
- goto failed;
- }
+ &dev);
+ if (IS_ERR(s))
+ return ERR_CAST(s);
if (!s->s_root) {
- s_new = true;
-
- /* New superblock instance created */
- snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
- sb_set_blocksize(s, block_size(sd.bdev));
-
- err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0);
+ /*
+ * We drop s_umount here because we need to open the bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * __invalidate_device()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
+ */
+ up_write(&s->s_umount);
+ err = setup_bdev_super(s, flags, NULL);
+ down_write(&s->s_umount);
+ if (!err)
+ err = nilfs_fill_super(s, data,
+ flags & SB_SILENT ? 1 : 0);
if (err)
goto failed_super;
@@ -1366,24 +1351,18 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (sd.cno) {
+ struct dentry *root_dentry;
+
err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
if (err)
goto failed_super;
- } else {
- root_dentry = dget(s->s_root);
+ return root_dentry;
}
- if (!s_new)
- blkdev_put(sd.bdev, fs_type);
-
- return root_dentry;
+ return dget(s->s_root);
failed_super:
deactivate_locked_super(s);
-
- failed:
- if (!s_new)
- blkdev_put(sd.bdev, fs_type);
return ERR_PTR(err);
}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 190aa717fa32..ebdcc25df0f7 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -199,7 +199,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
}
/* this conversion is done only at watch creation */
-static __u32 convert_arg(unsigned long arg)
+static __u32 convert_arg(unsigned int arg)
{
__u32 new_mask = FS_EVENT_ON_CHILD;
@@ -258,7 +258,7 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
* up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be
* attached to the fsnotify_mark.
*/
-int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
{
struct dnotify_mark *new_dn_mark, *dn_mark;
struct fsnotify_mark *new_fsn_mark, *fsn_mark;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index f602a96a1afe..647a22433bd8 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -84,7 +84,7 @@ slow:
return -ENOMEM;
}
inode->i_ino = ns->inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_flags |= S_IMMUTABLE;
inode->i_mode = S_IFREG | S_IRUGO;
inode->i_fop = &ns_file_operations;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 6c3f38d66579..99ac6ea277c4 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -654,7 +654,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
* always changes, when mtime is changed. ctime can be changed on its
* own, mtime is then not changed, e.g. when a file is renamed.
*/
- vi->i_ctime = ntfs2utc(si->last_mft_change_time);
+ inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time));
/*
* Last access to the data within the file. Not changed during a rename
* for example but changed whenever the file is written to.
@@ -1218,7 +1218,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
vi->i_mtime = base_vi->i_mtime;
- vi->i_ctime = base_vi->i_ctime;
+ inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
vi->i_atime = base_vi->i_atime;
vi->i_generation = ni->seq_no = base_ni->seq_no;
@@ -1484,7 +1484,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
vi->i_mtime = base_vi->i_mtime;
- vi->i_ctime = base_vi->i_ctime;
+ inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
vi->i_atime = base_vi->i_atime;
vi->i_generation = ni->seq_no = base_ni->seq_no;
/* Set inode type to zero but preserve permissions. */
@@ -2804,13 +2804,14 @@ done:
*/
if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
struct timespec64 now = current_time(VFS_I(base_ni));
+ struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
int sync_it = 0;
if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) ||
- !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now))
+ !timespec64_equal(&ctime, &now))
sync_it = 1;
+ inode_set_ctime_to_ts(VFS_I(base_ni), now);
VFS_I(base_ni)->i_mtime = now;
- VFS_I(base_ni)->i_ctime = now;
if (sync_it)
mark_inode_dirty_sync(VFS_I(base_ni));
@@ -2928,7 +2929,7 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (ia_valid & ATTR_MTIME)
vi->i_mtime = attr->ia_mtime;
if (ia_valid & ATTR_CTIME)
- vi->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(vi, attr->ia_ctime);
mark_inode_dirty(vi);
out:
return err;
@@ -3004,7 +3005,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si->last_data_change_time = nt;
modified = true;
}
- nt = utc2ntfs(vi->i_ctime);
+ nt = utc2ntfs(inode_get_ctime(vi));
if (si->last_mft_change_time != nt) {
ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino, (long long)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 0155f106ec34..ad1a8f72da22 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2682,8 +2682,7 @@ mft_rec_already_initialized:
vi->i_mode &= ~S_IWUGO;
/* Set the inode times to the current time. */
- vi->i_atime = vi->i_mtime = vi->i_ctime =
- current_time(vi);
+ vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi);
/*
* Set the file size to 0, the ntfs inode sizes are set to 0 by
* the call to ntfs_init_big_inode() below.
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 1d6c824246c4..962f12ce6c0a 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -85,7 +85,7 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED;
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->result_mask |= STATX_BTIME;
stat->btime = ni->i_crtime;
@@ -342,7 +342,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
err = 0;
}
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
if (IS_SYNC(inode)) {
@@ -400,7 +400,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
ni_unlock(ni);
ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (!IS_DIRSYNC(inode)) {
dirty = 1;
} else {
@@ -642,7 +642,7 @@ out:
filemap_invalidate_unlock(mapping);
if (!err) {
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 16bd9faa2d28..2b85cb10f0be 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -3265,6 +3265,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (is_rec_inuse(ni->mi.mrec) &&
!(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
bool modified = false;
+ struct timespec64 ctime = inode_get_ctime(inode);
/* Update times in standard attribute. */
std = ni_std(ni);
@@ -3280,7 +3281,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
modified = true;
}
- dup.c_time = kernel2nt(&inode->i_ctime);
+ dup.c_time = kernel2nt(&ctime);
if (std->c_time != dup.c_time) {
std->c_time = dup.c_time;
modified = true;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index dc7e7ab701c6..4123e126c4d0 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -44,6 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
u64 t64;
struct MFT_REC *rec;
struct runs_tree *run;
+ struct timespec64 ctime;
inode->i_op = NULL;
/* Setup 'uid' and 'gid' */
@@ -169,7 +170,8 @@ next_attr:
nt2kernel(std5->cr_time, &ni->i_crtime);
#endif
nt2kernel(std5->a_time, &inode->i_atime);
- nt2kernel(std5->c_time, &inode->i_ctime);
+ ctime = inode_get_ctime(inode);
+ nt2kernel(std5->c_time, &ctime);
nt2kernel(std5->m_time, &inode->i_mtime);
ni->std_fa = std5->fa;
@@ -958,7 +960,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
if (err >= 0) {
if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) {
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
dirty = true;
}
@@ -1658,8 +1660,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
/* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
- inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime =
- dir->i_ctime = ni->i_crtime;
+ inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, ni->i_crtime);
+ dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime);
mark_inode_dirty(dir);
mark_inode_dirty(inode);
@@ -1765,9 +1767,9 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
if (!err) {
drop_nlink(inode);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
if (inode->i_nlink)
mark_inode_dirty(inode);
} else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) {
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index 70f8c859e0ad..ad430d50bd79 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
err = ntfs_link_inode(inode, de);
if (!err) {
- dir->i_ctime = dir->i_mtime = inode->i_ctime =
- current_time(dir);
+ dir->i_mtime = inode_set_ctime_to_ts(inode,
+ inode_set_ctime_current(dir));
mark_inode_dirty(inode);
mark_inode_dirty(dir);
d_instantiate(de, inode);
@@ -324,14 +324,11 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
/* Restore after failed rename failed too. */
_ntfs_bad_inode(inode);
} else if (!err) {
- inode->i_ctime = dir->i_ctime = dir->i_mtime =
- current_time(dir);
+ simple_rename_timestamp(dir, dentry, new_dir, new_dentry);
mark_inode_dirty(inode);
mark_inode_dirty(dir);
- if (dir != new_dir) {
- new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime;
+ if (dir != new_dir)
mark_inode_dirty(new_dir);
- }
if (IS_DIRSYNC(dir))
ntfs_sync_inode(dir);
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 1a02072b6b0e..5fffddea554f 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -569,9 +569,9 @@ static void init_once(void *foo)
}
/*
- * put_ntfs - Noinline to reduce binary size.
+ * Noinline to reduce binary size.
*/
-static noinline void put_ntfs(struct ntfs_sb_info *sbi)
+static noinline void ntfs3_free_sbi(struct ntfs_sb_info *sbi)
{
kfree(sbi->new_rec);
kvfree(ntfs_put_shared(sbi->upcase));
@@ -625,12 +625,6 @@ static void ntfs_put_super(struct super_block *sb)
/* Mark rw ntfs as clear, if possible. */
ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
-
- put_mount_options(sbi->options);
- put_ntfs(sbi);
- sb->s_fs_info = NULL;
-
- sync_blockdev(sb->s_bdev);
}
static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1564,15 +1558,7 @@ load_root:
put_inode_out:
iput(inode);
out:
- /*
- * Free resources here.
- * ntfs_fs_free will be called with fc->s_fs_info = NULL
- */
- put_mount_options(sbi->options);
- put_ntfs(sbi);
- sb->s_fs_info = NULL;
kfree(boot2);
-
return err;
}
@@ -1659,7 +1645,7 @@ static void ntfs_fs_free(struct fs_context *fc)
struct ntfs_sb_info *sbi = fc->s_fs_info;
if (sbi)
- put_ntfs(sbi);
+ ntfs3_free_sbi(sbi);
if (opts)
put_mount_options(opts);
@@ -1728,13 +1714,24 @@ free_opts:
return -ENOMEM;
}
+static void ntfs3_kill_sb(struct super_block *sb)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+
+ kill_block_super(sb);
+
+ if (sbi->options)
+ put_mount_options(sbi->options);
+ ntfs3_free_sbi(sbi);
+}
+
// clang-format off
static struct file_system_type ntfs_fs_type = {
.owner = THIS_MODULE,
.name = "ntfs3",
.init_fs_context = ntfs_init_fs_context,
.parameters = ntfs_fs_parameters,
- .kill_sb = kill_block_super,
+ .kill_sb = ntfs3_kill_sb,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
// clang-format on
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 023f314e8950..29fd391899e5 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -637,7 +637,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
if (!err) {
set_cached_acl(inode, type, acl);
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
@@ -924,7 +924,7 @@ set_new_fa:
NULL);
out:
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return err;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 9fd03eaf15f8..e75137a8e7cb 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -191,10 +191,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_mode = new_mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
di->i_mode = cpu_to_le16(inode->i_mode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 51c93929a146..aef58f1395c8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7436,10 +7436,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8dfc284e85f0..0fdba30740ab 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2048,7 +2048,7 @@ out_write_size:
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
if (handle)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 694471fc46b8..8b123d543e6e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1658,7 +1658,7 @@ int __ocfs2_add_entry(handle_t *handle,
offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
if (ocfs2_dirent_would_fit(de, rec_len)) {
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (retval < 0) {
mlog_errno(retval);
@@ -2962,11 +2962,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_dinode_new_extent_list(dir, di);
i_size_write(dir, sb->s_blocksize);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
di->i_size = cpu_to_le64(sb->s_blocksize);
- di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, dir, 1);
/*
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index ba26c5567cff..81265123ce6c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -337,7 +337,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
@@ -360,7 +360,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
ip = DLMFS_I(inode);
ip->ip_conn = DLMFS_I(parent)->ip_conn;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c28bc983a7b1..c3e2961ee5db 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2162,6 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
+ struct timespec64 ctime = inode_get_ctime(inode);
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2185,7 +2186,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_iatime_packed =
cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+ cpu_to_be64(ocfs2_pack_timespec(&ctime));
lvb->lvb_imtime_packed =
cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
@@ -2208,6 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
+ struct timespec64 ctime;
mlog_meta_lvb(0, lockres);
@@ -2238,8 +2240,9 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
be64_to_cpu(lvb->lvb_iatime_packed));
ocfs2_unpack_timespec(&inode->i_mtime,
be64_to_cpu(lvb->lvb_imtime_packed));
- ocfs2_unpack_timespec(&inode->i_ctime,
+ ocfs2_unpack_timespec(&ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
+ inode_set_ctime_to_ts(inode, ctime);
spin_unlock(&oi->ip_lock);
return 0;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bf2c17ea96a0..3b91b4cc7c6a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -232,8 +232,10 @@ int ocfs2_should_update_atime(struct inode *inode,
return 0;
if (vfsmnt->mnt_flags & MNT_RELATIME) {
+ struct timespec64 ctime = inode_get_ctime(inode);
+
if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
- (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
+ (timespec64_compare(&inode->i_atime, &ctime) <= 0))
return 1;
return 0;
@@ -294,7 +296,7 @@ int ocfs2_set_inode_size(handle_t *handle,
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
@@ -415,12 +417,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
}
i_size_write(inode, new_i_size);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
@@ -824,7 +826,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
i_size_write(inode, abs_to);
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
di->i_mtime_nsec = di->i_ctime_nsec;
@@ -1317,7 +1319,7 @@ int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path,
goto bail;
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
/*
* If there is inline data in the inode, the inode will normally not
* have data blocks allocated (it may have an external xattr block).
@@ -2043,7 +2045,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index bb116c39b581..e8771600b930 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -306,8 +306,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
- inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+ inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
+ le32_to_cpu(fe->i_ctime_nsec));
if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
mlog(ML_ERROR,
@@ -1314,8 +1314,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_mode = cpu_to_le16(inode->i_mode);
fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
@@ -1352,8 +1352,8 @@ void ocfs2_refresh_inode(struct inode *inode,
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
- inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+ inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
+ le32_to_cpu(fe->i_ctime_nsec));
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 25d8072ccfce..c19c730c26e2 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -557,7 +557,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
(unsigned long)bh,
(unsigned long long)bh->b_blocknr);
- ocfs2_error(bh->b_bdev->bd_super,
+ ocfs2_error(bh->b_assoc_map->host->i_sb,
"JBD2 has aborted our journal, ocfs2 cannot continue\n");
}
@@ -780,14 +780,14 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
mlog_errno(status);
if (!is_handle_aborted(handle)) {
journal_t *journal = handle->h_transaction->t_journal;
- struct super_block *sb = bh->b_bdev->bd_super;
mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
"Aborting transaction and journal.\n");
handle->h_err = status;
jbd2_journal_abort_handle(handle);
jbd2_journal_abort(journal, status);
- ocfs2_abort(sb, "Journal already aborted.\n");
+ ocfs2_abort(bh->b_assoc_map->host->i_sb,
+ "Journal already aborted.\n");
}
}
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index b1e32ec4a9d4..05d67968a3a9 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -950,9 +950,9 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
}
di = (struct ocfs2_dinode *)di_bh->b_data;
- inode->i_ctime = current_time(inode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 17c52225b87d..e4a684d45308 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -793,10 +793,10 @@ static int ocfs2_link(struct dentry *old_dentry,
}
inc_nlink(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
@@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir,
ocfs2_set_links_count(fe, inode->i_nlink);
ocfs2_journal_dirty(handle, fe_bh);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
if (S_ISDIR(inode->i_mode))
drop_nlink(dir);
@@ -1537,7 +1537,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
new_dir_bh, &target_insert);
}
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
@@ -1546,8 +1546,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
- old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
- old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+ old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
+ old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
@@ -1586,9 +1586,9 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (new_inode) {
drop_nlink(new_inode);
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
}
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
@@ -1610,7 +1610,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (old_dir != new_dir) {
/* Keep the same times on both directories.*/
- new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime;
+ new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
+ inode_get_ctime(old_dir));
/*
* This will also pick up the i_nlink change from the
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 564ab48d03ef..25c8ec3c8c3a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3750,9 +3750,9 @@ static int ocfs2_change_ctime(struct inode *inode,
goto out_commit;
}
- inode->i_ctime = current_time(inode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_journal_dirty(handle, di_bh);
@@ -4073,10 +4073,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
* we want mtime to appear identical to the source and
* update ctime.
*/
- t_inode->i_ctime = current_time(t_inode);
+ inode_set_ctime_current(t_inode);
- di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec);
t_inode->i_mtime = s_inode->i_mtime;
di->i_mtime = s_di->i_mtime;
@@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest,
if (newlen > i_size_read(dest))
i_size_write(dest, newlen);
spin_unlock(&OCFS2_I(dest)->ip_lock);
- dest->i_ctime = dest->i_mtime = current_time(dest);
+ dest->i_mtime = inode_set_ctime_current(dest);
ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
if (ret) {
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4ac77ff6e676..6510ad783c91 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3421,9 +3421,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- inode->i_ctime = current_time(inode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
}
out:
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 82cf7e9a665f..6bda275826d6 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -143,7 +143,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode)
mark_buffer_dirty(bh);
brelse(bh);
- dir->i_ctime = current_time(dir);
+ inode_set_ctime_current(dir);
/* mark affected inodes dirty to rebuild checksums */
mark_inode_dirty(dir);
@@ -399,7 +399,7 @@ static int omfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (err)
goto out;
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
out:
return err;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index c4c79e07efc7..2f8c1882f45c 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_mapping->a_ops = &omfs_aops;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_op = &omfs_dir_inops;
@@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait)
oi->i_head.h_magic = OMFS_IMAGIC;
oi->i_size = cpu_to_be64(inode->i_size);
- ctime = inode->i_ctime.tv_sec * 1000LL +
- ((inode->i_ctime.tv_nsec + 999)/1000);
+ ctime = inode_get_ctime(inode).tv_sec * 1000LL +
+ ((inode_get_ctime(inode).tv_nsec + 999)/1000);
oi->i_ctime = cpu_to_be64(ctime);
omfs_update_checksums(oi);
@@ -232,10 +232,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
inode->i_atime.tv_sec = ctime;
inode->i_mtime.tv_sec = ctime;
- inode->i_ctime.tv_sec = ctime;
+ inode_set_ctime(inode, ctime, nsecs);
inode->i_atime.tv_nsec = nsecs;
inode->i_mtime.tv_nsec = nsecs;
- inode->i_ctime.tv_nsec = nsecs;
inode->i_mapping->a_ops = &omfs_aops;
diff --git a/fs/open.c b/fs/open.c
index e6ead0f19964..98f6601fbac6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -671,11 +671,20 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
return err;
}
-static int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
+static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
+ unsigned int flags)
{
struct path path;
int error;
- unsigned int lookup_flags = LOOKUP_FOLLOW;
+ unsigned int lookup_flags;
+
+ if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)))
+ return -EINVAL;
+
+ lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (!error) {
@@ -689,15 +698,21 @@ retry:
return error;
}
+SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
+ umode_t, mode, unsigned int, flags)
+{
+ return do_fchmodat(dfd, filename, mode, flags);
+}
+
SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
umode_t, mode)
{
- return do_fchmodat(dfd, filename, mode);
+ return do_fchmodat(dfd, filename, mode, 0);
}
SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
- return do_fchmodat(AT_FDCWD, filename, mode);
+ return do_fchmodat(AT_FDCWD, filename, mode, 0);
}
/*
@@ -1150,7 +1165,7 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
* backing_file_open - open a backing file for kernel internal use
* @path: path of the file to open
* @flags: open flags
- * @path: path of the backing file
+ * @real_path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
@@ -1503,7 +1518,7 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
* "id" is the POSIX thread ID. We use the
* files pointer for this..
*/
-int filp_close(struct file *filp, fl_owner_t id)
+static int filp_flush(struct file *filp, fl_owner_t id)
{
int retval = 0;
@@ -1520,10 +1535,18 @@ int filp_close(struct file *filp, fl_owner_t id)
dnotify_flush(filp, id);
locks_remove_posix(filp, id);
}
- fput(filp);
return retval;
}
+int filp_close(struct file *filp, fl_owner_t id)
+{
+ int retval;
+
+ retval = filp_flush(filp, id);
+ fput(filp);
+
+ return retval;
+}
EXPORT_SYMBOL(filp_close);
/*
@@ -1533,7 +1556,20 @@ EXPORT_SYMBOL(filp_close);
*/
SYSCALL_DEFINE1(close, unsigned int, fd)
{
- int retval = close_fd(fd);
+ int retval;
+ struct file *file;
+
+ file = close_fd_get_file(fd);
+ if (!file)
+ return -EBADF;
+
+ retval = filp_flush(file, current->files);
+
+ /*
+ * We're returning to user space. Don't bother
+ * with any delayed fput() cases.
+ */
+ __fput_sync(file);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
@@ -1546,7 +1582,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
}
/**
- * close_range() - Close all file descriptors in a given range.
+ * sys_close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index f0b7f4d51a17..b2457cb97fa0 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -237,7 +237,7 @@ found:
if (IS_ERR(inode))
return ERR_CAST(inode);
if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
ent_oi = OP_I(inode);
ent_oi->type = ent_type;
ent_oi->u = ent_data;
@@ -387,8 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
goto out_no_root;
}
- root_inode->i_mtime = root_inode->i_atime =
- root_inode->i_ctime = current_time(root_inode);
+ root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode);
root_inode->i_op = &openprom_inode_operations;
root_inode->i_fop = &openprom_operations;
root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 9014bbcc8031..085912268442 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -871,7 +871,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path,
ret = orangefs_inode_getattr(inode,
request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
if (ret == 0) {
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
/* override block size reported to stat */
if (!(request_mask & STATX_SIZE))
@@ -900,12 +900,13 @@ int orangefs_permission(struct mnt_idmap *idmap,
return generic_permission(&nop_mnt_idmap, inode, mask);
}
-int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags)
+int orangefs_update_time(struct inode *inode, int flags)
{
struct iattr iattr;
+
gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n",
get_khandle_from_ino(inode));
- generic_update_time(inode, time, flags);
+ flags = generic_update_time(inode, flags);
memset(&iattr, 0, sizeof iattr);
if (flags & S_ATIME)
iattr.ia_valid |= ATTR_ATIME;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 77518e248cf7..c9dfd5c6a097 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -421,7 +421,7 @@ static int orangefs_rename(struct mnt_idmap *idmap,
ret);
if (new_dentry->d_inode)
- new_dentry->d_inode->i_ctime = current_time(new_dentry->d_inode);
+ inode_set_ctime_current(d_inode(new_dentry));
op_release(new_op);
return ret;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index ce20d3443869..b711654ca18a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -370,7 +370,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path,
int orangefs_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask);
-int orangefs_update_time(struct inode *, struct timespec64 *, int);
+int orangefs_update_time(struct inode *, int);
/*
* defined in xattr.c
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 46b7dcff18ac..0a9fcfdf552f 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -361,11 +361,11 @@ again2:
downcall.resp.getattr.attributes.atime;
inode->i_mtime.tv_sec = (time64_t)new_op->
downcall.resp.getattr.attributes.mtime;
- inode->i_ctime.tv_sec = (time64_t)new_op->
- downcall.resp.getattr.attributes.ctime;
+ inode_set_ctime(inode,
+ (time64_t)new_op->downcall.resp.getattr.attributes.ctime,
+ 0);
inode->i_atime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
/* special case: mark the root inode as sticky */
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 21245b00722a..eaa1e6b3e04a 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -239,6 +239,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
static void ovl_file_accessed(struct file *file)
{
struct inode *inode, *upperinode;
+ struct timespec64 ctime, uctime;
if (file->f_flags & O_NOATIME)
return;
@@ -249,10 +250,12 @@ static void ovl_file_accessed(struct file *file)
if (!upperinode)
return;
+ ctime = inode_get_ctime(inode);
+ uctime = inode_get_ctime(upperinode);
if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) ||
- !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) {
+ !timespec64_equal(&ctime, &uctime))) {
inode->i_mtime = upperinode->i_mtime;
- inode->i_ctime = upperinode->i_ctime;
+ inode_set_ctime_to_ts(inode, uctime);
}
touch_atime(&file->f_path);
@@ -290,10 +293,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
if (iocb->ki_flags & IOCB_WRITE) {
struct inode *inode = file_inode(orig_iocb->ki_filp);
- /* Actually acquired in ovl_write_iter() */
- __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
- SB_FREEZE_WRITE);
- file_end_write(iocb->ki_filp);
+ kiocb_end_write(iocb);
ovl_copyattr(inode);
}
@@ -409,10 +409,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!aio_req)
goto out;
- file_start_write(real.file);
- /* Pacify lockdep, same trick as done in aio_write() */
- __sb_writers_release(file_inode(real.file)->i_sb,
- SB_FREEZE_WRITE);
aio_req->fd = real;
real.flags = 0;
aio_req->orig_iocb = iocb;
@@ -420,6 +416,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
aio_req->iocb.ki_flags = ifl;
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
refcount_set(&aio_req->ref, 2);
+ kiocb_start_write(&aio_req->iocb);
ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
ovl_aio_put(aio_req);
if (ret != -EIOCBQUEUED)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index a63e57447be9..f22e27b78025 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -693,7 +693,7 @@ int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
}
#endif
-int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
+int ovl_update_time(struct inode *inode, int flags)
{
if (flags & S_ATIME) {
struct ovl_fs *ofs = inode->i_sb->s_fs_info;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 9402591f12aa..8bbe6173bef4 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -665,7 +665,7 @@ static inline struct posix_acl *ovl_get_acl_path(const struct path *path,
}
#endif
-int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
+int ovl_update_time(struct inode *inode, int flags);
bool ovl_is_private_xattr(struct super_block *sb, const char *name);
struct ovl_inode_params {
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 7ef9e13c404a..c210b5d496a8 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1202,6 +1202,6 @@ void ovl_copyattr(struct inode *inode)
inode->i_mode = realinode->i_mode;
inode->i_atime = realinode->i_atime;
inode->i_mtime = realinode->i_mtime;
- inode->i_ctime = realinode->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(realinode));
i_size_write(inode, i_size_read(realinode));
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 2d88f73f585a..6c1a9b1db907 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -489,7 +489,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[head & mask];
+ struct pipe_buffer *buf;
struct page *page = pipe->tmp_page;
int copied;
@@ -899,7 +899,7 @@ static struct inode * get_pipe_inode(void)
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
return inode;
@@ -1236,7 +1236,7 @@ const struct file_operations pipefifo_fops = {
* Currently we rely on the pipe array holding a power-of-2 number
* of pages. Returns 0 on error.
*/
-unsigned int round_pipe_size(unsigned long size)
+unsigned int round_pipe_size(unsigned int size)
{
if (size > (1U << 31))
return 0;
@@ -1319,7 +1319,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
* Allocate a new array of pipe buffers and copy the info over. Returns the
* pipe size if successful, or return -ERROR on error.
*/
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
{
unsigned long user_bufs;
unsigned int nr_slots, size;
@@ -1387,7 +1387,7 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
return pipe;
}
-long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
struct pipe_inode_info *pipe;
long ret;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 7fa1b738bbab..a05fe94970ce 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1027,7 +1027,7 @@ int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
return error;
}
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
set_cached_acl(inode, type, acl);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9df3f4839662..7576effe8d52 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1902,7 +1902,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb,
ei = PROC_I(inode);
inode->i_mode = mode;
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_op = &proc_def_inode_operations;
/*
@@ -1966,7 +1966,7 @@ int pid_getattr(struct mnt_idmap *idmap, const struct path *path,
struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->uid = GLOBAL_ROOT_UID;
stat->gid = GLOBAL_ROOT_GID;
@@ -3583,7 +3583,8 @@ static int proc_tid_comm_permission(struct mnt_idmap *idmap,
}
static const struct inode_operations proc_tid_comm_inode_operations = {
- .permission = proc_tid_comm_permission,
+ .setattr = proc_setattr,
+ .permission = proc_tid_comm_permission,
};
/*
@@ -3899,7 +3900,7 @@ static int proc_task_getattr(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(path->dentry);
struct task_struct *p = get_proc_task(inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (p) {
stat->nlink += get_nr_threads(p);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index b3140deebbbf..6276b3938842 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -352,7 +352,7 @@ static int proc_fd_getattr(struct mnt_idmap *idmap,
struct inode *inode = d_inode(path->dentry);
int rv = 0;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
/* If it's a directory, put the number of open fds there */
if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 42ae38ff6e7e..775ce0bcf08c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -146,7 +146,7 @@ static int proc_getattr(struct mnt_idmap *idmap,
}
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 67b09a1d9433..532dc9d240f7 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -660,7 +660,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_private = de->data;
inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
PROC_I(inode)->pde = de;
if (is_empty_pde(de)) {
make_empty_dir_inode(inode);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a0c0419872e3..2ba31b6d68c0 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -308,7 +308,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
net = get_proc_task_net(inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (net != NULL) {
stat->nlink = net->proc_net->nlink;
@@ -321,6 +321,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
const struct inode_operations proc_net_inode_operations = {
.lookup = proc_tgid_net_lookup,
.getattr = proc_tgid_net_getattr,
+ .setattr = proc_setattr,
};
static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5ea42653126e..bf06344a42cc 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -463,7 +463,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
head->count++;
spin_unlock(&sysctl_lock);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
@@ -849,7 +849,7 @@ static int proc_sys_getattr(struct mnt_idmap *idmap,
if (IS_ERR(head))
return PTR_ERR(head);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (table)
stat->mode = (stat->mode & S_IFMT) | table->mode;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index a86e65a608da..9191248f2dac 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -314,7 +314,8 @@ static int proc_root_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
stat->nlink = proc_root.nlink + nr_processes();
return 0;
}
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 72cd69bcaf4a..ecc4da8d265e 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 507cd4e59d07..fafff1bd34cd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -587,8 +587,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
bool migration = false;
if (pmd_present(*pmd)) {
- /* FOLL_DUMP will return -EFAULT on huge zero page */
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ page = vm_normal_page_pmd(vma, addr, *pmd);
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
@@ -758,12 +757,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops smaps_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static const struct mm_walk_ops smaps_shmem_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
.pte_hole = smaps_pte_hole,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1245,6 +1246,7 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
static const struct mm_walk_ops clear_refs_walk_ops = {
.pmd_entry = clear_refs_pte_range,
.test_walk = clear_refs_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
};
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
@@ -1622,6 +1624,7 @@ static const struct mm_walk_ops pagemap_ops = {
.pmd_entry = pagemap_pmd_range,
.pte_hole = pagemap_pte_hole,
.hugetlb_entry = pagemap_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1935,6 +1938,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops show_numa_ops = {
.hugetlb_entry = gather_hugetlb_stats,
.pmd_entry = gather_pte_stats,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a553273fbd41..63ac1f93289f 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index c49d554cc9ae..3acc38600cd1 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
config PSTORE
tristate "Persistent store support"
- select CRYPTO if PSTORE_COMPRESS
default n
help
This option enables generic access to platform level
@@ -22,99 +21,18 @@ config PSTORE_DEFAULT_KMSG_BYTES
Defines default size of pstore kernel log storage.
Can be enlarged if needed, not recommended to shrink it.
-config PSTORE_DEFLATE_COMPRESS
- tristate "DEFLATE (ZLIB) compression"
- default y
- depends on PSTORE
- select CRYPTO_DEFLATE
- help
- This option enables DEFLATE (also known as ZLIB) compression
- algorithm support.
-
-config PSTORE_LZO_COMPRESS
- tristate "LZO compression"
- depends on PSTORE
- select CRYPTO_LZO
- help
- This option enables LZO compression algorithm support.
-
-config PSTORE_LZ4_COMPRESS
- tristate "LZ4 compression"
- depends on PSTORE
- select CRYPTO_LZ4
- help
- This option enables LZ4 compression algorithm support.
-
-config PSTORE_LZ4HC_COMPRESS
- tristate "LZ4HC compression"
- depends on PSTORE
- select CRYPTO_LZ4HC
- help
- This option enables LZ4HC (high compression) mode algorithm.
-
-config PSTORE_842_COMPRESS
- bool "842 compression"
- depends on PSTORE
- select CRYPTO_842
- help
- This option enables 842 compression algorithm support.
-
-config PSTORE_ZSTD_COMPRESS
- bool "zstd compression"
- depends on PSTORE
- select CRYPTO_ZSTD
- help
- This option enables zstd compression algorithm support.
-
config PSTORE_COMPRESS
- def_bool y
+ bool "Pstore compression (deflate)"
depends on PSTORE
- depends on PSTORE_DEFLATE_COMPRESS || PSTORE_LZO_COMPRESS || \
- PSTORE_LZ4_COMPRESS || PSTORE_LZ4HC_COMPRESS || \
- PSTORE_842_COMPRESS || PSTORE_ZSTD_COMPRESS
-
-choice
- prompt "Default pstore compression algorithm"
- depends on PSTORE_COMPRESS
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ default y
help
- This option chooses the default active compression algorithm.
- This change be changed at boot with "pstore.compress=..." on
- the kernel command line.
-
- Currently, pstore has support for 6 compression algorithms:
- deflate, lzo, lz4, lz4hc, 842 and zstd.
-
- The default compression algorithm is deflate.
-
- config PSTORE_DEFLATE_COMPRESS_DEFAULT
- bool "deflate" if PSTORE_DEFLATE_COMPRESS
-
- config PSTORE_LZO_COMPRESS_DEFAULT
- bool "lzo" if PSTORE_LZO_COMPRESS
-
- config PSTORE_LZ4_COMPRESS_DEFAULT
- bool "lz4" if PSTORE_LZ4_COMPRESS
-
- config PSTORE_LZ4HC_COMPRESS_DEFAULT
- bool "lz4hc" if PSTORE_LZ4HC_COMPRESS
-
- config PSTORE_842_COMPRESS_DEFAULT
- bool "842" if PSTORE_842_COMPRESS
-
- config PSTORE_ZSTD_COMPRESS_DEFAULT
- bool "zstd" if PSTORE_ZSTD_COMPRESS
-
-endchoice
-
-config PSTORE_COMPRESS_DEFAULT
- string
- depends on PSTORE_COMPRESS
- default "deflate" if PSTORE_DEFLATE_COMPRESS_DEFAULT
- default "lzo" if PSTORE_LZO_COMPRESS_DEFAULT
- default "lz4" if PSTORE_LZ4_COMPRESS_DEFAULT
- default "lz4hc" if PSTORE_LZ4HC_COMPRESS_DEFAULT
- default "842" if PSTORE_842_COMPRESS_DEFAULT
- default "zstd" if PSTORE_ZSTD_COMPRESS_DEFAULT
+ Whether pstore records should be compressed before being written to
+ the backing store. This is implemented using the zlib 'deflate'
+ algorithm, using the library implementation instead of using the full
+ blown crypto API. This reduces the risk of secondary oopses or other
+ problems while pstore is recording panic metadata.
config PSTORE_CONSOLE
bool "Log kernel console messages"
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index ffbadb8b3032..585360706b33 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -54,7 +54,7 @@ static void free_pstore_private(struct pstore_private *private)
if (!private)
return;
if (private->record) {
- kfree(private->record->buf);
+ kvfree(private->record->buf);
kfree(private->record->priv);
kfree(private->record);
}
@@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
return inode;
}
@@ -390,7 +390,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
inode->i_private = private;
if (record->time.tv_sec)
- inode->i_mtime = inode->i_ctime = record->time;
+ inode->i_mtime = inode_set_ctime_to_ts(inode, record->time);
d_add(dentry, inode);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index cbc0b468c1ab..62356d542ef6 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -14,24 +14,17 @@
#include <linux/init.h>
#include <linux/kmsg_dump.h>
#include <linux/console.h>
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/pstore.h>
-#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
-#include <linux/lzo.h>
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
-#include <linux/lz4.h>
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
-#include <linux/zstd.h>
-#endif
-#include <linux/crypto.h>
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/jiffies.h>
+#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include <linux/zlib.h>
#include "internal.h"
@@ -80,12 +73,21 @@ static char *backend;
module_param(backend, charp, 0444);
MODULE_PARM_DESC(backend, "specific backend to use");
-static char *compress =
-#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT
- CONFIG_PSTORE_COMPRESS_DEFAULT;
-#else
- NULL;
-#endif
+/*
+ * pstore no longer implements compression via the crypto API, and only
+ * supports zlib deflate compression implemented using the zlib library
+ * interface. This removes additional complexity which is hard to justify for a
+ * diagnostic facility that has to operate in conditions where the system may
+ * have become unstable. Zlib deflate is comparatively small in terms of code
+ * size, and compresses ASCII text comparatively well. In terms of compression
+ * speed, deflate is not the best performer but for recording the log output on
+ * a kernel panic, this is not considered critical.
+ *
+ * The only remaining arguments supported by the compress= module parameter are
+ * 'deflate' and 'none'. To retain compatibility with existing installations,
+ * all other values are logged and replaced with 'deflate'.
+ */
+static char *compress = "deflate";
module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
@@ -94,16 +96,9 @@ unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
module_param(kmsg_bytes, ulong, 0444);
MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
-/* Compression parameters */
-static struct crypto_comp *tfm;
-
-struct pstore_zbackend {
- int (*zbufsize)(size_t size);
- const char *name;
-};
+static void *compress_workspace;
static char *big_oops_buf;
-static size_t big_oops_buf_sz;
void pstore_set_kmsg_bytes(int bytes)
{
@@ -168,206 +163,89 @@ static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
}
}
-#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
-static int zbufsize_deflate(size_t size)
-{
- size_t cmpr;
-
- switch (size) {
- /* buffer range for efivars */
- case 1000 ... 2000:
- cmpr = 56;
- break;
- case 2001 ... 3000:
- cmpr = 54;
- break;
- case 3001 ... 3999:
- cmpr = 52;
- break;
- /* buffer range for nvram, erst */
- case 4000 ... 10000:
- cmpr = 45;
- break;
- default:
- cmpr = 60;
- break;
- }
-
- return (size * 100) / cmpr;
-}
-#endif
-
-#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
-static int zbufsize_lzo(size_t size)
-{
- return lzo1x_worst_compress(size);
-}
-#endif
-
-#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
-static int zbufsize_lz4(size_t size)
-{
- return LZ4_compressBound(size);
-}
-#endif
-
-#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS)
-static int zbufsize_842(size_t size)
-{
- return size;
-}
-#endif
-
-#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
-static int zbufsize_zstd(size_t size)
-{
- return zstd_compress_bound(size);
-}
-#endif
-
-static const struct pstore_zbackend *zbackend __ro_after_init;
-
-static const struct pstore_zbackend zbackends[] = {
-#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
- {
- .zbufsize = zbufsize_deflate,
- .name = "deflate",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
- {
- .zbufsize = zbufsize_lzo,
- .name = "lzo",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS)
- {
- .zbufsize = zbufsize_lz4,
- .name = "lz4",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
- {
- .zbufsize = zbufsize_lz4,
- .name = "lz4hc",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS)
- {
- .zbufsize = zbufsize_842,
- .name = "842",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
- {
- .zbufsize = zbufsize_zstd,
- .name = "zstd",
- },
-#endif
- { }
-};
-
static int pstore_compress(const void *in, void *out,
unsigned int inlen, unsigned int outlen)
{
+ struct z_stream_s zstream = {
+ .next_in = in,
+ .avail_in = inlen,
+ .next_out = out,
+ .avail_out = outlen,
+ .workspace = compress_workspace,
+ };
int ret;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS))
return -EINVAL;
- ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
- if (ret) {
- pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
- return ret;
- }
+ ret = zlib_deflateInit2(&zstream, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+ -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (ret != Z_OK)
+ return -EINVAL;
+
+ ret = zlib_deflate(&zstream, Z_FINISH);
+ if (ret != Z_STREAM_END)
+ return -EINVAL;
+
+ ret = zlib_deflateEnd(&zstream);
+ if (ret != Z_OK)
+ pr_warn_once("zlib_deflateEnd() failed: %d\n", ret);
- return outlen;
+ return zstream.total_out;
}
static void allocate_buf_for_compression(void)
{
- struct crypto_comp *ctx;
- int size;
char *buf;
- /* Skip if not built-in or compression backend not selected yet. */
- if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend)
- return;
-
- /* Skip if no pstore backend yet or compression init already done. */
- if (!psinfo || tfm)
- return;
-
- if (!crypto_has_comp(zbackend->name, 0, 0)) {
- pr_err("Unknown compression: %s\n", zbackend->name);
+ /* Skip if not built-in or compression disabled. */
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !compress ||
+ !strcmp(compress, "none")) {
+ compress = NULL;
return;
}
- size = zbackend->zbufsize(psinfo->bufsize);
- if (size <= 0) {
- pr_err("Invalid compression size for %s: %d\n",
- zbackend->name, size);
- return;
+ if (strcmp(compress, "deflate")) {
+ pr_err("Unsupported compression '%s', falling back to deflate\n",
+ compress);
+ compress = "deflate";
}
- buf = kmalloc(size, GFP_KERNEL);
+ /*
+ * The compression buffer only needs to be as large as the maximum
+ * uncompressed record size, since any record that would be expanded by
+ * compression is just stored uncompressed.
+ */
+ buf = kvzalloc(psinfo->bufsize, GFP_KERNEL);
if (!buf) {
- pr_err("Failed %d byte compression buffer allocation for: %s\n",
- size, zbackend->name);
+ pr_err("Failed %zu byte compression buffer allocation for: %s\n",
+ psinfo->bufsize, compress);
return;
}
- ctx = crypto_alloc_comp(zbackend->name, 0, 0);
- if (IS_ERR_OR_NULL(ctx)) {
- kfree(buf);
- pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
- PTR_ERR(ctx));
+ compress_workspace =
+ vmalloc(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL));
+ if (!compress_workspace) {
+ pr_err("Failed to allocate zlib deflate workspace\n");
+ kvfree(buf);
return;
}
/* A non-NULL big_oops_buf indicates compression is available. */
- tfm = ctx;
- big_oops_buf_sz = size;
big_oops_buf = buf;
- pr_info("Using crash dump compression: %s\n", zbackend->name);
+ pr_info("Using crash dump compression: %s\n", compress);
}
static void free_buf_for_compression(void)
{
- if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) {
- crypto_free_comp(tfm);
- tfm = NULL;
+ if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress_workspace) {
+ vfree(compress_workspace);
+ compress_workspace = NULL;
}
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- big_oops_buf_sz = 0;
-}
-/*
- * Called when compression fails, since the printk buffer
- * would be fetched for compression calling it again when
- * compression fails would have moved the iterator of
- * printk buffer which results in fetching old contents.
- * Copy the recent messages from big_oops_buf to psinfo->buf
- */
-static size_t copy_kmsg_to_buffer(int hsize, size_t len)
-{
- size_t total_len;
- size_t diff;
-
- total_len = hsize + len;
-
- if (total_len > psinfo->bufsize) {
- diff = total_len - psinfo->bufsize + hsize;
- memcpy(psinfo->buf, big_oops_buf, hsize);
- memcpy(psinfo->buf + hsize, big_oops_buf + diff,
- psinfo->bufsize - hsize);
- total_len = psinfo->bufsize;
- } else
- memcpy(psinfo->buf, big_oops_buf, total_len);
-
- return total_len;
+ kvfree(big_oops_buf);
+ big_oops_buf = NULL;
}
void pstore_record_init(struct pstore_record *record,
@@ -426,13 +304,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
record.part = part;
record.buf = psinfo->buf;
- if (big_oops_buf) {
- dst = big_oops_buf;
- dst_size = big_oops_buf_sz;
- } else {
- dst = psinfo->buf;
- dst_size = psinfo->bufsize;
- }
+ dst = big_oops_buf ?: psinfo->buf;
+ dst_size = psinfo->bufsize;
/* Write dump header. */
header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why,
@@ -453,8 +326,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
record.compressed = true;
record.size = zipped_len;
} else {
- record.size = copy_kmsg_to_buffer(header_size,
- dump_size);
+ record.size = header_size + dump_size;
+ memcpy(psinfo->buf, dst, record.size);
}
} else {
record.size = header_size + dump_size;
@@ -549,7 +422,7 @@ static int pstore_write_user_compat(struct pstore_record *record,
if (record->buf)
return -EINVAL;
- record->buf = memdup_user(buf, record->size);
+ record->buf = vmemdup_user(buf, record->size);
if (IS_ERR(record->buf)) {
ret = PTR_ERR(record->buf);
goto out;
@@ -557,7 +430,7 @@ static int pstore_write_user_compat(struct pstore_record *record,
ret = record->psi->write(record);
- kfree(record->buf);
+ kvfree(record->buf);
out:
record->buf = NULL;
@@ -681,7 +554,8 @@ void pstore_unregister(struct pstore_info *psi)
}
EXPORT_SYMBOL_GPL(pstore_unregister);
-static void decompress_record(struct pstore_record *record)
+static void decompress_record(struct pstore_record *record,
+ struct z_stream_s *zstream)
{
int ret;
int unzipped_len;
@@ -697,40 +571,50 @@ static void decompress_record(struct pstore_record *record)
}
/* Missing compression buffer means compression was not initialized. */
- if (!big_oops_buf) {
+ if (!zstream->workspace) {
pr_warn("no decompression method initialized!\n");
return;
}
+ ret = zlib_inflateReset(zstream);
+ if (ret != Z_OK) {
+ pr_err("zlib_inflateReset() failed, ret = %d!\n", ret);
+ return;
+ }
+
/* Allocate enough space to hold max decompression and ECC. */
- unzipped_len = big_oops_buf_sz;
- workspace = kmalloc(unzipped_len + record->ecc_notice_size,
- GFP_KERNEL);
+ workspace = kvzalloc(psinfo->bufsize + record->ecc_notice_size,
+ GFP_KERNEL);
if (!workspace)
return;
- /* After decompression "unzipped_len" is almost certainly smaller. */
- ret = crypto_comp_decompress(tfm, record->buf, record->size,
- workspace, &unzipped_len);
- if (ret) {
- pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
- kfree(workspace);
+ zstream->next_in = record->buf;
+ zstream->avail_in = record->size;
+ zstream->next_out = workspace;
+ zstream->avail_out = psinfo->bufsize;
+
+ ret = zlib_inflate(zstream, Z_FINISH);
+ if (ret != Z_STREAM_END) {
+ pr_err("zlib_inflate() failed, ret = %d!\n", ret);
+ kvfree(workspace);
return;
}
+ unzipped_len = zstream->total_out;
+
/* Append ECC notice to decompressed buffer. */
memcpy(workspace + unzipped_len, record->buf + record->size,
record->ecc_notice_size);
/* Copy decompressed contents into an minimum-sized allocation. */
- unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size,
- GFP_KERNEL);
- kfree(workspace);
+ unzipped = kvmemdup(workspace, unzipped_len + record->ecc_notice_size,
+ GFP_KERNEL);
+ kvfree(workspace);
if (!unzipped)
return;
/* Swap out compressed contents with decompressed contents. */
- kfree(record->buf);
+ kvfree(record->buf);
record->buf = unzipped;
record->size = unzipped_len;
record->compressed = false;
@@ -747,10 +631,17 @@ void pstore_get_backend_records(struct pstore_info *psi,
{
int failed = 0;
unsigned int stop_loop = 65536;
+ struct z_stream_s zstream = {};
if (!psi || !root)
return;
+ if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) {
+ zstream.workspace = kvmalloc(zlib_inflate_workspacesize(),
+ GFP_KERNEL);
+ zlib_inflateInit2(&zstream, -DEF_WBITS);
+ }
+
mutex_lock(&psi->read_mutex);
if (psi->open && psi->open(psi))
goto out;
@@ -779,11 +670,11 @@ void pstore_get_backend_records(struct pstore_info *psi,
break;
}
- decompress_record(record);
+ decompress_record(record, &zstream);
rc = pstore_mkfile(root, record);
if (rc) {
/* pstore_mkfile() did not take record, so free it. */
- kfree(record->buf);
+ kvfree(record->buf);
kfree(record->priv);
kfree(record);
if (rc != -EEXIST || !quiet)
@@ -795,6 +686,12 @@ void pstore_get_backend_records(struct pstore_info *psi,
out:
mutex_unlock(&psi->read_mutex);
+ if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) {
+ if (zlib_inflateEnd(&zstream) != Z_OK)
+ pr_warn("zlib_inflateEnd() failed\n");
+ kvfree(zstream.workspace);
+ }
+
if (failed)
pr_warn("failed to create %d record(s) from '%s'\n",
failed, psi->name);
@@ -818,34 +715,10 @@ static void pstore_timefunc(struct timer_list *unused)
pstore_timer_kick();
}
-static void __init pstore_choose_compression(void)
-{
- const struct pstore_zbackend *step;
-
- if (!compress)
- return;
-
- for (step = zbackends; step->name; step++) {
- if (!strcmp(compress, step->name)) {
- zbackend = step;
- return;
- }
- }
-}
-
static int __init pstore_init(void)
{
int ret;
- pstore_choose_compression();
-
- /*
- * Check if any pstore backends registered earlier but did not
- * initialize compression because crypto was not ready. If so,
- * initialize compression now.
- */
- allocate_buf_for_compression();
-
ret = pstore_init_fs();
if (ret)
free_buf_for_compression();
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 2f625e1fa8d8..d36702c7ab3c 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -20,6 +20,7 @@
#include <linux/compiler.h>
#include <linux/of.h>
#include <linux/of_address.h>
+#include <linux/mm.h>
#include "internal.h"
#include "ram_internal.h"
@@ -268,7 +269,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
/* ECC correction notice */
record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
- record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL);
+ record->buf = kvzalloc(size + record->ecc_notice_size + 1, GFP_KERNEL);
if (record->buf == NULL) {
size = -ENOMEM;
goto out;
@@ -282,7 +283,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
out:
if (free_prz) {
- kfree(prz->old_log);
+ kvfree(prz->old_log);
kfree(prz);
}
@@ -833,7 +834,7 @@ static int ramoops_probe(struct platform_device *pdev)
*/
if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) {
cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size;
- cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
+ cxt->pstore.buf = kvzalloc(cxt->pstore.bufsize, GFP_KERNEL);
if (!cxt->pstore.buf) {
pr_err("cannot allocate pstore crash dump buffer\n");
err = -ENOMEM;
@@ -866,7 +867,7 @@ static int ramoops_probe(struct platform_device *pdev)
return 0;
fail_buf:
- kfree(cxt->pstore.buf);
+ kvfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
fail_init:
@@ -881,7 +882,7 @@ static void ramoops_remove(struct platform_device *pdev)
pstore_unregister(&cxt->pstore);
- kfree(cxt->pstore.buf);
+ kvfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
ramoops_free_przs(cxt);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 85aaf0fc6d7d..650e437b55e6 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <asm/page.h>
#include "ram_internal.h"
@@ -24,12 +25,10 @@
/**
* struct persistent_ram_buffer - persistent circular RAM buffer
*
- * @sig:
- * signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value)
- * @start:
- * offset into @data where the beginning of the stored bytes begin
- * @size:
- * number of valid bytes stored in @data
+ * @sig: Signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value)
+ * @start: First valid byte in the buffer.
+ * @size: Number of valid bytes in the buffer.
+ * @data: The contents of the buffer.
*/
struct persistent_ram_buffer {
uint32_t sig;
@@ -301,7 +300,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz)
if (!prz->old_log) {
persistent_ram_ecc_old(prz);
- prz->old_log = kmalloc(size, GFP_KERNEL);
+ prz->old_log = kvzalloc(size, GFP_KERNEL);
}
if (!prz->old_log) {
pr_err("failed to allocate buffer\n");
@@ -385,7 +384,7 @@ void *persistent_ram_old(struct persistent_ram_zone *prz)
void persistent_ram_free_old(struct persistent_ram_zone *prz)
{
- kfree(prz->old_log);
+ kvfree(prz->old_log);
prz->old_log = NULL;
prz->old_log_size = 0;
}
@@ -519,7 +518,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
sig ^= PERSISTENT_RAM_SIG;
if (prz->buffer->sig == sig) {
- if (buffer_size(prz) == 0) {
+ if (buffer_size(prz) == 0 && buffer_start(prz) == 0) {
pr_debug("found existing empty buffer\n");
return 0;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 391ea402920d..a7171f5532a1 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -305,8 +305,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime);
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->di_ctime);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0);
inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 85b2fa3b211c..21f90d519f1a 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -562,8 +562,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime);
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_ctime);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0);
/* calc blocks based on 512 byte blocksize */
inode->i_blocks = (inode->i_size + 511) >> 9;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e3e4f4047657..4d826c369da2 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2367,7 +2367,7 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
if (!fmt)
return -ESRCH;
- if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
+ if (!sb->dq_op || !sb->s_qcop ||
(type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
error = -EINVAL;
goto out_fmt;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index fef477c78107..18e8387cab41 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode->i_mapping->a_ops = &ram_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
@@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
}
return error;
}
@@ -138,7 +138,7 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
if (!error) {
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
} else
iput(inode);
}
diff --git a/fs/read_write.c b/fs/read_write.c
index b07de77ef126..4771701c896b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(vfs_setpos);
* @file: file structure to seek on
* @offset: file offset to seek to
* @whence: type of seek
- * @size: max size of this file in file system
+ * @maxsize: max size of this file in file system
* @eof: offset used for SEEK_END position
*
* This is a variant of generic_file_llseek that allows passing in a custom
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 77bd3b27059f..86e55d4bb10d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1259,9 +1259,8 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode->i_size = sd_v1_size(sd);
inode->i_atime.tv_sec = sd_v1_atime(sd);
inode->i_mtime.tv_sec = sd_v1_mtime(sd);
- inode->i_ctime.tv_sec = sd_v1_ctime(sd);
+ inode_set_ctime(inode, sd_v1_ctime(sd), 0);
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
inode->i_blocks = sd_v1_blocks(sd);
@@ -1314,8 +1313,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
i_gid_write(inode, sd_v2_gid(sd));
inode->i_mtime.tv_sec = sd_v2_mtime(sd);
inode->i_atime.tv_sec = sd_v2_atime(sd);
- inode->i_ctime.tv_sec = sd_v2_ctime(sd);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, sd_v2_ctime(sd), 0);
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
inode->i_blocks = sd_v2_blocks(sd);
@@ -1374,7 +1372,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
set_sd_v2_gid(sd_v2, i_gid_read(inode));
set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
- set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
+ set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec);
set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
@@ -1394,7 +1392,7 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
set_sd_v1_nlink(sd_v1, inode->i_nlink);
set_sd_v1_size(sd_v1, size);
set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
- set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
+ set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec);
set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -1986,7 +1984,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
/* uid and gid must already be set by the caller for quota init */
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_size = i_size;
inode->i_blocks = 0;
inode->i_bytes = 0;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 6bf9b54e58ca..dd33f8cc6eda 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -55,7 +55,7 @@ int reiserfs_fileattr_set(struct mnt_idmap *idmap,
}
sd_attrs_to_i_attrs(flags, inode);
REISERFS_I(inode)->i_attrs = flags;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
err = 0;
unlock:
@@ -107,7 +107,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = -EFAULT;
goto setversion_out;
}
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
setversion_out:
mnt_drop_write_file(filp);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 479aa4a57602..015bfe4e4524 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2326,7 +2326,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
int i, j;
bh = __getblk(dev, block, bufsize);
- if (buffer_uptodate(bh))
+ if (!bh || buffer_uptodate(bh))
return (bh);
if (block + BUFNR > max_block) {
@@ -2336,6 +2336,8 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
j = 1;
for (i = 1; i < blocks; i++) {
bh = __getblk(dev, block + i, bufsize);
+ if (!bh)
+ break;
if (buffer_uptodate(bh)) {
brelse(bh);
break;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 52240cc891cf..9c5704be2435 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
}
dir->i_size += paste_size;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
if (!S_ISDIR(inode->i_mode) && visible)
/* reiserfs_mkdir or reiserfs_rename will do that by itself */
reiserfs_update_sd(th, dir);
@@ -966,7 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_nlink);
clear_nlink(inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
reiserfs_update_sd(&th, inode);
DEC_DIR_INODE_NLINK(dir)
@@ -1070,11 +1071,11 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
inc_nlink(inode);
goto end_unlink;
}
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
reiserfs_update_sd(&th, inode);
dir->i_size -= (de.de_entrylen + DEH_SIZE);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
reiserfs_update_sd(&th, dir);
if (!savelink)
@@ -1250,7 +1251,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
return err ? err : retval;
}
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
reiserfs_update_sd(&th, inode);
ihold(inode);
@@ -1325,7 +1326,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
int jbegin_count;
umode_t old_inode_mode;
unsigned long savelink = 1;
- struct timespec64 ctime;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -1576,14 +1576,11 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
journal_mark_dirty(&th, old_de.de_bh);
- ctime = current_time(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = ctime;
- new_dir->i_ctime = new_dir->i_mtime = ctime;
/*
* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
* which adds ctime update of renamed object
*/
- old_inode->i_ctime = ctime;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (new_dentry_inode) {
/* adjust link number of the victim */
@@ -1592,7 +1589,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
} else {
drop_nlink(new_dentry_inode);
}
- new_dentry_inode->i_ctime = ctime;
savelink = new_dentry_inode->i_nlink;
}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index ce5003986789..3676e02a0232 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2004,7 +2004,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
if (update_timestamps) {
inode->i_mtime = current_time(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
}
reiserfs_update_sd(th, inode);
@@ -2029,7 +2029,7 @@ update_and_out:
if (update_timestamps) {
/* this is truncate, not file closing */
inode->i_mtime = current_time(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
}
reiserfs_update_sd(th, inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 929acce6e731..7eaf36b3de12 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2587,7 +2587,7 @@ out:
return err;
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 651027967159..6000964c2b80 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -466,12 +466,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
static void update_ctime(struct inode *inode)
{
struct timespec64 now = current_time(inode);
+ struct timespec64 ctime = inode_get_ctime(inode);
if (inode_unhashed(inode) || !inode->i_nlink ||
- timespec64_equal(&inode->i_ctime, &now))
+ timespec64_equal(&ctime, &now))
return;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_to_ts(inode, now);
mark_inode_dirty(inode);
}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 138060452678..064264992b49 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -285,7 +285,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
if (error == -ENODATA) {
error = 0;
if (type == ACL_TYPE_ACCESS) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index c59b230d55b4..5c35f6c76037 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -322,8 +322,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
set_nlink(i, 1); /* Hard to decide.. */
i->i_size = be32_to_cpu(ri.size);
- i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
- i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+ i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0);
/* set up mode and ops */
mode = romfs_modemap[nextfh & ROMFH_TYPE];
@@ -583,16 +582,18 @@ static int romfs_init_fs_context(struct fs_context *fc)
*/
static void romfs_kill_sb(struct super_block *sb)
{
+ generic_shutdown_super(sb);
+
#ifdef CONFIG_ROMFS_ON_MTD
if (sb->s_mtd) {
- kill_mtd_super(sb);
- return;
+ put_mtd_device(sb->s_mtd);
+ sb->s_mtd = NULL;
}
#endif
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev) {
- kill_block_super(sb);
- return;
+ sync_blockdev(sb->s_bdev);
+ blkdev_put(sb->s_bdev, sb);
}
#endif
}
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index a4d8b0ea1c8c..6fc8f43b1c9d 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1077,7 +1077,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
}
static int
-cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv)
+cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv)
{
/*
* Note that this is called by vfs setlease with i_lock held to
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 6bc44f79d2e9..2108b3b40ce9 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1085,7 +1085,7 @@ int cifs_close(struct inode *inode, struct file *file)
!test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
dclose) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
}
spin_lock(&cinode->deferred_lock);
cifs_add_deferred_close(cfile, dclose);
@@ -2596,7 +2596,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
/* Does mm or vfs already set times? */
- inode->i_atime = inode->i_mtime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
if ((bytes_written > 0) && (offset))
rc = 0;
else if (bytes_written < 0)
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index 173999610997..84f3b09367d2 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -50,12 +50,13 @@ void cifs_fscache_fill_coherency(struct inode *inode,
struct cifs_fscache_inode_coherency_data *cd)
{
struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct timespec64 ctime = inode_get_ctime(inode);
memset(cd, 0, sizeof(*cd));
cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
- cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec);
- cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec);
+ cd->last_change_time_sec = cpu_to_le64(ctime.tv_sec);
+ cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec);
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index c3eeae07e139..93fe43789d7a 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -172,7 +172,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
else
inode->i_atime = fattr->cf_atime;
inode->i_mtime = fattr->cf_mtime;
- inode->i_ctime = fattr->cf_ctime;
+ inode_set_ctime_to_ts(inode, fattr->cf_ctime);
inode->i_rdev = fattr->cf_rdev;
cifs_nlink_fattr_to_inode(inode, fattr);
inode->i_uid = fattr->cf_uid;
@@ -1744,9 +1744,9 @@ out_reval:
cifs_inode = CIFS_I(inode);
cifs_inode->time = 0; /* will force revalidate to get info
when needed */
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
}
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
cifs_inode = CIFS_I(dir);
CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
unlink_out:
@@ -2060,8 +2060,8 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
*/
cifsInode->time = 0;
- d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ inode_set_ctime_current(d_inode(direntry));
+ inode->i_mtime = inode_set_ctime_current(inode);
rmdir_exit:
free_dentry_path(page);
@@ -2267,8 +2267,8 @@ unlink_target:
/* force revalidate to go get info when needed */
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
- source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
- target_dir->i_mtime = current_time(source_dir);
+ source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir,
+ inode_set_ctime_current(target_dir));
cifs_rename_exit:
kfree(info_buf_source);
@@ -2540,7 +2540,7 @@ int cifs_getattr(struct mnt_idmap *idmap, const struct path *path,
return rc;
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->blksize = cifs_sb->ctx->bsize;
stat->ino = CIFS_I(inode)->uniqueid;
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 0f62bc373ad0..182e2e879ecf 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -1396,7 +1396,8 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
if (file_inf.LastWriteTime)
inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime);
if (file_inf.ChangeTime)
- inode->i_ctime = cifs_NTtimeToUnix(file_inf.ChangeTime);
+ inode_set_ctime_to_ts(inode,
+ cifs_NTtimeToUnix(file_inf.ChangeTime));
if (file_inf.LastAccessTime)
inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime);
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 7cc1b0c47d0a..a947c18915c2 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -4402,8 +4402,8 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
}
basic_info = (struct smb2_file_basic_info *)rsp->Buffer;
- generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp),
- &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
+ file_inode(fp->filp), &stat);
basic_info->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
basic_info->LastAccessTime = cpu_to_le64(time);
@@ -4428,7 +4428,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp,
struct kstat stat;
inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
delete_pending = ksmbd_inode_pending_delete(fp);
@@ -4482,7 +4482,7 @@ static int get_file_all_info(struct ksmbd_work *work,
return PTR_ERR(filename);
inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
ksmbd_debug(SMB, "filename = %s\n", filename);
delete_pending = ksmbd_inode_pending_delete(fp);
@@ -4559,8 +4559,8 @@ static void get_file_stream_info(struct ksmbd_work *work,
int buf_free_len;
struct smb2_query_info_req *req = ksmbd_req_buf_next(work);
- generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp),
- &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
+ file_inode(fp->filp), &stat);
file_info = (struct smb2_file_stream_info *)rsp->Buffer;
buf_free_len =
@@ -4650,8 +4650,8 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp,
struct smb2_file_internal_info *file_info;
struct kstat stat;
- generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp),
- &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
+ file_inode(fp->filp), &stat);
file_info = (struct smb2_file_internal_info *)rsp->Buffer;
file_info->IndexNumber = cpu_to_le64(stat.ino);
rsp->OutputBufferLength =
@@ -4676,7 +4676,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
file_info->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
@@ -4737,8 +4737,8 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
struct smb2_file_comp_info *file_info;
struct kstat stat;
- generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp),
- &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
+ file_inode(fp->filp), &stat);
file_info = (struct smb2_file_comp_info *)rsp->Buffer;
file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9);
@@ -4790,7 +4790,7 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info->LastAccessTime = cpu_to_le64(time);
time = ksmbd_UnixTimeToNT(inode->i_mtime);
file_info->LastWriteTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode->i_ctime);
+ time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
file_info->ChangeTime = cpu_to_le64(time);
file_info->DosAttributes = fp->f_ci->m_fattr;
file_info->Inode = cpu_to_le64(inode->i_ino);
@@ -5433,7 +5433,7 @@ int smb2_close(struct ksmbd_work *work)
rsp->LastAccessTime = cpu_to_le64(time);
time = ksmbd_UnixTimeToNT(inode->i_mtime);
rsp->LastWriteTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode->i_ctime);
+ time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
rsp->ChangeTime = cpu_to_le64(time);
ksmbd_fd_put(work, fp);
} else {
@@ -5656,7 +5656,7 @@ static int set_file_basic_info(struct ksmbd_file *fp,
if (file_info->ChangeTime)
attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime);
else
- attrs.ia_ctime = inode->i_ctime;
+ attrs.ia_ctime = inode_get_ctime(inode);
if (file_info->LastWriteTime) {
attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime);
@@ -5701,7 +5701,7 @@ static int set_file_basic_info(struct ksmbd_file *fp,
return -EACCES;
inode_lock(inode);
- inode->i_ctime = attrs.ia_ctime;
+ inode_set_ctime_to_ts(inode, attrs.ia_ctime);
attrs.ia_valid &= ~ATTR_CTIME;
rc = notify_change(idmap, dentry, &attrs, NULL);
inode_unlock(inode);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 3d5d652153a5..d48756a339a5 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1659,7 +1659,8 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
u64 time;
int rc;
- generic_fillattr(idmap, d_inode(dentry), ksmbd_kstat->kstat);
+ generic_fillattr(idmap, STATX_BASIC_STATS, d_inode(dentry),
+ ksmbd_kstat->kstat);
time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime);
ksmbd_kstat->create_time = time;
diff --git a/fs/splice.c b/fs/splice.c
index 3e2a31e1ce6a..02631013b09f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -120,17 +120,17 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ struct folio *folio = page_folio(buf->page);
int err;
- if (!PageUptodate(page)) {
- lock_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_lock(folio);
/*
- * Page got truncated/unhashed. This will cause a 0-byte
+ * Folio got truncated/unhashed. This will cause a 0-byte
* splice, if this is the first page.
*/
- if (!page->mapping) {
+ if (!folio->mapping) {
err = -ENODATA;
goto error;
}
@@ -138,20 +138,18 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
/*
* Uh oh, read-error from disk.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
err = -EIO;
goto error;
}
- /*
- * Page is ok afterall, we are done.
- */
- unlock_page(page);
+ /* Folio is ok after all, we are done */
+ folio_unlock(folio);
}
return 0;
error:
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
@@ -1269,10 +1267,8 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
if ((in->f_flags | out->f_flags) & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;
- return splice_pipe_to_pipe(ipipe, opipe, len, flags);
- }
-
- if (ipipe) {
+ ret = splice_pipe_to_pipe(ipipe, opipe, len, flags);
+ } else if (ipipe) {
if (off_in)
return -ESPIPE;
if (off_out) {
@@ -1297,18 +1293,11 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
ret = do_splice_from(ipipe, out, &offset, len, flags);
file_end_write(out);
- if (ret > 0)
- fsnotify_modify(out);
-
if (!off_out)
out->f_pos = offset;
else
*off_out = offset;
-
- return ret;
- }
-
- if (opipe) {
+ } else if (opipe) {
if (off_out)
return -ESPIPE;
if (off_in) {
@@ -1324,18 +1313,25 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
- if (ret > 0)
- fsnotify_access(in);
-
if (!off_in)
in->f_pos = offset;
else
*off_in = offset;
+ } else {
+ ret = -EINVAL;
+ }
- return ret;
+ if (ret > 0) {
+ /*
+ * Generate modify out before access in:
+ * do_splice_from() may've already sent modify out,
+ * and this ensures the events get merged.
+ */
+ fsnotify_modify(out);
+ fsnotify_access(in);
}
- return -EINVAL;
+ return ret;
}
static long __do_splice(struct file *in, loff_t __user *off_in,
@@ -1464,6 +1460,9 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
pipe_unlock(pipe);
}
+ if (ret > 0)
+ fsnotify_access(file);
+
return ret;
}
@@ -1493,8 +1492,10 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
if (!ret)
ret = iter_to_pipe(iter, pipe, buf_flag);
pipe_unlock(pipe);
- if (ret > 0)
+ if (ret > 0) {
wakeup_pipe_readers(pipe);
+ fsnotify_modify(file);
+ }
return ret;
}
@@ -1928,6 +1929,11 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
}
}
+ if (ret > 0) {
+ fsnotify_access(in);
+ fsnotify_modify(out);
+ }
+
return ret;
}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 24463145b351..c6e626b00546 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -61,7 +61,7 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
- inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
+ inode_set_ctime(inode, inode->i_mtime.tv_sec, 0);
inode->i_mode = le16_to_cpu(sqsh_ino->mode);
inode->i_size = 0;
diff --git a/fs/stack.c b/fs/stack.c
index c9830924eb12..b5e01bdb5f5f 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -68,7 +68,7 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
dest->i_rdev = src->i_rdev;
dest->i_atime = src->i_atime;
dest->i_mtime = src->i_mtime;
- dest->i_ctime = src->i_ctime;
+ inode_set_ctime_to_ts(dest, inode_get_ctime(src));
dest->i_blkbits = src->i_blkbits;
dest->i_flags = src->i_flags;
set_nlink(dest, src->i_nlink);
diff --git a/fs/stat.c b/fs/stat.c
index 7c238da22ef0..136711ae72fb 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -27,10 +27,42 @@
#include "mount.h"
/**
+ * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
+ * @stat: where to store the resulting values
+ * @request_mask: STATX_* values requested
+ * @inode: inode from which to grab the c/mtime
+ *
+ * Given @inode, grab the ctime and mtime out if it and store the result
+ * in @stat. When fetching the value, flag it as queried so the next write
+ * will use a fine-grained timestamp.
+ */
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
+{
+ atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
+
+ /* If neither time was requested, then don't report them */
+ if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
+ stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
+ return;
+ }
+
+ stat->mtime = inode->i_mtime;
+ stat->ctime.tv_sec = inode->__i_ctime.tv_sec;
+ /*
+ * Atomically set the QUERIED flag and fetch the new value with
+ * the flag masked off.
+ */
+ stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec) &
+ ~I_CTIME_QUERIED;
+}
+EXPORT_SYMBOL(fill_mg_cmtime);
+
+/**
* generic_fillattr - Fill in the basic attributes from the inode struct
- * @idmap: idmap of the mount the inode was found from
- * @inode: Inode to use as the source
- * @stat: Where to fill in the attributes
+ * @idmap: idmap of the mount the inode was found from
+ * @request_mask: statx request_mask
+ * @inode: Inode to use as the source
+ * @stat: Where to fill in the attributes
*
* Fill in the basic attributes in the kstat structure from data that's to be
* found on the VFS inode structure. This is the default if no getattr inode
@@ -42,8 +74,8 @@
* uid and gid filds. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs @nop_mnt_idmap.
*/
-void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode,
- struct kstat *stat)
+void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
+ struct inode *inode, struct kstat *stat)
{
vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
@@ -57,10 +89,22 @@ void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode,
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode->i_atime;
- stat->mtime = inode->i_mtime;
- stat->ctime = inode->i_ctime;
+
+ if (is_mgtime(inode)) {
+ fill_mg_cmtime(stat, request_mask, inode);
+ } else {
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode_get_ctime(inode);
+ }
+
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
+
+ if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
+ stat->result_mask |= STATX_CHANGE_COOKIE;
+ stat->change_cookie = inode_query_iversion(inode);
+ }
+
}
EXPORT_SYMBOL(generic_fillattr);
@@ -123,17 +167,12 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
STATX_ATTR_DAX);
- if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
- stat->result_mask |= STATX_CHANGE_COOKIE;
- stat->change_cookie = inode_query_iversion(inode);
- }
-
idmap = mnt_idmap(path->mnt);
if (inode->i_op->getattr)
return inode->i_op->getattr(idmap, path, stat,
request_mask, query_flags);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
EXPORT_SYMBOL(vfs_getattr_nosec);
diff --git a/fs/super.c b/fs/super.c
index e781226e2880..692654c2af36 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -39,7 +39,7 @@
#include <uapi/linux/mount.h>
#include "internal.h"
-static int thaw_super_locked(struct super_block *sb);
+static int thaw_super_locked(struct super_block *sb, enum freeze_holder who);
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
@@ -50,6 +50,130 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
"sb_internal",
};
+static inline void __super_lock(struct super_block *sb, bool excl)
+{
+ if (excl)
+ down_write(&sb->s_umount);
+ else
+ down_read(&sb->s_umount);
+}
+
+static inline void super_unlock(struct super_block *sb, bool excl)
+{
+ if (excl)
+ up_write(&sb->s_umount);
+ else
+ up_read(&sb->s_umount);
+}
+
+static inline void __super_lock_excl(struct super_block *sb)
+{
+ __super_lock(sb, true);
+}
+
+static inline void super_unlock_excl(struct super_block *sb)
+{
+ super_unlock(sb, true);
+}
+
+static inline void super_unlock_shared(struct super_block *sb)
+{
+ super_unlock(sb, false);
+}
+
+static inline bool wait_born(struct super_block *sb)
+{
+ unsigned int flags;
+
+ /*
+ * Pairs with smp_store_release() in super_wake() and ensures
+ * that we see SB_BORN or SB_DYING after we're woken.
+ */
+ flags = smp_load_acquire(&sb->s_flags);
+ return flags & (SB_BORN | SB_DYING);
+}
+
+/**
+ * super_lock - wait for superblock to become ready and lock it
+ * @sb: superblock to wait for
+ * @excl: whether exclusive access is required
+ *
+ * If the superblock has neither passed through vfs_get_tree() or
+ * generic_shutdown_super() yet wait for it to happen. Either superblock
+ * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
+ * woken and we'll see SB_DYING.
+ *
+ * The caller must have acquired a temporary reference on @sb->s_count.
+ *
+ * Return: This returns true if SB_BORN was set, false if SB_DYING was
+ * set. The function acquires s_umount and returns with it held.
+ */
+static __must_check bool super_lock(struct super_block *sb, bool excl)
+{
+
+ lockdep_assert_not_held(&sb->s_umount);
+
+relock:
+ __super_lock(sb, excl);
+
+ /*
+ * Has gone through generic_shutdown_super() in the meantime.
+ * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
+ * grab a reference to this. Tell them so.
+ */
+ if (sb->s_flags & SB_DYING)
+ return false;
+
+ /* Has called ->get_tree() successfully. */
+ if (sb->s_flags & SB_BORN)
+ return true;
+
+ super_unlock(sb, excl);
+
+ /* wait until the superblock is ready or dying */
+ wait_var_event(&sb->s_flags, wait_born(sb));
+
+ /*
+ * Neither SB_BORN nor SB_DYING are ever unset so we never loop.
+ * Just reacquire @sb->s_umount for the caller.
+ */
+ goto relock;
+}
+
+/* wait and acquire read-side of @sb->s_umount */
+static inline bool super_lock_shared(struct super_block *sb)
+{
+ return super_lock(sb, false);
+}
+
+/* wait and acquire write-side of @sb->s_umount */
+static inline bool super_lock_excl(struct super_block *sb)
+{
+ return super_lock(sb, true);
+}
+
+/* wake waiters */
+#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD)
+static void super_wake(struct super_block *sb, unsigned int flag)
+{
+ WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS));
+ WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1);
+
+ /*
+ * Pairs with smp_load_acquire() in super_lock() to make sure
+ * all initializations in the superblock are seen by the user
+ * seeing SB_BORN sent.
+ */
+ smp_store_release(&sb->s_flags, sb->s_flags | flag);
+ /*
+ * Pairs with the barrier in prepare_to_wait_event() to make sure
+ * ___wait_var_event() either sees SB_BORN set or
+ * waitqueue_active() check in wake_up_var() sees the waiter.
+ */
+ smp_mb();
+ wake_up_var(&sb->s_flags);
+}
+
/*
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
@@ -76,7 +200,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
- if (!trylock_super(sb))
+ if (!super_trylock_shared(sb))
return SHRINK_STOP;
if (sb->s_op->nr_cached_objects)
@@ -110,7 +234,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
freed += sb->s_op->free_cached_objects(sb, sc);
}
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
return freed;
}
@@ -123,17 +247,17 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sb = container_of(shrink, struct super_block, s_shrink);
/*
- * We don't call trylock_super() here as it is a scalability bottleneck,
- * so we're exposed to partial setup state. The shrinker rwsem does not
- * protect filesystem operations backing list_lru_shrink_count() or
- * s_op->nr_cached_objects(). Counts can change between
- * super_cache_count and super_cache_scan, so we really don't need locks
- * here.
+ * We don't call super_trylock_shared() here as it is a scalability
+ * bottleneck, so we're exposed to partial setup state. The shrinker
+ * rwsem does not protect filesystem operations backing
+ * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can
+ * change between super_cache_count and super_cache_scan, so we really
+ * don't need locks here.
*
* However, if we are currently mounting the superblock, the underlying
* filesystem might be in a state of partial construction and hence it
- * is dangerous to access it. trylock_super() uses a SB_BORN check to
- * avoid this situation, so do the same here. The memory barrier is
+ * is dangerous to access it. super_trylock_shared() uses a SB_BORN check
+ * to avoid this situation, so do the same here. The memory barrier is
* matched with the one in mount_fs() as we don't hold locks here.
*/
if (!(sb->s_flags & SB_BORN))
@@ -176,7 +300,7 @@ static void destroy_unused_super(struct super_block *s)
{
if (!s)
return;
- up_write(&s->s_umount);
+ super_unlock_excl(s);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
@@ -337,10 +461,29 @@ void deactivate_locked_super(struct super_block *s)
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
+ /*
+ * Remove it from @fs_supers so it isn't found by new
+ * sget{_fc}() walkers anymore. Any concurrent mounter still
+ * managing to grab a temporary reference is guaranteed to
+ * already see SB_DYING and will wait until we notify them about
+ * SB_DEAD.
+ */
+ spin_lock(&sb_lock);
+ hlist_del_init(&s->s_instances);
+ spin_unlock(&sb_lock);
+
+ /*
+ * Let concurrent mounts know that this thing is really dead.
+ * We don't need @sb->s_umount here as every concurrent caller
+ * will see SB_DYING and either discard the superblock or wait
+ * for SB_DEAD.
+ */
+ super_wake(s, SB_DEAD);
+
put_filesystem(fs);
put_super(s);
} else {
- up_write(&s->s_umount);
+ super_unlock_excl(s);
}
}
@@ -357,7 +500,7 @@ EXPORT_SYMBOL(deactivate_locked_super);
void deactivate_super(struct super_block *s)
{
if (!atomic_add_unless(&s->s_active, -1, 1)) {
- down_write(&s->s_umount);
+ __super_lock_excl(s);
deactivate_locked_super(s);
}
}
@@ -379,20 +522,61 @@ EXPORT_SYMBOL(deactivate_super);
*/
static int grab_super(struct super_block *s) __releases(sb_lock)
{
+ bool born;
+
s->s_count++;
spin_unlock(&sb_lock);
- down_write(&s->s_umount);
- if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
+ born = super_lock_excl(s);
+ if (born && atomic_inc_not_zero(&s->s_active)) {
put_super(s);
return 1;
}
- up_write(&s->s_umount);
+ super_unlock_excl(s);
put_super(s);
return 0;
}
+static inline bool wait_dead(struct super_block *sb)
+{
+ unsigned int flags;
+
+ /*
+ * Pairs with memory barrier in super_wake() and ensures
+ * that we see SB_DEAD after we're woken.
+ */
+ flags = smp_load_acquire(&sb->s_flags);
+ return flags & SB_DEAD;
+}
+
+/**
+ * grab_super_dead - acquire an active reference to a superblock
+ * @sb: superblock to acquire
+ *
+ * Acquire a temporary reference on a superblock and try to trade it for
+ * an active reference. This is used in sget{_fc}() to wait for a
+ * superblock to either become SB_BORN or for it to pass through
+ * sb->kill() and be marked as SB_DEAD.
+ *
+ * Return: This returns true if an active reference could be acquired,
+ * false if not.
+ */
+static bool grab_super_dead(struct super_block *sb)
+{
+
+ sb->s_count++;
+ if (grab_super(sb)) {
+ put_super(sb);
+ lockdep_assert_held(&sb->s_umount);
+ return true;
+ }
+ wait_var_event(&sb->s_flags, wait_dead(sb));
+ put_super(sb);
+ lockdep_assert_not_held(&sb->s_umount);
+ return false;
+}
+
/*
- * trylock_super - try to grab ->s_umount shared
+ * super_trylock_shared - try to grab ->s_umount shared
* @sb: reference we are trying to grab
*
* Try to prevent fs shutdown. This is used in places where we
@@ -408,13 +592,13 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
* of down_read(). There's a couple of places that are OK with that, but
* it's very much not a general-purpose interface.
*/
-bool trylock_super(struct super_block *sb)
+bool super_trylock_shared(struct super_block *sb)
{
if (down_read_trylock(&sb->s_umount)) {
- if (!hlist_unhashed(&sb->s_instances) &&
- sb->s_root && (sb->s_flags & SB_BORN))
+ if (!(sb->s_flags & SB_DYING) && sb->s_root &&
+ (sb->s_flags & SB_BORN))
return true;
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
}
return false;
@@ -439,13 +623,13 @@ bool trylock_super(struct super_block *sb)
void retire_super(struct super_block *sb)
{
WARN_ON(!sb->s_bdev);
- down_write(&sb->s_umount);
+ __super_lock_excl(sb);
if (sb->s_iflags & SB_I_PERSB_BDI) {
bdi_unregister(sb->s_bdi);
sb->s_iflags &= ~SB_I_PERSB_BDI;
}
sb->s_iflags |= SB_I_RETIRED;
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
}
EXPORT_SYMBOL(retire_super);
@@ -517,11 +701,17 @@ void generic_shutdown_super(struct super_block *sb)
spin_unlock(&sb->s_inode_list_lock);
}
}
- spin_lock(&sb_lock);
- /* should be initialized for __put_super_and_need_restart() */
- hlist_del_init(&sb->s_instances);
- spin_unlock(&sb_lock);
- up_write(&sb->s_umount);
+ /*
+ * Broadcast to everyone that grabbed a temporary reference to this
+ * superblock before we removed it from @fs_supers that the superblock
+ * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
+ * discard this superblock and treat it as dead.
+ *
+ * We leave the superblock on @fs_supers so it can be found by
+ * sget{_fc}() until we passed sb->kill_sb().
+ */
+ super_wake(sb, SB_DYING);
+ super_unlock_excl(sb);
if (sb->s_bdi != &noop_backing_dev_info) {
if (sb->s_iflags & SB_I_PERSB_BDI)
bdi_unregister(sb->s_bdi);
@@ -546,17 +736,31 @@ bool mount_capable(struct fs_context *fc)
* @test: Comparison callback
* @set: Setup callback
*
- * Find or create a superblock using the parameters stored in the filesystem
- * context and the two callback functions.
+ * Create a new superblock or find an existing one.
+ *
+ * The @test callback is used to find a matching existing superblock.
+ * Whether or not the requested parameters in @fc are taken into account
+ * is specific to the @test callback that is used. They may even be
+ * completely ignored.
+ *
+ * If an extant superblock is matched, it will be returned unless:
*
- * If an extant superblock is matched, then that will be returned with an
- * elevated reference count that the caller must transfer or discard.
+ * (1) the namespace the filesystem context @fc and the extant
+ * superblock's namespace differ
+ *
+ * (2) the filesystem context @fc has requested that reusing an extant
+ * superblock is not allowed
+ *
+ * In both cases EBUSY will be returned.
*
* If no match is made, a new superblock will be allocated and basic
- * initialisation will be performed (s_type, s_fs_info and s_id will be set and
- * the set() callback will be invoked), the superblock will be published and it
- * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
- * as yet unset.
+ * initialisation will be performed (s_type, s_fs_info and s_id will be
+ * set and the @set callback will be invoked), the superblock will be
+ * published and it will be returned in a partially constructed state
+ * with SB_BORN and SB_ACTIVE as yet unset.
+ *
+ * Return: On success, an extant or newly created superblock is
+ * returned. On failure an error pointer is returned.
*/
struct super_block *sget_fc(struct fs_context *fc,
int (*test)(struct super_block *, struct fs_context *),
@@ -595,6 +799,11 @@ retry:
s->s_type = fc->fs_type;
s->s_iflags |= fc->s_iflags;
strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+ /*
+ * Make the superblock visible on @super_blocks and @fs_supers.
+ * It's in a nascent state and users should wait on SB_BORN or
+ * SB_DYING to be set.
+ */
list_add_tail(&s->s_list, &super_blocks);
hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
spin_unlock(&sb_lock);
@@ -603,12 +812,16 @@ retry:
return s;
share_extant_sb:
- if (user_ns != old->s_user_ns) {
+ if (user_ns != old->s_user_ns || fc->exclusive) {
spin_unlock(&sb_lock);
destroy_unused_super(s);
+ if (fc->exclusive)
+ warnfc(fc, "reusing existing filesystem not allowed");
+ else
+ warnfc(fc, "reusing existing filesystem in another namespace not allowed");
return ERR_PTR(-EBUSY);
}
- if (!grab_super(old))
+ if (!grab_super_dead(old))
goto retry;
destroy_unused_super(s);
return old;
@@ -652,7 +865,7 @@ retry:
destroy_unused_super(s);
return ERR_PTR(-EBUSY);
}
- if (!grab_super(old))
+ if (!grab_super_dead(old))
goto retry;
destroy_unused_super(s);
return old;
@@ -685,7 +898,7 @@ EXPORT_SYMBOL(sget);
void drop_super(struct super_block *sb)
{
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
put_super(sb);
}
@@ -693,7 +906,7 @@ EXPORT_SYMBOL(drop_super);
void drop_super_exclusive(struct super_block *sb)
{
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);
@@ -704,7 +917,8 @@ static void __iterate_supers(void (*f)(struct super_block *))
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
+ /* Pairs with memory marrier in super_wake(). */
+ if (smp_load_acquire(&sb->s_flags) & SB_DYING)
continue;
sb->s_count++;
spin_unlock(&sb_lock);
@@ -734,15 +948,15 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
+ bool born;
+
sb->s_count++;
spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ born = super_lock_shared(sb);
+ if (born && sb->s_root)
f(sb, arg);
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
spin_lock(&sb_lock);
if (p)
@@ -770,13 +984,15 @@ void iterate_supers_type(struct file_system_type *type,
spin_lock(&sb_lock);
hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
+ bool born;
+
sb->s_count++;
spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ born = super_lock_shared(sb);
+ if (born && sb->s_root)
f(sb, arg);
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
spin_lock(&sb_lock);
if (p)
@@ -791,43 +1007,6 @@ void iterate_supers_type(struct file_system_type *type,
EXPORT_SYMBOL(iterate_supers_type);
/**
- * get_super - get the superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device given. %NULL is returned if no match is found.
- */
-struct super_block *get_super(struct block_device *bdev)
-{
- struct super_block *sb;
-
- if (!bdev)
- return NULL;
-
- spin_lock(&sb_lock);
-rescan:
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
- if (sb->s_bdev == bdev) {
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- /* still alive? */
- if (sb->s_root && (sb->s_flags & SB_BORN))
- return sb;
- up_read(&sb->s_umount);
- /* nope, got unmounted */
- spin_lock(&sb_lock);
- __put_super(sb);
- goto rescan;
- }
- }
- spin_unlock(&sb_lock);
- return NULL;
-}
-
-/**
* get_active_super - get an active reference to the superblock of a device
* @bdev: device to get the superblock for
*
@@ -842,15 +1021,12 @@ struct super_block *get_active_super(struct block_device *bdev)
if (!bdev)
return NULL;
-restart:
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
if (sb->s_bdev == bdev) {
if (!grab_super(sb))
- goto restart;
- up_write(&sb->s_umount);
+ return NULL;
+ super_unlock_excl(sb);
return sb;
}
}
@@ -863,28 +1039,21 @@ struct super_block *user_get_super(dev_t dev, bool excl)
struct super_block *sb;
spin_lock(&sb_lock);
-rescan:
list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
if (sb->s_dev == dev) {
+ bool born;
+
sb->s_count++;
spin_unlock(&sb_lock);
- if (excl)
- down_write(&sb->s_umount);
- else
- down_read(&sb->s_umount);
/* still alive? */
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ born = super_lock(sb, excl);
+ if (born && sb->s_root)
return sb;
- if (excl)
- up_write(&sb->s_umount);
- else
- up_read(&sb->s_umount);
+ super_unlock(sb, excl);
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
- goto rescan;
+ break;
}
}
spin_unlock(&sb_lock);
@@ -926,9 +1095,9 @@ int reconfigure_super(struct fs_context *fc)
if (remount_ro) {
if (!hlist_empty(&sb->s_pins)) {
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
group_pin_kill(&sb->s_pins);
- down_write(&sb->s_umount);
+ __super_lock_excl(sb);
if (!sb->s_root)
return 0;
if (sb->s_writers.frozen != SB_UNFROZEN)
@@ -991,9 +1160,9 @@ cancel_readonly:
static void do_emergency_remount_callback(struct super_block *sb)
{
- down_write(&sb->s_umount);
- if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
- !sb_rdonly(sb)) {
+ bool born = super_lock_excl(sb);
+
+ if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
struct fs_context *fc;
fc = fs_context_for_reconfigure(sb->s_root,
@@ -1004,7 +1173,7 @@ static void do_emergency_remount_callback(struct super_block *sb)
put_fs_context(fc);
}
}
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
}
static void do_emergency_remount(struct work_struct *work)
@@ -1027,12 +1196,13 @@ void emergency_remount(void)
static void do_thaw_all_callback(struct super_block *sb)
{
- down_write(&sb->s_umount);
- if (sb->s_root && sb->s_flags & SB_BORN) {
+ bool born = super_lock_excl(sb);
+
+ if (born && sb->s_root) {
emergency_thaw_bdev(sb);
- thaw_super_locked(sb);
+ thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
} else {
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
}
}
@@ -1136,7 +1306,7 @@ static int test_single_super(struct super_block *s, struct fs_context *fc)
return 1;
}
-static int vfs_get_super(struct fs_context *fc, bool reconf,
+static int vfs_get_super(struct fs_context *fc,
int (*test)(struct super_block *, struct fs_context *),
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
@@ -1154,19 +1324,9 @@ static int vfs_get_super(struct fs_context *fc, bool reconf,
goto error;
sb->s_flags |= SB_ACTIVE;
- fc->root = dget(sb->s_root);
- } else {
- fc->root = dget(sb->s_root);
- if (reconf) {
- err = reconfigure_super(fc);
- if (err < 0) {
- dput(fc->root);
- fc->root = NULL;
- goto error;
- }
- }
}
+ fc->root = dget(sb->s_root);
return 0;
error:
@@ -1178,7 +1338,7 @@ int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, false, NULL, fill_super);
+ return vfs_get_super(fc, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);
@@ -1186,54 +1346,81 @@ int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, false, test_single_super, fill_super);
+ return vfs_get_super(fc, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);
-int get_tree_single_reconf(struct fs_context *fc,
- int (*fill_super)(struct super_block *sb,
- struct fs_context *fc))
-{
- return vfs_get_super(fc, true, test_single_super, fill_super);
-}
-EXPORT_SYMBOL(get_tree_single_reconf);
-
int get_tree_keyed(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc),
void *key)
{
fc->s_fs_info = key;
- return vfs_get_super(fc, false, test_keyed_super, fill_super);
+ return vfs_get_super(fc, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);
#ifdef CONFIG_BLOCK
-static void fs_mark_dead(struct block_device *bdev)
+/*
+ * Lock a super block that the callers holds a reference to.
+ *
+ * The caller needs to ensure that the super_block isn't being freed while
+ * calling this function, e.g. by holding a lock over the call to this function
+ * and the place that clears the pointer to the superblock used by this function
+ * before freeing the superblock.
+ */
+static bool super_lock_shared_active(struct super_block *sb)
{
- struct super_block *sb;
+ bool born = super_lock_shared(sb);
+
+ if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
+ super_unlock_shared(sb);
+ return false;
+ }
+ return true;
+}
+
+static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
+{
+ struct super_block *sb = bdev->bd_holder;
+
+ /* bd_holder_lock ensures that the sb isn't freed */
+ lockdep_assert_held(&bdev->bd_holder_lock);
- sb = get_super(bdev);
- if (!sb)
+ if (!super_lock_shared_active(sb))
return;
+ if (!surprise)
+ sync_filesystem(sb);
+ shrink_dcache_sb(sb);
+ invalidate_inodes(sb);
if (sb->s_op->shutdown)
sb->s_op->shutdown(sb);
- drop_super(sb);
+
+ super_unlock_shared(sb);
+}
+
+static void fs_bdev_sync(struct block_device *bdev)
+{
+ struct super_block *sb = bdev->bd_holder;
+
+ lockdep_assert_held(&bdev->bd_holder_lock);
+
+ if (!super_lock_shared_active(sb))
+ return;
+ sync_filesystem(sb);
+ super_unlock_shared(sb);
}
-static const struct blk_holder_ops fs_holder_ops = {
- .mark_dead = fs_mark_dead,
+const struct blk_holder_ops fs_holder_ops = {
+ .mark_dead = fs_bdev_mark_dead,
+ .sync = fs_bdev_sync,
};
+EXPORT_SYMBOL_GPL(fs_holder_ops);
static int set_bdev_super(struct super_block *s, void *data)
{
- s->s_bdev = data;
- s->s_dev = s->s_bdev->bd_dev;
- s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
-
- if (bdev_stable_writes(s->s_bdev))
- s->s_iflags |= SB_I_STABLE_WRITES;
+ s->s_dev = *(dev_t *)data;
return 0;
}
@@ -1244,8 +1431,63 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
- return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key;
+ return !(s->s_iflags & SB_I_RETIRED) &&
+ s->s_dev == *(dev_t *)fc->sget_key;
+}
+
+int setup_bdev_super(struct super_block *sb, int sb_flags,
+ struct fs_context *fc)
+{
+ blk_mode_t mode = sb_open_mode(sb_flags);
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+ if (IS_ERR(bdev)) {
+ if (fc)
+ errorf(fc, "%s: Can't open blockdev", fc->source);
+ return PTR_ERR(bdev);
+ }
+
+ /*
+ * This really should be in blkdev_get_by_dev, but right now can't due
+ * to legacy issues that require us to allow opening a block device node
+ * writable from userspace even for a read-only block device.
+ */
+ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
+ blkdev_put(bdev, sb);
+ return -EACCES;
+ }
+
+ /*
+ * Until SB_BORN flag is set, there can be no active superblock
+ * references and thus no filesystem freezing. get_active_super() will
+ * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed.
+ *
+ * It is enough to check bdev was not frozen before we set s_bdev.
+ */
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+ if (bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ if (fc)
+ warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
+ blkdev_put(bdev, sb);
+ return -EBUSY;
+ }
+ spin_lock(&sb_lock);
+ sb->s_bdev = bdev;
+ sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
+ if (bdev_stable_writes(bdev))
+ sb->s_iflags |= SB_I_STABLE_WRITES;
+ spin_unlock(&sb_lock);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+
+ snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name,
+ sb->s_id);
+ sb_set_blocksize(sb, block_size(bdev));
+ return 0;
}
+EXPORT_SYMBOL_GPL(setup_bdev_super);
/**
* get_tree_bdev - Get a superblock based on a single block device
@@ -1256,73 +1498,49 @@ int get_tree_bdev(struct fs_context *fc,
int (*fill_super)(struct super_block *,
struct fs_context *))
{
- struct block_device *bdev;
struct super_block *s;
int error = 0;
+ dev_t dev;
if (!fc->source)
return invalf(fc, "No source specified");
- bdev = blkdev_get_by_path(fc->source, sb_open_mode(fc->sb_flags),
- fc->fs_type, &fs_holder_ops);
- if (IS_ERR(bdev)) {
- errorf(fc, "%s: Can't open blockdev", fc->source);
- return PTR_ERR(bdev);
- }
-
- /* Once the superblock is inserted into the list by sget_fc(), s_umount
- * will protect the lockfs code from trying to start a snapshot while
- * we are mounting
- */
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
- blkdev_put(bdev, fc->fs_type);
- return -EBUSY;
+ error = lookup_bdev(fc->source, &dev);
+ if (error) {
+ errorf(fc, "%s: Can't lookup blockdev", fc->source);
+ return error;
}
fc->sb_flags |= SB_NOSEC;
- fc->sget_key = bdev;
+ fc->sget_key = &dev;
s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s)) {
- blkdev_put(bdev, fc->fs_type);
+ if (IS_ERR(s))
return PTR_ERR(s);
- }
if (s->s_root) {
/* Don't summarily change the RO/RW state. */
if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
- warnf(fc, "%pg: Can't mount, would change RO state", bdev);
+ warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
deactivate_locked_super(s);
- blkdev_put(bdev, fc->fs_type);
return -EBUSY;
}
-
+ } else {
/*
- * s_umount nests inside open_mutex during
- * __invalidate_device(). blkdev_put() acquires
- * open_mutex and can't be called under s_umount. Drop
- * s_umount temporarily. This is safe as we're
- * holding an active reference.
+ * We drop s_umount here because we need to open the bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * bdev_mark_dead()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
*/
- up_write(&s->s_umount);
- blkdev_put(bdev, fc->fs_type);
- down_write(&s->s_umount);
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
- fc->fs_type->name, s->s_id);
- sb_set_blocksize(s, block_size(bdev));
- error = fill_super(s, fc);
+ super_unlock_excl(s);
+ error = setup_bdev_super(s, fc->sb_flags, fc);
+ __super_lock_excl(s);
+ if (!error)
+ error = fill_super(s, fc);
if (error) {
deactivate_locked_super(s);
return error;
}
-
s->s_flags |= SB_ACTIVE;
- bdev->bd_super = s;
}
BUG_ON(fc->root);
@@ -1333,79 +1551,52 @@ EXPORT_SYMBOL(get_tree_bdev);
static int test_bdev_super(struct super_block *s, void *data)
{
- return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data;
+ return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
}
struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
- struct block_device *bdev;
struct super_block *s;
- int error = 0;
+ int error;
+ dev_t dev;
- bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type,
- &fs_holder_ops);
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
+ error = lookup_bdev(dev_name, &dev);
+ if (error)
+ return ERR_PTR(error);
- /*
- * once the super is inserted into the list by sget, s_umount
- * will protect the lockfs code from trying to start a snapshot
- * while we are mounting
- */
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- error = -EBUSY;
- goto error_bdev;
- }
- s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
- bdev);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ flags |= SB_NOSEC;
+ s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
if (IS_ERR(s))
- goto error_s;
+ return ERR_CAST(s);
if (s->s_root) {
if ((flags ^ s->s_flags) & SB_RDONLY) {
deactivate_locked_super(s);
- error = -EBUSY;
- goto error_bdev;
+ return ERR_PTR(-EBUSY);
}
-
+ } else {
/*
- * s_umount nests inside open_mutex during
- * __invalidate_device(). blkdev_put() acquires
- * open_mutex and can't be called under s_umount. Drop
- * s_umount temporarily. This is safe as we're
- * holding an active reference.
+ * We drop s_umount here because we need to open the bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * bdev_mark_dead()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
*/
- up_write(&s->s_umount);
- blkdev_put(bdev, fs_type);
- down_write(&s->s_umount);
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
- fs_type->name, s->s_id);
- sb_set_blocksize(s, block_size(bdev));
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
+ super_unlock_excl(s);
+ error = setup_bdev_super(s, flags, NULL);
+ __super_lock_excl(s);
+ if (!error)
+ error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
deactivate_locked_super(s);
- goto error;
+ return ERR_PTR(error);
}
s->s_flags |= SB_ACTIVE;
- bdev->bd_super = s;
}
return dget(s->s_root);
-
-error_s:
- error = PTR_ERR(s);
-error_bdev:
- blkdev_put(bdev, fs_type);
-error:
- return ERR_PTR(error);
}
EXPORT_SYMBOL(mount_bdev);
@@ -1413,10 +1604,11 @@ void kill_block_super(struct super_block *sb)
{
struct block_device *bdev = sb->s_bdev;
- bdev->bd_super = NULL;
generic_shutdown_super(sb);
- sync_blockdev(bdev);
- blkdev_put(bdev, sb->s_type);
+ if (bdev) {
+ sync_blockdev(bdev);
+ blkdev_put(bdev, sb);
+ }
}
EXPORT_SYMBOL(kill_block_super);
@@ -1533,13 +1725,13 @@ int vfs_get_tree(struct fs_context *fc)
WARN_ON(!sb->s_bdi);
/*
- * Write barrier is for super_cache_count(). We place it before setting
- * SB_BORN as the data dependency between the two functions is the
- * superblock structure contents that we just set up, not the SB_BORN
- * flag.
+ * super_wake() contains a memory barrier which also care of
+ * ordering for super_cache_count(). We place it before setting
+ * SB_BORN as the data dependency between the two functions is
+ * the superblock structure contents that we just set up, not
+ * the SB_BORN flag.
*/
- smp_wmb();
- sb->s_flags |= SB_BORN;
+ super_wake(sb, SB_BORN);
error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
if (unlikely(error)) {
@@ -1644,14 +1836,43 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
percpu_up_write(sb->s_writers.rw_sem + level);
}
+static int wait_for_partially_frozen(struct super_block *sb)
+{
+ int ret = 0;
+
+ do {
+ unsigned short old = sb->s_writers.frozen;
+
+ up_write(&sb->s_umount);
+ ret = wait_var_event_killable(&sb->s_writers.frozen,
+ sb->s_writers.frozen != old);
+ down_write(&sb->s_umount);
+ } while (ret == 0 &&
+ sb->s_writers.frozen != SB_UNFROZEN &&
+ sb->s_writers.frozen != SB_FREEZE_COMPLETE);
+
+ return ret;
+}
+
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
+ * @who: context that wants to freeze
*
* Syncs the super to make sure the filesystem is consistent and calls the fs's
- * freeze_fs. Subsequent calls to this without first thawing the fs will return
+ * freeze_fs. Subsequent calls to this without first thawing the fs may return
* -EBUSY.
*
+ * @who should be:
+ * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
+ * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
+ *
+ * The @who argument distinguishes between the kernel and userspace trying to
+ * freeze the filesystem. Although there cannot be multiple kernel freezes or
+ * multiple userspace freezes in effect at any given time, the kernel and
+ * userspace can both hold a filesystem frozen. The filesystem remains frozen
+ * until there are no kernel or userspace freezes in effect.
+ *
* During this function, sb->s_writers.frozen goes through these values:
*
* SB_UNFROZEN: File system is normal, all writes progress as usual.
@@ -1677,34 +1898,62 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
*
* sb->s_writers.frozen is protected by sb->s_umount.
*/
-int freeze_super(struct super_block *sb)
+int freeze_super(struct super_block *sb, enum freeze_holder who)
{
int ret;
atomic_inc(&sb->s_active);
- down_write(&sb->s_umount);
+ if (!super_lock_excl(sb))
+ WARN(1, "Dying superblock while freezing!");
+
+retry:
+ if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
+ if (sb->s_writers.freeze_holders & who) {
+ deactivate_locked_super(sb);
+ return -EBUSY;
+ }
+
+ WARN_ON(sb->s_writers.freeze_holders == 0);
+
+ /*
+ * Someone else already holds this type of freeze; share the
+ * freeze and assign the active ref to the freeze.
+ */
+ sb->s_writers.freeze_holders |= who;
+ super_unlock_excl(sb);
+ return 0;
+ }
+
if (sb->s_writers.frozen != SB_UNFROZEN) {
- deactivate_locked_super(sb);
- return -EBUSY;
+ ret = wait_for_partially_frozen(sb);
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+
+ goto retry;
}
if (!(sb->s_flags & SB_BORN)) {
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
return 0; /* sic - it's "nothing to do" */
}
if (sb_rdonly(sb)) {
/* Nothing to do really... */
+ sb->s_writers.freeze_holders |= who;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
- up_write(&sb->s_umount);
+ wake_up_var(&sb->s_writers.frozen);
+ super_unlock_excl(sb);
return 0;
}
sb->s_writers.frozen = SB_FREEZE_WRITE;
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
sb_wait_write(sb, SB_FREEZE_WRITE);
- down_write(&sb->s_umount);
+ if (!super_lock_excl(sb))
+ WARN(1, "Dying superblock while freezing!");
/* Now we go and block page faults... */
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
@@ -1715,6 +1964,7 @@ int freeze_super(struct super_block *sb)
if (ret) {
sb->s_writers.frozen = SB_UNFROZEN;
sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
+ wake_up_var(&sb->s_writers.frozen);
deactivate_locked_super(sb);
return ret;
}
@@ -1730,6 +1980,7 @@ int freeze_super(struct super_block *sb)
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
sb_freeze_unlock(sb, SB_FREEZE_FS);
+ wake_up_var(&sb->s_writers.frozen);
deactivate_locked_super(sb);
return ret;
}
@@ -1738,24 +1989,50 @@ int freeze_super(struct super_block *sb)
* For debugging purposes so that fs can warn if it sees write activity
* when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
+ sb->s_writers.freeze_holders |= who;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
+ wake_up_var(&sb->s_writers.frozen);
lockdep_sb_freeze_release(sb);
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
return 0;
}
EXPORT_SYMBOL(freeze_super);
-static int thaw_super_locked(struct super_block *sb)
+/*
+ * Undoes the effect of a freeze_super_locked call. If the filesystem is
+ * frozen both by userspace and the kernel, a thaw call from either source
+ * removes that state without releasing the other state or unlocking the
+ * filesystem.
+ */
+static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
{
int error;
- if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
- up_write(&sb->s_umount);
+ if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
+ if (!(sb->s_writers.freeze_holders & who)) {
+ super_unlock_excl(sb);
+ return -EINVAL;
+ }
+
+ /*
+ * Freeze is shared with someone else. Release our hold and
+ * drop the active ref that freeze_super assigned to the
+ * freezer.
+ */
+ if (sb->s_writers.freeze_holders & ~who) {
+ sb->s_writers.freeze_holders &= ~who;
+ deactivate_locked_super(sb);
+ return 0;
+ }
+ } else {
+ super_unlock_excl(sb);
return -EINVAL;
}
if (sb_rdonly(sb)) {
+ sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
+ wake_up_var(&sb->s_writers.frozen);
goto out;
}
@@ -1764,15 +2041,16 @@ static int thaw_super_locked(struct super_block *sb)
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
- printk(KERN_ERR
- "VFS:Filesystem thaw failed\n");
+ printk(KERN_ERR "VFS:Filesystem thaw failed\n");
lockdep_sb_freeze_release(sb);
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
return error;
}
}
+ sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
+ wake_up_var(&sb->s_writers.frozen);
sb_freeze_unlock(sb, SB_FREEZE_FS);
out:
deactivate_locked_super(sb);
@@ -1782,13 +2060,20 @@ out:
/**
* thaw_super -- unlock filesystem
* @sb: the super to thaw
+ * @who: context that wants to freeze
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super()
+ * if there are no remaining freezes on the filesystem.
*
- * Unlocks the filesystem and marks it writeable again after freeze_super().
+ * @who should be:
+ * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
+ * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
*/
-int thaw_super(struct super_block *sb)
+int thaw_super(struct super_block *sb, enum freeze_holder who)
{
- down_write(&sb->s_umount);
- return thaw_super_locked(sb);
+ if (!super_lock_excl(sb))
+ WARN(1, "Dying superblock while thawing!");
+ return thaw_super_locked(sb, who);
}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 0140010aa0c3..2f5ead88d00b 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -224,7 +224,7 @@ got_it:
memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
err = sysv_handle_dirsync(dir);
out_page:
@@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
}
de->inode = 0;
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return sysv_handle_dirsync(inode);
}
@@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page,
}
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
return sysv_handle_dirsync(inode);
}
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index e732879036ab..6719da5889d9 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
dirty_sb(sb);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = fs16_to_cpu(sbi, ino);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_blocks = 0;
memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
SYSV_I(inode)->i_dir_start_lookup = 0;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9e8d4a6fb2f3..0aa3827d8178 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -202,8 +202,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
- inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
inode->i_atime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
inode->i_blocks = 0;
@@ -256,7 +255,7 @@ static int __sysv_write_inode(struct inode *inode, int wait)
raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec);
- raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec);
+ raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec);
si = SYSV_I(inode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 58d7f43a1371..edb94e55de8e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -183,7 +183,7 @@ static inline int splice_branch(struct inode *inode,
*where->p = where->key;
write_unlock(&pointers_lock);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
/* had we spliced it onto indirect block? */
if (where->bh)
@@ -423,7 +423,7 @@ do_indirects:
}
n++;
}
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (IS_SYNC(inode))
sysv_sync_inode (inode);
else
@@ -449,7 +449,8 @@ int sysv_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct super_block *s = path->dentry->d_sb;
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
stat->blksize = s->s_blocksize;
return 0;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index fcf163fea3ad..d6b73798071b 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -103,7 +103,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -161,7 +161,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry)
err = sysv_delete_entry(de, page);
if (!err) {
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
}
unmap_and_put_page(page, de);
@@ -230,7 +230,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
unmap_and_put_page(new_page, new_de);
if (err)
goto out_dir;
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 57ac8aa4a724..2feb6c58648c 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -132,7 +132,7 @@ static struct inode *tracefs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
return inode;
}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 9c9d3f0e36a4..eef9e527d9ff 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -243,8 +243,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
(unsigned int)inode->i_mtime.tv_sec,
(unsigned int)inode->i_mtime.tv_nsec);
pr_err("\tctime %u.%u\n",
- (unsigned int)inode->i_ctime.tv_sec,
- (unsigned int)inode->i_ctime.tv_nsec);
+ (unsigned int) inode_get_ctime(inode).tv_sec,
+ (unsigned int) inode_get_ctime(inode).tv_nsec);
pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum);
pr_err("\txattr_size %u\n", ui->xattr_size);
pr_err("\txattr_cnt %u\n", ui->xattr_cnt);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef0499edc248..2f48c58d47cd 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -96,8 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
inode->i_flags |= S_NOCMTIME;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode->i_ctime =
- current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mapping->nrpages = 0;
if (!is_xattr) {
@@ -325,7 +324,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -765,10 +764,10 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
ihold(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -838,11 +837,11 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = current_time(dir);
+ inode_set_ctime_current(inode);
drop_nlink(inode);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
@@ -940,12 +939,12 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = current_time(dir);
+ inode_set_ctime_current(inode);
clear_nlink(inode);
drop_nlink(dir);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
@@ -1019,7 +1018,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inc_nlink(dir);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err) {
ubifs_err(c, "cannot create directory, error %d", err);
@@ -1110,7 +1109,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -1210,7 +1209,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -1298,7 +1297,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct ubifs_budget_req wht_req;
- struct timespec64 time;
unsigned int saved_nlink;
struct fscrypt_name old_nm, new_nm;
@@ -1414,8 +1412,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the @i_ctime for inodes on a
* rename.
*/
- time = current_time(old_dir);
- old_inode->i_ctime = time;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
/* We must adjust parent link count when renaming directories */
if (is_dir) {
@@ -1444,13 +1441,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dir->i_size -= old_sz;
ubifs_inode(old_dir)->ui_size = old_dir->i_size;
- old_dir->i_mtime = old_dir->i_ctime = time;
- new_dir->i_mtime = new_dir->i_ctime = time;
/*
* And finally, if we unlinked a direntry which happened to have the
* same name as the moved direntry, we have to decrement @i_nlink of
- * the unlinked inode and change its ctime.
+ * the unlinked inode.
*/
if (unlink) {
/*
@@ -1462,7 +1457,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
clear_nlink(new_inode);
else
drop_nlink(new_inode);
- new_inode->i_ctime = time;
} else {
new_dir->i_size += new_sz;
ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@ -1557,7 +1551,6 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
struct inode *fst_inode = d_inode(old_dentry);
struct inode *snd_inode = d_inode(new_dentry);
- struct timespec64 time;
int err;
struct fscrypt_name fst_nm, snd_nm;
@@ -1588,11 +1581,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
lock_4_inodes(old_dir, new_dir, NULL, NULL);
- time = current_time(old_dir);
- fst_inode->i_ctime = time;
- snd_inode->i_ctime = time;
- old_dir->i_mtime = old_dir->i_ctime = time;
- new_dir->i_mtime = new_dir->i_ctime = time;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (old_dir != new_dir) {
if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) {
@@ -1665,7 +1654,7 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path,
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->blksize = UBIFS_BLOCK_SIZE;
stat->size = ui->ui_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6738fe43040b..e5382f0b2587 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1092,7 +1092,7 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
if (attr->ia_valid & ATTR_MTIME)
inode->i_mtime = attr->ia_mtime;
if (attr->ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (attr->ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
@@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
/* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
if (attr->ia_valid & ATTR_SIZE) {
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
/* 'truncate_setsize()' changed @i_size, update @ui_size */
ui->ui_size = inode->i_size;
}
@@ -1364,8 +1364,10 @@ out:
static inline int mctime_update_needed(const struct inode *inode,
const struct timespec64 *now)
{
+ struct timespec64 ctime = inode_get_ctime(inode);
+
if (!timespec64_equal(&inode->i_mtime, now) ||
- !timespec64_equal(&inode->i_ctime, now))
+ !timespec64_equal(&ctime, now))
return 1;
return 0;
}
@@ -1376,8 +1378,7 @@ static inline int mctime_update_needed(const struct inode *inode,
*
* This function updates time of the inode.
*/
-int ubifs_update_time(struct inode *inode, struct timespec64 *time,
- int flags)
+int ubifs_update_time(struct inode *inode, int flags)
{
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1385,21 +1386,17 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
int err, release;
- if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
- return generic_update_time(inode, time, flags);
+ if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) {
+ generic_update_time(inode, flags);
+ return 0;
+ }
err = ubifs_budget_space(c, &req);
if (err)
return err;
mutex_lock(&ui->ui_mutex);
- if (flags & S_ATIME)
- inode->i_atime = *time;
- if (flags & S_CTIME)
- inode->i_ctime = *time;
- if (flags & S_MTIME)
- inode->i_mtime = *time;
-
+ inode_update_timestamps(inode, flags);
release = ui->dirty;
__mark_inode_dirty(inode, I_DIRTY_SYNC);
mutex_unlock(&ui->ui_mutex);
@@ -1432,7 +1429,7 @@ static int update_mctime(struct inode *inode)
return err;
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
@@ -1570,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 67c5108abd89..d79cabe193c3 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -118,7 +118,7 @@ static int setflags(struct inode *inode, int flags)
ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS);
ui->flags |= ioctl2ubifs(flags);
ubifs_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index dc52ac0f4a34..ffc9beee7be6 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -454,8 +454,8 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec);
ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec);
- ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ino->ctime_sec = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
ino->uid = cpu_to_le32(i_uid_read(inode));
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 32cb14759796..b08fb28d16b5 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -146,8 +146,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
- inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec);
- inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
+ inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec),
+ le32_to_cpu(ino->ctime_nsec));
inode->i_mode = le32_to_cpu(ino->mode);
inode->i_size = le64_to_cpu(ino->size);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4c36044140e7..ebb3ad6b5e7e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2027,7 +2027,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
-int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
+int ubifs_update_time(struct inode *inode, int flags);
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 349228dd1191..406c82eab513 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -134,7 +134,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
ui->data_len = size;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = current_time(host);
+ inode_set_ctime_current(host);
host_ui->xattr_cnt += 1;
host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -215,7 +215,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
ui->data_len = size;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = current_time(host);
+ inode_set_ctime_current(host);
host_ui->xattr_size -= CALC_XATTR_BYTES(old_size);
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -474,7 +474,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
return err;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = current_time(host);
+ inode_set_ctime_current(host);
host_ui->xattr_cnt -= 1;
host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 5f7ac8c84798..6b558cbbeb6b 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -100,7 +100,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
iinfo->i_crtime = inode->i_mtime;
if (unlikely(insert_inode_locked(inode) < 0)) {
make_bad_inode(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 28cdfc57d946..d089795074e8 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -910,7 +910,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map)
map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED;
iinfo->i_next_alloc_block = map->lblk + 1;
iinfo->i_next_alloc_goal = newblocknum + 1;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_SYNC(inode))
udf_sync_inode(inode);
@@ -1298,7 +1298,7 @@ set_size:
goto out_unlock;
}
update_time:
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (IS_SYNC(inode))
udf_sync_inode(inode);
else
@@ -1329,6 +1329,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
int bs = inode->i_sb->s_blocksize;
int ret = -EIO;
uint32_t uid, gid;
+ struct timespec64 ctime;
reread:
if (iloc->partitionReferenceNum >= sbi->s_partitions) {
@@ -1507,7 +1508,8 @@ reread:
udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime);
udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime);
- udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime);
+ udf_disk_stamp_to_time(&ctime, fe->attrTime);
+ inode_set_ctime_to_ts(inode, ctime);
iinfo->i_unique = le64_to_cpu(fe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1522,7 +1524,8 @@ reread:
udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime);
udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime);
udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime);
- udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime);
+ udf_disk_stamp_to_time(&ctime, efe->attrTime);
+ inode_set_ctime_to_ts(inode, ctime);
iinfo->i_unique = le64_to_cpu(efe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1799,7 +1802,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
- udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
+ udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode));
memset(&(fe->impIdent), 0, sizeof(struct regid));
strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1830,12 +1833,12 @@ static int udf_update_inode(struct inode *inode, int do_sync)
udf_adjust_time(iinfo, inode->i_atime);
udf_adjust_time(iinfo, inode->i_mtime);
- udf_adjust_time(iinfo, inode->i_ctime);
+ udf_adjust_time(iinfo, inode_get_ctime(inode));
udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
- udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
+ udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode));
memset(&(efe->impIdent), 0, sizeof(efe->impIdent));
strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index a95579b043ab..ae55ab8859b6 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
*(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
udf_fiiter_write_fi(&iter, NULL);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
udf_fiiter_release(&iter);
udf_add_fid_counter(dir->i_sb, false, 1);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
udf_fiiter_release(&iter);
udf_add_fid_counter(dir->i_sb, true, 1);
inc_nlink(dir);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
d_instantiate_new(dentry, inode);
@@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_size = 0;
inode_dec_link_count(dir);
udf_add_fid_counter(dir->i_sb, true, -1);
- inode->i_ctime = dir->i_ctime = dir->i_mtime =
- current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
mark_inode_dirty(dir);
ret = 0;
end_rmdir:
@@ -555,11 +555,11 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
set_nlink(inode, 1);
}
udf_fiiter_delete_entry(&iter);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
inode_dec_link_count(inode);
udf_add_fid_counter(dir->i_sb, false, -1);
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
ret = 0;
end_unlink:
udf_fiiter_release(&iter);
@@ -746,9 +746,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
udf_add_fid_counter(dir->i_sb, false, 1);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
ihold(inode);
d_instantiate(dentry, inode);
@@ -833,7 +833,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
/*
@@ -861,13 +861,13 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
if (new_inode) {
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
inode_dec_link_count(new_inode);
udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode),
-1);
}
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
- new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir);
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ new_dir->i_mtime = inode_set_ctime_current(new_dir);
mark_inode_dirty(old_dir);
mark_inode_dirty(new_dir);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 779b5c2c75f6..f7eaf7b14594 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -149,7 +149,7 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap,
struct inode *inode = d_backing_inode(dentry);
struct page *page;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
page = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(page))
return PTR_ERR(page);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 379d75796a5c..fd57f03b6c93 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
ufs_commit_chunk(page, pos, len);
ufs_put_page(page);
if (update_times)
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
ufs_handle_dirsync(dir);
}
@@ -397,7 +397,7 @@ got_it:
ufs_set_de_type(sb, de, inode->i_mode);
ufs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
err = ufs_handle_dirsync(dir);
@@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
ufs_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
err = ufs_handle_dirsync(inode);
out:
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 06bd84d555bd..a1e7bd9d1f98 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -292,7 +292,7 @@ cg_found:
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_blocks = 0;
inode->i_generation = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
ufsi->i_flags = UFS_I(dir)->i_flags;
ufsi->i_lastfrag = 0;
ufsi->i_shadow = 0;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index a4246c83a8cd..21a4779a2de5 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -296,7 +296,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
if (new)
*new = 1;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_SYNC(inode))
ufs_sync_inode (inode);
mark_inode_dirty(inode);
@@ -378,7 +378,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block,
mark_buffer_dirty(bh);
if (IS_SYNC(inode))
sync_dirty_buffer(bh);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
out:
brelse (bh);
@@ -580,11 +580,12 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
- inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec);
+ inode_set_ctime(inode,
+ (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec),
+ 0);
inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen);
ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
@@ -626,10 +627,10 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
- inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime);
+ inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime),
+ fs32_to_cpu(sb, ufs2_inode->ui_ctimensec));
inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
- inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec);
inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
@@ -726,7 +727,8 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
ufs_inode->ui_atime.tv_usec = 0;
- ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec);
+ ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb,
+ inode_get_ctime(inode).tv_sec);
ufs_inode->ui_ctime.tv_usec = 0;
ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
ufs_inode->ui_mtime.tv_usec = 0;
@@ -770,8 +772,9 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
- ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec);
- ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec);
+ ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec);
+ ufs_inode->ui_ctimensec = cpu_to_fs32(sb,
+ inode_get_ctime(inode).tv_nsec);
ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
@@ -1205,7 +1208,7 @@ static int ufs_truncate(struct inode *inode, loff_t size)
truncate_setsize(inode, size);
ufs_truncate_blocks(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
out:
UFSD("EXIT: err %d\n", err);
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 36154b5aca6d..9cad29463791 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -153,7 +153,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
struct inode *inode = d_inode(old_dentry);
int error;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -220,7 +220,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
err = 0;
out:
@@ -282,7 +282,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (!new_de)
goto out_dir;
ufs_set_link(new_dir, new_de, new_page, old_inode, 1);
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
@@ -298,7 +298,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
ufs_delete_entry(old_dir, old_de, old_page);
mark_inode_dirty(old_inode);
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index dd0ae1188e87..83f20dd15522 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -128,8 +128,8 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
inode->i_atime = ns_to_timespec64(
info->access_time.ns_relative_to_unix_epoch);
- inode->i_ctime = ns_to_timespec64(
- info->change_time.ns_relative_to_unix_epoch);
+ inode_set_ctime_to_ts(inode,
+ ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch));
inode->i_mtime = ns_to_timespec64(
info->modification_time.ns_relative_to_unix_epoch);
return 0;
@@ -252,7 +252,7 @@ int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path,
if (err)
return err;
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), kstat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), kstat);
return 0;
}
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 49bf3a1eb2a0..d071a6e32581 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -118,16 +118,16 @@ void fsverity_free_info(struct fsverity_info *vi);
int fsverity_get_descriptor(struct inode *inode,
struct fsverity_descriptor **desc_ret);
-int __init fsverity_init_info_cache(void);
-void __init fsverity_exit_info_cache(void);
+void __init fsverity_init_info_cache(void);
/* signature.c */
#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
+extern int fsverity_require_signatures;
int fsverity_verify_signature(const struct fsverity_info *vi,
const u8 *signature, size_t sig_size);
-int __init fsverity_init_signature(void);
+void __init fsverity_init_signature(void);
#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
static inline int
fsverity_verify_signature(const struct fsverity_info *vi,
@@ -136,15 +136,13 @@ fsverity_verify_signature(const struct fsverity_info *vi,
return 0;
}
-static inline int fsverity_init_signature(void)
+static inline void fsverity_init_signature(void)
{
- return 0;
}
#endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
/* verify.c */
-int __init fsverity_init_workqueue(void);
-void __init fsverity_exit_workqueue(void);
+void __init fsverity_init_workqueue(void);
#endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index c598d2035476..6b08b1d9a7d7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -226,6 +226,14 @@ void __init fsverity_check_hash_algs(void)
if (!alg->name)
continue;
+ /*
+ * 0 must never be allocated as an FS_VERITY_HASH_ALG_* value,
+ * as it is reserved for users that use 0 to mean unspecified or
+ * a default value. fs/verity/ itself doesn't care and doesn't
+ * have a default algorithm, but some users make use of this.
+ */
+ BUG_ON(i == 0);
+
BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE);
/*
diff --git a/fs/verity/init.c b/fs/verity/init.c
index 023905151035..a29f062f6047 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -9,6 +9,37 @@
#include <linux/ratelimit.h>
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fsverity_sysctl_header;
+
+static struct ctl_table fsverity_sysctl_table[] = {
+#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
+ {
+ .procname = "require_signatures",
+ .data = &fsverity_require_signatures,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ { }
+};
+
+static void __init fsverity_init_sysctl(void)
+{
+ fsverity_sysctl_header = register_sysctl("fs/verity",
+ fsverity_sysctl_table);
+ if (!fsverity_sysctl_header)
+ panic("fsverity sysctl registration failed");
+}
+#else /* CONFIG_SYSCTL */
+static inline void fsverity_init_sysctl(void)
+{
+}
+#endif /* !CONFIG_SYSCTL */
+
void fsverity_msg(const struct inode *inode, const char *level,
const char *fmt, ...)
{
@@ -33,28 +64,11 @@ void fsverity_msg(const struct inode *inode, const char *level,
static int __init fsverity_init(void)
{
- int err;
-
fsverity_check_hash_algs();
-
- err = fsverity_init_info_cache();
- if (err)
- return err;
-
- err = fsverity_init_workqueue();
- if (err)
- goto err_exit_info_cache;
-
- err = fsverity_init_signature();
- if (err)
- goto err_exit_workqueue;
-
+ fsverity_init_info_cache();
+ fsverity_init_workqueue();
+ fsverity_init_sysctl();
+ fsverity_init_signature();
return 0;
-
-err_exit_workqueue:
- fsverity_exit_workqueue();
-err_exit_info_cache:
- fsverity_exit_info_cache();
- return err;
}
late_initcall(fsverity_init)
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 1db5106a9c38..6c31a871b84b 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -408,18 +408,10 @@ void __fsverity_cleanup_inode(struct inode *inode)
}
EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode);
-int __init fsverity_init_info_cache(void)
+void __init fsverity_init_info_cache(void)
{
- fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info,
- SLAB_RECLAIM_ACCOUNT,
- file_digest);
- if (!fsverity_info_cachep)
- return -ENOMEM;
- return 0;
-}
-
-void __init fsverity_exit_info_cache(void)
-{
- kmem_cache_destroy(fsverity_info_cachep);
- fsverity_info_cachep = NULL;
+ fsverity_info_cachep = KMEM_CACHE_USERCOPY(
+ fsverity_info,
+ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC,
+ file_digest);
}
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index 72034bc71c9d..90c07573dd77 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -24,7 +24,7 @@
* /proc/sys/fs/verity/require_signatures
* If 1, all verity files must have a valid builtin signature.
*/
-static int fsverity_require_signatures;
+int fsverity_require_signatures;
/*
* Keyring that contains the trusted X.509 certificates.
@@ -62,6 +62,22 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return 0;
}
+ if (fsverity_keyring->keys.nr_leaves_on_tree == 0) {
+ /*
+ * The ".fs-verity" keyring is empty, due to builtin signatures
+ * being supported by the kernel but not actually being used.
+ * In this case, verify_pkcs7_signature() would always return an
+ * error, usually ENOKEY. It could also be EBADMSG if the
+ * PKCS#7 is malformed, but that isn't very important to
+ * distinguish. So, just skip to ENOKEY to avoid the attack
+ * surface of the PKCS#7 parser, which would otherwise be
+ * reachable by any task able to execute FS_IOC_ENABLE_VERITY.
+ */
+ fsverity_err(inode,
+ "fs-verity keyring is empty, rejecting signed file!");
+ return -ENOKEY;
+ }
+
d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL);
if (!d)
return -ENOMEM;
@@ -93,59 +109,14 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return 0;
}
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *fsverity_sysctl_header;
-
-static struct ctl_table fsverity_sysctl_table[] = {
- {
- .procname = "require_signatures",
- .data = &fsverity_require_signatures,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- { }
-};
-
-static int __init fsverity_sysctl_init(void)
-{
- fsverity_sysctl_header = register_sysctl("fs/verity", fsverity_sysctl_table);
- if (!fsverity_sysctl_header) {
- pr_err("sysctl registration failed!\n");
- return -ENOMEM;
- }
- return 0;
-}
-#else /* !CONFIG_SYSCTL */
-static inline int __init fsverity_sysctl_init(void)
+void __init fsverity_init_signature(void)
{
- return 0;
-}
-#endif /* !CONFIG_SYSCTL */
-
-int __init fsverity_init_signature(void)
-{
- struct key *ring;
- int err;
-
- ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0),
- current_cred(), KEY_POS_SEARCH |
+ fsverity_keyring =
+ keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0),
+ current_cred(), KEY_POS_SEARCH |
KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE |
KEY_USR_SEARCH | KEY_USR_SETATTR,
- KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
- if (IS_ERR(ring))
- return PTR_ERR(ring);
-
- err = fsverity_sysctl_init();
- if (err)
- goto err_put_ring;
-
- fsverity_keyring = ring;
- return 0;
-
-err_put_ring:
- key_put(ring);
- return err;
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(fsverity_keyring))
+ panic("failed to allocate \".fs-verity\" keyring");
}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 433cef51f5f6..904ccd7e8e16 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -346,7 +346,7 @@ void fsverity_enqueue_verify_work(struct work_struct *work)
}
EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work);
-int __init fsverity_init_workqueue(void)
+void __init fsverity_init_workqueue(void)
{
/*
* Use a high-priority workqueue to prioritize verification work, which
@@ -360,12 +360,5 @@ int __init fsverity_init_workqueue(void)
WQ_HIGHPRI,
num_online_cpus());
if (!fsverity_read_workqueue)
- return -ENOMEM;
- return 0;
-}
-
-void __init fsverity_exit_workqueue(void)
-{
- destroy_workqueue(fsverity_read_workqueue);
- fsverity_read_workqueue = NULL;
+ panic("failed to allocate fsverity_read_queue");
}
diff --git a/fs/xattr.c b/fs/xattr.c
index e7bbb7f57557..efd4736bc94b 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -1040,12 +1040,32 @@ const char *xattr_full_name(const struct xattr_handler *handler,
EXPORT_SYMBOL(xattr_full_name);
/**
- * free_simple_xattr - free an xattr object
+ * simple_xattr_space - estimate the memory used by a simple xattr
+ * @name: the full name of the xattr
+ * @size: the size of its value
+ *
+ * This takes no account of how much larger the two slab objects actually are:
+ * that would depend on the slab implementation, when what is required is a
+ * deterministic number, which grows with name length and size and quantity.
+ *
+ * Return: The approximate number of bytes of memory used by such an xattr.
+ */
+size_t simple_xattr_space(const char *name, size_t size)
+{
+ /*
+ * Use "40" instead of sizeof(struct simple_xattr), to return the
+ * same result on 32-bit and 64-bit, and even if simple_xattr grows.
+ */
+ return 40 + size + strlen(name);
+}
+
+/**
+ * simple_xattr_free - free an xattr object
* @xattr: the xattr object
*
* Free the xattr object. Can handle @xattr being NULL.
*/
-static inline void free_simple_xattr(struct simple_xattr *xattr)
+void simple_xattr_free(struct simple_xattr *xattr)
{
if (xattr)
kfree(xattr->name);
@@ -1073,7 +1093,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
if (len < sizeof(*new_xattr))
return NULL;
- new_xattr = kvmalloc(len, GFP_KERNEL);
+ new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT);
if (!new_xattr)
return NULL;
@@ -1164,7 +1184,6 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* @value: the value to store along the xattr
* @size: the size of @value
* @flags: the flags determining how to set the xattr
- * @removed_size: the size of the removed xattr
*
* Set a new xattr object.
* If @value is passed a new xattr object will be allocated. If XATTR_REPLACE
@@ -1181,29 +1200,27 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* nothing if XATTR_CREATE is specified in @flags or @flags is zero. For
* XATTR_REPLACE we fail as mentioned above.
*
- * Return: On success zero and on error a negative error code is returned.
+ * Return: On success, the removed or replaced xattr is returned, to be freed
+ * by the caller; or NULL if none. On failure a negative error code is returned.
*/
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags,
- ssize_t *removed_size)
+struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct simple_xattr *xattr = NULL, *new_xattr = NULL;
+ struct simple_xattr *old_xattr = NULL, *new_xattr = NULL;
struct rb_node *parent = NULL, **rbp;
int err = 0, ret;
- if (removed_size)
- *removed_size = -1;
-
/* value == NULL means remove */
if (value) {
new_xattr = simple_xattr_alloc(value, size);
if (!new_xattr)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- new_xattr->name = kstrdup(name, GFP_KERNEL);
+ new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
if (!new_xattr->name) {
- free_simple_xattr(new_xattr);
- return -ENOMEM;
+ simple_xattr_free(new_xattr);
+ return ERR_PTR(-ENOMEM);
}
}
@@ -1217,12 +1234,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
else if (ret > 0)
rbp = &(*rbp)->rb_right;
else
- xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
- if (xattr)
+ old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
+ if (old_xattr)
break;
}
- if (xattr) {
+ if (old_xattr) {
/* Fail if XATTR_CREATE is requested and the xattr exists. */
if (flags & XATTR_CREATE) {
err = -EEXIST;
@@ -1230,12 +1247,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
}
if (new_xattr)
- rb_replace_node(&xattr->rb_node, &new_xattr->rb_node,
- &xattrs->rb_root);
+ rb_replace_node(&old_xattr->rb_node,
+ &new_xattr->rb_node, &xattrs->rb_root);
else
- rb_erase(&xattr->rb_node, &xattrs->rb_root);
- if (!err && removed_size)
- *removed_size = xattr->size;
+ rb_erase(&old_xattr->rb_node, &xattrs->rb_root);
} else {
/* Fail if XATTR_REPLACE is requested but no xattr is found. */
if (flags & XATTR_REPLACE) {
@@ -1260,12 +1275,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
out_unlock:
write_unlock(&xattrs->lock);
- if (err)
- free_simple_xattr(new_xattr);
- else
- free_simple_xattr(xattr);
- return err;
-
+ if (!err)
+ return old_xattr;
+ simple_xattr_free(new_xattr);
+ return ERR_PTR(err);
}
static bool xattr_is_trusted(const char *name)
@@ -1370,14 +1383,17 @@ void simple_xattrs_init(struct simple_xattrs *xattrs)
/**
* simple_xattrs_free - free xattrs
* @xattrs: xattr header whose xattrs to destroy
+ * @freed_space: approximate number of bytes of memory freed from @xattrs
*
* Destroy all xattrs in @xattr. When this is called no one can hold a
* reference to any of the xattrs anymore.
*/
-void simple_xattrs_free(struct simple_xattrs *xattrs)
+void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space)
{
struct rb_node *rbp;
+ if (freed_space)
+ *freed_space = 0;
rbp = rb_first(&xattrs->rb_root);
while (rbp) {
struct simple_xattr *xattr;
@@ -1386,7 +1402,10 @@ void simple_xattrs_free(struct simple_xattrs *xattrs)
rbp_next = rb_next(rbp);
xattr = rb_entry(rbp, struct simple_xattr, rb_node);
rb_erase(&xattr->rb_node, &xattrs->rb_root);
- free_simple_xattr(xattr);
+ if (freed_space)
+ *freed_space += simple_xattr_space(xattr->name,
+ xattr->size);
+ simple_xattr_free(xattr);
rbp = rbp_next;
}
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 758aacd8166b..a35781577cad 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -222,7 +222,8 @@ xfs_inode_from_disk(
*/
inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
- inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
+ inode_set_ctime_to_ts(inode,
+ xfs_inode_from_disk_ts(from, from->di_ctime));
ip->i_disk_size = be64_to_cpu(from->di_size);
ip->i_nblocks = be64_to_cpu(from->di_nblocks);
@@ -316,7 +317,7 @@ xfs_inode_to_disk(
to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
- to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
+ to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode));
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
to->di_mode = cpu_to_be16(inode->i_mode);
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index cb4796b6e693..ad22656376d3 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- tv = current_time(inode);
+ /* If the mtime changes, then ctime must also change */
+ ASSERT(flags & XFS_ICHGTIME_CHG);
+ tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
- if (flags & XFS_ICHGTIME_CHG)
- inode->i_ctime = tv;
if (flags & XFS_ICHGTIME_CREATE)
ip->i_crtime = tv;
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index e382a35e98d8..05be757668bb 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2019-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
@@ -8,6 +8,8 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_mount.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
@@ -16,6 +18,7 @@
#include "xfs_ag.h"
#include "xfs_rtalloc.h"
#include "xfs_inode.h"
+#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -53,6 +56,7 @@ struct xchk_fscounters {
uint64_t frextents;
unsigned long long icount_min;
unsigned long long icount_max;
+ bool frozen;
};
/*
@@ -123,6 +127,82 @@ xchk_fscount_warmup(
return error;
}
+static inline int
+xchk_fsfreeze(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ trace_xchk_fsfreeze(sc, error);
+ return error;
+}
+
+static inline int
+xchk_fsthaw(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /* This should always succeed, we have a kernel freeze */
+ error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ trace_xchk_fsthaw(sc, error);
+ return error;
+}
+
+/*
+ * We couldn't stabilize the filesystem long enough to sample all the variables
+ * that comprise the summary counters and compare them to the percpu counters.
+ * We need to disable all writer threads, which means taking the first two
+ * freeze levels to put userspace to sleep, and the third freeze level to
+ * prevent background threads from starting new transactions. Take one level
+ * more to prevent other callers from unfreezing the filesystem while we run.
+ */
+STATIC int
+xchk_fscounters_freeze(
+ struct xfs_scrub *sc)
+{
+ struct xchk_fscounters *fsc = sc->buf;
+ int error = 0;
+
+ if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
+ sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
+ mnt_drop_write_file(sc->file);
+ }
+
+ /* Try to grab a kernel freeze. */
+ while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(HZ / 10);
+ }
+ if (error)
+ return error;
+
+ fsc->frozen = true;
+ return 0;
+}
+
+/* Thaw the filesystem after checking or repairing fscounters. */
+STATIC void
+xchk_fscounters_cleanup(
+ void *buf)
+{
+ struct xchk_fscounters *fsc = buf;
+ struct xfs_scrub *sc = fsc->sc;
+ int error;
+
+ if (!fsc->frozen)
+ return;
+
+ error = xchk_fsthaw(sc);
+ if (error)
+ xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
+ else
+ fsc->frozen = false;
+}
+
int
xchk_setup_fscounters(
struct xfs_scrub *sc)
@@ -140,6 +220,7 @@ xchk_setup_fscounters(
sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
+ sc->buf_cleanup = xchk_fscounters_cleanup;
fsc = sc->buf;
fsc->sc = sc;
@@ -150,7 +231,18 @@ xchk_setup_fscounters(
if (error)
return error;
- return xchk_trans_alloc(sc, 0);
+ /*
+ * Pause all writer activity in the filesystem while we're scrubbing to
+ * reduce the likelihood of background perturbations to the counters
+ * throwing off our calculations.
+ */
+ if (sc->flags & XCHK_TRY_HARDER) {
+ error = xchk_fscounters_freeze(sc);
+ if (error)
+ return error;
+ }
+
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
}
/*
@@ -290,8 +382,7 @@ retry:
if (fsc->ifree > fsc->icount) {
if (tries--)
goto retry;
- xchk_set_incomplete(sc);
- return 0;
+ return -EDEADLOCK;
}
return 0;
@@ -367,6 +458,8 @@ xchk_fscount_count_frextents(
* Otherwise, we /might/ have a problem. If the change in the summations is
* more than we want to tolerate, the filesystem is probably busy and we should
* just send back INCOMPLETE and see if userspace will try again.
+ *
+ * If we're repairing then we require an exact match.
*/
static inline bool
xchk_fscount_within_range(
@@ -396,21 +489,7 @@ xchk_fscount_within_range(
if (expected >= min_value && expected <= max_value)
return true;
- /*
- * If the difference between the two summations is too large, the fs
- * might just be busy and so we'll mark the scrub incomplete. Return
- * true here so that we don't mark the counter corrupt.
- *
- * XXX: In the future when userspace can grant scrub permission to
- * quiesce the filesystem to solve the outsized variance problem, this
- * check should be moved up and the return code changed to signal to
- * userspace that we need quiesce permission.
- */
- if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
- xchk_set_incomplete(sc);
- return true;
- }
-
+ /* Everything else is bad. */
return false;
}
@@ -422,6 +501,7 @@ xchk_fscounters(
struct xfs_mount *mp = sc->mp;
struct xchk_fscounters *fsc = sc->buf;
int64_t icount, ifree, fdblocks, frextents;
+ bool try_again = false;
int error;
/* Snapshot the percpu counters. */
@@ -431,9 +511,26 @@ xchk_fscounters(
frextents = percpu_counter_sum(&mp->m_frextents);
/* No negative values, please! */
- if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
+ if (icount < 0 || ifree < 0)
xchk_set_corrupt(sc);
+ /*
+ * If the filesystem is not frozen, the counter summation calls above
+ * can race with xfs_mod_freecounter, which subtracts a requested space
+ * reservation from the counter and undoes the subtraction if that made
+ * the counter go negative. Therefore, it's possible to see negative
+ * values here, and we should only flag that as a corruption if we
+ * froze the fs. This is much more likely to happen with frextents
+ * since there are no reserved pools.
+ */
+ if (fdblocks < 0 || frextents < 0) {
+ if (!fsc->frozen)
+ return -EDEADLOCK;
+
+ xchk_set_corrupt(sc);
+ return 0;
+ }
+
/* See if icount is obviously wrong. */
if (icount < fsc->icount_min || icount > fsc->icount_max)
xchk_set_corrupt(sc);
@@ -447,12 +544,6 @@ xchk_fscounters(
xchk_set_corrupt(sc);
/*
- * XXX: We can't quiesce percpu counter updates, so exit early.
- * This can be re-enabled when we gain exclusive freeze functionality.
- */
- return 0;
-
- /*
* If ifree exceeds icount by more than the minimum variance then
* something's probably wrong with the counters.
*/
@@ -463,8 +554,6 @@ xchk_fscounters(
error = xchk_fscount_aggregate_agcounts(sc, fsc);
if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
- return 0;
/* Count the free extents counter for rt volumes. */
error = xchk_fscount_count_frextents(sc, fsc);
@@ -473,20 +562,45 @@ xchk_fscounters(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
return 0;
- /* Compare the in-core counters with whatever we counted. */
- if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
- xchk_set_corrupt(sc);
+ /*
+ * Compare the in-core counters with whatever we counted. If the fs is
+ * frozen, we treat the discrepancy as a corruption because the freeze
+ * should have stabilized the counter values. Otherwise, we need
+ * userspace to call us back having granted us freeze permission.
+ */
+ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
+ fsc->icount)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
- if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
- xchk_set_corrupt(sc);
+ if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks))
- xchk_set_corrupt(sc);
+ fsc->fdblocks)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
- fsc->frextents))
- xchk_set_corrupt(sc);
+ fsc->frextents)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
+
+ if (try_again)
+ return -EDEADLOCK;
return 0;
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 3d98f604765e..a0fffbcd022b 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -184,8 +184,10 @@ xchk_teardown(
xchk_irele(sc, sc->ip);
sc->ip = NULL;
}
- if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
+ sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
+ }
if (sc->buf) {
if (sc->buf_cleanup)
sc->buf_cleanup(sc->buf);
@@ -505,6 +507,8 @@ retry_op:
error = mnt_want_write_file(sc->file);
if (error)
goto out_sc;
+
+ sc->flags |= XCHK_HAVE_FREEZE_PROT;
}
/* Set up for the operation. */
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index e113f2f5c254..f8ba00e51ca9 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -106,6 +106,7 @@ struct xfs_scrub {
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
+#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index b3894daeb86a..0b54f1a1cf0c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -98,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
#define XFS_SCRUB_STATE_STRINGS \
{ XCHK_TRY_HARDER, "try_harder" }, \
+ { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
@@ -693,6 +694,31 @@ TRACE_EVENT(xchk_fscounters_within_range,
__entry->old_value)
)
+DECLARE_EVENT_CLASS(xchk_fsfreeze_class,
+ TP_PROTO(struct xfs_scrub *sc, int error),
+ TP_ARGS(sc, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d type %s error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __entry->error)
+);
+#define DEFINE_XCHK_FSFREEZE_EVENT(name) \
+DEFINE_EVENT(xchk_fsfreeze_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, int error), \
+ TP_ARGS(sc, error))
+DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze);
+DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw);
+
TRACE_EVENT(xchk_refcount_incorrect,
TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
xfs_nlink_t seen),
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 791db7d9c849..6b840301817a 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -233,7 +233,7 @@ xfs_acl_set_mode(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (xfs_has_wsync(mp))
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 451942fb38ec..2fca4b4e7fd8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
.read_folio = xfs_vm_read_folio,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fbb675563208..fcefab687285 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1644,6 +1644,7 @@ xfs_swap_extents(
uint64_t f;
int resblks = 0;
unsigned int flags = 0;
+ struct timespec64 ctime;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1756,8 +1757,9 @@ xfs_swap_extents(
* process that the file was not changed out from
* under it.
*/
- if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
- (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ ctime = inode_get_ctime(VFS_I(ip));
+ if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) ||
(sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
(sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
error = -EBUSY;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15d1e5a7c2d3..3b903f6bce98 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1938,14 +1938,17 @@ void
xfs_free_buftarg(
struct xfs_buftarg *btp)
{
+ struct block_device *bdev = btp->bt_bdev;
+
unregister_shrinker(&btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
- blkdev_issue_flush(btp->bt_bdev);
- invalidate_bdev(btp->bt_bdev);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
+ /* the main block device is closed by kill_block_super */
+ if (bdev != btp->bt_mount->m_super->s_bdev)
+ blkdev_put(bdev, btp->bt_mount->m_super);
kmem_free(btp);
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9e62cc500140..360fe83a334f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -843,10 +843,9 @@ xfs_init_new_inode(
ip->i_df.if_nextents = 0;
ASSERT(ip->i_nblocks == 0);
- tv = current_time(inode);
+ tv = inode_set_ctime_current(inode);
inode->i_mtime = tv;
inode->i_atime = tv;
- inode->i_ctime = tv;
ip->i_extsize = 0;
ip->i_diflags = 0;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 91c847a84e10..127b2410eb20 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -528,7 +528,7 @@ xfs_inode_to_log_dinode(
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
- to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 24718adb3c16..2ededd3f6b8c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -573,10 +573,10 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
- stat->mtime = inode->i_mtime;
- stat->ctime = inode->i_ctime;
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
+ fill_mg_cmtime(stat, request_mask, inode);
+
if (xfs_has_v3inodes(mp)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
@@ -917,7 +917,7 @@ xfs_setattr_size(
if (newsize != oldsize &&
!(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
iattr->ia_ctime = iattr->ia_mtime =
- current_time(inode);
+ current_mgtime(inode);
iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
}
@@ -1029,7 +1029,6 @@ xfs_vn_setattr(
STATIC int
xfs_vn_update_time(
struct inode *inode,
- struct timespec64 *now,
int flags)
{
struct xfs_inode *ip = XFS_I(inode);
@@ -1037,13 +1036,16 @@ xfs_vn_update_time(
int log_flags = XFS_ILOG_TIMESTAMP;
struct xfs_trans *tp;
int error;
+ struct timespec64 now;
trace_xfs_update_time(ip);
if (inode->i_sb->s_flags & SB_LAZYTIME) {
if (!((flags & S_VERSION) &&
- inode_maybe_inc_iversion(inode, false)))
- return generic_update_time(inode, now, flags);
+ inode_maybe_inc_iversion(inode, false))) {
+ generic_update_time(inode, flags);
+ return 0;
+ }
/* Capture the iversion update that just occurred */
log_flags |= XFS_ILOG_CORE;
@@ -1054,12 +1056,15 @@ xfs_vn_update_time(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (flags & S_CTIME)
- inode->i_ctime = *now;
+ if (flags & (S_CTIME|S_MTIME))
+ now = inode_set_ctime_current(inode);
+ else
+ now = current_time(inode);
+
if (flags & S_MTIME)
- inode->i_mtime = *now;
+ inode->i_mtime = now;
if (flags & S_ATIME)
- inode->i_atime = *now;
+ inode->i_atime = now;
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, log_flags);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f225413a993c..c2093cb56092 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -100,8 +100,8 @@ xfs_bulkstat_one_int(
buf->bs_atime_nsec = inode->i_atime.tv_nsec;
buf->bs_mtime = inode->i_mtime.tv_sec;
buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime = inode->i_ctime.tv_sec;
- buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_ctime = inode_get_ctime(inode).tv_sec;
+ buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec;
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 818510243130..c79eac048456 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -377,17 +377,6 @@ disable_dax:
return 0;
}
-static void
-xfs_bdev_mark_dead(
- struct block_device *bdev)
-{
- xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED);
-}
-
-static const struct blk_holder_ops xfs_holder_ops = {
- .mark_dead = xfs_bdev_mark_dead,
-};
-
STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -396,8 +385,8 @@ xfs_blkdev_get(
{
int error = 0;
- *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp,
- &xfs_holder_ops);
+ *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*bdevp)) {
error = PTR_ERR(*bdevp);
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
@@ -407,31 +396,45 @@ xfs_blkdev_get(
}
STATIC void
-xfs_blkdev_put(
- struct xfs_mount *mp,
- struct block_device *bdev)
-{
- if (bdev)
- blkdev_put(bdev, mp);
-}
-
-STATIC void
-xfs_close_devices(
+xfs_shutdown_devices(
struct xfs_mount *mp)
{
+ /*
+ * Udev is triggered whenever anyone closes a block device or unmounts
+ * a file systemm on a block device.
+ * The default udev rules invoke blkid to read the fs super and create
+ * symlinks to the bdev under /dev/disk. For this, it uses buffered
+ * reads through the page cache.
+ *
+ * xfs_db also uses buffered reads to examine metadata. There is no
+ * coordination between xfs_db and udev, which means that they can run
+ * concurrently. Note there is no coordination between the kernel and
+ * blkid either.
+ *
+ * On a system with 64k pages, the page cache can cache the superblock
+ * and the root inode (and hence the root directory) with the same 64k
+ * page. If udev spawns blkid after the mkfs and the system is busy
+ * enough that it is still running when xfs_db starts up, they'll both
+ * read from the same page in the pagecache.
+ *
+ * The unmount writes updated inode metadata to disk directly. The XFS
+ * buffer cache does not use the bdev pagecache, so it needs to
+ * invalidate that pagecache on unmount. If the above scenario occurs,
+ * the pagecache no longer reflects what's on disk, xfs_db reads the
+ * stale metadata, and fails to find /a. Most of the time this succeeds
+ * because closing a bdev invalidates the page cache, but when processes
+ * race, everyone loses.
+ */
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
- struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
-
- xfs_free_buftarg(mp->m_logdev_targp);
- xfs_blkdev_put(mp, logdev);
+ blkdev_issue_flush(mp->m_logdev_targp->bt_bdev);
+ invalidate_bdev(mp->m_logdev_targp->bt_bdev);
}
if (mp->m_rtdev_targp) {
- struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
-
- xfs_free_buftarg(mp->m_rtdev_targp);
- xfs_blkdev_put(mp, rtdev);
+ blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
+ invalidate_bdev(mp->m_rtdev_targp->bt_bdev);
}
- xfs_free_buftarg(mp->m_ddev_targp);
+ blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ invalidate_bdev(mp->m_ddev_targp->bt_bdev);
}
/*
@@ -448,17 +451,24 @@ STATIC int
xfs_open_devices(
struct xfs_mount *mp)
{
- struct block_device *ddev = mp->m_super->s_bdev;
+ struct super_block *sb = mp->m_super;
+ struct block_device *ddev = sb->s_bdev;
struct block_device *logdev = NULL, *rtdev = NULL;
int error;
/*
+ * blkdev_put() can't be called under s_umount, see the comment
+ * in get_tree_bdev() for more details
+ */
+ up_write(&sb->s_umount);
+
+ /*
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
if (error)
- return error;
+ goto out_relock;
}
if (mp->m_rtname) {
@@ -496,7 +506,10 @@ xfs_open_devices(
mp->m_logdev_targp = mp->m_ddev_targp;
}
- return 0;
+ error = 0;
+out_relock:
+ down_write(&sb->s_umount);
+ return error;
out_free_rtdev_targ:
if (mp->m_rtdev_targp)
@@ -504,11 +517,12 @@ xfs_open_devices(
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- xfs_blkdev_put(mp, rtdev);
+ if (rtdev)
+ blkdev_put(rtdev, sb);
out_close_logdev:
if (logdev && logdev != ddev)
- xfs_blkdev_put(mp, logdev);
- return error;
+ blkdev_put(logdev, sb);
+ goto out_relock;
}
/*
@@ -758,6 +772,17 @@ static void
xfs_mount_free(
struct xfs_mount *mp)
{
+ /*
+ * Free the buftargs here because blkdev_put needs to be called outside
+ * of sb->s_umount, which is held around the call to ->put_super.
+ */
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_free_buftarg(mp->m_logdev_targp);
+ if (mp->m_rtdev_targp)
+ xfs_free_buftarg(mp->m_rtdev_targp);
+ if (mp->m_ddev_targp)
+ xfs_free_buftarg(mp->m_ddev_targp);
+
kfree(mp->m_rtname);
kfree(mp->m_logname);
kmem_free(mp);
@@ -1133,10 +1158,6 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
- /* if ->fill_super failed, we have no mount to tear down */
- if (!sb->s_fs_info)
- return;
-
xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid);
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
@@ -1147,10 +1168,7 @@ xfs_fs_put_super(
xfs_inodegc_free_percpu(mp);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
- xfs_close_devices(mp);
-
- sb->s_fs_info = NULL;
- xfs_mount_free(mp);
+ xfs_shutdown_devices(mp);
}
static long
@@ -1492,7 +1510,7 @@ xfs_fs_fill_super(
error = xfs_fs_validate_params(mp);
if (error)
- goto out_free_names;
+ return error;
sb_min_blocksize(sb, BBSIZE);
sb->s_xattr = xfs_xattr_handlers;
@@ -1519,11 +1537,11 @@ xfs_fs_fill_super(
error = xfs_open_devices(mp);
if (error)
- goto out_free_names;
+ return error;
error = xfs_init_mount_workqueues(mp);
if (error)
- goto out_close_devices;
+ goto out_shutdown_devices;
error = xfs_init_percpu_counters(mp);
if (error)
@@ -1737,11 +1755,8 @@ xfs_fs_fill_super(
xfs_destroy_percpu_counters(mp);
out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
- out_close_devices:
- xfs_close_devices(mp);
- out_free_names:
- sb->s_fs_info = NULL;
- xfs_mount_free(mp);
+ out_shutdown_devices:
+ xfs_shutdown_devices(mp);
return error;
out_unmount:
@@ -1934,7 +1949,8 @@ xfs_fs_reconfigure(
return 0;
}
-static void xfs_fs_free(
+static void
+xfs_fs_free(
struct fs_context *fc)
{
struct xfs_mount *mp = fc->s_fs_info;
@@ -2003,13 +2019,21 @@ static int xfs_init_fs_context(
return 0;
}
+static void
+xfs_kill_sb(
+ struct super_block *sb)
+{
+ kill_block_super(sb);
+ xfs_mount_free(XFS_M(sb));
+}
+
static struct file_system_type xfs_fs_type = {
.owner = THIS_MODULE,
.name = "xfs",
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .kill_sb = xfs_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("xfs");
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 789cfb74c146..b2c9b35df8f7 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
.read_folio = zonefs_read_folio,
.readahead = zonefs_readahead,
.writepages = zonefs_writepages,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 9350221abfc5..9d1a9808fbbb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -658,7 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
inode->i_ino = ino;
inode->i_mode = z->z_mode;
- inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
+ inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
+ inode_get_ctime(dir));
inode->i_uid = z->z_uid;
inode->i_gid = z->z_gid;
inode->i_size = z->z_wpoffset;
@@ -694,7 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
inode->i_ino = ino;
inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555);
inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
- inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime;
+ inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
+ inode_get_ctime(root));
inode->i_private = &sbi->s_zgroup[ztype];
set_nlink(inode, 2);
@@ -1317,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
inode->i_ino = bdev_nr_zones(sb->s_bdev);
inode->i_mode = S_IFDIR | 0555;
- inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_op = &zonefs_dir_inode_operations;
inode->i_fop = &zonefs_dir_operations;
inode->i_size = 2;
diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h
index d71291f25a80..76aa6aa346ba 100644
--- a/include/acpi/acnames.h
+++ b/include/acpi/acnames.h
@@ -22,6 +22,7 @@
#define METHOD_NAME__DDN "_DDN"
#define METHOD_NAME__DIS "_DIS"
#define METHOD_NAME__DMA "_DMA"
+#define METHOD_NAME__EVT "_EVT"
#define METHOD_NAME__HID "_HID"
#define METHOD_NAME__INI "_INI"
#define METHOD_NAME__PLD "_PLD"
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index c941d99162c0..254685085c82 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -515,6 +515,12 @@ void acpi_bus_private_data_handler(acpi_handle, void *);
int acpi_bus_get_private_data(acpi_handle, void **);
int acpi_bus_attach_private_data(acpi_handle, void *);
void acpi_bus_detach_private_data(acpi_handle);
+int acpi_dev_install_notify_handler(struct acpi_device *adev,
+ u32 handler_type,
+ acpi_notify_handler handler);
+void acpi_dev_remove_notify_handler(struct acpi_device *adev,
+ u32 handler_type,
+ acpi_notify_handler handler);
extern int acpi_notifier_call_chain(struct acpi_device *, u32, u32);
extern int register_acpi_notifier(struct notifier_block *);
extern int unregister_acpi_notifier(struct notifier_block *);
@@ -563,8 +569,6 @@ int acpi_match_device_ids(struct acpi_device *device,
const struct acpi_device_id *ids);
void acpi_set_modalias(struct acpi_device *adev, const char *default_id,
char *modalias, size_t len);
-int acpi_create_dir(struct acpi_device *);
-void acpi_remove_dir(struct acpi_device *);
static inline bool acpi_device_enumerated(struct acpi_device *adev)
{
@@ -645,6 +649,8 @@ int acpi_disable_wakeup_device_power(struct acpi_device *dev);
#ifdef CONFIG_X86
bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status);
bool acpi_quirk_skip_acpi_ac_and_battery(void);
+int acpi_install_cmos_rtc_space_handler(acpi_handle handle);
+void acpi_remove_cmos_rtc_space_handler(acpi_handle handle);
#else
static inline bool acpi_device_override_status(struct acpi_device *adev,
unsigned long long *status)
@@ -655,6 +661,13 @@ static inline bool acpi_quirk_skip_acpi_ac_and_battery(void)
{
return false;
}
+static inline int acpi_install_cmos_rtc_space_handler(acpi_handle handle)
+{
+ return 1;
+}
+static inline void acpi_remove_cmos_rtc_space_handler(acpi_handle handle)
+{
+}
#endif
#if IS_ENABLED(CONFIG_X86_ANDROID_TABLETS)
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 9ffdc0425bc2..3d90716f9522 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -12,7 +12,7 @@
/* Current ACPICA subsystem version in YYYYMMDD format */
-#define ACPI_CA_VERSION 0x20230331
+#define ACPI_CA_VERSION 0x20230628
#include <acpi/acconfig.h>
#include <acpi/actypes.h>
@@ -970,8 +970,6 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status
void **data,
void (*callback)(void *)))
-void acpi_run_debugger(char *batch_buffer);
-
void acpi_set_debugger_thread_id(acpi_thread_id thread_id);
#endif /* __ACXFACE_H__ */
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 58b0490a2ad1..8d5572ad48cb 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -402,7 +402,7 @@ struct acpi_cdat_dsmas {
/* Flags for subtable above */
-#define ACPI_CEDT_DSMAS_NON_VOLATILE (1 << 2)
+#define ACPI_CDAT_DSMAS_NON_VOLATILE (1 << 2)
/* Subtable 1: Device scoped Latency and Bandwidth Information Structure (DSLBIS) */
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 0029336775a9..3751ae69432f 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -893,7 +893,10 @@ enum acpi_madt_type {
ACPI_MADT_TYPE_BIO_PIC = 22,
ACPI_MADT_TYPE_LPC_PIC = 23,
ACPI_MADT_TYPE_RINTC = 24,
- ACPI_MADT_TYPE_RESERVED = 25, /* 25 to 0x7F are reserved */
+ ACPI_MADT_TYPE_IMSIC = 25,
+ ACPI_MADT_TYPE_APLIC = 26,
+ ACPI_MADT_TYPE_PLIC = 27,
+ ACPI_MADT_TYPE_RESERVED = 28, /* 28 to 0x7F are reserved */
ACPI_MADT_TYPE_OEM_RESERVED = 0x80 /* 0x80 to 0xFF are reserved for OEM use */
};
@@ -1261,6 +1264,9 @@ struct acpi_madt_rintc {
u32 flags;
u64 hart_id;
u32 uid; /* ACPI processor UID */
+ u32 ext_intc_id; /* External INTC Id */
+ u64 imsic_addr; /* IMSIC base address */
+ u32 imsic_size; /* IMSIC size */
};
/* Values for RISC-V INTC Version field above */
@@ -1271,6 +1277,48 @@ enum acpi_madt_rintc_version {
ACPI_MADT_RINTC_VERSION_RESERVED = 2 /* 2 and greater are reserved */
};
+/* 25: RISC-V IMSIC */
+struct acpi_madt_imsic {
+ struct acpi_subtable_header header;
+ u8 version;
+ u8 reserved;
+ u32 flags;
+ u16 num_ids;
+ u16 num_guest_ids;
+ u8 guest_index_bits;
+ u8 hart_index_bits;
+ u8 group_index_bits;
+ u8 group_index_shift;
+};
+
+/* 26: RISC-V APLIC */
+struct acpi_madt_aplic {
+ struct acpi_subtable_header header;
+ u8 version;
+ u8 id;
+ u32 flags;
+ u8 hw_id[8];
+ u16 num_idcs;
+ u16 num_sources;
+ u32 gsi_base;
+ u64 base_addr;
+ u32 size;
+};
+
+/* 27: RISC-V PLIC */
+struct acpi_madt_plic {
+ struct acpi_subtable_header header;
+ u8 version;
+ u8 id;
+ u8 hw_id[8];
+ u16 num_irqs;
+ u16 max_prio;
+ u32 flags;
+ u32 size;
+ u64 base_addr;
+ u32 gsi_base;
+};
+
/* 80: OEM data */
struct acpi_madt_oem_data {
@@ -2730,12 +2778,15 @@ enum acpi_rgrt_image_type {
struct acpi_table_rhct {
struct acpi_table_header header; /* Common ACPI table header */
- u32 reserved;
+ u32 flags; /* RHCT flags */
u64 time_base_freq;
u32 node_count;
u32 node_offset;
};
+/* RHCT Flags */
+
+#define ACPI_RHCT_TIMER_CANNOT_WAKEUP_CPU (1)
/*
* RHCT subtables
*/
@@ -2749,6 +2800,9 @@ struct acpi_rhct_node_header {
enum acpi_rhct_node_type {
ACPI_RHCT_NODE_TYPE_ISA_STRING = 0x0000,
+ ACPI_RHCT_NODE_TYPE_CMO = 0x0001,
+ ACPI_RHCT_NODE_TYPE_MMU = 0x0002,
+ ACPI_RHCT_NODE_TYPE_RESERVED = 0x0003,
ACPI_RHCT_NODE_TYPE_HART_INFO = 0xFFFF,
};
@@ -2762,6 +2816,24 @@ struct acpi_rhct_isa_string {
char isa[];
};
+struct acpi_rhct_cmo_node {
+ u8 reserved; /* Must be zero */
+ u8 cbom_size; /* CBOM size in powerof 2 */
+ u8 cbop_size; /* CBOP size in powerof 2 */
+ u8 cboz_size; /* CBOZ size in powerof 2 */
+};
+
+struct acpi_rhct_mmu_node {
+ u8 reserved; /* Must be zero */
+ u8 mmu_type; /* Virtual Address Scheme */
+};
+
+enum acpi_rhct_mmu_type {
+ ACPI_RHCT_MMU_TYPE_SV39 = 0,
+ ACPI_RHCT_MMU_TYPE_SV48 = 1,
+ ACPI_RHCT_MMU_TYPE_SV57 = 2
+};
+
/* Hart Info node structure */
struct acpi_rhct_hart_info {
u16 num_offsets;
diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h
index 000764ab3985..c080d579a546 100644
--- a/include/acpi/actbl3.h
+++ b/include/acpi/actbl3.h
@@ -279,12 +279,14 @@ struct acpi_srat_gic_its_affinity {
* 6: ACPI_SRAT_TYPE_GENERIC_PORT_AFFINITY
*/
+#define ACPI_SRAT_DEVICE_HANDLE_SIZE 16
+
struct acpi_srat_generic_affinity {
struct acpi_subtable_header header;
u8 reserved;
u8 device_handle_type;
u32 proximity_domain;
- u8 device_handle[16];
+ u8 device_handle[ACPI_SRAT_DEVICE_HANDLE_SIZE];
u32 flags;
u32 reserved1;
};
diff --git a/include/acpi/pdc_intel.h b/include/acpi/pdc_intel.h
deleted file mode 100644
index 967c552d1cd3..000000000000
--- a/include/acpi/pdc_intel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/* _PDC bit definition for Intel processors */
-
-#ifndef __PDC_INTEL_H__
-#define __PDC_INTEL_H__
-
-#define ACPI_PDC_P_FFH (0x0001)
-#define ACPI_PDC_C_C1_HALT (0x0002)
-#define ACPI_PDC_T_FFH (0x0004)
-#define ACPI_PDC_SMP_C1PT (0x0008)
-#define ACPI_PDC_SMP_C2C3 (0x0010)
-#define ACPI_PDC_SMP_P_SWCOORD (0x0020)
-#define ACPI_PDC_SMP_C_SWCOORD (0x0040)
-#define ACPI_PDC_SMP_T_SWCOORD (0x0080)
-#define ACPI_PDC_C_C1_FFH (0x0100)
-#define ACPI_PDC_C_C2C3_FFH (0x0200)
-#define ACPI_PDC_SMP_P_HWCOORD (0x0800)
-
-#define ACPI_PDC_EST_CAPABILITY_SMP (ACPI_PDC_SMP_C1PT | \
- ACPI_PDC_C_C1_HALT | \
- ACPI_PDC_P_FFH)
-
-#define ACPI_PDC_EST_CAPABILITY_SWSMP (ACPI_PDC_SMP_C1PT | \
- ACPI_PDC_C_C1_HALT | \
- ACPI_PDC_SMP_P_SWCOORD | \
- ACPI_PDC_SMP_P_HWCOORD | \
- ACPI_PDC_P_FFH)
-
-#define ACPI_PDC_C_CAPABILITY_SMP (ACPI_PDC_SMP_C2C3 | \
- ACPI_PDC_SMP_C1PT | \
- ACPI_PDC_C_C1_HALT | \
- ACPI_PDC_C_C1_FFH | \
- ACPI_PDC_C_C2C3_FFH)
-
-#endif /* __PDC_INTEL_H__ */
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 1ca450e35c0d..565341c826e3 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -182,6 +182,7 @@
#ifdef ACPI_USE_STANDARD_HEADERS
#include <stddef.h>
#include <unistd.h>
+#include <stdint.h>
#define ACPI_OFFSET(d, f) offsetof(d, f)
#endif
diff --git a/include/acpi/platform/aczephyr.h b/include/acpi/platform/aczephyr.h
index 2f0d30c3c5fd..703db4dc740d 100644
--- a/include/acpi/platform/aczephyr.h
+++ b/include/acpi/platform/aczephyr.h
@@ -10,9 +10,6 @@
#ifndef __ACZEPHYR_H__
#define __ACZEPHYR_H__
-#define SEEK_SET FS_SEEK_SET
-#define SEEK_END FS_SEEK_END
-
#define ACPI_MACHINE_WIDTH 64
#define ACPI_NO_ERROR_MESSAGES
diff --git a/include/acpi/proc_cap_intel.h b/include/acpi/proc_cap_intel.h
new file mode 100644
index 000000000000..ddcdc41d6c3e
--- /dev/null
+++ b/include/acpi/proc_cap_intel.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Vendor specific processor capabilities bit definition
+ * for Intel processors. Those bits are used to convey OSPM
+ * power management capabilities to the platform.
+ */
+
+#ifndef __PROC_CAP_INTEL_H__
+#define __PROC_CAP_INTEL_H__
+
+#define ACPI_PROC_CAP_P_FFH (0x0001)
+#define ACPI_PROC_CAP_C_C1_HALT (0x0002)
+#define ACPI_PROC_CAP_T_FFH (0x0004)
+#define ACPI_PROC_CAP_SMP_C1PT (0x0008)
+#define ACPI_PROC_CAP_SMP_C2C3 (0x0010)
+#define ACPI_PROC_CAP_SMP_P_SWCOORD (0x0020)
+#define ACPI_PROC_CAP_SMP_C_SWCOORD (0x0040)
+#define ACPI_PROC_CAP_SMP_T_SWCOORD (0x0080)
+#define ACPI_PROC_CAP_C_C1_FFH (0x0100)
+#define ACPI_PROC_CAP_C_C2C3_FFH (0x0200)
+#define ACPI_PROC_CAP_SMP_P_HWCOORD (0x0800)
+#define ACPI_PROC_CAP_COLLAB_PROC_PERF (0x1000)
+
+#define ACPI_PROC_CAP_EST_CAPABILITY_SMP (ACPI_PROC_CAP_SMP_C1PT | \
+ ACPI_PROC_CAP_C_C1_HALT | \
+ ACPI_PROC_CAP_P_FFH)
+
+#define ACPI_PROC_CAP_EST_CAPABILITY_SWSMP (ACPI_PROC_CAP_SMP_C1PT | \
+ ACPI_PROC_CAP_C_C1_HALT | \
+ ACPI_PROC_CAP_SMP_P_SWCOORD | \
+ ACPI_PROC_CAP_SMP_P_HWCOORD | \
+ ACPI_PROC_CAP_P_FFH)
+
+#define ACPI_PROC_CAP_C_CAPABILITY_SMP (ACPI_PROC_CAP_SMP_C2C3 | \
+ ACPI_PROC_CAP_SMP_C1PT | \
+ ACPI_PROC_CAP_C_C1_HALT | \
+ ACPI_PROC_CAP_C_C1_FFH | \
+ ACPI_PROC_CAP_C_C2C3_FFH)
+
+#endif /* __PROC_CAP_INTEL_H__ */
diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h
index 02f2ac4dd2df..e69cece404b3 100644
--- a/include/drm/display/drm_dp.h
+++ b/include/drm/display/drm_dp.h
@@ -1537,7 +1537,7 @@ enum drm_dp_phy {
#define DP_BRANCH_OUI_HEADER_SIZE 0xc
#define DP_RECEIVER_CAP_SIZE 0xf
-#define DP_DSC_RECEIVER_CAP_SIZE 0xf
+#define DP_DSC_RECEIVER_CAP_SIZE 0x10 /* DSC Capabilities 0x60 through 0x6F */
#define EDP_PSR_RECEIVER_CAP_SIZE 2
#define EDP_DISPLAY_CTL_CAP_SIZE 3
#define DP_LTTPR_COMMON_CAP_SIZE 8
diff --git a/include/drm/drm_probe_helper.h b/include/drm/drm_probe_helper.h
index 4977e0ab72db..fad3c4003b2b 100644
--- a/include/drm/drm_probe_helper.h
+++ b/include/drm/drm_probe_helper.h
@@ -25,6 +25,7 @@ void drm_kms_helper_connector_hotplug_event(struct drm_connector *connector);
void drm_kms_helper_poll_disable(struct drm_device *dev);
void drm_kms_helper_poll_enable(struct drm_device *dev);
+void drm_kms_helper_poll_reschedule(struct drm_device *dev);
bool drm_kms_helper_is_poll_worker(void);
enum drm_mode_status drm_crtc_helper_mode_valid_fixed(struct drm_crtc *crtc,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 641dc4843987..a73246c3c35e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -477,8 +477,6 @@ static inline int acpi_get_node(acpi_handle handle)
return 0;
}
#endif
-extern int acpi_paddr_to_node(u64 start_addr, u64 size);
-
extern int pnpacpi_disabled;
#define PXM_INVAL (-1)
@@ -1100,7 +1098,7 @@ void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state,
acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state,
u32 val_a, u32 val_b);
-#ifdef CONFIG_X86
+#if defined(CONFIG_SUSPEND) && defined(CONFIG_X86)
struct acpi_s2idle_dev_ops {
struct list_head list_node;
void (*prepare)(void);
@@ -1109,7 +1107,13 @@ struct acpi_s2idle_dev_ops {
};
int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg);
void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg);
-#endif /* CONFIG_X86 */
+int acpi_get_lps0_constraint(struct acpi_device *adev);
+#else /* CONFIG_SUSPEND && CONFIG_X86 */
+static inline int acpi_get_lps0_constraint(struct device *dev)
+{
+ return ACPI_STATE_UNKNOWN;
+}
+#endif /* CONFIG_SUSPEND && CONFIG_X86 */
#ifndef CONFIG_IA64
void arch_reserve_mem_area(acpi_physical_address addr, size_t size);
#else
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index ee7cb6aaff71..1cb65592c95d 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -21,6 +21,7 @@
*/
#define IORT_SMMU_V3_PMCG_GENERIC 0x00000000 /* Generic SMMUv3 PMCG */
#define IORT_SMMU_V3_PMCG_HISI_HIP08 0x00000001 /* HiSilicon HIP08 PMCG */
+#define IORT_SMMU_V3_PMCG_HISI_HIP09 0x00000002 /* HiSilicon HIP09 PMCG */
int iort_register_domain_token(int trans_id, phys_addr_t base,
struct fwnode_handle *fw_node);
diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h
index 14dc461b0e82..255701e1251b 100644
--- a/include/linux/arm_sdei.h
+++ b/include/linux/arm_sdei.h
@@ -47,10 +47,12 @@ int sdei_unregister_ghes(struct ghes *ghes);
int sdei_mask_local_cpu(void);
int sdei_unmask_local_cpu(void);
void __init sdei_init(void);
+void sdei_handler_abort(void);
#else
static inline int sdei_mask_local_cpu(void) { return 0; }
static inline int sdei_unmask_local_cpu(void) { return 0; }
static inline void sdei_init(void) { }
+static inline void sdei_handler_abort(void) { }
#endif /* CONFIG_ARM_SDE_INTERFACE */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0bad62cca3d0..d5c5e59ddbd2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -52,7 +52,6 @@ struct block_device {
atomic_t bd_openers;
spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
struct inode * bd_inode; /* will die */
- struct super_block * bd_super;
void * bd_claiming;
void * bd_holder;
const struct blk_holder_ops *bd_holder_ops;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 87d94be7825a..83ce87354e9a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -750,7 +750,8 @@ static inline int bdev_read_only(struct block_device *bdev)
}
bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
-bool disk_force_media_change(struct gendisk *disk, unsigned int events);
+void disk_force_media_change(struct gendisk *disk);
+void bdev_mark_dead(struct block_device *bdev, bool surprise);
void add_disk_randomness(struct gendisk *disk) __latent_entropy;
void rand_initialize_disk(struct gendisk *disk);
@@ -809,7 +810,6 @@ int __register_blkdev(unsigned int major, const char *name,
void unregister_blkdev(unsigned int major, const char *name);
bool disk_check_media_change(struct gendisk *disk);
-int __invalidate_device(struct block_device *bdev, bool kill_dirty);
void set_capacity(struct gendisk *disk, sector_t size);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
@@ -1460,9 +1460,16 @@ void blkdev_show(struct seq_file *seqf, off_t offset);
#endif
struct blk_holder_ops {
- void (*mark_dead)(struct block_device *bdev);
+ void (*mark_dead)(struct block_device *bdev, bool surprise);
+
+ /*
+ * Sync the file system mounted on the block device.
+ */
+ void (*sync)(struct block_device *bdev);
};
+extern const struct blk_holder_ops fs_holder_ops;
+
/*
* Return the correct open flags for blkdev_get_by_* for super block flags
* as stored in sb->s_flags.
@@ -1521,8 +1528,6 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev)
}
#endif /* CONFIG_BLOCK */
-int fsync_bdev(struct block_device *bdev);
-
int freeze_bdev(struct block_device *bdev);
int thaw_bdev(struct block_device *bdev);
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8a0d5466c7be..ae20dbb885d6 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -661,6 +661,8 @@ struct cgroup_subsys {
void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
int (*css_extra_stat_show)(struct seq_file *seq,
struct cgroup_subsys_state *css);
+ int (*css_local_stat_show)(struct seq_file *seq,
+ struct cgroup_subsys_state *css);
int (*can_attach)(struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_taskset *tset);
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 1ef013324237..06f1b292f8a0 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -183,6 +183,39 @@ int clk_get_scaled_duty_cycle(struct clk *clk, unsigned int scale);
*/
bool clk_is_match(const struct clk *p, const struct clk *q);
+/**
+ * clk_rate_exclusive_get - get exclusivity over the rate control of a
+ * producer
+ * @clk: clock source
+ *
+ * This function allows drivers to get exclusive control over the rate of a
+ * provider. It prevents any other consumer to execute, even indirectly,
+ * opereation which could alter the rate of the provider or cause glitches
+ *
+ * If exlusivity is claimed more than once on clock, even by the same driver,
+ * the rate effectively gets locked as exclusivity can't be preempted.
+ *
+ * Must not be called from within atomic context.
+ *
+ * Returns success (0) or negative errno.
+ */
+int clk_rate_exclusive_get(struct clk *clk);
+
+/**
+ * clk_rate_exclusive_put - release exclusivity over the rate control of a
+ * producer
+ * @clk: clock source
+ *
+ * This function allows drivers to release the exclusivity it previously got
+ * from clk_rate_exclusive_get()
+ *
+ * The caller must balance the number of clk_rate_exclusive_get() and
+ * clk_rate_exclusive_put() calls.
+ *
+ * Must not be called from within atomic context.
+ */
+void clk_rate_exclusive_put(struct clk *clk);
+
#else
static inline int clk_notifier_register(struct clk *clk,
@@ -236,6 +269,13 @@ static inline bool clk_is_match(const struct clk *p, const struct clk *q)
return p == q;
}
+static inline int clk_rate_exclusive_get(struct clk *clk)
+{
+ return 0;
+}
+
+static inline void clk_rate_exclusive_put(struct clk *clk) {}
+
#endif
#ifdef CONFIG_HAVE_CLK_PREPARE
@@ -583,38 +623,6 @@ struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id);
*/
struct clk *devm_get_clk_from_child(struct device *dev,
struct device_node *np, const char *con_id);
-/**
- * clk_rate_exclusive_get - get exclusivity over the rate control of a
- * producer
- * @clk: clock source
- *
- * This function allows drivers to get exclusive control over the rate of a
- * provider. It prevents any other consumer to execute, even indirectly,
- * opereation which could alter the rate of the provider or cause glitches
- *
- * If exlusivity is claimed more than once on clock, even by the same driver,
- * the rate effectively gets locked as exclusivity can't be preempted.
- *
- * Must not be called from within atomic context.
- *
- * Returns success (0) or negative errno.
- */
-int clk_rate_exclusive_get(struct clk *clk);
-
-/**
- * clk_rate_exclusive_put - release exclusivity over the rate control of a
- * producer
- * @clk: clock source
- *
- * This function allows drivers to release the exclusivity it previously got
- * from clk_rate_exclusive_get()
- *
- * The caller must balance the number of clk_rate_exclusive_get() and
- * clk_rate_exclusive_put() calls.
- *
- * Must not be called from within atomic context.
- */
-void clk_rate_exclusive_put(struct clk *clk);
/**
* clk_enable - inform the system when the clock source should be running.
@@ -974,14 +982,6 @@ static inline void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks) {}
static inline void devm_clk_put(struct device *dev, struct clk *clk) {}
-
-static inline int clk_rate_exclusive_get(struct clk *clk)
-{
- return 0;
-}
-
-static inline void clk_rate_exclusive_put(struct clk *clk) {}
-
static inline int clk_enable(struct clk *clk)
{
return 0;
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 00efa35c350f..28566624f008 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -95,6 +95,19 @@
#endif
/*
+ * Optional: only supported since gcc >= 14
+ * Optional: only supported since clang >= 18
+ *
+ * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
+ * clang: https://reviews.llvm.org/D148381
+ */
+#if __has_attribute(__counted_by__)
+# define __counted_by(member) __attribute__((__counted_by__(member)))
+#else
+# define __counted_by(member)
+#endif
+
+/*
* Optional: not supported by gcc
* Optional: only supported since clang >= 14.0
*
@@ -130,19 +143,6 @@
#endif
/*
- * Optional: only supported since gcc >= 14
- * Optional: only supported since clang >= 17
- *
- * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
- * clang: https://reviews.llvm.org/D148381
- */
-#if __has_attribute(__element_count__)
-# define __counted_by(member) __attribute__((__element_count__(#member)))
-#else
-# define __counted_by(member)
-#endif
-
-/*
* Optional: only supported since clang >= 14.0
*
* gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-error-function-attribute
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 547ea1ff806e..c523c6683789 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -106,6 +106,34 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
#define __cold
#endif
+/*
+ * On x86-64 and arm64 targets, __preserve_most changes the calling convention
+ * of a function to make the code in the caller as unintrusive as possible. This
+ * convention behaves identically to the C calling convention on how arguments
+ * and return values are passed, but uses a different set of caller- and callee-
+ * saved registers.
+ *
+ * The purpose is to alleviates the burden of saving and recovering a large
+ * register set before and after the call in the caller. This is beneficial for
+ * rarely taken slow paths, such as error-reporting functions that may be called
+ * from hot paths.
+ *
+ * Note: This may conflict with instrumentation inserted on function entry which
+ * does not use __preserve_most or equivalent convention (if in assembly). Since
+ * function tracing assumes the normal C calling convention, where the attribute
+ * is supported, __preserve_most implies notrace. It is recommended to restrict
+ * use of the attribute to functions that should or already disable tracing.
+ *
+ * Optional: not supported by gcc.
+ *
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#preserve-most
+ */
+#if __has_attribute(__preserve_most__) && (defined(CONFIG_X86_64) || defined(CONFIG_ARM64))
+# define __preserve_most notrace __attribute__((__preserve_most__))
+#else
+# define __preserve_most
+#endif
+
/* Builtins */
/*
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 62b32b19e0a8..fb2915676574 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -116,6 +116,7 @@ extern bool try_wait_for_completion(struct completion *x);
extern bool completion_done(struct completion *x);
extern void complete(struct completion *);
+extern void complete_on_current_cpu(struct completion *x);
extern void complete_all(struct completion *);
#endif
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e006c719182b..0abd60a7987b 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -18,6 +18,7 @@
#include <linux/compiler.h>
#include <linux/cpumask.h>
#include <linux/cpuhotplug.h>
+#include <linux/cpu_smt.h>
struct device;
struct device_node;
@@ -194,7 +195,6 @@ void arch_cpu_finalize_init(void);
static inline void arch_cpu_finalize_init(void) { }
#endif
-void cpu_set_state_online(int cpu);
void play_idle_precise(u64 duration_ns, u64 latency_ns);
static inline void play_idle(unsigned long duration_us)
@@ -208,30 +208,6 @@ void cpuhp_report_idle_dead(void);
static inline void cpuhp_report_idle_dead(void) { }
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-enum cpuhp_smt_control {
- CPU_SMT_ENABLED,
- CPU_SMT_DISABLED,
- CPU_SMT_FORCE_DISABLED,
- CPU_SMT_NOT_SUPPORTED,
- CPU_SMT_NOT_IMPLEMENTED,
-};
-
-#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
-extern enum cpuhp_smt_control cpu_smt_control;
-extern void cpu_smt_disable(bool force);
-extern void cpu_smt_check_topology(void);
-extern bool cpu_smt_possible(void);
-extern int cpuhp_smt_enable(void);
-extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
-#else
-# define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED)
-static inline void cpu_smt_disable(bool force) { }
-static inline void cpu_smt_check_topology(void) { }
-static inline bool cpu_smt_possible(void) { return false; }
-static inline int cpuhp_smt_enable(void) { return 0; }
-static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }
-#endif
-
extern bool cpu_mitigations_off(void);
extern bool cpu_mitigations_auto_nosmt(void);
diff --git a/include/linux/cpu_smt.h b/include/linux/cpu_smt.h
new file mode 100644
index 000000000000..0c1664294b57
--- /dev/null
+++ b/include/linux/cpu_smt.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CPU_SMT_H_
+#define _LINUX_CPU_SMT_H_
+
+enum cpuhp_smt_control {
+ CPU_SMT_ENABLED,
+ CPU_SMT_DISABLED,
+ CPU_SMT_FORCE_DISABLED,
+ CPU_SMT_NOT_SUPPORTED,
+ CPU_SMT_NOT_IMPLEMENTED,
+};
+
+#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
+extern enum cpuhp_smt_control cpu_smt_control;
+extern unsigned int cpu_smt_num_threads;
+extern void cpu_smt_disable(bool force);
+extern void cpu_smt_set_num_threads(unsigned int num_threads,
+ unsigned int max_threads);
+extern bool cpu_smt_possible(void);
+extern int cpuhp_smt_enable(void);
+extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
+#else
+# define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED)
+# define cpu_smt_num_threads 1
+static inline void cpu_smt_disable(bool force) { }
+static inline void cpu_smt_set_num_threads(unsigned int num_threads,
+ unsigned int max_threads) { }
+static inline bool cpu_smt_possible(void) { return false; }
+static inline int cpuhp_smt_enable(void) { return 0; }
+static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }
+#endif
+
+#endif /* _LINUX_CPU_SMT_H_ */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 25b6e6e6ba6b..06dda85f0424 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -48,7 +48,7 @@
* same section.
*
* If neither #1 nor #2 apply, please use the dynamic state space when
- * setting up a state by using CPUHP_PREPARE_DYN or CPUHP_PREPARE_ONLINE
+ * setting up a state by using CPUHP_BP_PREPARE_DYN or CPUHP_AP_ONLINE_DYN
* for the @state argument of the setup function.
*
* See Documentation/core-api/cpu_hotplug.rst for further information and
diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h
index 9192986b1a73..ac862422df15 100644
--- a/include/linux/decompress/mm.h
+++ b/include/linux/decompress/mm.h
@@ -48,7 +48,7 @@ MALLOC_VISIBLE void *malloc(int size)
if (!malloc_ptr)
malloc_ptr = free_mem_ptr;
- malloc_ptr = (malloc_ptr + 3) & ~3; /* Align */
+ malloc_ptr = (malloc_ptr + 7) & ~7; /* Align */
p = (void *)malloc_ptr;
malloc_ptr += size;
diff --git a/include/linux/dm-verity-loadpin.h b/include/linux/dm-verity-loadpin.h
index 552b817ab102..3ac6dbaeaa37 100644
--- a/include/linux/dm-verity-loadpin.h
+++ b/include/linux/dm-verity-loadpin.h
@@ -12,7 +12,7 @@ extern struct list_head dm_verity_loadpin_trusted_root_digests;
struct dm_verity_loadpin_trusted_root_digest {
struct list_head node;
unsigned int len;
- u8 data[];
+ u8 data[] __counted_by(len);
};
#if IS_ENABLED(CONFIG_SECURITY_LOADPIN_VERITY)
diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index b1d26f9f1c9f..9f183a679277 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -30,7 +30,7 @@ struct dnotify_struct {
FS_MOVED_FROM | FS_MOVED_TO)
extern void dnotify_flush(struct file *, fl_owner_t);
-extern int fcntl_dirnotify(int, struct file *, unsigned long);
+extern int fcntl_dirnotify(int, struct file *, unsigned int);
#else
@@ -38,7 +38,7 @@ static inline void dnotify_flush(struct file *filp, fl_owner_t id)
{
}
-static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
{
return -EINVAL;
}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index ab088c662e88..5a1e39df8b26 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -726,7 +726,6 @@ static inline efi_status_t efi_query_variable_store(u32 attributes,
return EFI_SUCCESS;
}
#endif
-extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
extern int __init __efi_memmap_init(struct efi_memory_map_data *data);
extern int __init efi_memmap_init_early(struct efi_memory_map_data *data);
@@ -1130,7 +1129,7 @@ extern bool efi_runtime_disabled(void);
static inline bool efi_runtime_disabled(void) { return true; }
#endif
-extern void efi_call_virt_check_flags(unsigned long flags, const char *call);
+extern void efi_call_virt_check_flags(unsigned long flags, const void *caller);
extern unsigned long efi_call_virt_save_flags(void);
enum efi_secureboot_mode {
@@ -1171,8 +1170,7 @@ static inline void efi_check_for_embedded_firmwares(void) { }
#define arch_efi_call_virt(p, f, args...) ((p)->f(args))
/*
- * Arch code can implement the following three template macros, avoiding
- * reptition for the void/non-void return cases of {__,}efi_call_virt():
+ * Arch code must implement the following three routines:
*
* * arch_efi_call_virt_setup()
*
@@ -1181,9 +1179,8 @@ static inline void efi_check_for_embedded_firmwares(void) { }
*
* * arch_efi_call_virt()
*
- * Performs the call. The last expression in the macro must be the call
- * itself, allowing the logic to be shared by the void and non-void
- * cases.
+ * Performs the call. This routine takes a variable number of arguments so
+ * it must be implemented as a variadic preprocessor macro.
*
* * arch_efi_call_virt_teardown()
*
@@ -1192,33 +1189,20 @@ static inline void efi_check_for_embedded_firmwares(void) { }
#define efi_call_virt_pointer(p, f, args...) \
({ \
- efi_status_t __s; \
+ typeof((p)->f(args)) __s; \
unsigned long __flags; \
\
arch_efi_call_virt_setup(); \
\
__flags = efi_call_virt_save_flags(); \
__s = arch_efi_call_virt(p, f, args); \
- efi_call_virt_check_flags(__flags, __stringify(f)); \
+ efi_call_virt_check_flags(__flags, NULL); \
\
arch_efi_call_virt_teardown(); \
\
__s; \
})
-#define __efi_call_virt_pointer(p, f, args...) \
-({ \
- unsigned long __flags; \
- \
- arch_efi_call_virt_setup(); \
- \
- __flags = efi_call_virt_save_flags(); \
- arch_efi_call_virt(p, f, args); \
- efi_call_virt_check_flags(__flags, __stringify(f)); \
- \
- arch_efi_call_virt_teardown(); \
-})
-
#define EFI_RANDOM_SEED_SIZE 32U // BLAKE2S_HASH_SIZE
struct linux_efi_random_seed {
@@ -1244,6 +1228,10 @@ extern int efi_tpm_final_log_size;
extern unsigned long rci2_table_phys;
+efi_status_t
+efi_call_acpi_prm_handler(efi_status_t (__efiapi *handler_addr)(u64, void *),
+ u64 param_buffer_addr, void *context);
+
/*
* efi_runtime_service() function identifiers.
* "NONE" is used by efi_recover_from_page_fault() to check if the page
@@ -1263,25 +1251,26 @@ enum efi_rts_ids {
EFI_RESET_SYSTEM,
EFI_UPDATE_CAPSULE,
EFI_QUERY_CAPSULE_CAPS,
+ EFI_ACPI_PRM_HANDLER,
};
+union efi_rts_args;
+
/*
* efi_runtime_work: Details of EFI Runtime Service work
- * @arg<1-5>: EFI Runtime Service function arguments
+ * @args: Pointer to union describing the arguments
* @status: Status of executing EFI Runtime Service
* @efi_rts_id: EFI Runtime Service function identifier
* @efi_rts_comp: Struct used for handling completions
+ * @caller: The caller of the runtime service
*/
struct efi_runtime_work {
- void *arg1;
- void *arg2;
- void *arg3;
- void *arg4;
- void *arg5;
- efi_status_t status;
- struct work_struct work;
- enum efi_rts_ids efi_rts_id;
- struct completion efi_rts_comp;
+ union efi_rts_args *args;
+ efi_status_t status;
+ struct work_struct work;
+ enum efi_rts_ids efi_rts_id;
+ struct completion efi_rts_comp;
+ const void *caller;
};
extern struct efi_runtime_work efi_rts_work;
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index efcdd1631d9b..95e868e09e29 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -144,7 +144,7 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int,
struct flock64 *);
#endif
-int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
+int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
int fcntl_getlease(struct file *filp);
/* fs/locks.c */
@@ -167,8 +167,8 @@ bool vfs_inode_has_locks(struct inode *inode);
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
void lease_get_mtime(struct inode *, struct timespec64 *time);
-int generic_setlease(struct file *, long, struct file_lock **, void **priv);
-int vfs_setlease(struct file *, long, struct file_lock **, void **);
+int generic_setlease(struct file *, int, struct file_lock **, void **priv);
+int vfs_setlease(struct file *, int, struct file_lock **, void **);
int lease_modify(struct file_lock *, int, struct list_head *);
struct notifier_block;
@@ -213,7 +213,7 @@ static inline int fcntl_setlk64(unsigned int fd, struct file *file,
return -EACCES;
}
#endif
-static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
return -EINVAL;
}
@@ -306,13 +306,13 @@ static inline void lease_get_mtime(struct inode *inode,
return;
}
-static inline int generic_setlease(struct file *filp, long arg,
+static inline int generic_setlease(struct file *filp, int arg,
struct file_lock **flp, void **priv)
{
return -EINVAL;
}
-static inline int vfs_setlease(struct file *filp, long arg,
+static inline int vfs_setlease(struct file *filp, int arg,
struct file_lock **lease, void **priv)
{
return -EINVAL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 562f2623c9c9..dda08d973639 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -338,6 +338,20 @@ enum rw_hint {
#define IOCB_NOIO (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE (1 << 21)
+/*
+ * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
+ * iocb completion can be passed back to the owner for execution from a safe
+ * context rather than needing to be punted through a workqueue. If this
+ * flag is set, the bio completion handling may set iocb->dio_complete to a
+ * handler function and iocb->private to context information for that handler.
+ * The issuer should call the handler with that context information from task
+ * context to complete the processing of the iocb. Note that while this
+ * provides a task context for the dio_complete() callback, it should only be
+ * used on the completion side for non-IO generating completions. It's fine to
+ * call blocking functions from this callback, but they should not wait for
+ * unrelated IO (like cache flushing, new IO generation, etc).
+ */
+#define IOCB_DIO_CALLER_COMP (1 << 22)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
@@ -351,7 +365,8 @@ enum rw_hint {
{ IOCB_WRITE, "WRITE" }, \
{ IOCB_WAITQ, "WAITQ" }, \
{ IOCB_NOIO, "NOIO" }, \
- { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }
+ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \
+ { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }
struct kiocb {
struct file *ki_filp;
@@ -360,7 +375,23 @@ struct kiocb {
void *private;
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
- struct wait_page_queue *ki_waitq; /* for async buffered IO */
+ union {
+ /*
+ * Only used for async buffered reads, where it denotes the
+ * page waitqueue associated with completing the read. Valid
+ * IFF IOCB_WAITQ is set.
+ */
+ struct wait_page_queue *ki_waitq;
+ /*
+ * Can be used for O_DIRECT IO, where the completion handling
+ * is punted back to the issuer of the IO. May only be set
+ * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
+ * must then check for presence of this handler when ki_complete
+ * is invoked. The data passed in to this handler must be
+ * assigned to ->private when dio_complete is assigned.
+ */
+ ssize_t (*dio_complete)(void *data);
+ };
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -642,7 +673,7 @@ struct inode {
loff_t i_size;
struct timespec64 i_atime;
struct timespec64 i_mtime;
- struct timespec64 i_ctime;
+ struct timespec64 __i_ctime; /* use inode_*_ctime accessors! */
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
u8 i_blkbits;
@@ -1069,7 +1100,7 @@ extern void fasync_free(struct fasync_struct *);
extern void kill_fasync(struct fasync_struct **, int, int);
extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
-extern int f_setown(struct file *filp, unsigned long arg, int force);
+extern int f_setown(struct file *filp, int who, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);
@@ -1095,6 +1126,8 @@ extern int send_sigurg(struct fown_struct *fown);
#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */
/* These sb flags are internal to the kernel */
+#define SB_DEAD BIT(21)
+#define SB_DYING BIT(24)
#define SB_SUBMOUNT BIT(26)
#define SB_FORCE BIT(27)
#define SB_NOSEC BIT(28)
@@ -1147,7 +1180,8 @@ enum {
#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
struct sb_writers {
- int frozen; /* Is sb frozen? */
+ unsigned short frozen; /* Is sb frozen? */
+ unsigned short freeze_holders; /* Who froze fs? */
struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
@@ -1474,7 +1508,79 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
kgid_has_mapping(fs_userns, kgid);
}
-extern struct timespec64 current_time(struct inode *inode);
+struct timespec64 current_mgtime(struct inode *inode);
+struct timespec64 current_time(struct inode *inode);
+struct timespec64 inode_set_ctime_current(struct inode *inode);
+
+/*
+ * Multigrain timestamps
+ *
+ * Conditionally use fine-grained ctime and mtime timestamps when there
+ * are users actively observing them via getattr. The primary use-case
+ * for this is NFS clients that use the ctime to distinguish between
+ * different states of the file, and that are often fooled by multiple
+ * operations that occur in the same coarse-grained timer tick.
+ *
+ * The kernel always keeps normalized struct timespec64 values in the ctime,
+ * which means that only the first 30 bits of the value are used. Use the
+ * 31st bit of the ctime's tv_nsec field as a flag to indicate that the value
+ * has been queried since it was last updated.
+ */
+#define I_CTIME_QUERIED (1L<<30)
+
+/**
+ * inode_get_ctime - fetch the current ctime from the inode
+ * @inode: inode from which to fetch ctime
+ *
+ * Grab the current ctime tv_nsec field from the inode, mask off the
+ * I_CTIME_QUERIED flag and return it. This is mostly intended for use by
+ * internal consumers of the ctime that aren't concerned with ensuring a
+ * fine-grained update on the next change (e.g. when preparing to store
+ * the value in the backing store for later retrieval).
+ *
+ * This is safe to call regardless of whether the underlying filesystem
+ * is using multigrain timestamps.
+ */
+static inline struct timespec64 inode_get_ctime(const struct inode *inode)
+{
+ struct timespec64 ctime;
+
+ ctime.tv_sec = inode->__i_ctime.tv_sec;
+ ctime.tv_nsec = inode->__i_ctime.tv_nsec & ~I_CTIME_QUERIED;
+
+ return ctime;
+}
+
+/**
+ * inode_set_ctime_to_ts - set the ctime in the inode
+ * @inode: inode in which to set the ctime
+ * @ts: value to set in the ctime field
+ *
+ * Set the ctime in @inode to @ts
+ */
+static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
+ struct timespec64 ts)
+{
+ inode->__i_ctime = ts;
+ return ts;
+}
+
+/**
+ * inode_set_ctime - set the ctime in the inode
+ * @inode: inode in which to set the ctime
+ * @sec: tv_sec value to set
+ * @nsec: tv_nsec value to set
+ *
+ * Set the ctime in @inode to { @sec, @nsec }
+ */
+static inline struct timespec64 inode_set_ctime(struct inode *inode,
+ time64_t sec, long nsec)
+{
+ struct timespec64 ts = { .tv_sec = sec,
+ .tv_nsec = nsec };
+
+ return inode_set_ctime_to_ts(inode, ts);
+}
/*
* Snapshotting support.
@@ -1770,6 +1876,7 @@ struct dir_context {
struct iov_iter;
struct io_uring_cmd;
+struct offset_ctx;
struct file_operations {
struct module *owner;
@@ -1798,7 +1905,7 @@ struct file_operations {
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
void (*splice_eof)(struct file *file);
- int (*setlease)(struct file *, long, struct file_lock **, void **);
+ int (*setlease)(struct file *, int, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
@@ -1850,7 +1957,7 @@ struct inode_operations {
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
u64 len);
- int (*update_time)(struct inode *, struct timespec64 *, int);
+ int (*update_time)(struct inode *, int);
int (*atomic_open)(struct inode *, struct dentry *,
struct file *, unsigned open_flag,
umode_t create_mode);
@@ -1863,6 +1970,7 @@ struct inode_operations {
int (*fileattr_set)(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
+ struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
} ____cacheline_aligned;
static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
@@ -1908,6 +2016,10 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
loff_t len, unsigned int remap_flags);
+enum freeze_holder {
+ FREEZE_HOLDER_KERNEL = (1U << 0),
+ FREEZE_HOLDER_USERSPACE = (1U << 1),
+};
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
@@ -1920,9 +2032,9 @@ struct super_operations {
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
- int (*freeze_super) (struct super_block *);
+ int (*freeze_super) (struct super_block *, enum freeze_holder who);
int (*freeze_fs) (struct super_block *);
- int (*thaw_super) (struct super_block *);
+ int (*thaw_super) (struct super_block *, enum freeze_holder who);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
@@ -2200,7 +2312,7 @@ enum file_time_flags {
extern bool atime_needs_update(const struct path *, struct inode *);
extern void touch_atime(const struct path *);
-int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
+int inode_update_time(struct inode *inode, int flags);
static inline void file_accessed(struct file *file)
{
@@ -2222,6 +2334,7 @@ struct file_system_type {
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
+#define FS_MGTIME 64 /* FS uses multigrain timestamps */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
@@ -2245,6 +2358,17 @@ struct file_system_type {
#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
+/**
+ * is_mgtime: is this inode using multigrain timestamps
+ * @inode: inode to test for multigrain timestamps
+ *
+ * Return true if the inode uses multigrain timestamps, false otherwise.
+ */
+static inline bool is_mgtime(const struct inode *inode)
+{
+ return inode->i_sb->s_type->fs_flags & FS_MGTIME;
+}
+
extern struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int));
@@ -2296,8 +2420,8 @@ extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
-extern int freeze_super(struct super_block *super);
-extern int thaw_super(struct super_block *super);
+int freeze_super(struct super_block *super, enum freeze_holder who);
+int thaw_super(struct super_block *super, enum freeze_holder who);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);
@@ -2306,7 +2430,8 @@ extern int current_umask(void);
extern void ihold(struct inode * inode);
extern void iput(struct inode *);
-extern int generic_update_time(struct inode *, struct timespec64 *, int);
+int inode_update_timestamps(struct inode *inode, int flags);
+int generic_update_time(struct inode *, int);
/* /sys/fs */
extern struct kobject *fs_kobj;
@@ -2545,6 +2670,13 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
return (inode->i_mode ^ mode) & S_IFMT;
}
+/**
+ * file_start_write - get write access to a superblock for regular file io
+ * @file: the file we want to write to
+ *
+ * This is a variant of sb_start_write() which is a noop on non-regualr file.
+ * Should be matched with a call to file_end_write().
+ */
static inline void file_start_write(struct file *file)
{
if (!S_ISREG(file_inode(file)->i_mode))
@@ -2559,11 +2691,53 @@ static inline bool file_start_write_trylock(struct file *file)
return sb_start_write_trylock(file_inode(file)->i_sb);
}
+/**
+ * file_end_write - drop write access to a superblock of a regular file
+ * @file: the file we wrote to
+ *
+ * Should be matched with a call to file_start_write().
+ */
static inline void file_end_write(struct file *file)
{
if (!S_ISREG(file_inode(file)->i_mode))
return;
- __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+ sb_end_write(file_inode(file)->i_sb);
+}
+
+/**
+ * kiocb_start_write - get write access to a superblock for async file io
+ * @iocb: the io context we want to submit the write with
+ *
+ * This is a variant of sb_start_write() for async io submission.
+ * Should be matched with a call to kiocb_end_write().
+ */
+static inline void kiocb_start_write(struct kiocb *iocb)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ sb_start_write(inode->i_sb);
+ /*
+ * Fool lockdep by telling it the lock got released so that it
+ * doesn't complain about the held lock when we return to userspace.
+ */
+ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * kiocb_end_write - drop write access to a superblock after async file io
+ * @iocb: the io context we sumbitted the write with
+ *
+ * Should be matched with a call to kiocb_start_write().
+ */
+static inline void kiocb_end_write(struct kiocb *iocb)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ /*
+ * Tell lockdep we inherited freeze protection from submission thread.
+ */
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ sb_end_write(inode->i_sb);
}
/*
@@ -2613,8 +2787,7 @@ static inline bool inode_is_open_for_write(const struct inode *inode)
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
static inline void i_readcount_dec(struct inode *inode)
{
- BUG_ON(!atomic_read(&inode->i_readcount));
- atomic_dec(&inode->i_readcount);
+ BUG_ON(atomic_dec_return(&inode->i_readcount) < 0);
}
static inline void i_readcount_inc(struct inode *inode)
{
@@ -2880,7 +3053,8 @@ extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
-void generic_fillattr(struct mnt_idmap *, struct inode *, struct kstat *);
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
+void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
@@ -2919,7 +3093,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int);
extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
-extern struct super_block *get_super(struct block_device *);
extern struct super_block *get_active_super(struct block_device *bdev);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
@@ -2940,6 +3113,8 @@ extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
+void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename(struct mnt_idmap *, struct inode *,
@@ -2956,7 +3131,7 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping,
extern const struct address_space_operations ram_aops;
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
-extern int simple_nosetlease(struct file *, long, struct file_lock **, void **);
+extern int simple_nosetlease(struct file *, int, struct file_lock **, void **);
extern const struct dentry_operations simple_dentry_operations;
extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
@@ -2977,6 +3152,22 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
const void __user *from, size_t count);
+struct offset_ctx {
+ struct xarray xa;
+ u32 next_offset;
+};
+
+void simple_offset_init(struct offset_ctx *octx);
+int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
+void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
+int simple_offset_rename_exchange(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry);
+void simple_offset_destroy(struct offset_ctx *octx);
+
+extern const struct file_operations simple_offset_dir_operations;
+
extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index ff6341e09925..96332db693d5 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -109,6 +109,7 @@ struct fs_context {
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
bool oldapi:1; /* Coming from mount(2) */
+ bool exclusive:1; /* create new superblock, reject existing one */
};
struct fs_context_operations {
@@ -150,14 +151,13 @@ extern int get_tree_nodev(struct fs_context *fc,
extern int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc));
-extern int get_tree_single_reconf(struct fs_context *fc,
- int (*fill_super)(struct super_block *sb,
- struct fs_context *fc));
extern int get_tree_keyed(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc),
void *key);
+int setup_bdev_super(struct super_block *sb, int sb_flags,
+ struct fs_context *fc);
extern int get_tree_bdev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc));
diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h
index 54210a42c30d..010d39d0dc1c 100644
--- a/include/linux/fs_stack.h
+++ b/include/linux/fs_stack.h
@@ -24,7 +24,7 @@ static inline void fsstack_copy_attr_times(struct inode *dest,
{
dest->i_atime = src->i_atime;
dest->i_mtime = src->i_mtime;
- dest->i_ctime = src->i_ctime;
+ inode_set_ctime_to_ts(dest, inode_get_ctime(src));
}
#endif /* _LINUX_FS_STACK_H */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 20284387b841..e718dbe928ba 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -25,9 +25,6 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
#endif
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
-struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmd,
- unsigned int flags);
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, unsigned long next);
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..fdc6e64f49d6 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -261,9 +261,10 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
diff --git a/include/linux/list.h b/include/linux/list.h
index f10344dbad4d..164b4d0e9d2a 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -38,11 +38,92 @@ static inline void INIT_LIST_HEAD(struct list_head *list)
WRITE_ONCE(list->prev, list);
}
+#ifdef CONFIG_LIST_HARDENED
+
#ifdef CONFIG_DEBUG_LIST
-extern bool __list_add_valid(struct list_head *new,
- struct list_head *prev,
- struct list_head *next);
-extern bool __list_del_entry_valid(struct list_head *entry);
+# define __list_valid_slowpath
+#else
+# define __list_valid_slowpath __cold __preserve_most
+#endif
+
+/*
+ * Performs the full set of list corruption checks before __list_add().
+ * On list corruption reports a warning, and returns false.
+ */
+extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next);
+
+/*
+ * Performs list corruption checks before __list_add(). Returns false if a
+ * corruption is detected, true otherwise.
+ *
+ * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
+ * inline to catch non-faulting corruptions, and only if a corruption is
+ * detected calls the reporting function __list_add_valid_or_report().
+ */
+static __always_inline bool __list_add_valid(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ bool ret = true;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
+ /*
+ * With the hardening version, elide checking if next and prev
+ * are NULL, since the immediate dereference of them below would
+ * result in a fault if NULL.
+ *
+ * With the reduced set of checks, we can afford to inline the
+ * checks, which also gives the compiler a chance to elide some
+ * of them completely if they can be proven at compile-time. If
+ * one of the pre-conditions does not hold, the slow-path will
+ * show a report which pre-condition failed.
+ */
+ if (likely(next->prev == prev && prev->next == next && new != prev && new != next))
+ return true;
+ ret = false;
+ }
+
+ ret &= __list_add_valid_or_report(new, prev, next);
+ return ret;
+}
+
+/*
+ * Performs the full set of list corruption checks before __list_del_entry().
+ * On list corruption reports a warning, and returns false.
+ */
+extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry);
+
+/*
+ * Performs list corruption checks before __list_del_entry(). Returns false if a
+ * corruption is detected, true otherwise.
+ *
+ * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
+ * inline to catch non-faulting corruptions, and only if a corruption is
+ * detected calls the reporting function __list_del_entry_valid_or_report().
+ */
+static __always_inline bool __list_del_entry_valid(struct list_head *entry)
+{
+ bool ret = true;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
+ struct list_head *prev = entry->prev;
+ struct list_head *next = entry->next;
+
+ /*
+ * With the hardening version, elide checking if next and prev
+ * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate
+ * dereference of them below would result in a fault.
+ */
+ if (likely(prev->next == entry && next->prev == entry))
+ return true;
+ ret = false;
+ }
+
+ ret &= __list_del_entry_valid_or_report(entry);
+ return ret;
+}
#else
static inline bool __list_add_valid(struct list_head *new,
struct list_head *prev,
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 7308a1a7599b..af796986baee 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -54,6 +54,7 @@ LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *f
LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm)
LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
+LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference)
LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
struct fs_context *src_sc)
LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 406ab9ea818f..34f9dba17c1a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3421,15 +3421,24 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
* Indicates whether GUP can follow a PROT_NONE mapped page, or whether
* a (NUMA hinting) fault is required.
*/
-static inline bool gup_can_follow_protnone(unsigned int flags)
+static inline bool gup_can_follow_protnone(struct vm_area_struct *vma,
+ unsigned int flags)
{
/*
- * FOLL_FORCE has to be able to make progress even if the VMA is
- * inaccessible. Further, FOLL_FORCE access usually does not represent
- * application behaviour and we should avoid triggering NUMA hinting
- * faults.
+ * If callers don't want to honor NUMA hinting faults, no need to
+ * determine if we would actually have to trigger a NUMA hinting fault.
*/
- return flags & FOLL_FORCE;
+ if (!(flags & FOLL_HONOR_NUMA_FAULT))
+ return true;
+
+ /*
+ * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs.
+ *
+ * Requiring a fault here even for inaccessible VMAs would mean that
+ * FOLL_FORCE cannot make any progress, because handle_mm_fault()
+ * refuses to process NUMA hinting faults in inaccessible VMAs.
+ */
+ return !vma_is_accessible(vma);
}
typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5e74ce4a28cd..7d30dc4ff0ff 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1286,6 +1286,15 @@ enum {
FOLL_PCI_P2PDMA = 1 << 10,
/* allow interrupts from generic signals */
FOLL_INTERRUPTIBLE = 1 << 11,
+ /*
+ * Always honor (trigger) NUMA hinting faults.
+ *
+ * FOLL_WRITE implicitly honors NUMA hinting faults because a
+ * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE
+ * apply). get_user_pages_fast_only() always implicitly honors NUMA
+ * hinting faults.
+ */
+ FOLL_HONOR_NUMA_FAULT = 1 << 12,
/* See also internal only FOLL flags in mm/internal.h */
};
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 86544707236a..45702bdcbceb 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -73,9 +73,7 @@ struct raw_notifier_head {
struct srcu_notifier_head {
struct mutex mutex;
-#ifdef CONFIG_TREE_SRCU
struct srcu_usage srcuu;
-#endif
struct srcu_struct srcu;
struct notifier_block __rcu *head;
};
@@ -106,7 +104,6 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
#define RAW_NOTIFIER_INIT(name) { \
.head = NULL }
-#ifdef CONFIG_TREE_SRCU
#define SRCU_NOTIFIER_INIT(name, pcpu) \
{ \
.mutex = __MUTEX_INITIALIZER(name.mutex), \
@@ -114,14 +111,6 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
.srcuu = __SRCU_USAGE_INIT(name.srcuu), \
.srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \
}
-#else
-#define SRCU_NOTIFIER_INIT(name, pcpu) \
- { \
- .mutex = __MUTEX_INITIALIZER(name.mutex), \
- .head = NULL, \
- .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \
- }
-#endif
#define ATOMIC_NOTIFIER_HEAD(name) \
struct atomic_notifier_head name = \
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index fee881cded01..771cb0285872 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -29,7 +29,7 @@ struct fs_struct;
* nsproxy is copied.
*/
struct nsproxy {
- atomic_t count;
+ refcount_t count;
struct uts_namespace *uts_ns;
struct ipc_namespace *ipc_ns;
struct mnt_namespace *mnt_ns;
@@ -102,14 +102,13 @@ int __init nsproxy_cache_init(void);
static inline void put_nsproxy(struct nsproxy *ns)
{
- if (atomic_dec_and_test(&ns->count)) {
+ if (refcount_dec_and_test(&ns->count))
free_nsproxy(ns);
- }
}
static inline void get_nsproxy(struct nsproxy *ns)
{
- atomic_inc(&ns->count);
+ refcount_inc(&ns->count);
}
#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 716953ee1ebd..d87840acbfb2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -470,6 +470,19 @@ static inline void *detach_page_private(struct page *page)
return folio_detach_private(page_folio(page));
}
+/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size. I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
+#else
+#define MAX_PAGECACHE_ORDER 8
+#endif
+
#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
#else
@@ -501,22 +514,69 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan);
-#define FGP_ACCESSED 0x00000001
-#define FGP_LOCK 0x00000002
-#define FGP_CREAT 0x00000004
-#define FGP_WRITE 0x00000008
-#define FGP_NOFS 0x00000010
-#define FGP_NOWAIT 0x00000020
-#define FGP_FOR_MMAP 0x00000040
-#define FGP_STABLE 0x00000080
+/**
+ * typedef fgf_t - Flags for getting folios from the page cache.
+ *
+ * Most users of the page cache will not need to use these flags;
+ * there are convenience functions such as filemap_get_folio() and
+ * filemap_lock_folio(). For users which need more control over exactly
+ * what is done with the folios, these flags to __filemap_get_folio()
+ * are available.
+ *
+ * * %FGP_ACCESSED - The folio will be marked accessed.
+ * * %FGP_LOCK - The folio is returned locked.
+ * * %FGP_CREAT - If no folio is present then a new folio is allocated,
+ * added to the page cache and the VM's LRU list. The folio is
+ * returned locked.
+ * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
+ * folio is already in cache. If the folio was allocated, unlock it
+ * before returning so the caller can do the same dance.
+ * * %FGP_WRITE - The folio will be written to by the caller.
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
+ * * %FGP_NOWAIT - Don't block on the folio lock.
+ * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
+ * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
+ * implementation.
+ */
+typedef unsigned int __bitwise fgf_t;
+
+#define FGP_ACCESSED ((__force fgf_t)0x00000001)
+#define FGP_LOCK ((__force fgf_t)0x00000002)
+#define FGP_CREAT ((__force fgf_t)0x00000004)
+#define FGP_WRITE ((__force fgf_t)0x00000008)
+#define FGP_NOFS ((__force fgf_t)0x00000010)
+#define FGP_NOWAIT ((__force fgf_t)0x00000020)
+#define FGP_FOR_MMAP ((__force fgf_t)0x00000040)
+#define FGP_STABLE ((__force fgf_t)0x00000080)
+#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */
#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
+/**
+ * fgf_set_order - Encode a length in the fgf_t flags.
+ * @size: The suggested size of the folio to create.
+ *
+ * The caller of __filemap_get_folio() can use this to suggest a preferred
+ * size for the folio that is created. If there is already a folio at
+ * the index, it will be returned, no matter what its size. If a folio
+ * is freshly created, it may be of a different size than requested
+ * due to alignment constraints, memory pressure, or the presence of
+ * other folios at nearby indices.
+ */
+static inline fgf_t fgf_set_order(size_t size)
+{
+ unsigned int shift = ilog2(size);
+
+ if (shift <= PAGE_SHIFT)
+ return 0;
+ return (__force fgf_t)((shift - PAGE_SHIFT) << 26);
+}
+
void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp);
+ fgf_t fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp);
+ fgf_t fgp_flags, gfp_t gfp);
/**
* filemap_get_folio - Find and get a folio.
@@ -590,7 +650,7 @@ static inline struct page *find_get_page(struct address_space *mapping,
}
static inline struct page *find_get_page_flags(struct address_space *mapping,
- pgoff_t offset, int fgp_flags)
+ pgoff_t offset, fgf_t fgp_flags)
{
return pagecache_get_page(mapping, offset, fgp_flags, 0);
}
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 27a6df448ee5..27cd1e59ccf7 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -6,6 +6,16 @@
struct mm_walk;
+/* Locking requirement during a page walk. */
+enum page_walk_lock {
+ /* mmap_lock should be locked for read to stabilize the vma tree */
+ PGWALK_RDLOCK = 0,
+ /* vma will be write-locked during the walk */
+ PGWALK_WRLOCK = 1,
+ /* vma is expected to be already write-locked during the walk */
+ PGWALK_WRLOCK_VERIFY = 2,
+};
+
/**
* struct mm_walk_ops - callbacks for walk_page_range
* @pgd_entry: if set, called for each non-empty PGD (top-level) entry
@@ -66,6 +76,7 @@ struct mm_walk_ops {
int (*pre_vma)(unsigned long start, unsigned long end,
struct mm_walk *walk);
void (*post_vma)(struct mm_walk *walk);
+ enum page_walk_lock walk_lock;
};
/*
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2dc75df1437f..8f9a459e1671 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -576,6 +576,8 @@
#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F3 0x14e3
#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F3 0x14f3
#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F3 0x12fb
+#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3
+#define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb
#define PCI_DEVICE_ID_AMD_MI200_DF_F3 0x14d3
#define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703
#define PCI_DEVICE_ID_AMD_LANCE 0x2000
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index a0801f68762b..143fbc10ecfe 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -187,5 +187,6 @@ void armpmu_free_irq(int irq, int cpu);
#endif /* CONFIG_ARM_PMU */
#define ARMV8_SPE_PDEV_NAME "arm,spe-v1"
+#define ARMV8_TRBE_PDEV_NAME "arm,trbe"
#endif /* __ARM_PMU_H__ */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2166a69e3bf2..05253af70ce9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -288,10 +288,9 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_EXTENDED_REGS 0x0008
#define PERF_PMU_CAP_EXCLUSIVE 0x0010
#define PERF_PMU_CAP_ITRACE 0x0020
-#define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x0040
-#define PERF_PMU_CAP_NO_EXCLUDE 0x0080
-#define PERF_PMU_CAP_AUX_OUTPUT 0x0100
-#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0200
+#define PERF_PMU_CAP_NO_EXCLUDE 0x0040
+#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
+#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
struct perf_output_handle;
@@ -1194,7 +1193,8 @@ struct perf_sample_data {
PERF_MEM_S(LVL, NA) |\
PERF_MEM_S(SNOOP, NA) |\
PERF_MEM_S(LOCK, NA) |\
- PERF_MEM_S(TLB, NA))
+ PERF_MEM_S(TLB, NA) |\
+ PERF_MEM_S(LVLNUM, NA))
static inline void perf_sample_data_init(struct perf_sample_data *data,
u64 addr, u64 period)
@@ -1316,15 +1316,31 @@ extern int perf_event_output(struct perf_event *event,
struct pt_regs *regs);
static inline bool
-is_default_overflow_handler(struct perf_event *event)
+__is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
{
- if (likely(event->overflow_handler == perf_event_output_forward))
+ if (likely(overflow_handler == perf_event_output_forward))
return true;
- if (unlikely(event->overflow_handler == perf_event_output_backward))
+ if (unlikely(overflow_handler == perf_event_output_backward))
return true;
return false;
}
+#define is_default_overflow_handler(event) \
+ __is_default_overflow_handler((event)->overflow_handler)
+
+#ifdef CONFIG_BPF_SYSCALL
+static inline bool uses_default_overflow_handler(struct perf_event *event)
+{
+ if (likely(is_default_overflow_handler(event)))
+ return true;
+
+ return __is_default_overflow_handler(event->orig_overflow_handler);
+}
+#else
+#define uses_default_overflow_handler(event) \
+ is_default_overflow_handler(event)
+#endif
+
extern void
perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
@@ -1860,10 +1876,6 @@ extern void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg,
u64 now);
-#ifdef CONFIG_MMU
-extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr);
-#endif
-
/*
* Snapshot branch stack on software events.
*
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 02e0086b10f6..608a9eb86bff 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -269,10 +269,10 @@ bool pipe_is_unprivileged_user(void);
/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots);
-long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
+long pipe_fcntl(struct file *, unsigned int, unsigned int arg);
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);
int create_pipe_files(struct file **, int);
-unsigned int round_pipe_size(unsigned long size);
+unsigned int round_pipe_size(unsigned int size);
#endif
diff --git a/include/linux/raid_class.h b/include/linux/raid_class.h
index 6a9b177d5c41..e50416ba9cd9 100644
--- a/include/linux/raid_class.h
+++ b/include/linux/raid_class.h
@@ -77,7 +77,3 @@ DEFINE_RAID_ATTRIBUTE(enum raid_state, state)
struct raid_template *raid_class_attach(struct raid_function_template *);
void raid_class_release(struct raid_template *);
-
-int __must_check raid_component_add(struct raid_template *, struct device *,
- struct device *);
-
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 7ee7ed5de722..6dbc5a1bf6a8 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
rb_insert_augmented(node, &root->rb_root, augment);
}
+static __always_inline struct rb_node *
+rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
+ bool (*less)(struct rb_node *, const struct rb_node *),
+ const struct rb_augment_callbacks *augment)
+{
+ struct rb_node **link = &tree->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ while (*link) {
+ parent = *link;
+ if (less(node, parent)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(node, parent, link);
+ augment->propagate(parent, NULL); /* suboptimal */
+ rb_insert_augmented_cached(node, tree, leftmost, augment);
+
+ return leftmost ? node : NULL;
+}
+
/*
* Template for declaring augmented rbtree callbacks (generic case)
*
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index ba4c00dd8005..89186c499dd4 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -101,7 +101,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
{
struct hlist_nulls_node *first = h->first;
- n->next = first;
+ WRITE_ONCE(n->next, first);
WRITE_ONCE(n->pprev, &h->first);
rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
if (!is_a_nulls(first))
@@ -137,7 +137,7 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
last = i;
if (last) {
- n->next = last->next;
+ WRITE_ONCE(n->next, last->next);
n->pprev = &last->next;
rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
} else {
diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index 9bc8cbb33340..eda493200663 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -87,6 +87,7 @@ static inline void rcu_read_unlock_trace(void)
void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
void synchronize_rcu_tasks_trace(void);
void rcu_barrier_tasks_trace(void);
+struct task_struct *get_rcu_tasks_trace_gp_kthread(void);
#else
/*
* The BPF JIT forms these addresses even when it doesn't call these
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index 699b938358bf..5e0f74f2f8ca 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -42,6 +42,11 @@ do { \
* call_srcu() function, with this wrapper supplying the pointer to the
* corresponding srcu_struct.
*
+ * Note that call_rcu_hurry() should be used instead of call_rcu()
+ * because in kernels built with CONFIG_RCU_LAZY=y the delay between the
+ * invocation of call_rcu() and that of the corresponding RCU callback
+ * can be multiple seconds.
+ *
* The first argument tells Tiny RCU's _wait_rcu_gp() not to
* bother waiting for RCU. The reason for this is because anywhere
* synchronize_rcu_mult() can be called is automatically already a full
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 609bde814cb0..177b3f3676ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -75,14 +75,14 @@ struct user_event_mm;
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
*
- * We have two separate sets of flags: task->state
+ * We have two separate sets of flags: task->__state
* is about runnability, while task->exit_state are
* about the task exiting. Confusing, but this way
* modifying one set can't modify the other one by
* mistake.
*/
-/* Used in tsk->state: */
+/* Used in tsk->__state: */
#define TASK_RUNNING 0x00000000
#define TASK_INTERRUPTIBLE 0x00000001
#define TASK_UNINTERRUPTIBLE 0x00000002
@@ -92,7 +92,7 @@ struct user_event_mm;
#define EXIT_DEAD 0x00000010
#define EXIT_ZOMBIE 0x00000020
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
-/* Used in tsk->state again: */
+/* Used in tsk->__state again: */
#define TASK_PARKED 0x00000040
#define TASK_DEAD 0x00000080
#define TASK_WAKEKILL 0x00000100
@@ -173,7 +173,7 @@ struct user_event_mm;
#endif
/*
- * set_current_state() includes a barrier so that the write of current->state
+ * set_current_state() includes a barrier so that the write of current->__state
* is correctly serialised wrt the caller's subsequent test of whether to
* actually sleep:
*
@@ -196,9 +196,9 @@ struct user_event_mm;
* wake_up_state(p, TASK_UNINTERRUPTIBLE);
*
* where wake_up_state()/try_to_wake_up() executes a full memory barrier before
- * accessing p->state.
+ * accessing p->__state.
*
- * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
+ * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
* once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
* TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
*
@@ -549,13 +549,18 @@ struct sched_entity {
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
+ u64 deadline;
+ u64 min_deadline;
+
struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
u64 sum_exec_runtime;
- u64 vruntime;
u64 prev_sum_exec_runtime;
+ u64 vruntime;
+ s64 vlag;
+ u64 slice;
u64 nr_migrations;
@@ -2433,9 +2438,11 @@ extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
unsigned long uaddr);
+extern int sched_core_idle_cpu(int cpu);
#else
static inline void sched_core_free(struct task_struct *tsk) { }
static inline void sched_core_fork(struct task_struct *p) { }
+static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
#endif
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index dd35ce28bb90..a23af225c898 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -118,11 +118,47 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
}
extern void __put_task_struct(struct task_struct *t);
+extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
static inline void put_task_struct(struct task_struct *t)
{
- if (refcount_dec_and_test(&t->usage))
+ if (!refcount_dec_and_test(&t->usage))
+ return;
+
+ /*
+ * In !RT, it is always safe to call __put_task_struct().
+ * Under RT, we can only call it in preemptible context.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+ static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);
+
+ lock_map_acquire_try(&put_task_map);
__put_task_struct(t);
+ lock_map_release(&put_task_map);
+ return;
+ }
+
+ /*
+ * under PREEMPT_RT, we can't call put_task_struct
+ * in atomic context because it will indirectly
+ * acquire sleeping locks.
+ *
+ * call_rcu() will schedule delayed_put_task_struct_rcu()
+ * to be called in process context.
+ *
+ * __put_task_struct() is called when
+ * refcount_dec_and_test(&t->usage) succeeds.
+ *
+ * This means that it can't "conflict" with
+ * put_task_struct_rcu_user() which abuses ->rcu the same
+ * way; rcu_users has a reference so task->usage can't be
+ * zero after rcu_users 1 -> 0 transition.
+ *
+ * delayed_free_task() also uses ->rcu, but it is only called
+ * when it fails to fork a process. Therefore, there is no
+ * way it can conflict with put_task_struct().
+ */
+ call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}
DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))
diff --git a/include/linux/security.h b/include/linux/security.h
index 32828502f09e..bac98ea18f78 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -293,6 +293,7 @@ int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
int security_bprm_check(struct linux_binprm *bprm);
void security_bprm_committing_creds(struct linux_binprm *bprm);
void security_bprm_committed_creds(struct linux_binprm *bprm);
+int security_fs_context_submount(struct fs_context *fc, struct super_block *reference);
int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
int security_sb_alloc(struct super_block *sb);
@@ -629,6 +630,11 @@ static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
{
}
+static inline int security_fs_context_submount(struct fs_context *fc,
+ struct super_block *reference)
+{
+ return 0;
+}
static inline int security_fs_context_dup(struct fs_context *fc,
struct fs_context *src_fc)
{
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index bd023dd38ae6..386ab580b839 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -249,18 +249,19 @@ static inline void seq_show_option(struct seq_file *m, const char *name,
/**
* seq_show_option_n - display mount options with appropriate escapes
- * where @value must be a specific length.
+ * where @value must be a specific length (i.e.
+ * not NUL-terminated).
* @m: the seq_file handle
* @name: the mount option name
* @value: the mount option name's value, cannot be NULL
- * @length: the length of @value to display
+ * @length: the exact length of @value to display, must be constant expression
*
* This is a macro since this uses "length" to define the size of the
* stack buffer.
*/
#define seq_show_option_n(m, name, value, length) { \
char val_buf[length + 1]; \
- strncpy(val_buf, value, length); \
+ memcpy(val_buf, value, length); \
val_buf[length] = '\0'; \
seq_show_option(m, name, val_buf); \
}
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 9029abd29b1c..6b0c626620f5 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -13,6 +13,10 @@
/* inode in-kernel data */
+#ifdef CONFIG_TMPFS_QUOTA
+#define SHMEM_MAXQUOTAS 2
+#endif
+
struct shmem_inode_info {
spinlock_t lock;
unsigned int seals; /* shmem seals */
@@ -27,6 +31,10 @@ struct shmem_inode_info {
atomic_t stop_eviction; /* hold when working on inode */
struct timespec64 i_crtime; /* file creation time */
unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */
+#ifdef CONFIG_TMPFS_QUOTA
+ struct dquot *i_dquot[MAXQUOTAS];
+#endif
+ struct offset_ctx dir_offsets; /* stable entry offsets */
struct inode vfs_inode;
};
@@ -35,11 +43,18 @@ struct shmem_inode_info {
(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL)
#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL)
+struct shmem_quota_limits {
+ qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */
+ qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */
+ qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */
+ qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */
+};
+
struct shmem_sb_info {
unsigned long max_blocks; /* How many blocks are allowed */
struct percpu_counter used_blocks; /* How many are allocated */
unsigned long max_inodes; /* How many inodes are allowed */
- unsigned long free_inodes; /* How many are left for allocation */
+ unsigned long free_ispace; /* How much ispace left for allocation */
raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
umode_t mode; /* Mount mode for root directory */
unsigned char huge; /* Whether to try for hugepages */
@@ -53,6 +68,7 @@ struct shmem_sb_info {
spinlock_t shrinklist_lock; /* Protects shrinklist */
struct list_head shrinklist; /* List of shinkable inodes */
unsigned long shrinklist_len; /* Length of shrinklist */
+ struct shmem_quota_limits qlimits; /* Default quota limits */
};
static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
@@ -172,4 +188,17 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
#endif /* CONFIG_SHMEM */
#endif /* CONFIG_USERFAULTFD */
+/*
+ * Used space is stored as unsigned 64-bit value in bytes but
+ * quota core supports only signed 64-bit values so use that
+ * as a limit
+ */
+#define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */
+#define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL
+
+#ifdef CONFIG_TMPFS_QUOTA
+extern const struct dquot_operations shmem_quota_operations;
+extern struct quota_format_type shmem_quota_format;
+#endif /* CONFIG_TMPFS_QUOTA */
+
#endif
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index ebd72491af99..447133171d95 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -48,6 +48,10 @@ void srcu_drive_gp(struct work_struct *wp);
#define DEFINE_STATIC_SRCU(name) \
static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name)
+// Dummy structure for srcu_notifier_head.
+struct srcu_usage { };
+#define __SRCU_USAGE_INIT(name) { }
+
void synchronize_srcu(struct srcu_struct *ssp);
/*
diff --git a/include/linux/swait.h b/include/linux/swait.h
index 6a8c22b8c2a5..d324419482a0 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -146,7 +146,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
extern void swake_up_one(struct swait_queue_head *q);
extern void swake_up_all(struct swait_queue_head *q);
-extern void swake_up_locked(struct swait_queue_head *q);
+extern void swake_up_locked(struct swait_queue_head *q, int wake_flags);
extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 03e3d0121d5e..c0cb22cd607d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -284,22 +284,6 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
#endif
/*
- * Called before coming back to user-mode. Returning to user-mode with an
- * address limit different than USER_DS can allow to overwrite kernel memory.
- */
-static inline void addr_limit_user_check(void)
-{
-#ifdef TIF_FSCHECK
- if (!test_thread_flag(TIF_FSCHECK))
- return;
-#endif
-
-#ifdef TIF_FSCHECK
- clear_thread_flag(TIF_FSCHECK);
-#endif
-}
-
-/*
* These syscall function prototypes are kept in the same order as
* include/uapi/asm-generic/unistd.h. Architecture specific entries go below,
* followed by deprecated or obsolete system calls.
@@ -438,8 +422,10 @@ asmlinkage long sys_chdir(const char __user *filename);
asmlinkage long sys_fchdir(unsigned int fd);
asmlinkage long sys_chroot(const char __user *filename);
asmlinkage long sys_fchmod(unsigned int fd, umode_t mode);
-asmlinkage long sys_fchmodat(int dfd, const char __user * filename,
+asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
umode_t mode);
+asmlinkage long sys_fchmodat2(int dfd, const char __user *filename,
+ umode_t mode, unsigned int flags);
asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
gid_t group, int flag);
asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index dee66ade89a0..b449a46766f5 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -81,11 +81,13 @@ struct thermal_zone_device_ops {
* @temperature: temperature value in miliCelsius
* @hysteresis: relative hysteresis in miliCelsius
* @type: trip point type
+ * @priv: pointer to driver data associated with this trip
*/
struct thermal_trip {
int temperature;
int hysteresis;
enum thermal_trip_type type;
+ void *priv;
};
struct thermal_cooling_device_ops {
@@ -287,6 +289,9 @@ int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id,
const struct thermal_trip *trip);
+int for_each_thermal_trip(struct thermal_zone_device *tz,
+ int (*cb)(struct thermal_trip *, void *),
+ void *data);
int thermal_zone_get_num_trips(struct thermal_zone_device *tz);
int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp);
@@ -323,6 +328,10 @@ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
struct thermal_cooling_device *);
void thermal_zone_device_update(struct thermal_zone_device *,
enum thermal_notify_event);
+void thermal_zone_device_exec(struct thermal_zone_device *tz,
+ void (*cb)(struct thermal_zone_device *,
+ unsigned long),
+ unsigned long data);
struct thermal_cooling_device *thermal_cooling_device_register(const char *,
void *, const struct thermal_cooling_device_ops *);
diff --git a/include/linux/torture.h b/include/linux/torture.h
index 7038104463e4..bb466eec01e4 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -108,12 +108,15 @@ bool torture_must_stop(void);
bool torture_must_stop_irq(void);
void torture_kthread_stopping(char *title);
int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
- char *f, struct task_struct **tp);
+ char *f, struct task_struct **tp, void (*cbf)(struct task_struct *tp));
void _torture_stop_kthread(char *m, struct task_struct **tp);
#define torture_create_kthread(n, arg, tp) \
_torture_create_kthread(n, (arg), #n, "Creating " #n " task", \
- "Failed to create " #n, &(tp))
+ "Failed to create " #n, &(tp), NULL)
+#define torture_create_kthread_cb(n, arg, tp, cbf) \
+ _torture_create_kthread(n, (arg), #n, "Creating " #n " task", \
+ "Failed to create " #n, &(tp), cbf)
#define torture_stop_kthread(n, tp) \
_torture_stop_kthread("Stopping " #n " task", &(tp))
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 3930e676436c..1e8bbdb8da90 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -59,6 +59,17 @@ int trace_raw_output_prep(struct trace_iterator *iter,
extern __printf(2, 3)
void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...);
+/* Used to find the offset and length of dynamic fields in trace events */
+struct trace_dynamic_info {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ u16 offset;
+ u16 len;
+#else
+ u16 len;
+ u16 offset;
+#endif
+};
+
/*
* The trace entry - the most basic unit of tracing. This is what
* is printed in the end as a single line in the trace output, such as:
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ff81e5ccaef2..42bce38a8e87 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -163,7 +163,7 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
return ret;
}
-size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
+size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
@@ -184,6 +184,13 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
{
return copy_page_to_iter(&folio->page, offset, bytes, i);
}
+
+static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
+ size_t offset, size_t bytes, struct iov_iter *i)
+{
+ return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
+}
+
size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
size_t bytes, struct iov_iter *i);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a0307b516b09..5ec7739400f4 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -210,6 +210,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
}
int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
unsigned int mode, void *key, wait_queue_entry_t *bookmark);
@@ -237,6 +238,8 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head);
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
+#define wake_up_poll_on_current_cpu(x, m) \
+ __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
#define wake_up_locked_poll(x, m) \
__wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m) \
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fba937999fbf..083387c00f0c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -375,11 +375,6 @@ void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end);
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
-void folio_account_redirty(struct folio *folio);
-static inline void account_page_redirty(struct page *page)
-{
- folio_account_redirty(page_folio(page));
-}
bool folio_redirty_for_writepage(struct writeback_control *, struct folio *);
bool redirty_page_for_writepage(struct writeback_control *, struct page *);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d591ef59aa98..d20051865800 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -114,13 +114,15 @@ struct simple_xattr {
};
void simple_xattrs_init(struct simple_xattrs *xattrs);
-void simple_xattrs_free(struct simple_xattrs *xattrs);
+void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space);
+size_t simple_xattr_space(const char *name, size_t size);
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
+void simple_xattr_free(struct simple_xattr *xattr);
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
void *buffer, size_t size);
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags,
- ssize_t *removed_size);
+struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
+ const char *name, const void *value,
+ size_t size, int flags);
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
char *buffer, size_t size);
void simple_xattr_add(struct simple_xattrs *xattrs,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 30ac427cf0c6..5b8b1b644a2d 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -722,23 +722,14 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond,
}
/* Caller must hold rcu_read_lock() for read */
-static inline bool bond_slave_has_mac_rx(struct bonding *bond, const u8 *mac)
+static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac)
{
struct list_head *iter;
struct slave *tmp;
- struct netdev_hw_addr *ha;
bond_for_each_slave_rcu(bond, tmp, iter)
if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr))
return true;
-
- if (netdev_uc_empty(bond->dev))
- return false;
-
- netdev_for_each_uc_addr(ha, bond->dev)
- if (ether_addr_equal_64bits(mac, ha->addr))
- return true;
-
return false;
}
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0bb32bfc6183..491ceb7ebe5d 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -222,8 +222,8 @@ struct inet_sock {
__s16 uc_ttl;
__u16 cmsg_flags;
struct ip_options_rcu __rcu *inet_opt;
+ atomic_t inet_id;
__be16 inet_sport;
- __u16 inet_id;
__u8 tos;
__u8 min_ttl;
diff --git a/include/net/ip.h b/include/net/ip.h
index 332521170d9b..19adacd5ece0 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -538,8 +538,19 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb,
* generator as much as we can.
*/
if (sk && inet_sk(sk)->inet_daddr) {
- iph->id = htons(inet_sk(sk)->inet_id);
- inet_sk(sk)->inet_id += segs;
+ int val;
+
+ /* avoid atomic operations for TCP,
+ * as we hold socket lock at this point.
+ */
+ if (sk_is_tcp(sk)) {
+ sock_owned_by_me(sk);
+ val = atomic_read(&inet_sk(sk)->inet_id);
+ atomic_set(&inet_sk(sk)->inet_id, val + segs);
+ } else {
+ val = atomic_add_return(segs, &inet_sk(sk)->inet_id);
+ }
+ iph->id = htons(val);
return;
}
if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) {
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 3a8a2d2c58c3..2a55ae932c56 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6612,6 +6612,7 @@ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap,
* marks frames marked in the bitmap as having been filtered. Afterwards, it
* checks if any frames in the window starting from @ssn can now be released
* (in case they were only waiting for frames that were filtered.)
+ * (Only work correctly if @max_rx_aggregation_subframes <= 64 frames)
*/
void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
u16 ssn, u64 filtered,
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e9ae567c037d..dd40c75011d2 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -587,6 +587,11 @@ static inline void *nft_set_priv(const struct nft_set *set)
return (void *)set->data;
}
+static inline bool nft_set_gc_is_pending(const struct nft_set *s)
+{
+ return refcount_read(&s->refs) != 1;
+}
+
static inline struct nft_set *nft_set_container_of(const void *priv)
{
return (void *)priv - offsetof(struct nft_set, data);
@@ -1729,6 +1734,7 @@ struct nftables_pernet {
u64 table_handle;
unsigned int base_seq;
unsigned int gc_seq;
+ u8 validate_state;
};
extern unsigned int nf_tables_net_id;
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index d9076a7a430c..6506221c5fe3 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -190,8 +190,8 @@ int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
u32 portid, const struct nlmsghdr *nlh);
-int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
- struct netlink_ext_ack *exterr);
+int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
+ struct netlink_ext_ack *exterr);
struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid);
#define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind)
diff --git a/include/net/sock.h b/include/net/sock.h
index e3d987b2ef12..690e22139543 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1323,6 +1323,7 @@ struct proto {
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
+ * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes.
* All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
@@ -1423,7 +1424,7 @@ static inline bool sk_has_memory_pressure(const struct sock *sk)
static inline bool sk_under_global_memory_pressure(const struct sock *sk)
{
return sk->sk_prot->memory_pressure &&
- !!*sk->sk_prot->memory_pressure;
+ !!READ_ONCE(*sk->sk_prot->memory_pressure);
}
static inline bool sk_under_memory_pressure(const struct sock *sk)
@@ -1435,7 +1436,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
mem_cgroup_under_socket_pressure(sk->sk_memcg))
return true;
- return !!*sk->sk_prot->memory_pressure;
+ return !!READ_ONCE(*sk->sk_prot->memory_pressure);
}
static inline long
@@ -1512,7 +1513,7 @@ proto_memory_pressure(struct proto *prot)
{
if (!prot->memory_pressure)
return false;
- return !!*prot->memory_pressure;
+ return !!READ_ONCE(*prot->memory_pressure);
}
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index a8206f5332e9..b2db2c2f1c57 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -38,7 +38,6 @@ struct find_free_extent_ctl;
__print_symbolic(type, \
{ BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \
{ BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \
- { BTRFS_EXTENT_REF_V0_KEY, "EXTENT_REF_V0" }, \
{ BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \
{ BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" })
@@ -2482,7 +2481,7 @@ DECLARE_EVENT_CLASS(btrfs_raid56_bio,
__entry->offset, __entry->opf, __entry->physical, __entry->len)
);
-DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial,
+DEFINE_EVENT(btrfs_raid56_bio, raid56_read,
TP_PROTO(const struct btrfs_raid_bio *rbio,
const struct bio *bio,
const struct raid56_bio_trace_info *trace_info),
@@ -2490,32 +2489,7 @@ DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial,
TP_ARGS(rbio, bio, trace_info)
);
-DEFINE_EVENT(btrfs_raid56_bio, raid56_write_stripe,
- TP_PROTO(const struct btrfs_raid_bio *rbio,
- const struct bio *bio,
- const struct raid56_bio_trace_info *trace_info),
-
- TP_ARGS(rbio, bio, trace_info)
-);
-
-
-DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_write_stripe,
- TP_PROTO(const struct btrfs_raid_bio *rbio,
- const struct bio *bio,
- const struct raid56_bio_trace_info *trace_info),
-
- TP_ARGS(rbio, bio, trace_info)
-);
-
-DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read,
- TP_PROTO(const struct btrfs_raid_bio *rbio,
- const struct bio *bio,
- const struct raid56_bio_trace_info *trace_info),
-
- TP_ARGS(rbio, bio, trace_info)
-);
-
-DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read_recover,
+DEFINE_EVENT(btrfs_raid56_bio, raid56_write,
TP_PROTO(const struct btrfs_raid_bio *rbio,
const struct bio *bio,
const struct raid56_bio_trace_info *trace_info),
diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
index 71dbe8bfa7db..e18684b02c3d 100644
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -80,11 +80,11 @@ TRACE_EVENT(erofs_fill_inode,
__entry->blkaddr, __entry->ofs)
);
-TRACE_EVENT(erofs_readpage,
+TRACE_EVENT(erofs_read_folio,
- TP_PROTO(struct page *page, bool raw),
+ TP_PROTO(struct folio *folio, bool raw),
- TP_ARGS(page, raw),
+ TP_ARGS(folio, raw),
TP_STRUCT__entry(
__field(dev_t, dev )
@@ -96,11 +96,11 @@ TRACE_EVENT(erofs_readpage,
),
TP_fast_assign(
- __entry->dev = page->mapping->host->i_sb->s_dev;
- __entry->nid = EROFS_I(page->mapping->host)->nid;
- __entry->dir = S_ISDIR(page->mapping->host->i_mode);
- __entry->index = page->index;
- __entry->uptodate = PageUptodate(page);
+ __entry->dev = folio->mapping->host->i_sb->s_dev;
+ __entry->nid = EROFS_I(folio->mapping->host)->nid;
+ __entry->dir = S_ISDIR(folio->mapping->host->i_mode);
+ __entry->index = folio->index;
+ __entry->uptodate = folio_test_uptodate(folio);
__entry->raw = raw;
),
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index fd6c1cb585db..abe087c53b4b 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -820,8 +820,11 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
#define __NR_cachestat 451
__SYSCALL(__NR_cachestat, sys_cachestat)
+#define __NR_fchmodat2 452
+__SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+
#undef __NR_syscalls
-#define __NR_syscalls 452
+#define __NR_syscalls 453
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index ab38d0f411fa..fc3c32186d7e 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -220,7 +220,11 @@
#define BTRFS_EXTENT_DATA_REF_KEY 178
-#define BTRFS_EXTENT_REF_V0_KEY 180
+/*
+ * Obsolete key. Defintion removed in 6.6, value may be reused in the future.
+ *
+ * #define BTRFS_EXTENT_REF_V0_KEY 180
+ */
#define BTRFS_SHARED_BLOCK_REF_KEY 182
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 0c8cf359ea5b..e0e159138331 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -443,7 +443,6 @@ typedef struct elf64_shdr {
#define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */
#define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */
#define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */
-#define NT_RISCV_VECTOR 0x900 /* RISC-V vector registers */
#define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */
#define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */
#define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 8eb0d7b758d2..bb242fdcfe6b 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -100,8 +100,9 @@ enum fsconfig_command {
FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */
FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */
FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */
- FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */
+ FSCONFIG_CMD_CREATE = 6, /* Create new or reuse existing superblock */
FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */
+ FSCONFIG_CMD_CREATE_EXCL = 8, /* Create new superblock, fail if reusing existing superblock */
};
/*
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 37675437b768..39c6a250dd1b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1339,7 +1339,8 @@ union perf_mem_data_src {
#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */
#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */
#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */
-/* 5-0x8 available */
+/* 5-0x7 available */
+#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */
#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */
#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */
#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
index f17c9636a859..52090105b828 100644
--- a/include/uapi/linux/quota.h
+++ b/include/uapi/linux/quota.h
@@ -77,6 +77,7 @@
#define QFMT_VFS_V0 2
#define QFMT_OCFS2 3
#define QFMT_VFS_V1 4
+#define QFMT_SHMEM 5
/* Size of block in which space limits are passed through the quota
* interface */
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0fdc6ef02b94..dbfc9b37fcae 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -115,6 +115,8 @@ struct seccomp_notif_resp {
__u32 flags;
};
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+
/* valid flags for seccomp_notif_addfd */
#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */
@@ -150,4 +152,6 @@ struct seccomp_notif_addfd {
#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
struct seccomp_notif_addfd)
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
+
#endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h
index 7837ba4fe728..7c3fc3980881 100644
--- a/include/uapi/linux/stddef.h
+++ b/include/uapi/linux/stddef.h
@@ -45,3 +45,7 @@
TYPE NAME[]; \
}
#endif
+
+#ifndef __counted_by
+#define __counted_by(m)
+#endif
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index d2029556083e..375718ba4ab6 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -98,6 +98,18 @@ struct privcmd_mmap_resource {
__u64 addr;
};
+/* For privcmd_irqfd::flags */
+#define PRIVCMD_IRQFD_FLAG_DEASSIGN (1 << 0)
+
+struct privcmd_irqfd {
+ void __user *dm_op;
+ __u32 size; /* Size of structure pointed by dm_op */
+ __u32 fd;
+ __u32 flags;
+ domid_t dom;
+ __u8 pad[2];
+};
+
/*
* @cmd: IOCTL_PRIVCMD_HYPERCALL
* @arg: &privcmd_hypercall_t
@@ -125,5 +137,7 @@ struct privcmd_mmap_resource {
_IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
#define IOCTL_PRIVCMD_MMAP_RESOURCE \
_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
+#define IOCTL_PRIVCMD_IRQFD \
+ _IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff --git a/include/xen/events.h b/include/xen/events.h
index 95970a2f7695..95d5e28de324 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -75,7 +75,6 @@ void evtchn_put(evtchn_port_t evtchn);
void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
void rebind_evtchn_irq(evtchn_port_t evtchn, int irq);
-int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu);
static inline void notify_remote_via_evtchn(evtchn_port_t port)
{
diff --git a/init/Kconfig b/init/Kconfig
index f7f65af4ee12..5e7d4885d1bf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -629,6 +629,7 @@ config TASK_IO_ACCOUNTING
config PSI
bool "Pressure stall information tracking"
+ select KERNFS
help
Collect metrics that indicate how overcommitted the CPU, memory,
and IO capacity are in the system.
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 1aa015883519..5dfd30b13f48 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/ramfs.h>
#include <linux/shmem_fs.h>
+#include <linux/ktime.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_fs_sb.h>
@@ -71,12 +72,37 @@ static int __init rootwait_setup(char *str)
{
if (*str)
return 0;
- root_wait = 1;
+ root_wait = -1;
return 1;
}
__setup("rootwait", rootwait_setup);
+static int __init rootwait_timeout_setup(char *str)
+{
+ int sec;
+
+ if (kstrtoint(str, 0, &sec) || sec < 0) {
+ pr_warn("ignoring invalid rootwait value\n");
+ goto ignore;
+ }
+
+ if (check_mul_overflow(sec, MSEC_PER_SEC, &root_wait)) {
+ pr_warn("ignoring excessive rootwait value\n");
+ goto ignore;
+ }
+
+ return 1;
+
+ignore:
+ /* Fallback to indefinite wait */
+ root_wait = -1;
+
+ return 1;
+}
+
+__setup("rootwait=", rootwait_timeout_setup);
+
static char * __initdata root_mount_data;
static int __init root_data_setup(char *str)
{
@@ -384,14 +410,22 @@ void __init mount_root(char *root_device_name)
/* wait for any asynchronous scanning to complete */
static void __init wait_for_root(char *root_device_name)
{
+ ktime_t end;
+
if (ROOT_DEV != 0)
return;
pr_info("Waiting for root device %s...\n", root_device_name);
+ end = ktime_add_ms(ktime_get_raw(), root_wait);
+
while (!driver_probe_done() ||
- early_lookup_bdev(root_device_name, &ROOT_DEV) < 0)
+ early_lookup_bdev(root_device_name, &ROOT_DEV) < 0) {
msleep(5);
+ if (root_wait > 0 && ktime_after(ktime_get_raw(), end))
+ break;
+ }
+
async_synchronize_full();
}
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1bce2208b65c..b3435033fadf 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -105,6 +105,7 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} else {
rw->kiocb.ki_ioprio = get_current_ioprio();
}
+ rw->kiocb.dio_complete = NULL;
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
@@ -220,17 +221,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
}
#endif
-static void kiocb_end_write(struct io_kiocb *req)
+static void io_req_end_write(struct io_kiocb *req)
{
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
if (req->flags & REQ_F_ISREG) {
- struct super_block *sb = file_inode(req->file)->i_sb;
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
- __sb_writers_acquired(sb, SB_FREEZE_WRITE);
- sb_end_write(sb);
+ kiocb_end_write(&rw->kiocb);
}
}
@@ -243,7 +239,7 @@ static void io_req_io_end(struct io_kiocb *req)
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
if (rw->kiocb.ki_flags & IOCB_WRITE) {
- kiocb_end_write(req);
+ io_req_end_write(req);
fsnotify_modify(req->file);
} else {
fsnotify_access(req->file);
@@ -285,6 +281,15 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct kiocb *kiocb = &rw->kiocb;
+
+ if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
+ long res = kiocb->dio_complete(rw->kiocb.private);
+
+ io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ }
+
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
@@ -300,9 +305,11 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
struct io_kiocb *req = cmd_to_io_kiocb(rw);
- if (__io_complete_rw_common(req, res))
- return;
- io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
+ if (__io_complete_rw_common(req, res))
+ return;
+ io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+ }
req->io_task_work.func = io_req_rw_complete;
__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
}
@@ -313,7 +320,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
struct io_kiocb *req = cmd_to_io_kiocb(rw);
if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
+ io_req_end_write(req);
if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
@@ -902,19 +909,18 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
+ if (req->flags & REQ_F_ISREG)
+ kiocb_start_write(kiocb);
+ kiocb->ki_flags |= IOCB_WRITE;
+
/*
- * Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in
- * io_complete_rw(). Fool lockdep by telling it the lock got
- * released so that it doesn't complain about the held lock when
- * we return to userspace.
+ * For non-polled IO, set IOCB_DIO_CALLER_COMP, stating that our handler
+ * groks deferring the completion to task context. This isn't
+ * necessary and useful for polled IO as that can always complete
+ * directly.
*/
- if (req->flags & REQ_F_ISREG) {
- sb_start_write(file_inode(req->file)->i_sb);
- __sb_writers_release(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE);
- }
- kiocb->ki_flags |= IOCB_WRITE;
+ if (!(kiocb->ki_flags & IOCB_HIPRI))
+ kiocb->ki_flags |= IOCB_DIO_CALLER_COMP;
if (likely(req->file->f_op->write_iter))
ret2 = call_write_iter(req->file, kiocb, &s->iter);
@@ -961,7 +967,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
io->bytes_done += ret2;
if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
+ io_req_end_write(req);
return ret ? ret : -EAGAIN;
}
done:
@@ -972,7 +978,7 @@ copy_iov:
ret = io_setup_async_rw(req, iovec, s, false);
if (!ret) {
if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
+ io_req_end_write(req);
return -EAGAIN;
}
return ret;
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 71881bddad25..ba8215ed663a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -302,7 +302,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_mtime = inode->i_ctime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
if (S_ISREG(mode)) {
struct mqueue_inode_info *info;
@@ -596,7 +596,7 @@ static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg)
put_ipc_ns(ipc_ns);
dir->i_size += DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
+ dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
d_instantiate(dentry, inode);
dget(dentry);
@@ -618,7 +618,7 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
+ dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
dir->i_size -= DIRENT_SIZE;
drop_nlink(inode);
dput(dentry);
@@ -635,7 +635,8 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
size_t count, loff_t *off)
{
- struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+ struct inode *inode = file_inode(filp);
+ struct mqueue_inode_info *info = MQUEUE_I(inode);
char buffer[FILENT_SIZE];
ssize_t ret;
@@ -656,7 +657,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
if (ret <= 0)
return ret;
- file_inode(filp)->i_atime = file_inode(filp)->i_ctime = current_time(file_inode(filp));
+ inode->i_atime = inode_set_ctime_current(inode);
return ret;
}
@@ -1162,8 +1163,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
goto out_unlock;
__do_notify(info);
}
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
out_unlock:
spin_unlock(&info->lock);
@@ -1257,8 +1257,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
msg_ptr = msg_get(info);
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
/* There is now free space in queue. */
pipelined_receive(&wake_q, info);
@@ -1396,7 +1395,7 @@ retry:
if (notification == NULL) {
if (info->notify_owner == task_tgid(current)) {
remove_notification(info);
- inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode_set_ctime_current(inode);
}
} else if (info->notify_owner != NULL) {
ret = -EBUSY;
@@ -1422,7 +1421,7 @@ retry:
info->notify_owner = get_pid(task_tgid(current));
info->notify_user_ns = get_user_ns(current_user_ns());
- inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode_set_ctime_current(inode);
}
spin_unlock(&info->lock);
out_fput:
@@ -1485,7 +1484,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
f.file->f_flags &= ~O_NONBLOCK;
spin_unlock(&f.file->f_lock);
- inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode_set_ctime_current(inode);
}
spin_unlock(&info->lock);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4174f76133df..99d0625b6c82 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -118,9 +118,8 @@ static struct inode *bpf_get_inode(struct super_block *sb,
return ERR_PTR(-ENOSPC);
inode->i_ino = get_next_ino();
- inode->i_atime = current_time(inode);
+ inode->i_atime = inode_set_ctime_current(inode);
inode->i_mtime = inode->i_atime;
- inode->i_ctime = inode->i_atime;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
@@ -148,8 +147,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = current_time(dir);
- dir->i_ctime = dir->i_mtime;
+ dir->i_mtime = inode_set_ctime_current(dir);
}
static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f55a40db065f..5fa95f86cb4d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3685,6 +3685,36 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
return ret;
}
+static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_local_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_local_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+
+static int cpu_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
@@ -5235,6 +5265,10 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ {
+ .name = "cpu.stat.local",
+ .seq_show = cpu_local_stat_show,
+ },
{ } /* terminate */
};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 88a7ede322bd..f6811c857102 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -592,7 +592,10 @@ static void lockdep_release_cpus_lock(void)
void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
+
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+static unsigned int cpu_smt_max_threads __ro_after_init;
+unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;
void __init cpu_smt_disable(bool force)
{
@@ -606,16 +609,33 @@ void __init cpu_smt_disable(bool force)
pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED;
}
+ cpu_smt_num_threads = 1;
}
/*
* The decision whether SMT is supported can only be done after the full
* CPU identification. Called from architecture code.
*/
-void __init cpu_smt_check_topology(void)
+void __init cpu_smt_set_num_threads(unsigned int num_threads,
+ unsigned int max_threads)
{
- if (!topology_smt_supported())
+ WARN_ON(!num_threads || (num_threads > max_threads));
+
+ if (max_threads == 1)
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+ cpu_smt_max_threads = max_threads;
+
+ /*
+ * If SMT has been disabled via the kernel command line or SMT is
+ * not supported, set cpu_smt_num_threads to 1 for consistency.
+ * If enabled, take the architecture requested number of threads
+ * to bring up into account.
+ */
+ if (cpu_smt_control != CPU_SMT_ENABLED)
+ cpu_smt_num_threads = 1;
+ else if (num_threads < cpu_smt_num_threads)
+ cpu_smt_num_threads = num_threads;
}
static int __init smt_cmdline_disable(char *str)
@@ -625,9 +645,23 @@ static int __init smt_cmdline_disable(char *str)
}
early_param("nosmt", smt_cmdline_disable);
+/*
+ * For Archicture supporting partial SMT states check if the thread is allowed.
+ * Otherwise this has already been checked through cpu_smt_max_threads when
+ * setting the SMT level.
+ */
+static inline bool cpu_smt_thread_allowed(unsigned int cpu)
+{
+#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
+ return topology_smt_thread_allowed(cpu);
+#else
+ return true;
+#endif
+}
+
static inline bool cpu_smt_allowed(unsigned int cpu)
{
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
return true;
if (topology_is_primary_thread(cpu))
@@ -642,7 +676,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}
-/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
+/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
@@ -650,22 +684,8 @@ bool cpu_smt_possible(void)
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);
-static inline bool cpuhp_smt_aware(void)
-{
- return topology_smt_supported();
-}
-
-static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
-{
- return cpu_primary_thread_mask;
-}
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
-static inline bool cpuhp_smt_aware(void) { return false; }
-static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
-{
- return cpu_present_mask;
-}
#endif
static inline enum cpuhp_state
@@ -1793,6 +1813,16 @@ static int __init parallel_bringup_parse_param(char *arg)
}
early_param("cpuhp.parallel", parallel_bringup_parse_param);
+static inline bool cpuhp_smt_aware(void)
+{
+ return cpu_smt_max_threads > 1;
+}
+
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_primary_thread_mask;
+}
+
/*
* On architectures which have enabled parallel bringup this invokes all BP
* prepare states for each of the to be onlined APs first. The last state
@@ -2626,6 +2656,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
for_each_online_cpu(cpu) {
if (topology_is_primary_thread(cpu))
continue;
+ /*
+ * Disable can be called with CPU_SMT_ENABLED when changing
+ * from a higher to lower number of SMT threads per core.
+ */
+ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ continue;
ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (ret)
break;
@@ -2660,6 +2696,8 @@ int cpuhp_smt_enable(void)
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
+ if (!cpu_smt_thread_allowed(cpu))
+ continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
break;
@@ -2838,20 +2876,19 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
+static bool cpu_smt_num_threads_valid(unsigned int threads)
+{
+ if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
+ return threads >= 1 && threads <= cpu_smt_max_threads;
+ return threads == 1 || threads == cpu_smt_max_threads;
+}
+
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- int ctrlval, ret;
-
- if (sysfs_streq(buf, "on"))
- ctrlval = CPU_SMT_ENABLED;
- else if (sysfs_streq(buf, "off"))
- ctrlval = CPU_SMT_DISABLED;
- else if (sysfs_streq(buf, "forceoff"))
- ctrlval = CPU_SMT_FORCE_DISABLED;
- else
- return -EINVAL;
+ int ctrlval, ret, num_threads, orig_threads;
+ bool force_off;
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
return -EPERM;
@@ -2859,21 +2896,39 @@ __store_smt_control(struct device *dev, struct device_attribute *attr,
if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return -ENODEV;
+ if (sysfs_streq(buf, "on")) {
+ ctrlval = CPU_SMT_ENABLED;
+ num_threads = cpu_smt_max_threads;
+ } else if (sysfs_streq(buf, "off")) {
+ ctrlval = CPU_SMT_DISABLED;
+ num_threads = 1;
+ } else if (sysfs_streq(buf, "forceoff")) {
+ ctrlval = CPU_SMT_FORCE_DISABLED;
+ num_threads = 1;
+ } else if (kstrtoint(buf, 10, &num_threads) == 0) {
+ if (num_threads == 1)
+ ctrlval = CPU_SMT_DISABLED;
+ else if (cpu_smt_num_threads_valid(num_threads))
+ ctrlval = CPU_SMT_ENABLED;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
- if (ctrlval != cpu_smt_control) {
- switch (ctrlval) {
- case CPU_SMT_ENABLED:
- ret = cpuhp_smt_enable();
- break;
- case CPU_SMT_DISABLED:
- case CPU_SMT_FORCE_DISABLED:
- ret = cpuhp_smt_disable(ctrlval);
- break;
- }
- }
+ orig_threads = cpu_smt_num_threads;
+ cpu_smt_num_threads = num_threads;
+
+ force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;
+
+ if (num_threads > orig_threads)
+ ret = cpuhp_smt_enable();
+ else if (num_threads < orig_threads || force_off)
+ ret = cpuhp_smt_disable(ctrlval);
unlock_device_hotplug();
return ret ? ret : count;
@@ -2901,6 +2956,17 @@ static ssize_t control_show(struct device *dev,
{
const char *state = smt_states[cpu_smt_control];
+#ifdef CONFIG_HOTPLUG_SMT
+ /*
+ * If SMT is enabled but not all threads are enabled then show the
+ * number of threads. If all threads are enabled show "on". Otherwise
+ * show the state name.
+ */
+ if (cpu_smt_control == CPU_SMT_ENABLED &&
+ cpu_smt_num_threads != cpu_smt_max_threads)
+ return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
+#endif
+
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index be61332c66b5..d7ee4bc3f2ba 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -205,8 +205,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
arch_exit_to_user_mode_prepare(regs, ti_work);
- /* Ensure that the address limit is intact and no locks are held */
- addr_limit_user_check();
+ /* Ensure that kernel state is sane for a return to userspace */
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 78ae7b6f90fd..93015cb64d4e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8249,7 +8249,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
unsigned int size;
memset(comm, 0, sizeof(comm));
- strlcpy(comm, comm_event->task->comm, sizeof(comm));
+ strscpy(comm, comm_event->task->comm, sizeof(comm));
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -8704,7 +8704,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
}
cpy_name:
- strlcpy(tmp, name, sizeof(tmp));
+ strscpy(tmp, name, sizeof(tmp));
name = tmp;
got_name:
/*
@@ -9128,7 +9128,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
goto err;
- strlcpy(name, sym, KSYM_NAME_LEN);
+ strscpy(name, sym, KSYM_NAME_LEN);
name_len = strlen(name) + 1;
while (!IS_ALIGNED(name_len, sizeof(u64)))
name[name_len++] = '\0';
@@ -9595,16 +9595,16 @@ u64 perf_swevent_set_period(struct perf_event *event)
hwc->last_period = hwc->sample_period;
-again:
- old = val = local64_read(&hwc->period_left);
- if (val < 0)
- return 0;
+ old = local64_read(&hwc->period_left);
+ do {
+ val = old;
+ if (val < 0)
+ return 0;
- nr = div64_u64(period + val, period);
- offset = nr * period;
- val -= offset;
- if (local64_cmpxchg(&hwc->period_left, old, val) != old)
- goto again;
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));
return nr;
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a0433f37b024..fb1e180b5f0a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -191,9 +191,10 @@ __perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
+ offset = local_read(&rb->head);
do {
+ head = offset;
tail = READ_ONCE(rb->user_page->data_tail);
- offset = head = local_read(&rb->head);
if (!rb->overwrite) {
if (unlikely(!ring_buffer_has_space(head, tail,
perf_data_size(rb),
@@ -217,7 +218,7 @@ __perf_output_begin(struct perf_output_handle *handle,
head += size;
else
head -= size;
- } while (local_cmpxchg(&rb->head, offset, head) != offset);
+ } while (!local_try_cmpxchg(&rb->head, &offset, head));
if (backward) {
offset = head;
diff --git a/kernel/fork.c b/kernel/fork.c
index d2e12b6d2b18..f81149739eb9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -985,6 +985,14 @@ void __put_task_struct(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(__put_task_struct);
+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
+{
+ struct task_struct *task = container_of(rhp, struct task_struct, rcu);
+
+ __put_task_struct(task);
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
+
void __init __weak arch_task_cache_init(void) { }
/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ee8c0acf39df..dc94e0bf2c94 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -473,11 +473,12 @@ void handle_nested_irq(unsigned int irq)
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ raw_spin_unlock_irq(&desc->lock);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
- irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ atomic_inc(&desc->threads_active);
raw_spin_unlock_irq(&desc->lock);
action_ret = IRQ_NONE;
@@ -487,11 +488,7 @@ void handle_nested_irq(unsigned int irq)
if (!irq_settings_no_debug(desc))
note_interrupt(desc, action_ret);
- raw_spin_lock_irq(&desc->lock);
- irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-
-out_unlock:
- raw_spin_unlock_irq(&desc->lock);
+ wake_threads_waitq(desc);
}
EXPORT_SYMBOL_GPL(handle_nested_irq);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index bdd35bb9c735..bcc7f21db9ee 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -108,8 +108,6 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
enum irqchip_irq_state which,
bool *state);
-extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
@@ -121,6 +119,8 @@ void irq_resend_init(struct irq_desc *desc);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
+void wake_threads_waitq(struct irq_desc *desc);
+
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d2742af0f0fd..d309ba84e08a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -108,6 +108,16 @@ bool synchronize_hardirq(unsigned int irq)
}
EXPORT_SYMBOL(synchronize_hardirq);
+static void __synchronize_irq(struct irq_desc *desc)
+{
+ __synchronize_hardirq(desc, true);
+ /*
+ * We made sure that no hardirq handler is running. Now verify that no
+ * threaded handlers are active.
+ */
+ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
+}
+
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -127,16 +137,8 @@ void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc) {
- __synchronize_hardirq(desc, true);
- /*
- * We made sure that no hardirq handler is
- * running. Now verify that no threaded handlers are
- * active.
- */
- wait_event(desc->wait_for_threads,
- !atomic_read(&desc->threads_active));
- }
+ if (desc)
+ __synchronize_irq(desc);
}
EXPORT_SYMBOL(synchronize_irq);
@@ -1216,7 +1218,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
return ret;
}
-static void wake_threads_waitq(struct irq_desc *desc)
+void wake_threads_waitq(struct irq_desc *desc)
{
if (atomic_dec_and_test(&desc->threads_active))
wake_up(&desc->wait_for_threads);
@@ -1944,7 +1946,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* supports it also make sure that there is no (not yet serviced)
* interrupt in flight at the hardware level.
*/
- __synchronize_hardirq(desc, true);
+ __synchronize_irq(desc);
#ifdef CONFIG_DEBUG_SHIRQ
/*
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index edec335c0a7a..5f2c66860ac6 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -68,11 +68,16 @@ static int irq_sw_resend(struct irq_desc *desc)
*/
if (!desc->parent_irq)
return -EINVAL;
+
+ desc = irq_to_desc(desc->parent_irq);
+ if (!desc)
+ return -EINVAL;
}
/* Add to resend_list and activate the softirq: */
raw_spin_lock(&irq_resend_lock);
- hlist_add_head(&desc->resend_node, &irq_resend_list);
+ if (hlist_unhashed(&desc->resend_node))
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
raw_spin_unlock(&irq_resend_lock);
tasklet_schedule(&resend_tasklet);
return 0;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 016d997131d4..18edd57b5fe8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -163,12 +163,12 @@ unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
-static bool cleanup_symbol_name(char *s)
+static void cleanup_symbol_name(char *s)
{
char *res;
if (!IS_ENABLED(CONFIG_LTO_CLANG))
- return false;
+ return;
/*
* LLVM appends various suffixes for local functions and variables that
@@ -178,26 +178,21 @@ static bool cleanup_symbol_name(char *s)
* - foo.llvm.[0-9a-f]+
*/
res = strstr(s, ".llvm.");
- if (res) {
+ if (res)
*res = '\0';
- return true;
- }
- return false;
+ return;
}
static int compare_symbol_name(const char *name, char *namebuf)
{
- int ret;
-
- ret = strcmp(name, namebuf);
- if (!ret)
- return ret;
-
- if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf))
- return 0;
-
- return ret;
+ /* The kallsyms_seqs_of_names is sorted based on names after
+ * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled.
+ * To ensure correct bisection in kallsyms_lookup_names(), do
+ * cleanup_symbol_name(namebuf) before comparing name and namebuf.
+ */
+ cleanup_symbol_name(namebuf);
+ return strcmp(name, namebuf);
}
static unsigned int get_symbol_seq(int index)
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index a2e3745d15c4..e05ddc33a752 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -196,7 +196,7 @@ static bool match_cleanup_name(const char *s, const char *name)
if (!IS_ENABLED(CONFIG_LTO_CLANG))
return false;
- p = strchr(s, '.');
+ p = strstr(s, ".llvm.");
if (!p)
return false;
@@ -344,27 +344,6 @@ static int test_kallsyms_basic_function(void)
goto failed;
}
- /*
- * The first '.' may be the initial letter, in which case the
- * entire symbol name will be truncated to an empty string in
- * cleanup_symbol_name(). Do not test these symbols.
- *
- * For example:
- * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head
- * .E_read_words
- * .E_leading_bytes
- * .E_trailing_bytes
- * .E_write_words
- * .E_copy
- * .str.292.llvm.12122243386960820698
- * .str.24.llvm.12122243386960820698
- * .str.29.llvm.12122243386960820698
- * .str.75.llvm.12122243386960820698
- * .str.99.llvm.12122243386960820698
- */
- if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0])
- continue;
-
lookup_addr = kallsyms_lookup_name(namebuf);
memset(stat, 0, sizeof(*stat));
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 949d3deae506..270c7f80ce84 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -45,6 +45,7 @@ torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
torture_param(int, rt_boost, 2,
"Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
@@ -809,7 +810,8 @@ static int lock_torture_writer(void *arg)
bool skip_main_lock;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+ if (!rt_task(current))
+ set_user_nice(current, MAX_NICE);
do {
if ((torture_random(&rand) & 0xfffff) == 0)
@@ -1015,8 +1017,7 @@ static void lock_torture_cleanup(void)
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
- torture_stop_kthread(lock_torture_writer,
- writer_tasks[i]);
+ torture_stop_kthread(lock_torture_writer, writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
@@ -1244,8 +1245,9 @@ static int __init lock_torture_init(void)
goto create_reader;
/* Create writer. */
- firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
- writer_tasks[i]);
+ firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i],
+ writer_fifo ? sched_set_fifo : NULL);
if (torture_init_error(firsterr))
goto unwind;
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 6afc249ce697..6a0184e9c234 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -486,6 +486,16 @@ gotlock:
}
/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+/*
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
@@ -533,16 +543,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
pv_kick(node->cpu);
}
-/*
- * Include the architecture specific callee-save thunk of the
- * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * function close to each other sharing consecutive instruction cachelines.
- * Alternatively, architecture specific version of __pv_queued_spin_unlock()
- * can be defined.
- */
-#include <asm/qspinlock_paravirt.h>
-
#ifndef __pv_queued_spin_unlock
__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
{
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 80d9c6d77a45..15781acaac1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -30,7 +30,7 @@
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
.ipc_ns = &init_ipc_ns,
@@ -55,7 +55,7 @@ static inline struct nsproxy *create_nsproxy(void)
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
if (nsproxy)
- atomic_set(&nsproxy->count, 1);
+ refcount_set(&nsproxy->count, 1);
return nsproxy;
}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 98c1544cf572..5befd8780dcd 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -511,6 +511,14 @@ static inline void show_rcu_tasks_gp_kthreads(void) {}
void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TASKS_RCU
+struct task_struct *get_rcu_tasks_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index d1221731c7cf..ffdb30495e3c 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -84,15 +84,17 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
#endif
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
-torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, minruntime, 0, "Minimum run time (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
"Shutdown at end of scalability tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
@@ -139,6 +141,7 @@ struct rcu_scale_ops {
void (*gp_barrier)(void);
void (*sync)(void);
void (*exp_sync)(void);
+ struct task_struct *(*rso_gp_kthread)(void);
const char *name;
};
@@ -295,6 +298,7 @@ static struct rcu_scale_ops tasks_ops = {
.gp_barrier = rcu_barrier_tasks,
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
+ .rso_gp_kthread = get_rcu_tasks_gp_kthread,
.name = "tasks"
};
@@ -306,6 +310,44 @@ static struct rcu_scale_ops tasks_ops = {
#endif // #else // #ifdef CONFIG_TASKS_RCU
+#ifdef CONFIG_TASKS_RUDE_RCU
+
+/*
+ * Definitions for RCU-tasks-rude scalability testing.
+ */
+
+static int tasks_rude_scale_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_rude_scale_read_unlock(int idx)
+{
+}
+
+static struct rcu_scale_ops tasks_rude_ops = {
+ .ptype = RCU_TASKS_RUDE_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_rude_scale_read_lock,
+ .readunlock = tasks_rude_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks_rude,
+ .gp_barrier = rcu_barrier_tasks_rude,
+ .sync = synchronize_rcu_tasks_rude,
+ .exp_sync = synchronize_rcu_tasks_rude,
+ .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread,
+ .name = "tasks-rude"
+};
+
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU
+
#ifdef CONFIG_TASKS_TRACE_RCU
/*
@@ -334,6 +376,7 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.gp_barrier = rcu_barrier_tasks_trace,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
+ .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
.name = "tasks-tracing"
};
@@ -410,10 +453,12 @@ rcu_scale_writer(void *arg)
{
int i = 0;
int i_max;
+ unsigned long jdone;
long me = (long)arg;
struct rcu_head *rhp = NULL;
bool started = false, done = false, alldone = false;
u64 t;
+ DEFINE_TORTURE_RANDOM(tr);
u64 *wdp;
u64 *wdpp = writer_durations[me];
@@ -424,7 +469,7 @@ rcu_scale_writer(void *arg)
sched_set_fifo_low(current);
if (holdoff)
- schedule_timeout_uninterruptible(holdoff * HZ);
+ schedule_timeout_idle(holdoff * HZ);
/*
* Wait until rcu_end_inkernel_boot() is called for normal GP tests
@@ -445,9 +490,12 @@ rcu_scale_writer(void *arg)
}
}
+ jdone = jiffies + minruntime * HZ;
do {
if (writer_holdoff)
udelay(writer_holdoff);
+ if (writer_holdoff_jiffies)
+ schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
if (gp_async) {
@@ -475,7 +523,7 @@ retry:
if (!started &&
atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
started = true;
- if (!done && i >= MIN_MEAS) {
+ if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
done = true;
sched_set_normal(current, 0);
pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
@@ -518,8 +566,8 @@ static void
rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
- scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
+ "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n",
+ scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown);
}
/*
@@ -556,6 +604,8 @@ static struct task_struct **kfree_reader_tasks;
static int kfree_nrealthreads;
static atomic_t n_kfree_scale_thread_started;
static atomic_t n_kfree_scale_thread_ended;
+static struct task_struct *kthread_tp;
+static u64 kthread_stime;
struct kfree_obj {
char kfree_obj[8];
@@ -701,6 +751,10 @@ kfree_scale_init(void)
unsigned long jif_start;
unsigned long orig_jif;
+ pr_alert("%s" SCALE_FLAG
+ "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n",
+ scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single);
+
// Also, do a quick self-test to ensure laziness is as much as
// expected.
if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) {
@@ -797,6 +851,18 @@ rcu_scale_cleanup(void)
if (gp_exp && gp_async)
SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+ // If built-in, just report all of the GP kthread's CPU time.
+ if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread)
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp) {
+ u32 ns;
+ u64 us;
+
+ kthread_stime = kthread_tp->stime - kthread_stime;
+ us = div_u64_rem(kthread_stime, 1000, &ns);
+ pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns);
+ show_rcu_gp_kthreads();
+ }
if (kfree_rcu_test) {
kfree_scale_cleanup();
return;
@@ -885,7 +951,7 @@ rcu_scale_init(void)
long i;
int firsterr = 0;
static struct rcu_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS
+ &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
};
if (!torture_init_begin(scale_type, verbose))
@@ -910,6 +976,11 @@ rcu_scale_init(void)
if (cur_ops->init)
cur_ops->init();
+ if (cur_ops->rso_gp_kthread) {
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp)
+ kthread_stime = kthread_tp->stime;
+ }
if (kfree_rcu_test)
return kfree_scale_init();
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 147551c23baf..ade42d6a9d9b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1581,6 +1581,7 @@ rcu_torture_writer(void *arg)
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
tracing_off();
+ show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1876,7 +1877,7 @@ static int
rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
{
int mask = rcutorture_extend_mask_max();
- unsigned long randmask1 = torture_random(trsp) >> 8;
+ unsigned long randmask1 = torture_random(trsp);
unsigned long randmask2 = randmask1 >> 3;
unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
@@ -1935,7 +1936,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
if (!((mask - 1) & mask))
return rtrsp; /* Current RCU reader not extendable. */
/* Bias towards larger numbers of loops. */
- i = (torture_random(trsp) >> 3);
+ i = torture_random(trsp);
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
@@ -2136,7 +2137,7 @@ static int rcu_nocb_toggle(void *arg)
toggle_fuzz = NSEC_PER_USEC;
do {
r = torture_random(&rand);
- cpu = (r >> 4) % (maxcpu + 1);
+ cpu = (r >> 1) % (maxcpu + 1);
if (r & 0x1) {
rcu_nocb_cpu_offload(cpu);
atomic_long_inc(&n_nocb_offload);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 1970ce5f22d4..91a0fd0d4d9a 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -528,6 +528,38 @@ static struct ref_scale_ops clock_ops = {
.name = "clock"
};
+static void ref_jiffies_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += jiffies;
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += jiffies;
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops jiffies_ops = {
+ .readsection = ref_jiffies_section,
+ .delaysection = ref_jiffies_delay_section,
+ .name = "jiffies"
+};
+
////////////////////////////////////////////////////////////////////////
//
// Methods leveraging SLAB_TYPESAFE_BY_RCU.
@@ -1047,7 +1079,7 @@ ref_scale_init(void)
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
- &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
@@ -1107,12 +1139,11 @@ ref_scale_init(void)
VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
for (i = 0; i < nreaders; i++) {
+ init_waitqueue_head(&reader_tasks[i].wq);
firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
reader_tasks[i].task);
if (torture_init_error(firsterr))
goto unwind;
-
- init_waitqueue_head(&(reader_tasks[i].wq));
}
// Main Task
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index b770add3f843..8d65f7d576a3 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -25,6 +25,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @cblist: Callback list.
* @lock: Lock protecting per-CPU callback list.
* @rtp_jiffies: Jiffies counter value for statistics.
+ * @lazy_timer: Timer to unlazify callbacks.
+ * @urgent_gp: Number of additional non-lazy grace periods.
* @rtp_n_lock_retries: Rough lock-contention statistic.
* @rtp_work: Work queue for invoking callbacks.
* @rtp_irq_work: IRQ work queue for deferred wakeups.
@@ -38,6 +40,8 @@ struct rcu_tasks_percpu {
raw_spinlock_t __private lock;
unsigned long rtp_jiffies;
unsigned long rtp_n_lock_retries;
+ struct timer_list lazy_timer;
+ unsigned int urgent_gp;
struct work_struct rtp_work;
struct irq_work rtp_irq_work;
struct rcu_head barrier_q_head;
@@ -51,7 +55,6 @@ struct rcu_tasks_percpu {
* @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
* @cbs_gbl_lock: Lock protecting callback list.
* @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
- * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
* @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping.
@@ -61,6 +64,8 @@ struct rcu_tasks_percpu {
* @tasks_gp_seq: Number of grace periods completed since boot.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
+ * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy.
* @pregp_func: This flavor's pre-grace-period function (optional).
* @pertask_func: This flavor's per-task scan function (optional).
* @postscan_func: This flavor's post-task scan function (optional).
@@ -92,6 +97,7 @@ struct rcu_tasks {
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
+ unsigned long lazy_jiffies;
rcu_tasks_gp_func_t gp_func;
pregp_func_t pregp_func;
pertask_func_t pertask_func;
@@ -127,6 +133,7 @@ static struct rcu_tasks rt_name = \
.gp_func = gp, \
.call_func = call, \
.rtpcpu = &rt_name ## __percpu, \
+ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
.name = n, \
.percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
.percpu_enqueue_lim = 1, \
@@ -139,9 +146,7 @@ static struct rcu_tasks rt_name = \
#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
-#endif
-#ifdef CONFIG_TASKS_RCU
/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
@@ -171,6 +176,8 @@ static int rcu_task_contend_lim __read_mostly = 100;
module_param(rcu_task_contend_lim, int, 0444);
static int rcu_task_collapse_lim __read_mostly = 10;
module_param(rcu_task_collapse_lim, int, 0444);
+static int rcu_task_lazy_lim __read_mostly = 32;
+module_param(rcu_task_lazy_lim, int, 0444);
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
@@ -229,7 +236,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
#endif /* #ifndef CONFIG_TINY_RCU */
// Initialize per-CPU callback lists for the specified flavor of
-// Tasks RCU.
+// Tasks RCU. Do not enqueue callbacks before this function is invoked.
static void cblist_init_generic(struct rcu_tasks *rtp)
{
int cpu;
@@ -237,7 +244,6 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
int lim;
int shift;
- raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rcu_task_enqueue_lim < 0) {
rcu_task_enqueue_lim = 1;
rcu_task_cb_adjust = true;
@@ -260,22 +266,48 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
WARN_ON_ONCE(!rtpcp);
if (cpu)
raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
- raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ local_irq_save(flags); // serialize initialization
if (rcu_segcblist_empty(&rtpcp->cblist))
rcu_segcblist_init(&rtpcp->cblist);
+ local_irq_restore(flags);
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
- raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
}
- raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
}
+// Compute wakeup time for lazy callback timer.
+static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp)
+{
+ return jiffies + rtp->lazy_jiffies;
+}
+
+// Timer handler that unlazifies lazy callbacks.
+static void call_rcu_tasks_generic_timer(struct timer_list *tlp)
+{
+ unsigned long flags;
+ bool needwake = false;
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer);
+
+ rtp = rtpcp->rtpp;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) {
+ if (!rtpcp->urgent_gp)
+ rtpcp->urgent_gp = 1;
+ needwake = true;
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (needwake)
+ rcuwait_wake_up(&rtp->cbs_wait);
+}
+
// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
{
@@ -292,6 +324,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
{
int chosen_cpu;
unsigned long flags;
+ bool havekthread = smp_load_acquire(&rtp->kthread_ptr);
int ideal_cpu;
unsigned long j;
bool needadjust = false;
@@ -316,12 +349,19 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
needadjust = true; // Defer adjustment to avoid deadlock.
}
- if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) {
- raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
- cblist_init_generic(rtp);
- raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ // Queuing callbacks before initialization not yet supported.
+ if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist)))
+ rcu_segcblist_init(&rtpcp->cblist);
+ needwake = (func == wakeme_after_rcu) ||
+ (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim);
+ if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) {
+ if (rtp->lazy_jiffies)
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ else
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
}
- needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ if (needwake)
+ rtpcp->urgent_gp = 3;
rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
if (unlikely(needadjust)) {
@@ -415,9 +455,14 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
}
rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
(void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
- if (rcu_segcblist_pend_cbs(&rtpcp->cblist))
+ if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) {
+ if (rtp->lazy_jiffies)
+ rtpcp->urgent_gp--;
needgpcb |= 0x3;
- if (!rcu_segcblist_empty(&rtpcp->cblist))
+ } else if (rcu_segcblist_empty(&rtpcp->cblist)) {
+ rtpcp->urgent_gp = 0;
+ }
+ if (rcu_segcblist_ready_cbs(&rtpcp->cblist))
needgpcb |= 0x1;
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
}
@@ -525,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
if (unlikely(midboot)) {
needgpcb = 0x2;
} else {
+ mutex_unlock(&rtp->tasks_gp_mutex);
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
rcuwait_wait_event(&rtp->cbs_wait,
(needgpcb = rcu_tasks_need_gpcb(rtp)),
TASK_IDLE);
+ mutex_lock(&rtp->tasks_gp_mutex);
}
if (needgpcb & 0x2) {
@@ -549,11 +596,19 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
// RCU-tasks kthread that detects grace periods and invokes callbacks.
static int __noreturn rcu_tasks_kthread(void *arg)
{
+ int cpu;
struct rcu_tasks *rtp = arg;
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0);
+ rtpcp->urgent_gp = 1;
+ }
+
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
housekeeping_affine(current, HK_TYPE_RCU);
- WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
+ smp_store_release(&rtp->kthread_ptr, current); // Let GPs start!
/*
* Each pass through the following loop makes one check for
@@ -635,16 +690,22 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
int cpu;
bool havecbs = false;
+ bool haveurgent = false;
+ bool haveurgentcbs = false;
for_each_possible_cpu(cpu) {
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
- if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) {
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)))
havecbs = true;
+ if (data_race(rtpcp->urgent_gp))
+ haveurgent = true;
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp))
+ haveurgentcbs = true;
+ if (havecbs && haveurgent && haveurgentcbs)
break;
- }
}
- pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
+ pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
jiffies - data_race(rtp->gp_jiffies),
@@ -652,6 +713,9 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
".C"[havecbs],
+ ".u"[haveurgent],
+ ".U"[haveurgentcbs],
+ rtp->lazy_jiffies,
s);
}
#endif // #ifndef CONFIG_TINY_RCU
@@ -1020,11 +1084,16 @@ void rcu_barrier_tasks(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+int rcu_tasks_lazy_ms = -1;
+module_param(rcu_tasks_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_kthread(void)
{
cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
+ if (rcu_tasks_lazy_ms >= 0)
+ rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms);
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
rcu_tasks.pertask_func = rcu_tasks_pertask;
rcu_tasks.postscan_func = rcu_tasks_postscan;
@@ -1042,6 +1111,12 @@ void show_rcu_tasks_classic_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+struct task_struct *get_rcu_tasks_gp_kthread(void)
+{
+ return rcu_tasks.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+
/*
* Contribute to protect against tasklist scan blind spot while the
* task is exiting and may be removed from the tasklist. See
@@ -1173,10 +1248,15 @@ void rcu_barrier_tasks_rude(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
+int rcu_tasks_rude_lazy_ms = -1;
+module_param(rcu_tasks_rude_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_rude_kthread(void)
{
cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
+ if (rcu_tasks_rude_lazy_ms >= 0)
+ rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
}
@@ -1188,6 +1268,13 @@ void show_rcu_tasks_rude_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
+{
+ return rcu_tasks_rude.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
+
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
@@ -1793,6 +1880,9 @@ void rcu_barrier_tasks_trace(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
+int rcu_tasks_trace_lazy_ms = -1;
+module_param(rcu_tasks_trace_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_trace_kthread(void)
{
cblist_init_generic(&rcu_tasks_trace);
@@ -1807,6 +1897,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
if (rcu_tasks_trace.init_fract <= 0)
rcu_tasks_trace.init_fract = 1;
}
+ if (rcu_tasks_trace_lazy_ms >= 0)
+ rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms);
rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
@@ -1830,6 +1922,12 @@ void show_rcu_tasks_trace_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
+{
+ return rcu_tasks_trace.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
+
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1449cb69a0e0..cb1caefa8bd0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -632,7 +632,7 @@ void __rcu_irq_enter_check_tick(void)
// prevents self-deadlock. So we can safely recheck under the lock.
// Note that the nohz_full state currently cannot change.
raw_spin_lock_rcu_node(rdp->mynode);
- if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+ if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
// A nohz_full CPU is in the kernel and RCU needs a
// quiescent state. Turn on the tick!
WRITE_ONCE(rdp->rcu_forced_tick, true);
@@ -677,12 +677,16 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
}
/**
- * rcu_is_watching - see if RCU thinks that the current CPU is not idle
+ * rcu_is_watching - RCU read-side critical sections permitted on current CPU?
*
- * Return true if RCU is watching the running CPU, which means that this
- * CPU can safely enter RCU read-side critical sections. In other words,
- * if the current CPU is not in its idle loop or is in an interrupt or
- * NMI handler, return true.
+ * Return @true if RCU is watching the running CPU and @false otherwise.
+ * An @true return means that this CPU can safely enter RCU read-side
+ * critical sections.
+ *
+ * Although calls to rcu_is_watching() from most parts of the kernel
+ * will return @true, there are important exceptions. For example, if the
+ * current CPU is deep within its idle loop, in kernel entry/exit code,
+ * or offline, rcu_is_watching() will return @false.
*
* Make notrace because it can be called by the internal functions of
* ftrace, and making this notrace removes unnecessary recursion calls.
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 43229d2b0c44..5598212d1f27 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -77,9 +77,9 @@ __setup("rcu_nocbs", rcu_nocb_setup);
static int __init parse_rcu_nocb_poll(char *arg)
{
rcu_nocb_poll = true;
- return 0;
+ return 1;
}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+__setup("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
* Don't bother bypassing ->cblist if the call_rcu() rate is low.
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 5d113aa59e77..59032aaccd18 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -171,7 +171,8 @@ static void scf_torture_stats_print(void)
scfs.n_all_wait += scf_stats_p[i].n_all_wait;
}
if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
- atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs))
+ atomic_read(&n_mb_out_errs) ||
+ (!IS_ENABLED(CONFIG_KASAN) && atomic_read(&n_alloc_errs)))
bangstr = "!!! ";
pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ",
SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched,
@@ -312,6 +313,7 @@ static void scf_handler_1(void *scfc_in)
// Randomly do an smp_call_function*() invocation.
static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp)
{
+ bool allocfail = false;
uintptr_t cpu;
int ret = 0;
struct scf_check *scfcp = NULL;
@@ -323,8 +325,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
preempt_disable();
if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) {
scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
- if (WARN_ON_ONCE(!scfcp)) {
+ if (!scfcp) {
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_KASAN));
atomic_inc(&n_alloc_errs);
+ allocfail = true;
} else {
scfcp->scfc_cpu = -1;
scfcp->scfc_wait = scfsp->scfs_wait;
@@ -431,7 +435,9 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
cpus_read_unlock();
else
preempt_enable();
- if (!(torture_random(trsp) & 0xfff))
+ if (allocfail)
+ schedule_timeout_idle((1 + longwait) * HZ); // Let no-wait handlers complete.
+ else if (!(torture_random(trsp) & 0xfff))
schedule_timeout_uninterruptible(1);
}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index d57a5c1c1cd9..3561ab533dd4 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -13,6 +13,23 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
+static void complete_with_flags(struct completion *x, int wake_flags)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+
+ if (x->done != UINT_MAX)
+ x->done++;
+ swake_up_locked(&x->wait, wake_flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+
+void complete_on_current_cpu(struct completion *x)
+{
+ return complete_with_flags(x, WF_CURRENT_CPU);
+}
+
/**
* complete: - signals a single thread waiting on this completion
* @x: holds the state of this particular completion
@@ -27,14 +44,7 @@
*/
void complete(struct completion *x)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&x->wait.lock, flags);
-
- if (x->done != UINT_MAX)
- x->done++;
- swake_up_locked(&x->wait);
- raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+ complete_with_flags(x, 0);
}
EXPORT_SYMBOL(complete);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c52c2eba7c73..2299a5cfbfb9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1097,25 +1097,22 @@ int get_nohz_timer_target(void)
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
- rcu_read_lock();
+ guard(rcu)();
+
for_each_domain(cpu, sd) {
for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
- if (!idle_cpu(i)) {
- cpu = i;
- goto unlock;
- }
+ if (!idle_cpu(i))
+ return i;
}
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
- cpu = default_cpu;
-unlock:
- rcu_read_unlock();
- return cpu;
+
+ return default_cpu;
}
/*
@@ -1194,6 +1191,20 @@ static void nohz_csd_func(void *info)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
+{
+ if (rq->nr_running != 1)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (!task_on_rq_queued(p))
+ return false;
+
+ return true;
+}
+
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
@@ -1229,6 +1240,18 @@ bool sched_can_stop_tick(struct rq *rq)
if (rq->nr_running > 1)
return false;
+ /*
+ * If there is one task and it has CFS runtime bandwidth constraints
+ * and it's on the cpu now we don't want to stop the tick.
+ * This check prevents clearing the bit if a newly enqueued task here is
+ * dequeued by migrating while the constrained task continues to run.
+ * E.g. going from 2->1 without going through pick_next_task().
+ */
+ if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+ if (cfs_task_bw_constrained(rq->curr))
+ return false;
+ }
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -1804,7 +1827,8 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
int old_min, old_max, old_min_rt;
int result;
- mutex_lock(&uclamp_mutex);
+ guard(mutex)(&uclamp_mutex);
+
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
@@ -1813,7 +1837,7 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
if (result)
goto undo;
if (!write)
- goto done;
+ return 0;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
@@ -1849,16 +1873,12 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
* Otherwise, keep it simple and do just a lazy update at each next
* task enqueue time.
*/
-
- goto done;
+ return 0;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
-done:
- mutex_unlock(&uclamp_mutex);
-
return result;
}
#endif
@@ -3413,7 +3433,6 @@ static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
@@ -3421,33 +3440,25 @@ static int migrate_swap_stop(void *data)
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
+ guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
+ guard(double_rq_lock)(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
+ return -EAGAIN;
if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
- ret = 0;
-
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
-
- return ret;
+ return 0;
}
/*
@@ -3722,14 +3733,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
struct sched_domain *sd;
__schedstat_inc(p->stats.nr_wakeups_remote);
- rcu_read_lock();
+
+ guard(rcu)();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
- rcu_read_unlock();
}
if (wake_flags & WF_MIGRATED)
@@ -3928,21 +3939,13 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
-
- rcu_read_lock();
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
-
- rq_lock_irqsave(rq, &rf);
- if (is_idle_task(rq->curr))
- resched_curr(rq);
- /* Else CPU is not idle, do nothing here: */
- rq_unlock_irqrestore(rq, &rf);
-
-out:
- rcu_read_unlock();
+ guard(rcu)();
+ if (is_idle_task(rcu_dereference(rq->curr))) {
+ guard(rq_lock_irqsave)(rq);
+ if (is_idle_task(rq->curr))
+ resched_curr(rq);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -4193,13 +4196,11 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- unsigned long flags;
+ guard(preempt)();
int cpu, success = 0;
- preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -4226,129 +4227,127 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* reordered with p->state check below. This pairs with smp_store_mb()
* in set_current_state() that the waiting thread does.
*/
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- smp_mb__after_spinlock();
- if (!ttwu_state_match(p, state, &success))
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ smp_mb__after_spinlock();
+ if (!ttwu_state_match(p, state, &success))
+ break;
- trace_sched_waking(p);
+ trace_sched_waking(p);
- /*
- * Ensure we load p->on_rq _after_ p->state, otherwise it would
- * be possible to, falsely, observe p->on_rq == 0 and get stuck
- * in smp_cond_load_acquire() below.
- *
- * sched_ttwu_pending() try_to_wake_up()
- * STORE p->on_rq = 1 LOAD p->state
- * UNLOCK rq->lock
- *
- * __schedule() (switch to task 'p')
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * UNLOCK rq->lock
- *
- * [task p]
- * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
- */
- smp_rmb();
- if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
- goto unlock;
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
+ */
+ smp_rmb();
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+ break;
#ifdef CONFIG_SMP
- /*
- * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
- * possible to, falsely, observe p->on_cpu == 0.
- *
- * One must be running (->on_cpu == 1) in order to remove oneself
- * from the runqueue.
- *
- * __schedule() (switch to task 'p') try_to_wake_up()
- * STORE p->on_cpu = 1 LOAD p->on_rq
- * UNLOCK rq->lock
- *
- * __schedule() (put 'p' to sleep)
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * STORE p->on_rq = 0 LOAD p->on_cpu
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
- * care about it's own p->state. See the comment in __schedule().
- */
- smp_acquire__after_ctrl_dep();
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
+ */
+ smp_acquire__after_ctrl_dep();
- /*
- * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
- * == 0), which means we need to do an enqueue, change p->state to
- * TASK_WAKING such that we can unlock p->pi_lock before doing the
- * enqueue, such as ttwu_queue_wakelist().
- */
- WRITE_ONCE(p->__state, TASK_WAKING);
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ WRITE_ONCE(p->__state, TASK_WAKING);
- /*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, considering queueing p on the remote CPUs wake_list
- * which potentially sends an IPI instead of spinning on p->on_cpu to
- * let the waker make forward progress. This is safe because IRQs are
- * disabled and the IPI will deliver after on_cpu is cleared.
- *
- * Ensure we load task_cpu(p) after p->on_cpu:
- *
- * set_task_cpu(p, cpu);
- * STORE p->cpu = @cpu
- * __schedule() (switch to task 'p')
- * LOCK rq->lock
- * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
- * STORE p->on_cpu = 1 LOAD p->cpu
- *
- * to ensure we observe the correct CPU on which the task is currently
- * scheduling.
- */
- if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
- goto unlock;
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+ break;
- /*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, wait until it's done referencing the task.
- *
- * Pairs with the smp_store_release() in finish_task().
- *
- * This ensures that tasks getting woken will be fully ordered against
- * their previous state and preserve Program Order.
- */
- smp_cond_load_acquire(&p->on_cpu, !VAL);
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until it's done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
- cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
- if (task_cpu(p) != cpu) {
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
+ cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
- wake_flags |= WF_MIGRATED;
- psi_ttwu_dequeue(p);
- set_task_cpu(p, cpu);
- }
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
#else
- cpu = task_cpu(p);
+ cpu = task_cpu(p);
#endif /* CONFIG_SMP */
- ttwu_queue(p, cpu, wake_flags);
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ ttwu_queue(p, cpu, wake_flags);
+ }
out:
if (success)
ttwu_stat(p, task_cpu(p), wake_flags);
- preempt_enable();
return success;
}
@@ -4501,6 +4500,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ p->se.vlag = 0;
+ p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5496,23 +5497,20 @@ unsigned int nr_iowait(void)
void sched_exec(void)
{
struct task_struct *p = current;
- unsigned long flags;
+ struct migration_arg arg;
int dest_cpu;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
- if (dest_cpu == smp_processor_id())
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
+ if (dest_cpu == smp_processor_id())
+ return;
- if (likely(cpu_active(dest_cpu))) {
- struct migration_arg arg = { p, dest_cpu };
+ if (unlikely(!cpu_active(dest_cpu)))
+ return;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
- return;
+ arg = (struct migration_arg){ p, dest_cpu };
}
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
#endif
@@ -5722,9 +5720,6 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr;
- struct rq_flags rf;
- u64 delta;
int os;
/*
@@ -5734,30 +5729,26 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (!tick_nohz_tick_stopped_cpu(cpu))
- goto out_requeue;
+ if (tick_nohz_tick_stopped_cpu(cpu)) {
+ guard(rq_lock_irq)(rq);
+ struct task_struct *curr = rq->curr;
- rq_lock_irq(rq, &rf);
- curr = rq->curr;
- if (cpu_is_offline(cpu))
- goto out_unlock;
+ if (cpu_online(cpu)) {
+ update_rq_clock(rq);
- update_rq_clock(rq);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a
+ * reasonable amount of time.
+ */
+ u64 delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
+ curr->sched_class->task_tick(rq, curr, 0);
- if (!is_idle_task(curr)) {
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ calc_load_nohz_remote(rq);
+ }
}
- curr->sched_class->task_tick(rq, curr, 0);
-
- calc_load_nohz_remote(rq);
-out_unlock:
- rq_unlock_irq(rq, &rf);
-out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
@@ -6306,19 +6297,19 @@ static bool try_steal_cookie(int this, int that)
unsigned long cookie;
bool success = false;
- local_irq_disable();
- double_rq_lock(dst, src);
+ guard(irq)();
+ guard(double_rq_lock)(dst, src);
cookie = dst->core->core_cookie;
if (!cookie)
- goto unlock;
+ return false;
if (dst->curr != dst->idle)
- goto unlock;
+ return false;
p = sched_core_find(src, cookie);
if (!p)
- goto unlock;
+ return false;
do {
if (p == src->core_pick || p == src->curr)
@@ -6330,9 +6321,10 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
/*
- * sched_core_find() and sched_core_next() will ensure that task @p
- * is not throttled now, we also need to check whether the runqueue
- * of the destination CPU is being throttled.
+ * sched_core_find() and sched_core_next() will ensure
+ * that task @p is not throttled now, we also need to
+ * check whether the runqueue of the destination CPU is
+ * being throttled.
*/
if (sched_task_is_throttled(p, this))
goto next;
@@ -6350,10 +6342,6 @@ next:
p = sched_core_next(p, cookie);
} while (p);
-unlock:
- double_rq_unlock(dst, src);
- local_irq_enable();
-
return success;
}
@@ -6411,20 +6399,24 @@ static void queue_core_balance(struct rq *rq)
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
+DEFINE_LOCK_GUARD_1(core_lock, int,
+ sched_core_lock(*_T->lock, &_T->flags),
+ sched_core_unlock(*_T->lock, &_T->flags),
+ unsigned long flags)
+
static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- unsigned long flags;
int t;
- sched_core_lock(cpu, &flags);
+ guard(core_lock)(&cpu);
WARN_ON_ONCE(rq->core != rq);
/* if we're the first, we'll be our own leader */
if (cpumask_weight(smt_mask) == 1)
- goto unlock;
+ return;
/* find the leader */
for_each_cpu(t, smt_mask) {
@@ -6438,7 +6430,7 @@ static void sched_core_cpu_starting(unsigned int cpu)
}
if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
- goto unlock;
+ return;
/* install and validate core_rq */
for_each_cpu(t, smt_mask) {
@@ -6449,29 +6441,25 @@ static void sched_core_cpu_starting(unsigned int cpu)
WARN_ON_ONCE(rq->core != core_rq);
}
-
-unlock:
- sched_core_unlock(cpu, &flags);
}
static void sched_core_cpu_deactivate(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- unsigned long flags;
int t;
- sched_core_lock(cpu, &flags);
+ guard(core_lock)(&cpu);
/* if we're the last man standing, nothing to do */
if (cpumask_weight(smt_mask) == 1) {
WARN_ON_ONCE(rq->core != rq);
- goto unlock;
+ return;
}
/* if we're not the leader, nothing to do */
if (rq->core != rq)
- goto unlock;
+ return;
/* find a new leader */
for_each_cpu(t, smt_mask) {
@@ -6482,7 +6470,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
}
if (WARN_ON_ONCE(!core_rq)) /* impossible */
- goto unlock;
+ return;
/* copy the shared state to the new leader */
core_rq->core_task_seq = rq->core_task_seq;
@@ -6504,9 +6492,6 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
rq = cpu_rq(t);
rq->core = core_rq;
}
-
-unlock:
- sched_core_unlock(cpu, &flags);
}
static inline void sched_core_cpu_dying(unsigned int cpu)
@@ -7030,7 +7015,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -7383,6 +7368,19 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
#ifdef CONFIG_SMP
/*
* This function computes an effective utilization for the given CPU, to be
@@ -9940,7 +9938,7 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -11074,11 +11072,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
/*
* Ensure max(child_quota) <= parent_quota. On cgroup2,
- * always take the min. On cgroup1, only inherit when no
- * limit is set:
+ * always take the non-RUNTIME_INF min. On cgroup1, only
+ * inherit when no limit is set. In both cases this is used
+ * by the scheduler to determine if a given CFS task has a
+ * bandwidth constraint at some higher level.
*/
if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
- quota = min(quota, parent_quota);
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF)
+ quota = min(quota, parent_quota);
} else {
if (quota == RUNTIME_INF)
quota = parent_quota;
@@ -11139,6 +11142,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
return 0;
}
+
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
+
+ return 0;
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -11215,6 +11239,10 @@ static struct cftype cpu_legacy_files[] = {
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
+ {
+ .name = "stat.local",
+ .seq_show = cpu_cfs_local_stat_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
@@ -11271,6 +11299,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
return 0;
}
+static int cpu_local_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ u64 throttled_self_usec;
+
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "throttled_usec %llu\n",
+ throttled_self_usec);
+ }
+#endif
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -11449,6 +11495,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
+ .css_local_stat_show = cpu_local_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 066ff1c8ae4e..4c3d0d9f3db6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
@@ -427,6 +424,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
#undef SDM
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
}
void update_sched_domain_debugfs(void)
@@ -581,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -626,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
+ struct sched_entity *last, *first;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -643,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
- if (rb_first_cached(&cfs_rq->tasks_timeline))
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_vruntime = first->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
+ right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
@@ -863,10 +863,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- PN(sysctl_sched_latency);
- PN(sysctl_sched_min_granularity);
- PN(sysctl_sched_idle_min_granularity);
- PN(sysctl_sched_wakeup_granularity);
+ PN(sysctl_sched_base_slice);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3e25be58e2b..911d0063763c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
#include <asm/switch_to.h>
@@ -57,22 +58,6 @@
#include "autogroup.h"
/*
- * Targeted preemption latency for CPU-bound tasks:
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- * run vmstat and monitor the context-switches (cs) field)
- *
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-
-/*
* The initial- and re-scaling of tunables is configurable
*
* Options are:
@@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-
-/*
- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
- * Applies only when SCHED_IDLE tasks compete with normal tasks.
- *
- * (default: 0.75 msec)
- */
-unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
-
-/*
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
+unsigned int sysctl_sched_base_slice = 750000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
/*
* After fork, child runs first. If set to 0 (default) then
@@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8;
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
-/*
- * SCHED_OTHER wake-up granularity.
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- *
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
@@ -277,9 +237,7 @@ static void update_sysctl(void)
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_base_slice);
#undef SET_SYSCTL
}
@@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
const struct sched_class fair_sched_class;
@@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a,
return (s64)(a->vruntime - b->vruntime) < 0;
}
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
#define __node_2_se(node) \
rb_entry((node), struct sched_entity, run_node)
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag:
+ *
+ * \Sum lag_i = 0
+ *
+ * Where lag_i is given by:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * Where S is the ideal service time and V is it's virtual time counterpart.
+ * Therefore:
+ *
+ * \Sum lag_i = 0
+ * \Sum w_i * (V - v_i) = 0
+ * \Sum w_i * V - w_i * v_i = 0
+ *
+ * From which we can solve an expression for V in v_i (which we have in
+ * se->vruntime):
+ *
+ * \Sum v_i * w_i \Sum v_i * w_i
+ * V = -------------- = --------------
+ * \Sum w_i W
+ *
+ * Specifically, this is the weighted average of all entity virtual runtimes.
+ *
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
+ * that join/leave operations happen at lag_i = 0, otherwise the
+ * virtual time has non-continguous motion equivalent to:
+ *
+ * V +-= lag_i / W
+ *
+ * Also see the comment in place_entity() that deals with this. ]]
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v0) + v0
+ *
+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
+ * V = ---------------------------- = --------------------- + v0
+ * W W
+ *
+ * Which we track using:
+ *
+ * v0 := cfs_rq->min_vruntime
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
+ * \Sum w_i := cfs_rq->avg_load
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ *
+ * Also, we use scale_load_down() to reduce the size.
+ *
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ if (load)
+ avg = div_s64(avg, load);
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
+ */
+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 lag, limit;
+
+ SCHED_WARN_ON(!se->on_rq);
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * to the loss in precision caused by the division.
+ */
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= entity_key(cfs_rq, se) * load;
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ u64 min_vruntime = cfs_rq->min_vruntime;
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ min_vruntime = vruntime;
+ }
+ return min_vruntime;
+}
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (leftmost) { /* non-empty tree */
- struct sched_entity *se = __node_2_se(leftmost);
-
+ if (se) {
if (!curr)
vruntime = se->vruntime;
else
@@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
/* ensure we never gain time by being placed backwards. */
u64_u32_store(cfs_rq->min_vruntime,
- max_vruntime(cfs_rq->min_vruntime, vruntime));
+ __update_min_vruntime(cfs_rq, vruntime));
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (deadline_gt(min_deadline, se, rse))
+ se->min_deadline = rse->min_deadline;
+ }
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+{
+ u64 old_min_deadline = se->min_deadline;
+ struct rb_node *node = &se->run_node;
+
+ se->min_deadline = se->deadline;
+ __update_min_deadline(se, node->rb_right);
+ __update_min_deadline(se, node->rb_left);
+
+ return se->min_deadline == old_min_deadline;
+}
+
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
+ run_node, min_deadline, min_deadline_update);
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+ avg_vruntime_add(cfs_rq, se);
+ se->min_deadline = se->deadline;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_deadline_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_deadline_cb);
+ avg_vruntime_sub(cfs_rq, se);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -660,14 +845,88 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
return __node_2_se(left);
}
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
- struct rb_node *next = rb_next(&se->run_node);
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
- if (!next)
- return NULL;
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ /*
+ * Once selected, run a task until it either becomes non-eligible or
+ * until it gets a new slice. See the HACK in set_next_entity().
+ */
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ return curr;
+
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /*
+ * If this entity is not eligible, try the left subtree.
+ */
+ if (!entity_eligible(cfs_rq, se)) {
+ node = node->rb_left;
+ continue;
+ }
+
+ /*
+ * If this entity has an earlier deadline than the previous
+ * best, take this one. If it also has the earliest deadline
+ * of its subtree, we're done.
+ */
+ if (!best || deadline_gt(deadline, best, se)) {
+ best = se;
+ if (best->deadline == best->min_deadline)
+ break;
+ }
- return __node_2_se(next);
+ /*
+ * If the earlest deadline in this subtree is in the fully
+ * eligible left half of our space, go there.
+ */
+ if (node->rb_left &&
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
+ node = node->rb_left;
+ continue;
+ }
+
+ node = node->rb_right;
+ }
+
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
+ best = curr;
+
+ if (unlikely(!best)) {
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ if (left) {
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
+ return left;
+ }
+ }
+
+ return best;
}
#ifdef CONFIG_SCHED_DEBUG
@@ -684,109 +943,51 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
/**************************************************************
* Scheduling class statistics methods:
*/
-
+#ifdef CONFIG_SMP
int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
- sysctl_sched_min_granularity);
-
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
- WRT_SYSCTL(sched_min_granularity);
- WRT_SYSCTL(sched_latency);
- WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_base_slice);
#undef WRT_SYSCTL
return 0;
}
#endif
+#endif
-/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
-
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
- if (unlikely(nr_running > sched_nr_latency))
- return nr_running * sysctl_sched_min_granularity;
- else
- return sysctl_sched_latency;
-}
-
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
*/
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned int nr_running = cfs_rq->nr_running;
- struct sched_entity *init_se = se;
- unsigned int min_gran;
- u64 slice;
-
- if (sched_feat(ALT_PERIOD))
- nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
-
- slice = __sched_period(nr_running + !se->on_rq);
-
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
- struct cfs_rq *qcfs_rq;
-
- qcfs_rq = cfs_rq_of(se);
- load = &qcfs_rq->load;
-
- if (unlikely(!se->on_rq)) {
- lw = qcfs_rq->load;
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return;
- update_load_add(&lw, se->load.weight);
- load = &lw;
- }
- slice = __calc_delta(slice, se->load.weight, load);
- }
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_base_slice.
+ */
+ se->slice = sysctl_sched_base_slice;
- if (sched_feat(BASE_SLICE)) {
- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
- min_gran = sysctl_sched_idle_min_granularity;
- else
- min_gran = sysctl_sched_min_granularity;
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
- slice = max_t(u64, slice, min_gran);
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ if (cfs_rq->nr_running > 1) {
+ resched_curr(rq_of(cfs_rq));
+ clear_buddies(cfs_rq, se);
}
-
- return slice;
-}
-
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
#include "pelt.h"
@@ -921,6 +1122,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
@@ -3375,16 +3577,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
+ unsigned long old_weight = se->load.weight;
+
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
+ else
+ avg_vruntime_sub(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
update_load_set(&se->load, weight);
+ if (!se->on_rq) {
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * old_weight, weight);
+ } else {
+ s64 deadline = se->deadline - se->vruntime;
+ /*
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ */
+ deadline = div_s64(deadline * old_weight, weight);
+ se->deadline = se->vruntime + deadline;
+ }
+
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
@@ -3394,9 +3616,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#endif
enqueue_load_avg(cfs_rq, se);
- if (se->on_rq)
+ if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
-
+ if (cfs_rq->curr != se)
+ avg_vruntime_add(cfs_rq, se);
+ }
}
void reweight_task(struct task_struct *p, int prio)
@@ -4692,159 +4916,125 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
-
- if (d < 0)
- d = -d;
-
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq->nr_spread_over);
-#endif
-}
-
-static inline bool entity_is_long_sleeper(struct sched_entity *se)
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- u64 sleep_time;
+ u64 vslice = calc_delta_fair(se->slice, se);
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 lag = 0;
- if (se->exec_start == 0)
- return false;
-
- cfs_rq = cfs_rq_of(se);
-
- sleep_time = rq_clock_task(rq_of(cfs_rq));
+ /*
+ * Due to how V is constructed as the weighted average of entities,
+ * adding tasks with positive lag, or removing tasks with negative lag
+ * will move 'time' backwards, this can screw around with the lag of
+ * other tasks.
+ *
+ * EEVDF: placement strategy #1 / #2
+ */
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ struct sched_entity *curr = cfs_rq->curr;
+ unsigned long load;
- /* Happen while migrating because of clock task divergence */
- if (sleep_time <= se->exec_start)
- return false;
+ lag = se->vlag;
- sleep_time -= se->exec_start;
- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
- return true;
+ /*
+ * If we want to place a task and preserve lag, we have to
+ * consider the effect of the new entity on the weighted
+ * average and compensate for this, otherwise lag can quickly
+ * evaporate.
+ *
+ * Lag is defined as:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * To avoid the 'w_i' term all over the place, we only track
+ * the virtual lag:
+ *
+ * vl_i = V - v_i <=> v_i = V - vl_i
+ *
+ * And we take V to be the weighted average of all v:
+ *
+ * V = (\Sum w_j*v_j) / W
+ *
+ * Where W is: \Sum w_j
+ *
+ * Then, the weighted average after adding an entity with lag
+ * vl_i is given by:
+ *
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = V - w_i*vl_i / (W + w_i)
+ *
+ * And the actual lag after adding an entity with vl_i is:
+ *
+ * vl'_i = V' - v_i
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+ * = vl_i - w_i*vl_i / (W + w_i)
+ *
+ * Which is strictly less than vl_i. So in order to preserve lag
+ * we should inflate the lag before placement such that the
+ * effective lag after placement comes out right.
+ *
+ * As such, invert the above relation for vl'_i to get the vl_i
+ * we need to use such that the lag after placement is the lag
+ * we computed before dequeue.
+ *
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+ *
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+ * = W*vl_i
+ *
+ * vl_i = (W + w_i)*vl'_i / W
+ */
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
- return false;
-}
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
+ }
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
- u64 vruntime = cfs_rq->min_vruntime;
+ se->vruntime = vruntime - lag;
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
-
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh;
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
- if (se_is_idle(se))
- thresh = sysctl_sched_min_granularity;
- else
- thresh = sysctl_sched_latency;
-
- /*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
- */
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
-
- vruntime -= thresh;
- }
-
- /*
- * Pull vruntime of the entity being placed to the base level of
- * cfs_rq, to prevent boosting it if placed backwards.
- * However, min_vruntime can advance much faster than real time, with
- * the extreme being when an entity with the minimal weight always runs
- * on the cfs_rq. If the waking entity slept for a long time, its
- * vruntime difference from min_vruntime may overflow s64 and their
- * comparison may get inversed, so ignore the entity's original
- * vruntime in that case.
- * The maximal vruntime speedup is given by the ratio of normal to
- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
- * When placing a migrated waking entity, its exec_start has been set
- * from a different rq. In order to take into account a possible
- * divergence between new and prev rq's clocks task because of irq and
- * stolen time, we take an additional margin.
- * So, cutting off on the sleep time of
- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
- * should be safe.
- */
- if (entity_is_long_sleeper(se))
- se->vruntime = vruntime;
- else
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static inline bool cfs_bandwidth_used(void);
-/*
- * MIGRATION
- *
- * dequeue
- * update_curr()
- * update_min_vruntime()
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way the vruntime transition between RQs is done when both
- * min_vruntime are up-to-date.
- *
- * WAKEUP (remote)
- *
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way we don't have the most up-to-date min_vruntime on the originating
- * CPU and an up-to-date min_vruntime on the destination CPU.
- */
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, flags);
update_curr(cfs_rq);
/*
- * Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past. Being
- * placed in the past could significantly boost this task to the
- * fairness detriment of existing tasks.
- */
- if (renorm && !curr)
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
@@ -4855,37 +5045,46 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
+ /*
+ * XXX update_load_avg() above will have attached us to the pelt sum;
+ * but update_cfs_group() here will re-adjust the weight and have to
+ * undo/redo all that. Seems wasteful.
+ */
update_cfs_group(se);
+
+ /*
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
+ * we can place the entity.
+ */
+ if (!curr)
+ place_entity(cfs_rq, se, flags);
+
account_entity_enqueue(cfs_rq, se);
- if (flags & ENQUEUE_WAKEUP)
- place_entity(cfs_rq, se, 0);
/* Entity has migrated, no longer consider this task hot */
if (flags & ENQUEUE_MIGRATED)
se->exec_start = 0;
check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
- check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq))
+ if (!throttled_hierarchy(cfs_rq)) {
list_add_leaf_cfs_rq(cfs_rq);
- }
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last != se)
- break;
+ } else {
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct rq *rq = rq_of(cfs_rq);
- cfs_rq->last = NULL;
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+#endif
+ }
}
}
@@ -4900,27 +5099,10 @@ static void __clear_buddies_next(struct sched_entity *se)
}
}
-static void __clear_buddies_skip(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip != se)
- break;
-
- cfs_rq->skip = NULL;
- }
-}
-
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->last == se)
- __clear_buddies_last(se);
-
if (cfs_rq->next == se)
__clear_buddies_next(se);
-
- if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -4954,20 +5136,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
+ update_entity_lag(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
@@ -4986,52 +5160,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_idle_cfs_rq_clock_pelt(cfs_rq);
}
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- unsigned long ideal_runtime, delta_exec;
- struct sched_entity *se;
- s64 delta;
-
- /*
- * When many tasks blow up the sched_period; it is possible that
- * sched_slice() reports unusually large results (when many tasks are
- * very light for example). Therefore impose a maximum.
- */
- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
-
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
- return;
- }
-
- /*
- * Ensure that a task that missed wakeup preemption by a
- * narrow margin doesn't have to wait for a full slice.
- * This also mitigates buddy induced latencies under load.
- */
- if (delta_exec < sysctl_sched_min_granularity)
- return;
-
- se = __pick_first_entity(cfs_rq);
- delta = curr->vruntime - se->vruntime;
-
- if (delta < 0)
- return;
-
- if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
-}
-
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -5047,6 +5175,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ /*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+ se->vlag = se->deadline;
}
update_stats_curr_start(cfs_rq, se);
@@ -5070,9 +5203,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -5083,50 +5213,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- struct sched_entity *se;
-
/*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
*/
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
-
- se = left; /* ideally we run the leftmost entity */
-
- /*
- * Avoid running the skip buddy, if running something else can
- * be done without getting too unfair.
- */
- if (cfs_rq->skip && cfs_rq->skip == se) {
- struct sched_entity *second;
-
- if (se == curr) {
- second = __pick_first_entity(cfs_rq);
- } else {
- second = __pick_next_entity(se);
- if (!second || (curr && entity_before(curr, second)))
- second = curr;
- }
-
- if (second && wakeup_preempt_entity(second, left) < 1)
- se = second;
- }
+ if (sched_feat(NEXT_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ return cfs_rq->next;
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- se = cfs_rq->next;
- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- se = cfs_rq->last;
- }
-
- return se;
+ return pick_eevdf(cfs_rq);
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -5143,8 +5237,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
-
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
@@ -5185,9 +5277,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}
@@ -5377,6 +5466,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* Add cfs_rq with load or one or more already running entities to the list */
if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+ cfs_rq->throttled_clock_self = 0;
+
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;
+
+ cfs_rq->throttled_clock_self_time += delta;
+ }
}
return 0;
@@ -5391,6 +5491,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
+
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@@ -5480,7 +5584,9 @@ done:
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
+ SCHED_WARN_ON(cfs_rq->throttled_clock);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -5498,7 +5604,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ if (cfs_rq->throttled_clock) {
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ cfs_rq->throttled_clock = 0;
+ }
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
@@ -6014,13 +6123,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
cfs_b->burst = 0;
+ cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
@@ -6157,6 +6267,46 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
rq_clock_stop_loop_update(rq);
}
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (cfs_rq->runtime_enabled ||
+ tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/* called from pick_next_task_fair() */
+static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq);
+
+ if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+ return;
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (rq->nr_running != 1)
+ return;
+
+ /*
+ * We know there is only one task runnable and we've just picked it. The
+ * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
+ * be otherwise able to stop the tick. Just need to check if we are using
+ * bandwidth control.
+ */
+ if (cfs_task_bw_constrained(p))
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#endif
+
#else /* CONFIG_CFS_BANDWIDTH */
static inline bool cfs_bandwidth_used(void)
@@ -6186,9 +6336,8 @@ static inline int throttled_lb_pair(struct task_group *tg,
return 0;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
@@ -6199,9 +6348,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-
+#ifdef CONFIG_CGROUP_SCHED
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
+#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
+static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
+#endif
+
/**************************************************
* CFS operations on tasks:
*/
@@ -6210,13 +6368,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
SCHED_WARN_ON(task_rq(p) != rq);
if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;
if (delta < 0) {
@@ -6240,8 +6397,7 @@ static void hrtick_update(struct rq *rq)
if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -6282,17 +6438,6 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-/*
- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
- * of idle_nr_running, which does not consider idle descendants of normal
- * entities.
- */
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->nr_running &&
- cfs_rq->nr_running == cfs_rq->idle_nr_running;
-}
-
#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
@@ -7065,7 +7210,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
- for_each_cpu_wrap(cpu, cpus, target + 1) {
+ for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
@@ -7289,9 +7434,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
- if (boost)
- util_est = max(util_est, runnable);
-
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
@@ -7741,6 +7883,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
if (wake_flags & WF_TTWU) {
record_wakee(p);
+ if ((wake_flags & WF_CURRENT_CPU) &&
+ cpumask_test_cpu(cpu, p->cpus_ptr))
+ return cpu;
+
if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
@@ -7798,18 +7944,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
struct sched_entity *se = &p->se;
- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
- }
-
if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
@@ -7847,66 +7981,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
#endif /* CONFIG_SMP */
-static unsigned long wakeup_gran(struct sched_entity *se)
-{
- unsigned long gran = sysctl_sched_wakeup_granularity;
-
- /*
- * Since its curr running now, convert the gran from real-time
- * to virtual-time in his units.
- *
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- return calc_delta_fair(gran, se);
-}
-
-/*
- * Should 'se' preempt 'curr'.
- *
- * |s1
- * |s2
- * |s3
- * g
- * |<--->|c
- *
- * w(c, s1) = -1
- * w(c, s2) = 0
- * w(c, s3) = 1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
-
- if (vdiff <= 0)
- return -1;
-
- gran = wakeup_gran(se);
- if (vdiff > gran)
- return 1;
-
- return 0;
-}
-
-static void set_last_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- if (se_is_idle(se))
- return;
- cfs_rq_of(se)->last = se;
- }
-}
-
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
@@ -7918,12 +7992,6 @@ static void set_next_buddy(struct sched_entity *se)
}
}
-static void set_skip_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se)
- cfs_rq_of(se)->skip = se;
-}
-
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -7932,7 +8000,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
int cse_is_idle, pse_is_idle;
@@ -7948,7 +8015,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
next_buddy_marked = 1;
}
@@ -7993,35 +8060,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (cse_is_idle != pse_is_idle)
return;
- update_curr(cfs_rq_of(se));
- if (wakeup_preempt_entity(se, pse) == 1) {
- /*
- * Bias pick_next to pick the sched entity that is
- * triggering this preemption.
- */
- if (!next_buddy_marked)
- set_next_buddy(pse);
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+
+ /*
+ * XXX pick_eevdf(cfs_rq) != se ?
+ */
+ if (pick_eevdf(cfs_rq) == pse)
goto preempt;
- }
return;
preempt:
resched_curr(rq);
- /*
- * Only set the backward buddy when the current task is still
- * on the rq. This can happen when a wakeup gets interleaved
- * with schedule on the ->pre_schedule() or idle_balance()
- * point, either of which can * drop the rq lock.
- *
- * Also, during early boot the idle thread is in the fair class,
- * for obvious reasons its a bad idea to schedule back to it.
- */
- if (unlikely(!se->on_rq || curr == rq->idle))
- return;
-
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
- set_last_buddy(se);
}
#ifdef CONFIG_SMP
@@ -8172,6 +8223,7 @@ done: __maybe_unused;
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
return p;
@@ -8222,8 +8274,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
/*
* sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
*/
static void yield_task_fair(struct rq *rq)
{
@@ -8239,21 +8289,19 @@ static void yield_task_fair(struct rq *rq)
clear_buddies(cfs_rq, se);
- if (curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- /*
- * Tell update_rq_clock() that we've just updated,
- * so we don't do microscopic update in schedule()
- * and double the fastpath cost.
- */
- rq_clock_skip_update(rq);
- }
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq);
- set_skip_buddy(se);
+ se->deadline += calc_delta_fair(se->slice, se);
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -8416,6 +8464,11 @@ enum group_type {
*/
group_misfit_task,
/*
+ * Balance SMT group that's fully busy. Can benefit from migration
+ * a task on SMT with busy sibling to another CPU on idle core.
+ */
+ group_smt_balance,
+ /*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
@@ -8496,8 +8549,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
- (&p->se == cfs_rq_of(&p->se)->next ||
- &p->se == cfs_rq_of(&p->se)->last))
+ (&p->se == cfs_rq_of(&p->se)->next))
return 1;
if (sysctl_sched_migration_cost == -1)
@@ -9123,6 +9175,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -9396,6 +9449,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (sgs->group_asym_packing)
return group_asym_packing;
+ if (sgs->group_smt_balance)
+ return group_smt_balance;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
@@ -9465,6 +9521,71 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+ struct sched_group *sg2)
+{
+ if (!sg1 || !sg2)
+ return false;
+
+ return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+ (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
+
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (env->idle == CPU_NOT_IDLE)
+ return false;
+
+ /*
+ * For SMT source group, it is better to move a task
+ * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+ * will not be on.
+ */
+ if (group->flags & SD_SHARE_CPUCAPACITY &&
+ sgs->sum_h_nr_running > 1)
+ return true;
+
+ return false;
+}
+
+static inline long sibling_imbalance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *local)
+{
+ int ncores_busiest, ncores_local;
+ long imbalance;
+
+ if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ return 0;
+
+ ncores_busiest = sds->busiest->cores;
+ ncores_local = sds->local->cores;
+
+ if (ncores_busiest == ncores_local) {
+ imbalance = busiest->sum_nr_running;
+ lsub_positive(&imbalance, local->sum_nr_running);
+ return imbalance;
+ }
+
+ /* Balance such that nr_running/ncores ratio are same on both groups */
+ imbalance = ncores_local * busiest->sum_nr_running;
+ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+ /* Normalize imbalance and do rounding on normalization */
+ imbalance = 2 * imbalance + ncores_local + ncores_busiest;
+ imbalance /= ncores_local + ncores_busiest;
+
+ /* Take advantage of resource in an empty sched group */
+ if (imbalance == 0 && local->sum_nr_running == 0 &&
+ busiest->sum_nr_running > 1)
+ imbalance = 2;
+
+ return imbalance;
+}
+
static inline bool
sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
{
@@ -9557,6 +9678,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_asym_packing = 1;
}
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (!local_group && smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
+
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
@@ -9641,6 +9766,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
return false;
break;
+ case group_smt_balance:
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9670,6 +9796,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_has_spare:
/*
+ * Do not pick sg with SMT CPUs over sg with pure CPUs,
+ * as we do not want to pull task off SMT core with one task
+ * and make the core idle.
+ */
+ if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+ return false;
+ else
+ return true;
+ }
+
+ /*
* Select not overloaded group with lowest number of idle cpus
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
@@ -9865,6 +10003,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those types are not used in the slow wakeup path */
return false;
@@ -9996,6 +10135,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those type are not used in the slow wakeup path */
return NULL;
@@ -10250,6 +10390,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+ if (busiest->group_type == group_smt_balance) {
+ /* Reduce number of tasks sharing CPU capacity */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
+ }
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -10297,14 +10444,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
- unsigned int nr_diff = busiest->sum_nr_running;
/*
* When prefer sibling, evenly spread running tasks on
* groups.
*/
env->migration_type = migrate_task;
- lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff;
+ env->imbalance = sibling_imbalance(env, sds, busiest, local);
} else {
/*
@@ -10501,20 +10646,27 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* group's child domain.
*/
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- busiest->sum_nr_running > local->sum_nr_running + 1)
+ sibling_imbalance(env, &sds, busiest, local) > 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE)
+ if (env->idle == CPU_NOT_IDLE) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
+ }
+
+ if (busiest->group_type == group_smt_balance &&
+ smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+ /* Let non SMT CPU pull from SMT CPU sharing with sibling */
+ goto force_balance;
+ }
if (busiest->group_weight > 1 &&
- local->idle_cpus <= (busiest->idle_cpus + 1))
+ local->idle_cpus <= (busiest->idle_cpus + 1)) {
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
@@ -10525,12 +10677,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* there is more than 1 CPU per group.
*/
goto out_balanced;
+ }
- if (busiest->sum_h_nr_running == 1)
+ if (busiest->sum_h_nr_running == 1) {
/*
* busiest doesn't have any tasks waiting to run
*/
goto out_balanced;
+ }
}
force_balance:
@@ -10764,7 +10918,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- int cpu;
+ int cpu, idle_smt = -1;
/*
* Ensure the balancing environment is consistent; can happen
@@ -10791,10 +10945,24 @@ static int should_we_balance(struct lb_env *env)
if (!idle_cpu(cpu))
continue;
+ /*
+ * Don't balance to idle SMT in busy core right away when
+ * balancing cores, but remember the first idle SMT CPU for
+ * later consideration. Find CPU on an idle core first.
+ */
+ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+ if (idle_smt == -1)
+ idle_smt = cpu;
+ continue;
+ }
+
/* Are we the first idle CPU? */
return cpu == env->dst_cpu;
}
+ if (idle_smt == env->dst_cpu)
+ return true;
+
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
}
@@ -12007,8 +12175,8 @@ static void rq_offline_fair(struct rq *rq)
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
{
- u64 slice = sched_slice(cfs_rq_of(se), se);
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
return (rtime * min_nr_tasks > slice);
}
@@ -12164,8 +12332,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
+ struct cfs_rq *cfs_rq;
struct rq *rq = this_rq();
struct rq_flags rf;
@@ -12174,22 +12342,9 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (curr) {
+ if (curr)
update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- }
- place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
+ place_entity(cfs_rq, se, ENQUEUE_INITIAL);
rq_unlock(rq, &rf);
}
@@ -12218,34 +12373,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
check_preempt_curr(rq, p, 0);
}
-static inline bool vruntime_normalized(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
-
- /*
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
- * the dequeue_entity(.flags=0) will already have normalized the
- * vruntime.
- */
- if (p->on_rq)
- return true;
-
- /*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - A forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - A task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- */
- if (!se->sum_exec_runtime ||
- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
- return true;
-
- return false;
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Propagate the changes of the sched_entity across the tg tree to make it
@@ -12316,16 +12443,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
- }
detach_entity_cfs_rq(se);
}
@@ -12333,12 +12450,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se);
-
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12450,7 +12563,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
@@ -12703,7 +12816,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);
return rr_interval;
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..f770168230ae 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,16 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Only give sleepers 50% of their service deficit. This allows
- * them to run sooner, but does not allow tons of sleepers to
- * rip the spread apart.
- */
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
-SCHED_FEAT(START_DEBIT, true)
+SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -20,13 +16,6 @@ SCHED_FEAT(START_DEBIT, true)
SCHED_FEAT(NEXT_BUDDY, false)
/*
- * Prefer to schedule the task that ran last (when we did
- * wake-preempt) as that likely will touch the same data, increases
- * cache locality.
- */
-SCHED_FEAT(LAST_BUDDY, true)
-
-/*
* Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
@@ -99,5 +88,4 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
-SCHED_FEAT(ALT_PERIOD, true)
-SCHED_FEAT(BASE_SLICE, true)
+SCHED_FEAT(HZ_BW, true)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9bb3f2b3ccfc..1d0f634725a6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -140,7 +140,7 @@
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
-DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 00e0e5074115..0597ba0f85ff 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -25,7 +25,7 @@ unsigned int sysctl_sched_rt_period = 1000000;
int sysctl_sched_rt_runtime = 950000;
#ifdef CONFIG_SYSCTL
-static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
@@ -3062,6 +3062,9 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
sched_rr_timeslice =
sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
msecs_to_jiffies(sysctl_sched_rr_timeslice);
+
+ if (sysctl_sched_rr_timeslice <= 0)
+ sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
}
mutex_unlock(&mutex);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e93e006a942b..04846272409c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -454,11 +454,12 @@ extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
-extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern bool cfs_task_bw_constrained(struct task_struct *p);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
@@ -494,6 +495,7 @@ static inline void set_task_rq_fair(struct sched_entity *se,
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
+static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
#endif /* CONFIG_CGROUP_SCHED */
@@ -548,6 +550,9 @@ struct cfs_rq {
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
+ s64 avg_vruntime;
+ u64 avg_load;
+
u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -567,8 +572,6 @@ struct cfs_rq {
*/
struct sched_entity *curr;
struct sched_entity *next;
- struct sched_entity *last;
- struct sched_entity *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
@@ -636,6 +639,8 @@ struct cfs_rq {
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
+ u64 throttled_clock_self;
+ u64 throttled_clock_self_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
@@ -1245,6 +1250,7 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
bool fi);
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
/*
* Helpers to check if the CPU's core cookie matches with the task's cookie
@@ -1700,6 +1706,21 @@ rq_unlock(struct rq *rq, struct rq_flags *rf)
raw_spin_rq_unlock(rq);
}
+DEFINE_LOCK_GUARD_1(rq_lock, struct rq,
+ rq_lock(_T->lock, &_T->rf),
+ rq_unlock(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq,
+ rq_lock_irq(_T->lock, &_T->rf),
+ rq_unlock_irq(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
+ rq_lock_irqsave(_T->lock, &_T->rf),
+ rq_unlock_irqrestore(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
static inline struct rq *
this_rq_lock_irq(struct rq_flags *rf)
__acquires(rq->lock)
@@ -1882,6 +1903,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
+ unsigned int cores;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* CPU of highest priority in group */
int flags;
@@ -2131,12 +2153,13 @@ static inline int task_on_rq_migrating(struct task_struct *p)
}
/* Wake flags. The first three directly map to some SD flag value */
-#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
-#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
-#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
-#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
-#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2195,6 +2218,7 @@ extern const u32 sched_prio_to_wmult[40];
#else
#define ENQUEUE_MIGRATED 0x00
#endif
+#define ENQUEUE_INITIAL 0x80
#define RETRY_TASK ((void *)-1UL)
@@ -2398,6 +2422,7 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
#endif
extern void schedule_idle(void);
+asmlinkage void schedule_user(void);
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
@@ -2499,11 +2524,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_base_slice;
+
#ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
-extern unsigned int sysctl_sched_idle_min_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
@@ -2609,6 +2632,12 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
#endif
+#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
+__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
+static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
+ _lock; return _t; }
+
#ifdef CONFIG_SMP
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
@@ -2738,6 +2767,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
+static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ raw_spin_unlock(l1);
+ raw_spin_unlock(l2);
+}
+
+DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t,
+ double_raw_lock(_T->lock, _T->lock2),
+ double_raw_unlock(_T->lock, _T->lock2))
+
/*
* double_rq_unlock - safely unlock two runqueues
*
@@ -2795,6 +2834,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
#endif
+DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
+ double_rq_lock(_T->lock, _T->lock2),
+ double_rq_unlock(_T->lock, _T->lock2))
+
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
@@ -3229,6 +3272,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
extern void swake_up_all_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+
#ifdef CONFIG_PREEMPT_DYNAMIC
extern int preempt_dynamic_mode;
extern int sched_dynamic_mode(const char *str);
@@ -3480,4 +3525,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 76b9b796e695..72505cd3b60a 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -18,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head);
* If for some reason it would return 0, that means the previously waiting
* task is already running, so it will observe condition true (or has already).
*/
-void swake_up_locked(struct swait_queue_head *q)
+void swake_up_locked(struct swait_queue_head *q, int wake_flags)
{
struct swait_queue *curr;
@@ -26,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q)
return;
curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
- wake_up_process(curr->task);
+ try_to_wake_up(curr->task, TASK_NORMAL, wake_flags);
list_del_init(&curr->task_list);
}
EXPORT_SYMBOL(swake_up_locked);
@@ -41,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked);
void swake_up_all_locked(struct swait_queue_head *q)
{
while (!list_empty(&q->task_list))
- swake_up_locked(q);
+ swake_up_locked(q, 0);
}
void swake_up_one(struct swait_queue_head *q)
@@ -49,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q)
unsigned long flags;
raw_spin_lock_irqsave(&q->lock, flags);
- swake_up_locked(q);
+ swake_up_locked(q, 0);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(swake_up_one);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d3a3b2646ec4..05a5bc678c08 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -722,8 +722,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (parent->parent) {
parent->parent->child = tmp;
- if (tmp->flags & SD_SHARE_CPUCAPACITY)
- parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY;
+ parent->parent->groups->flags = tmp->flags;
}
/*
@@ -1275,14 +1274,24 @@ build_sched_groups(struct sched_domain *sd, int cpu)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
+ struct cpumask *mask = sched_domains_tmpmask2;
WARN_ON(!sg);
do {
- int cpu, max_cpu = -1;
+ int cpu, cores = 0, max_cpu = -1;
sg->group_weight = cpumask_weight(sched_group_span(sg));
+ cpumask_copy(mask, sched_group_span(sg));
+ for_each_cpu(cpu, mask) {
+ cores++;
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
+#endif
+ }
+ sg->cores = cores;
+
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 48c53e4739ea..802d98cf2de3 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
}
EXPORT_SYMBOL(__wake_up);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+ __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
+}
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d3e584065c7f..255999ba9190 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -110,11 +110,13 @@ struct seccomp_knotif {
* @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
* is allowed.
* @ioctl_flags: The flags used for the seccomp_addfd ioctl.
+ * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd
* @ret: The return value of the installing process. It is set to the fd num
* upon success (>= 0).
* @completion: Indicates that the installing process has completed fd
* installation, or gone away (either due to successful
* reply, or signal)
+ * @list: list_head for chaining seccomp_kaddfd together.
*
*/
struct seccomp_kaddfd {
@@ -138,14 +140,17 @@ struct seccomp_kaddfd {
* structure is fairly large, we store the notification-specific stuff in a
* separate structure.
*
- * @request: A semaphore that users of this notification can wait on for
- * changes. Actual reads and writes are still controlled with
- * filter->notify_lock.
+ * @requests: A semaphore that users of this notification can wait on for
+ * changes. Actual reads and writes are still controlled with
+ * filter->notify_lock.
+ * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
* @next_id: The id of the next request.
* @notifications: A list of struct seccomp_knotif elements.
*/
+
struct notification {
- struct semaphore request;
+ atomic_t requests;
+ u32 flags;
u64 next_id;
struct list_head notifications;
};
@@ -555,6 +560,8 @@ static void __seccomp_filter_release(struct seccomp_filter *orig)
* drop its reference count, and notify
* about unused filters
*
+ * @tsk: task the filter should be released from.
+ *
* This function should only be called when the task is exiting as
* it detaches it from its filter tree. As such, READ_ONCE() and
* barriers are not needed here, as would normally be needed.
@@ -574,6 +581,8 @@ void seccomp_filter_release(struct task_struct *tsk)
/**
* seccomp_sync_threads: sets all threads to use current's filter
*
+ * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync.
+ *
* Expects sighand and cred_guard_mutex locks to be held, and for
* seccomp_can_sync_threads() to have returned success already
* without dropping the locks.
@@ -1116,8 +1125,11 @@ static int seccomp_do_user_notification(int this_syscall,
list_add_tail(&n.list, &match->notif->notifications);
INIT_LIST_HEAD(&n.addfd);
- up(&match->notif->request);
- wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ atomic_inc(&match->notif->requests);
+ if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ else
+ wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
/*
* This is where we wait for a reply from userspace.
@@ -1450,6 +1462,37 @@ find_notification(struct seccomp_filter *filter, u64 id)
return NULL;
}
+static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
+ void *key)
+{
+ /* Avoid a wakeup if event not interesting for us. */
+ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+static int recv_wait_event(struct seccomp_filter *filter)
+{
+ DEFINE_WAIT_FUNC(wait, recv_wake_function);
+ int ret;
+
+ if (atomic_dec_if_positive(&filter->notif->requests) >= 0)
+ return 0;
+
+ for (;;) {
+ ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
+
+ if (atomic_dec_if_positive(&filter->notif->requests) >= 0)
+ break;
+
+ if (ret)
+ return ret;
+
+ schedule();
+ }
+ finish_wait(&filter->wqh, &wait);
+ return 0;
+}
static long seccomp_notify_recv(struct seccomp_filter *filter,
void __user *buf)
@@ -1467,7 +1510,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
memset(&unotif, 0, sizeof(unotif));
- ret = down_interruptible(&filter->notif->request);
+ ret = recv_wait_event(filter);
if (ret < 0)
return ret;
@@ -1515,7 +1558,8 @@ out:
if (should_sleep_killable(filter, knotif))
complete(&knotif->ready);
knotif->state = SECCOMP_NOTIFY_INIT;
- up(&filter->notif->request);
+ atomic_inc(&filter->notif->requests);
+ wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM);
}
mutex_unlock(&filter->notify_lock);
}
@@ -1561,7 +1605,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->error = resp.error;
knotif->val = resp.val;
knotif->flags = resp.flags;
- complete(&knotif->ready);
+ if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ complete_on_current_cpu(&knotif->ready);
+ else
+ complete(&knotif->ready);
out:
mutex_unlock(&filter->notify_lock);
return ret;
@@ -1591,6 +1638,22 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
return ret;
}
+static long seccomp_notify_set_flags(struct seccomp_filter *filter,
+ unsigned long flags)
+{
+ long ret;
+
+ if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ return -EINVAL;
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ return ret;
+ filter->notif->flags = flags;
+ mutex_unlock(&filter->notify_lock);
+ return 0;
+}
+
static long seccomp_notify_addfd(struct seccomp_filter *filter,
struct seccomp_notif_addfd __user *uaddfd,
unsigned int size)
@@ -1720,6 +1783,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
+ return seccomp_notify_set_flags(filter, arg);
}
/* Extensible Argument ioctls */
@@ -1777,7 +1842,6 @@ static struct file *init_listener(struct seccomp_filter *filter)
if (!filter->notif)
goto out;
- sema_init(&filter->notif->request, 0);
filter->notif->next_id = get_random_u64();
INIT_LIST_HEAD(&filter->notif->notifications);
diff --git a/kernel/smp.c b/kernel/smp.c
index 385179dae360..8455a53465af 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -46,6 +46,8 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);
+
static void __flush_smp_call_function_queue(bool warn_cpu_offline);
int smpcfd_prepare_cpu(unsigned int cpu)
@@ -253,13 +255,15 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
*bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
}
if (cpu >= 0) {
- dump_cpu_task(cpu);
+ if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
+ dump_cpu_task(cpu);
if (!cpu_cur_csd) {
pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
arch_send_call_function_single_ipi(cpu);
}
}
- dump_stack();
+ if (firsttime)
+ dump_stack();
*ts1 = ts2;
return false;
@@ -433,9 +437,14 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
struct llist_node *entry, *prev;
struct llist_head *head;
static bool warned;
+ atomic_t *tbt;
lockdep_assert_irqs_disabled();
+ /* Allow waiters to send backtrace NMI from here onwards */
+ tbt = this_cpu_ptr(&trigger_backtrace);
+ atomic_set_release(tbt, 1);
+
head = this_cpu_ptr(&call_single_queue);
entry = llist_del_all(head);
entry = llist_reverse_order(entry);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 807b34ccd797..210cf5f8d92c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -612,7 +612,7 @@ static inline void tick_irq_exit(void)
int cpu = smp_processor_id();
/* Make sure that timer wheel updates are propagated */
- if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+ if ((sched_core_idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
if (!in_hardirq())
tick_nohz_irq_exit();
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 88cbc1181b23..c108ed8a9804 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -473,8 +473,8 @@ static void clocksource_watchdog(struct timer_list *unused)
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
- u64 cs_wd_msec;
- u64 wd_msec;
+ s64 cs_wd_msec;
+ s64 wd_msec;
u32 wd_rem;
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
@@ -483,8 +483,8 @@ static void clocksource_watchdog(struct timer_list *unused)
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
- cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
- wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
+ cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
+ wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
diff --git a/kernel/torture.c b/kernel/torture.c
index 1a0519b836ac..b28b05bbef02 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -37,6 +37,7 @@
#include <linux/ktime.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
+#include <linux/sched/rt.h>
#include "rcu/rcu.h"
MODULE_LICENSE("GPL");
@@ -54,6 +55,9 @@ module_param(verbose_sleep_frequency, int, 0444);
static int verbose_sleep_duration = 1;
module_param(verbose_sleep_duration, int, 0444);
+static int random_shuffle;
+module_param(random_shuffle, int, 0444);
+
static char *torture_type;
static int verbose;
@@ -88,8 +92,8 @@ int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_s
ktime_t hto = baset_ns;
if (trsp)
- hto += (torture_random(trsp) >> 3) % fuzzt_ns;
- set_current_state(TASK_UNINTERRUPTIBLE);
+ hto += torture_random(trsp) % fuzzt_ns;
+ set_current_state(TASK_IDLE);
return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
}
EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
@@ -350,22 +354,22 @@ torture_onoff(void *arg)
if (onoff_holdoff > 0) {
VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
- schedule_timeout_interruptible(onoff_holdoff);
+ torture_hrtimeout_jiffies(onoff_holdoff, &rand);
VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
}
while (!torture_must_stop()) {
if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) {
- schedule_timeout_interruptible(HZ / 10);
+ torture_hrtimeout_jiffies(HZ / 10, &rand);
continue;
}
- cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+ cpu = torture_random(&rand) % (maxcpu + 1);
if (!torture_offline(cpu,
&n_offline_attempts, &n_offline_successes,
&sum_offline, &min_offline, &max_offline))
torture_online(cpu,
&n_online_attempts, &n_online_successes,
&sum_online, &min_online, &max_online);
- schedule_timeout_interruptible(onoff_interval);
+ torture_hrtimeout_jiffies(onoff_interval, &rand);
}
stop:
@@ -518,6 +522,7 @@ static void torture_shuffle_task_unregister_all(void)
*/
static void torture_shuffle_tasks(void)
{
+ DEFINE_TORTURE_RANDOM(rand);
struct shuffle_task *stp;
cpumask_setall(shuffle_tmp_mask);
@@ -537,8 +542,10 @@ static void torture_shuffle_tasks(void)
cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
mutex_lock(&shuffle_task_mutex);
- list_for_each_entry(stp, &shuffle_task_list, st_l)
- set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ list_for_each_entry(stp, &shuffle_task_list, st_l) {
+ if (!random_shuffle || torture_random(&rand) & 0x1)
+ set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ }
mutex_unlock(&shuffle_task_mutex);
cpus_read_unlock();
@@ -550,9 +557,11 @@ static void torture_shuffle_tasks(void)
*/
static int torture_shuffle(void *arg)
{
+ DEFINE_TORTURE_RANDOM(rand);
+
VERBOSE_TOROUT_STRING("torture_shuffle task started");
do {
- schedule_timeout_interruptible(shuffle_interval);
+ torture_hrtimeout_jiffies(shuffle_interval, &rand);
torture_shuffle_tasks();
torture_shutdown_absorb("torture_shuffle");
} while (!torture_must_stop());
@@ -728,12 +737,12 @@ bool stutter_wait(const char *title)
cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
- if (!ret) {
+ if (!ret && !rt_task(current)) {
sched_set_normal(current, MAX_NICE);
ret = true;
}
if (spt == 1) {
- schedule_timeout_interruptible(1);
+ torture_hrtimeout_jiffies(1, NULL);
} else if (spt == 2) {
while (READ_ONCE(stutter_pause_test)) {
if (!(i++ & 0xffff))
@@ -741,7 +750,7 @@ bool stutter_wait(const char *title)
cond_resched();
}
} else {
- schedule_timeout_interruptible(round_jiffies_relative(HZ));
+ torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL);
}
torture_shutdown_absorb(title);
}
@@ -926,7 +935,7 @@ EXPORT_SYMBOL_GPL(torture_kthread_stopping);
* it starts, you will need to open-code your own.
*/
int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
- char *f, struct task_struct **tp)
+ char *f, struct task_struct **tp, void (*cbf)(struct task_struct *tp))
{
int ret = 0;
@@ -938,6 +947,10 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
*tp = NULL;
return ret;
}
+
+ if (cbf)
+ cbf(*tp);
+
wake_up_process(*tp); // Process is sleeping, so ordering provided.
torture_shuffle_task_register(*tp);
return ret;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..8e64aaad5361 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4213,8 +4213,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
* will point to the same string as current_trace->name.
*/
mutex_lock(&trace_types_lock);
- if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
+ if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) {
+ /* Close iter->trace before switching to the new current tracer */
+ if (iter->trace->close)
+ iter->trace->close(iter);
*iter->trace = *tr->current_trace;
+ /* Reopen the new current tracer */
+ if (iter->trace->open)
+ iter->trace->open(iter);
+ }
mutex_unlock(&trace_types_lock);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -5277,11 +5284,17 @@ int tracing_set_cpumask(struct trace_array *tr,
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
+#endif
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
+#endif
}
}
arch_spin_unlock(&tr->max_lock);
@@ -6705,10 +6718,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
#endif
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
+ int cpu;
int ret;
ret = tracing_check_open_get_tr(tr);
@@ -6716,13 +6755,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
return ret;
mutex_lock(&trace_types_lock);
+ cpu = tracing_get_cpu(inode);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}
trace_seq_init(&iter->seq);
@@ -6745,7 +6787,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6755,12 +6797,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
tr->trace_ref++;
-out:
+
mutex_unlock(&trace_types_lock);
return ret;
fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6777,7 +6822,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
free_cpumask_var(iter->started);
@@ -9441,6 +9486,9 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9482,6 +9530,7 @@ static struct trace_array *trace_array_create(const char *name)
out_free_tr:
ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -9584,6 +9633,7 @@ static int __remove_instance(struct trace_array *tr)
}
kfree(tr->topts);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -10381,12 +10431,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
+ if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();
@@ -10439,6 +10491,8 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..73eaec158473 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@ struct trace_array {
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
@@ -1295,6 +1297,14 @@ static inline void trace_branch_disable(void)
/* set ring buffers to default size if not already done so */
int tracing_update_buffers(void);
+union trace_synth_field {
+ u8 as_u8;
+ u16 as_u16;
+ u32 as_u32;
+ u64 as_u64;
+ struct trace_dynamic_info as_dynamic;
+};
+
struct ftrace_event_field {
struct list_head link;
const char *name;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index dd398afc8e25..9897d0bfcab7 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -127,7 +127,7 @@ static bool synth_event_match(const char *system, const char *event,
struct synth_trace_event {
struct trace_entry ent;
- u64 fields[];
+ union trace_synth_field fields[];
};
static int synth_event_define_fields(struct trace_event_call *call)
@@ -321,19 +321,19 @@ static const char *synth_field_fmt(char *type)
static void print_synth_event_num_val(struct trace_seq *s,
char *print_fmt, char *name,
- int size, u64 val, char *space)
+ int size, union trace_synth_field *val, char *space)
{
switch (size) {
case 1:
- trace_seq_printf(s, print_fmt, name, (u8)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u8, space);
break;
case 2:
- trace_seq_printf(s, print_fmt, name, (u16)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u16, space);
break;
case 4:
- trace_seq_printf(s, print_fmt, name, (u32)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u32, space);
break;
default:
@@ -350,7 +350,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
struct synth_trace_event *entry;
struct synth_event *se;
- unsigned int i, n_u64;
+ unsigned int i, j, n_u64;
char print_fmt[32];
const char *fmt;
@@ -374,43 +374,28 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
/* parameter values */
if (se->fields[i]->is_string) {
if (se->fields[i]->is_dynamic) {
- u32 offset, data_offset;
- char *str_field;
-
- offset = (u32)entry->fields[n_u64];
- data_offset = offset & 0xffff;
-
- str_field = (char *)entry + data_offset;
+ union trace_synth_field *data = &entry->fields[n_u64];
trace_seq_printf(s, print_fmt, se->fields[i]->name,
STR_VAR_LEN_MAX,
- str_field,
+ (char *)entry + data->as_dynamic.offset,
i == se->n_fields - 1 ? "" : " ");
n_u64++;
} else {
trace_seq_printf(s, print_fmt, se->fields[i]->name,
STR_VAR_LEN_MAX,
- (char *)&entry->fields[n_u64],
+ (char *)&entry->fields[n_u64].as_u64,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
}
} else if (se->fields[i]->is_stack) {
- u32 offset, data_offset, len;
- unsigned long *p, *end;
-
- offset = (u32)entry->fields[n_u64];
- data_offset = offset & 0xffff;
- len = offset >> 16;
-
- p = (void *)entry + data_offset;
- end = (void *)p + len - (sizeof(long) - 1);
+ union trace_synth_field *data = &entry->fields[n_u64];
+ unsigned long *p = (void *)entry + data->as_dynamic.offset;
trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name);
-
- for (; *p && p < end; p++)
- trace_seq_printf(s, "=> %pS\n", (void *)*p);
+ for (j = 1; j < data->as_dynamic.len / sizeof(long); j++)
+ trace_seq_printf(s, "=> %pS\n", (void *)p[j]);
n_u64++;
-
} else {
struct trace_print_flags __flags[] = {
__def_gfpflag_names, {-1, NULL} };
@@ -419,13 +404,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
print_synth_event_num_val(s, print_fmt,
se->fields[i]->name,
se->fields[i]->size,
- entry->fields[n_u64],
+ &entry->fields[n_u64],
space);
if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
trace_seq_puts(s, " (");
trace_print_flags_seq(s, "|",
- entry->fields[n_u64],
+ entry->fields[n_u64].as_u64,
__flags);
trace_seq_putc(s, ')');
}
@@ -454,21 +439,16 @@ static unsigned int trace_string(struct synth_trace_event *entry,
int ret;
if (is_dynamic) {
- u32 data_offset;
-
- data_offset = struct_size(entry, fields, event->n_u64);
- data_offset += data_size;
-
- len = fetch_store_strlen((unsigned long)str_val);
+ union trace_synth_field *data = &entry->fields[*n_u64];
- data_offset |= len << 16;
- *(u32 *)&entry->fields[*n_u64] = data_offset;
+ data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size;
+ data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val);
ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
(*n_u64)++;
} else {
- str_field = (char *)&entry->fields[*n_u64];
+ str_field = (char *)&entry->fields[*n_u64].as_u64;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)str_val < TASK_SIZE)
@@ -492,6 +472,7 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
unsigned int data_size,
unsigned int *n_u64)
{
+ union trace_synth_field *data = &entry->fields[*n_u64];
unsigned int len;
u32 data_offset;
void *data_loc;
@@ -504,10 +485,6 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
break;
}
- /* Include the zero'd element if it fits */
- if (len < HIST_STACKTRACE_DEPTH)
- len++;
-
len *= sizeof(long);
/* Find the dynamic section to copy the stack into. */
@@ -515,8 +492,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
memcpy(data_loc, stack, len);
/* Fill in the field that holds the offset/len combo */
- data_offset |= len << 16;
- *(u32 *)&entry->fields[*n_u64] = data_offset;
+
+ data->as_dynamic.offset = data_offset;
+ data->as_dynamic.len = len;
(*n_u64)++;
@@ -550,7 +528,8 @@ static notrace void trace_event_raw_event_synth(void *__data,
str_val = (char *)(long)var_ref_vals[val_idx];
if (event->dynamic_fields[i]->is_stack) {
- len = *((unsigned long *)str_val);
+ /* reserve one extra element for size */
+ len = *((unsigned long *)str_val) + 1;
len *= sizeof(unsigned long);
} else {
len = fetch_store_strlen((unsigned long)str_val);
@@ -592,19 +571,19 @@ static notrace void trace_event_raw_event_synth(void *__data,
switch (field->size) {
case 1:
- *(u8 *)&entry->fields[n_u64] = (u8)val;
+ entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&entry->fields[n_u64] = (u16)val;
+ entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&entry->fields[n_u64] = (u32)val;
+ entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- entry->fields[n_u64] = val;
+ entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -1791,19 +1770,19 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
switch (field->size) {
case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ state.entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ state.entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ state.entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- state.entry->fields[n_u64] = val;
+ state.entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -1884,19 +1863,19 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
switch (field->size) {
case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ state.entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ state.entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ state.entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- state.entry->fields[n_u64] = val;
+ state.entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -2031,19 +2010,19 @@ static int __synth_event_add_val(const char *field_name, u64 val,
} else {
switch (field->size) {
case 1:
- *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
+ trace_state->entry->fields[field->offset].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
+ trace_state->entry->fields[field->offset].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
+ trace_state->entry->fields[field->offset].as_u32 = (u32)val;
break;
default:
- trace_state->entry->fields[field->offset] = val;
+ trace_state->entry->fields[field->offset].as_u64 = val;
break;
}
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 590b3d51afae..ba37f768e2f2 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -231,7 +231,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
-
+ else
+ iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 330aee1c1a49..0469a04a355f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -168,6 +168,8 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
+ else
+ iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d6798513a8c2..73a0c8e41559 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1673,10 +1673,15 @@ menu "Debug kernel data structures"
config DEBUG_LIST
bool "Debug linked list manipulation"
- depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION
+ depends on DEBUG_KERNEL
+ select LIST_HARDENED
help
- Enable this to turn on extended checks in the linked-list
- walking routines.
+ Enable this to turn on extended checks in the linked-list walking
+ routines.
+
+ This option trades better quality error reports for performance, and
+ is more suitable for kernel debugging. If you care about performance,
+ you should only enable CONFIG_LIST_HARDENED instead.
If unsure, say N.
@@ -1710,16 +1715,6 @@ config DEBUG_NOTIFIERS
This is a relatively cheap check but if you care about maximum
performance, say N.
-config BUG_ON_DATA_CORRUPTION
- bool "Trigger a BUG when data corruption is detected"
- select DEBUG_LIST
- help
- Select this option if the kernel should BUG when it encounters
- data corruption in kernel memory structures when they get checked
- for validity.
-
- If unsure, say N.
-
config DEBUG_MAPLE_TREE
bool "Debug maple trees"
depends on DEBUG_KERNEL
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index efae7e011956..59e21bfec188 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -13,7 +13,7 @@ menuconfig UBSAN
if UBSAN
config UBSAN_TRAP
- bool "On Sanitizer warnings, abort the running kernel code"
+ bool "Abort on Sanitizer warnings (smaller kernel but less verbose)"
depends on !COMPILE_TEST
help
Building kernels with Sanitizer features enabled tends to grow
@@ -26,6 +26,14 @@ config UBSAN_TRAP
the system. For some system builders this is an acceptable
trade-off.
+ Also note that selecting Y will cause your kernel to Oops
+ with an "illegal instruction" error with no further details
+ when a UBSAN violation occurs. (Except on arm64, which will
+ report which Sanitizer failed.) This may make it hard to
+ determine whether an Oops was caused by UBSAN or to figure
+ out the details of a UBSAN violation. It makes the kernel log
+ output less useful for bug reports.
+
config CC_HAS_UBSAN_BOUNDS_STRICT
def_bool $(cc-option,-fsanitize=bounds-strict)
help
diff --git a/lib/Makefile b/lib/Makefile
index 1ffae65bb7ee..d1397785ec16 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -167,7 +167,7 @@ obj-$(CONFIG_BTREE) += btree.o
obj-$(CONFIG_INTERVAL_TREE) += interval_tree.o
obj-$(CONFIG_ASSOCIATIVE_ARRAY) += assoc_array.o
obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
+obj-$(CONFIG_LIST_HARDENED) += list_debug.o
obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
obj-$(CONFIG_BITREVERSE) += bitrev.o
diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c
index 0d3a686b5ba2..fb8c0c5c2bd2 100644
--- a/lib/clz_ctz.c
+++ b/lib/clz_ctz.c
@@ -28,36 +28,16 @@ int __weak __clzsi2(int val)
}
EXPORT_SYMBOL(__clzsi2);
-int __weak __clzdi2(long val);
-int __weak __ctzdi2(long val);
-#if BITS_PER_LONG == 32
-
-int __weak __clzdi2(long val)
+int __weak __clzdi2(u64 val);
+int __weak __clzdi2(u64 val)
{
- return 32 - fls((int)val);
+ return 64 - fls64(val);
}
EXPORT_SYMBOL(__clzdi2);
-int __weak __ctzdi2(long val)
+int __weak __ctzdi2(u64 val);
+int __weak __ctzdi2(u64 val)
{
- return __ffs((u32)val);
+ return __ffs64(val);
}
EXPORT_SYMBOL(__ctzdi2);
-
-#elif BITS_PER_LONG == 64
-
-int __weak __clzdi2(long val)
-{
- return 64 - fls64((u64)val);
-}
-EXPORT_SYMBOL(__clzdi2);
-
-int __weak __ctzdi2(long val)
-{
- return __ffs64((u64)val);
-}
-EXPORT_SYMBOL(__ctzdi2);
-
-#else
-#error BITS_PER_LONG not 32 or 64
-#endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e4dc809d1075..424737045b97 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -566,24 +566,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
}
EXPORT_SYMBOL(iov_iter_zero);
-size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
- struct iov_iter *i)
+size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
+ size_t bytes, struct iov_iter *i)
{
- char *kaddr = kmap_atomic(page), *p = kaddr + offset;
- if (!page_copy_sane(page, offset, bytes)) {
- kunmap_atomic(kaddr);
+ size_t n, copied = 0;
+
+ if (!page_copy_sane(page, offset, bytes))
return 0;
- }
- if (WARN_ON_ONCE(!i->data_source)) {
- kunmap_atomic(kaddr);
+ if (WARN_ON_ONCE(!i->data_source))
return 0;
- }
- iterate_and_advance(i, bytes, base, len, off,
- copyin(p + off, base, len),
- memcpy_from_iter(i, p + off, base, len)
- )
- kunmap_atomic(kaddr);
- return bytes;
+
+ do {
+ char *p;
+
+ n = bytes - copied;
+ if (PageHighMem(page)) {
+ page += offset / PAGE_SIZE;
+ offset %= PAGE_SIZE;
+ n = min_t(size_t, n, PAGE_SIZE - offset);
+ }
+
+ p = kmap_atomic(page) + offset;
+ iterate_and_advance(i, n, base, len, off,
+ copyin(p + off, base, len),
+ memcpy_from_iter(i, p + off, base, len)
+ )
+ kunmap_atomic(p);
+ copied += n;
+ offset += n;
+ } while (PageHighMem(page) && copied != bytes && n > 0);
+
+ return copied;
}
EXPORT_SYMBOL(copy_page_from_iter_atomic);
diff --git a/lib/list_debug.c b/lib/list_debug.c
index d98d43f80958..db602417febf 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -2,7 +2,8 @@
* Copyright 2006, Red Hat, Inc., Dave Jones
* Released under the General Public License (GPL).
*
- * This file contains the linked list validation for DEBUG_LIST.
+ * This file contains the linked list validation and error reporting for
+ * LIST_HARDENED and DEBUG_LIST.
*/
#include <linux/export.h>
@@ -17,8 +18,9 @@
* attempt).
*/
-bool __list_add_valid(struct list_head *new, struct list_head *prev,
- struct list_head *next)
+__list_valid_slowpath
+bool __list_add_valid_or_report(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
{
if (CHECK_DATA_CORRUPTION(prev == NULL,
"list_add corruption. prev is NULL.\n") ||
@@ -37,9 +39,10 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev,
return true;
}
-EXPORT_SYMBOL(__list_add_valid);
+EXPORT_SYMBOL(__list_add_valid_or_report);
-bool __list_del_entry_valid(struct list_head *entry)
+__list_valid_slowpath
+bool __list_del_entry_valid_or_report(struct list_head *entry)
{
struct list_head *prev, *next;
@@ -65,6 +68,5 @@ bool __list_del_entry_valid(struct list_head *entry)
return false;
return true;
-
}
-EXPORT_SYMBOL(__list_del_entry_valid);
+EXPORT_SYMBOL(__list_del_entry_valid_or_report);
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 8d24279fad05..6f6a5fc85b42 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -2506,94 +2506,29 @@ static void fs_reclaim_tests(void)
pr_cont("\n");
}
-#define __guard(cleanup) __maybe_unused __attribute__((__cleanup__(cleanup)))
+/* Defines guard classes to create contexts */
+DEFINE_LOCK_GUARD_0(HARDIRQ, HARDIRQ_ENTER(), HARDIRQ_EXIT())
+DEFINE_LOCK_GUARD_0(NOTTHREADED_HARDIRQ,
+ do {
+ local_irq_disable();
+ __irq_enter();
+ WARN_ON(!in_irq());
+ } while(0), HARDIRQ_EXIT())
+DEFINE_LOCK_GUARD_0(SOFTIRQ, SOFTIRQ_ENTER(), SOFTIRQ_EXIT())
+
+/* Define RCU guards, should go away when RCU has its own guard definitions */
+DEFINE_LOCK_GUARD_0(RCU, rcu_read_lock(), rcu_read_unlock())
+DEFINE_LOCK_GUARD_0(RCU_BH, rcu_read_lock_bh(), rcu_read_unlock_bh())
+DEFINE_LOCK_GUARD_0(RCU_SCHED, rcu_read_lock_sched(), rcu_read_unlock_sched())
-static void hardirq_exit(int *_)
-{
- HARDIRQ_EXIT();
-}
-
-#define HARDIRQ_CONTEXT(name, ...) \
- int hardirq_guard_##name __guard(hardirq_exit); \
- HARDIRQ_ENTER();
-
-#define NOTTHREADED_HARDIRQ_CONTEXT(name, ...) \
- int notthreaded_hardirq_guard_##name __guard(hardirq_exit); \
- local_irq_disable(); \
- __irq_enter(); \
- WARN_ON(!in_irq());
-
-static void softirq_exit(int *_)
-{
- SOFTIRQ_EXIT();
-}
-
-#define SOFTIRQ_CONTEXT(name, ...) \
- int softirq_guard_##name __guard(softirq_exit); \
- SOFTIRQ_ENTER();
-
-static void rcu_exit(int *_)
-{
- rcu_read_unlock();
-}
-
-#define RCU_CONTEXT(name, ...) \
- int rcu_guard_##name __guard(rcu_exit); \
- rcu_read_lock();
-
-static void rcu_bh_exit(int *_)
-{
- rcu_read_unlock_bh();
-}
-
-#define RCU_BH_CONTEXT(name, ...) \
- int rcu_bh_guard_##name __guard(rcu_bh_exit); \
- rcu_read_lock_bh();
-
-static void rcu_sched_exit(int *_)
-{
- rcu_read_unlock_sched();
-}
-
-#define RCU_SCHED_CONTEXT(name, ...) \
- int rcu_sched_guard_##name __guard(rcu_sched_exit); \
- rcu_read_lock_sched();
-
-static void raw_spinlock_exit(raw_spinlock_t **lock)
-{
- raw_spin_unlock(*lock);
-}
-
-#define RAW_SPINLOCK_CONTEXT(name, lock) \
- raw_spinlock_t *raw_spinlock_guard_##name __guard(raw_spinlock_exit) = &(lock); \
- raw_spin_lock(&(lock));
-
-static void spinlock_exit(spinlock_t **lock)
-{
- spin_unlock(*lock);
-}
-
-#define SPINLOCK_CONTEXT(name, lock) \
- spinlock_t *spinlock_guard_##name __guard(spinlock_exit) = &(lock); \
- spin_lock(&(lock));
-
-static void mutex_exit(struct mutex **lock)
-{
- mutex_unlock(*lock);
-}
-
-#define MUTEX_CONTEXT(name, lock) \
- struct mutex *mutex_guard_##name __guard(mutex_exit) = &(lock); \
- mutex_lock(&(lock));
#define GENERATE_2_CONTEXT_TESTCASE(outer, outer_lock, inner, inner_lock) \
\
static void __maybe_unused inner##_in_##outer(void) \
{ \
- outer##_CONTEXT(_, outer_lock); \
- { \
- inner##_CONTEXT(_, inner_lock); \
- } \
+ /* Relies the reversed clean-up ordering: inner first */ \
+ guard(outer)(outer_lock); \
+ guard(inner)(inner_lock); \
}
/*
@@ -2632,21 +2567,21 @@ GENERATE_2_CONTEXT_TESTCASE(SOFTIRQ, , inner, inner_lock) \
GENERATE_2_CONTEXT_TESTCASE(RCU, , inner, inner_lock) \
GENERATE_2_CONTEXT_TESTCASE(RCU_BH, , inner, inner_lock) \
GENERATE_2_CONTEXT_TESTCASE(RCU_SCHED, , inner, inner_lock) \
-GENERATE_2_CONTEXT_TESTCASE(RAW_SPINLOCK, raw_lock_A, inner, inner_lock) \
-GENERATE_2_CONTEXT_TESTCASE(SPINLOCK, lock_A, inner, inner_lock) \
-GENERATE_2_CONTEXT_TESTCASE(MUTEX, mutex_A, inner, inner_lock)
+GENERATE_2_CONTEXT_TESTCASE(raw_spinlock, &raw_lock_A, inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(spinlock, &lock_A, inner, inner_lock) \
+GENERATE_2_CONTEXT_TESTCASE(mutex, &mutex_A, inner, inner_lock)
GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(RCU, )
-GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(RAW_SPINLOCK, raw_lock_B)
-GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(SPINLOCK, lock_B)
-GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(MUTEX, mutex_B)
+GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(raw_spinlock, &raw_lock_B)
+GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(spinlock, &lock_B)
+GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(mutex, &mutex_B)
/* the outer context allows all kinds of preemption */
#define DO_CONTEXT_TESTCASE_OUTER_PREEMPTIBLE(outer) \
dotest(RCU_in_##outer, SUCCESS, LOCKTYPE_RWLOCK); \
- dotest(RAW_SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
- dotest(SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
- dotest(MUTEX_in_##outer, SUCCESS, LOCKTYPE_MUTEX); \
+ dotest(raw_spinlock_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(spinlock_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(mutex_in_##outer, SUCCESS, LOCKTYPE_MUTEX); \
/*
* the outer context only allows the preemption introduced by spinlock_t (which
@@ -2654,16 +2589,16 @@ GENERATE_2_CONTEXT_TESTCASE_FOR_ALL_OUTER(MUTEX, mutex_B)
*/
#define DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(outer) \
dotest(RCU_in_##outer, SUCCESS, LOCKTYPE_RWLOCK); \
- dotest(RAW_SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
- dotest(SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
- dotest(MUTEX_in_##outer, FAILURE, LOCKTYPE_MUTEX); \
+ dotest(raw_spinlock_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(spinlock_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(mutex_in_##outer, FAILURE, LOCKTYPE_MUTEX); \
/* the outer doesn't allows any kind of preemption */
#define DO_CONTEXT_TESTCASE_OUTER_NOT_PREEMPTIBLE(outer) \
dotest(RCU_in_##outer, SUCCESS, LOCKTYPE_RWLOCK); \
- dotest(RAW_SPINLOCK_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
- dotest(SPINLOCK_in_##outer, FAILURE, LOCKTYPE_SPIN); \
- dotest(MUTEX_in_##outer, FAILURE, LOCKTYPE_MUTEX); \
+ dotest(raw_spinlock_in_##outer, SUCCESS, LOCKTYPE_SPIN); \
+ dotest(spinlock_in_##outer, FAILURE, LOCKTYPE_SPIN); \
+ dotest(mutex_in_##outer, FAILURE, LOCKTYPE_MUTEX); \
static void wait_context_tests(void)
{
@@ -2697,15 +2632,15 @@ static void wait_context_tests(void)
pr_cont("\n");
print_testname("in RAW_SPINLOCK context");
- DO_CONTEXT_TESTCASE_OUTER_NOT_PREEMPTIBLE(RAW_SPINLOCK);
+ DO_CONTEXT_TESTCASE_OUTER_NOT_PREEMPTIBLE(raw_spinlock);
pr_cont("\n");
print_testname("in SPINLOCK context");
- DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(SPINLOCK);
+ DO_CONTEXT_TESTCASE_OUTER_LIMITED_PREEMPTIBLE(spinlock);
pr_cont("\n");
print_testname("in MUTEX context");
- DO_CONTEXT_TESTCASE_OUTER_PREEMPTIBLE(MUTEX);
+ DO_CONTEXT_TESTCASE_OUTER_PREEMPTIBLE(mutex);
pr_cont("\n");
}
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 4dd73cf936a6..f723024e1426 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4265,6 +4265,10 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
* mas_wr_append: Attempt to append
* @wr_mas: the maple write state
*
+ * This is currently unsafe in rcu mode since the end of the node may be cached
+ * by readers while the node contents may be updated which could result in
+ * inaccurate information.
+ *
* Return: True if appended, false otherwise
*/
static inline bool mas_wr_append(struct ma_wr_state *wr_mas)
@@ -4274,6 +4278,9 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas)
struct ma_state *mas = wr_mas->mas;
unsigned char node_pivots = mt_pivots[wr_mas->type];
+ if (mt_in_rcu(mas->tree))
+ return false;
+
if (mas->offset != wr_mas->node_end)
return false;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 1a31065b2036..976b9bd02a1b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1136,7 +1136,6 @@ static void set_iter_tags(struct radix_tree_iter *iter,
void __rcu **radix_tree_iter_resume(void __rcu **slot,
struct radix_tree_iter *iter)
{
- slot++;
iter->index = __radix_tree_iter_add(iter, 1);
iter->next_index = iter->index;
iter->tags = 0;
diff --git a/mm/Makefile b/mm/Makefile
index 678530a07326..d4ee20988dd1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o show_mem.o\
+ compaction.o show_mem.o shmem_quota.o\
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 2fcc9731528a..e0e59d420fca 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -386,6 +386,7 @@ out:
static const struct mm_walk_ops damon_mkold_ops = {
.pmd_entry = damon_mkold_pmd_entry,
.hugetlb_entry = damon_mkold_hugetlb_entry,
+ .walk_lock = PGWALK_RDLOCK,
};
static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
@@ -525,6 +526,7 @@ out:
static const struct mm_walk_ops damon_young_ops = {
.pmd_entry = damon_young_pmd_entry,
.hugetlb_entry = damon_young_hugetlb_entry,
+ .walk_lock = PGWALK_RDLOCK,
};
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
diff --git a/mm/filemap.c b/mm/filemap.c
index 9e44a49bbd74..baafbf324c9f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1855,30 +1855,15 @@ out:
*
* Looks up the page cache entry at @mapping & @index.
*
- * @fgp_flags can be zero or more of these flags:
- *
- * * %FGP_ACCESSED - The folio will be marked accessed.
- * * %FGP_LOCK - The folio is returned locked.
- * * %FGP_CREAT - If no page is present then a new page is allocated using
- * @gfp and added to the page cache and the VM's LRU list.
- * The page is returned locked and with an increased refcount.
- * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
- * page is already in cache. If the page was allocated, unlock it before
- * returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written to by the caller.
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
- * * %FGP_NOWAIT - Don't get blocked by page lock.
- * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
- *
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
- * If there is a page cache page, it is returned with an increased refcount.
+ * If this function returns a folio, it is returned with an increased refcount.
*
* Return: The found folio or an ERR_PTR() otherwise.
*/
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+ fgf_t fgp_flags, gfp_t gfp)
{
struct folio *folio;
@@ -1920,7 +1905,9 @@ repeat:
folio_wait_stable(folio);
no_page:
if (!folio && (fgp_flags & FGP_CREAT)) {
+ unsigned order = FGF_GET_ORDER(fgp_flags);
int err;
+
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
@@ -1929,26 +1916,44 @@ no_page:
gfp &= ~GFP_KERNEL;
gfp |= GFP_NOWAIT | __GFP_NOWARN;
}
-
- folio = filemap_alloc_folio(gfp, 0);
- if (!folio)
- return ERR_PTR(-ENOMEM);
-
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
- /* Init accessed so avoid atomic mark_page_accessed later */
- if (fgp_flags & FGP_ACCESSED)
- __folio_set_referenced(folio);
+ if (!mapping_large_folio_support(mapping))
+ order = 0;
+ if (order > MAX_PAGECACHE_ORDER)
+ order = MAX_PAGECACHE_ORDER;
+ /* If we're not aligned, allocate a smaller folio */
+ if (index & ((1UL << order) - 1))
+ order = __ffs(index);
- err = filemap_add_folio(mapping, folio, index, gfp);
- if (unlikely(err)) {
+ do {
+ gfp_t alloc_gfp = gfp;
+
+ err = -ENOMEM;
+ if (order == 1)
+ order = 0;
+ if (order > 0)
+ alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ folio = filemap_alloc_folio(alloc_gfp, order);
+ if (!folio)
+ continue;
+
+ /* Init accessed so avoid atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ __folio_set_referenced(folio);
+
+ err = filemap_add_folio(mapping, folio, index, gfp);
+ if (!err)
+ break;
folio_put(folio);
folio = NULL;
- if (err == -EEXIST)
- goto repeat;
- }
+ } while (order-- > 0);
+ if (err == -EEXIST)
+ goto repeat;
+ if (err)
+ return ERR_PTR(err);
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index c6f056c20503..10c3247542cb 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru);
noinline
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+ fgf_t fgp_flags, gfp_t gfp)
{
struct folio *folio;
diff --git a/mm/gup.c b/mm/gup.c
index 76d222ccc3ff..6e2f9e9d6537 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -597,7 +597,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
pte = ptep_get(ptep);
if (!pte_present(pte))
goto no_page;
- if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
+ if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
goto no_page;
page = vm_normal_page(vma, address, pte);
@@ -714,7 +714,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
- if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
+ if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
return no_page_table(vma, flags);
ptl = pmd_lock(mm, pmd);
@@ -851,6 +851,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
return NULL;
+ /*
+ * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect
+ * to fail on PROT_NONE-mapped pages.
+ */
page = follow_page_mask(vma, address, foll_flags, &ctx);
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
@@ -2227,6 +2231,13 @@ static bool is_valid_gup_args(struct page **pages, int *locked,
gup_flags |= FOLL_UNLOCKABLE;
}
+ /*
+ * For now, always trigger NUMA hinting faults. Some GUP users like
+ * KVM require the hint to be as the calling context of GUP is
+ * functionally similar to a memory reference from task context.
+ */
+ gup_flags |= FOLL_HONOR_NUMA_FAULT;
+
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
@@ -2551,7 +2562,14 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
struct page *page;
struct folio *folio;
- if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
+ /*
+ * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
+ * pte_access_permitted() better should reject these pages
+ * either way: otherwise, GUP-fast might succeed in
+ * cases where ordinary GUP would fail due to VMA access
+ * permissions.
+ */
+ if (pte_protnone(pte))
goto pte_unmap;
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
@@ -2970,8 +2988,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
pmd_devmap(pmd))) {
- if (pmd_protnone(pmd) &&
- !gup_can_follow_protnone(flags))
+ /* See gup_pte_range() */
+ if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
@@ -3151,7 +3169,7 @@ static int internal_get_user_pages_fast(unsigned long start,
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
FOLL_FAST_ONLY | FOLL_NOFAULT |
- FOLL_PCI_P2PDMA)))
+ FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
return -EINVAL;
if (gup_flags & FOLL_PIN)
diff --git a/mm/hmm.c b/mm/hmm.c
index 855e25e59d8f..277ddcab4947 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -562,6 +562,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
.pte_hole = hmm_vma_walk_hole,
.hugetlb_entry = hmm_vma_walk_hugetlb_entry,
.test_walk = hmm_vma_walk_test,
+ .walk_lock = PGWALK_RDLOCK,
};
/**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index eb3678360b97..e4f0266a22d4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1467,8 +1467,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
return ERR_PTR(-EFAULT);
- /* Full NUMA hinting faults to serialise migration in fault paths */
- if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
+ if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
return NULL;
if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
@@ -1613,7 +1612,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this folio, we couldn't discard
* the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (folio_mapcount(folio) != 1)
+ if (folio_estimated_sharers(folio) != 1)
goto out;
if (!folio_trylock(folio))
@@ -2521,7 +2520,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
unsigned int nr = thp_nr_pages(head);
- int i;
+ int i, nr_dropped = 0;
/* complete memcg works before add pages to LRU */
split_page_memcg(head, nr);
@@ -2546,7 +2545,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct folio *tail = page_folio(head + i);
if (shmem_mapping(head->mapping))
- shmem_uncharge(head->mapping->host, 1);
+ nr_dropped++;
else if (folio_test_clear_dirty(tail))
folio_account_cleaned(tail,
inode_to_wb(folio->mapping->host));
@@ -2583,6 +2582,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
local_irq_enable();
+ if (nr_dropped)
+ shmem_uncharge(head->mapping->host, nr_dropped);
remap_page(folio, nr);
if (PageSwapCache(head)) {
diff --git a/mm/internal.h b/mm/internal.h
index a7d9e980429a..8ed127c1c808 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -924,6 +924,13 @@ int migrate_device_coherent_page(struct page *page);
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
int __must_check try_grab_page(struct page *page, unsigned int flags);
+/*
+ * mm/huge_memory.c
+ */
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd,
+ unsigned int flags);
+
enum {
/* mark page accessed */
FOLL_TOUCH = 1 << 16,
@@ -998,6 +1005,16 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
smp_rmb();
/*
+ * During GUP-fast we might not get called on the head page for a
+ * hugetlb page that is mapped using cont-PTE, because GUP-fast does
+ * not work with the abstracted hugetlb PTEs that always point at the
+ * head page. For hugetlb, PageAnonExclusive only applies on the head
+ * page (as it cannot be partially COW-shared), so lookup the head page.
+ */
+ if (unlikely(!PageHead(page) && PageHuge(page)))
+ page = compound_head(page);
+
+ /*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
*/
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 78c8d5d8b628..47d1d32c734f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1955,10 +1955,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
goto xa_locked;
}
}
- if (!shmem_charge(mapping->host, 1)) {
- result = SCAN_FAIL;
- goto xa_locked;
- }
nr_none++;
continue;
}
@@ -2145,8 +2141,13 @@ xa_unlocked:
*/
try_to_unmap_flush();
- if (result != SCAN_SUCCEED)
+ if (result == SCAN_SUCCEED && nr_none &&
+ !shmem_charge(mapping->host, nr_none))
+ result = SCAN_FAIL;
+ if (result != SCAN_SUCCEED) {
+ nr_none = 0;
goto rollback;
+ }
/*
* The old pages are locked, so they won't change anymore.
@@ -2283,8 +2284,8 @@ rollback:
if (nr_none) {
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
- shmem_uncharge(mapping->host, nr_none);
xas_unlock_irq(&xas);
+ shmem_uncharge(mapping->host, nr_none);
}
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
diff --git a/mm/ksm.c b/mm/ksm.c
index d20d7662419b..d7b5b95e936e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -455,6 +455,12 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
static const struct mm_walk_ops break_ksm_ops = {
.pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops break_ksm_lock_vma_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK,
};
/*
@@ -470,16 +476,17 @@ static const struct mm_walk_ops break_ksm_ops = {
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
{
vm_fault_t ret = 0;
+ const struct mm_walk_ops *ops = lock_vma ?
+ &break_ksm_lock_vma_ops : &break_ksm_ops;
do {
int ksm_page;
cond_resched();
- ksm_page = walk_page_range_vma(vma, addr, addr + 1,
- &break_ksm_ops, NULL);
+ ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
if (WARN_ON_ONCE(ksm_page < 0))
return ksm_page;
if (!ksm_page)
@@ -565,7 +572,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
- break_ksm(vma, addr);
+ break_ksm(vma, addr, false);
mmap_read_unlock(mm);
}
@@ -871,7 +878,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
* in cmp_and_merge_page on one of the rmap_items we would be removing.
*/
static int unmerge_ksm_pages(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end, bool lock_vma)
{
unsigned long addr;
int err = 0;
@@ -882,7 +889,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
if (signal_pending(current))
err = -ERESTARTSYS;
else
- err = break_ksm(vma, addr);
+ err = break_ksm(vma, addr, lock_vma);
}
return err;
}
@@ -1029,7 +1036,7 @@ static int unmerge_and_remove_all_rmap_items(void)
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
err = unmerge_ksm_pages(vma,
- vma->vm_start, vma->vm_end);
+ vma->vm_start, vma->vm_end, false);
if (err)
goto error;
}
@@ -2530,7 +2537,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
+ err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
if (err)
return err;
}
@@ -2668,7 +2675,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return 0; /* just ignore the advice */
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, start, end);
+ err = unmerge_ksm_pages(vma, start, end, true);
if (err)
return err;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 886f06066622..ec30f48f8f2e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -233,6 +233,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
static const struct mm_walk_ops swapin_walk_ops = {
.pmd_entry = swapin_walk_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
};
static void shmem_swapin_range(struct vm_area_struct *vma,
@@ -383,7 +384,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio = pfn_folio(pmd_pfn(orig_pmd));
/* Do not interfere with other mappings of this folio */
- if (folio_mapcount(folio) != 1)
+ if (folio_estimated_sharers(folio) != 1)
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -457,7 +458,7 @@ regular_folio:
if (folio_test_large(folio)) {
int err;
- if (folio_mapcount(folio) != 1)
+ if (folio_estimated_sharers(folio) != 1)
break;
if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
@@ -534,6 +535,7 @@ regular_folio:
static const struct mm_walk_ops cold_walk_ops = {
.pmd_entry = madvise_cold_or_pageout_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
@@ -678,7 +680,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (folio_test_large(folio)) {
int err;
- if (folio_mapcount(folio) != 1)
+ if (folio_estimated_sharers(folio) != 1)
break;
if (!folio_trylock(folio))
break;
@@ -757,6 +759,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops madvise_free_walk_ops = {
.pmd_entry = madvise_free_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static int madvise_free_single_vma(struct vm_area_struct *vma,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8ca4bdcb03c..315fd5f45e3c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6024,6 +6024,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
static const struct mm_walk_ops precharge_walk_ops = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
@@ -6303,6 +6304,7 @@ put: /* get_mctgt_type() gets & locks the page */
static const struct mm_walk_ops charge_walk_ops = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static void mem_cgroup_move_charge(void)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9a285038d765..fe121fdb05f7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -831,6 +831,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
static const struct mm_walk_ops hwp_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -2740,10 +2741,13 @@ retry:
if (ret > 0) {
ret = soft_offline_in_use_page(page);
} else if (ret == 0) {
- if (!page_handle_poison(page, true, false) && try_again) {
- try_again = false;
- flags &= ~MF_COUNT_INCREASED;
- goto retry;
+ if (!page_handle_poison(page, true, false)) {
+ if (try_again) {
+ try_again = false;
+ flags &= ~MF_COUNT_INCREASED;
+ goto retry;
+ }
+ ret = -EBUSY;
}
}
diff --git a/mm/memory.c b/mm/memory.c
index 1ec1ef3418bf..cdc4d4c1c858 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5257,11 +5257,8 @@ EXPORT_SYMBOL_GPL(handle_mm_fault);
static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
- /* Even if this succeeds, make it clear we *might* have slept */
- if (likely(mmap_read_trylock(mm))) {
- might_sleep();
+ if (likely(mmap_read_trylock(mm)))
return true;
- }
if (regs && !user_mode(regs)) {
unsigned long ip = instruction_pointer(regs);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c53f8beeb507..ec2eaceffd74 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -718,6 +718,14 @@ static const struct mm_walk_ops queue_pages_walk_ops = {
.hugetlb_entry = queue_folios_hugetlb,
.pmd_entry = queue_folios_pte_range,
.test_walk = queue_pages_test_walk,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
+ .hugetlb_entry = queue_folios_hugetlb,
+ .pmd_entry = queue_folios_pte_range,
+ .test_walk = queue_pages_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
};
/*
@@ -738,7 +746,7 @@ static const struct mm_walk_ops queue_pages_walk_ops = {
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
- struct list_head *pagelist)
+ struct list_head *pagelist, bool lock_vma)
{
int err;
struct queue_pages qp = {
@@ -749,8 +757,10 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
.end = end,
.first = NULL,
};
+ const struct mm_walk_ops *ops = lock_vma ?
+ &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
- err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+ err = walk_page_range(mm, start, end, ops, &qp);
if (!qp.first)
/* whole range in hole */
@@ -1078,7 +1088,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
vma = find_vma(mm, 0);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
- flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
@@ -1321,12 +1331,8 @@ static long do_mbind(unsigned long start, unsigned long len,
* Lock the VMAs before scanning for pages to migrate, to ensure we don't
* miss a concurrently inserted page.
*/
- vma_iter_init(&vmi, mm, start);
- for_each_vma_range(vmi, vma, end)
- vma_start_write(vma);
-
ret = queue_pages_range(mm, start, end, nmask,
- flags | MPOL_MF_INVERT, &pagelist);
+ flags | MPOL_MF_INVERT, &pagelist, true);
if (ret < 0) {
err = ret;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 8365158460ed..d5f492356e3e 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -279,6 +279,7 @@ next:
static const struct mm_walk_ops migrate_vma_walk_ops = {
.pmd_entry = migrate_vma_collect_pmd,
.pte_hole = migrate_vma_collect_hole,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
diff --git a/mm/mincore.c b/mm/mincore.c
index b7f7a516b26c..dad3622cc963 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -176,6 +176,7 @@ static const struct mm_walk_ops mincore_walk_ops = {
.pmd_entry = mincore_pte_range,
.pte_hole = mincore_unmapped_range,
.hugetlb_entry = mincore_hugetlb,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 0a0c996c5c21..479e09d0994c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -371,6 +371,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma,
{
static const struct mm_walk_ops mlock_walk_ops = {
.pmd_entry = mlock_pte_range,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6f658d483704..3aef1340533a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -568,6 +568,7 @@ static const struct mm_walk_ops prot_none_walk_ops = {
.pte_entry = prot_none_pte_entry,
.hugetlb_entry = prot_none_hugetlb_entry,
.test_walk = prot_none_test,
+ .walk_lock = PGWALK_WRLOCK,
};
int
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d3f42009bb70..b8d3d7040a50 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1193,7 +1193,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
* write_bandwidth = ---------------------------------------------------
* period
*
- * @written may have decreased due to folio_account_redirty().
+ * @written may have decreased due to folio_redirty_for_writepage().
* Avoid underflowing @bw calculation.
*/
bw = written - min(written, wb->written_stamp);
@@ -2712,37 +2712,6 @@ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
EXPORT_SYMBOL(filemap_dirty_folio);
/**
- * folio_account_redirty - Manually account for redirtying a page.
- * @folio: The folio which is being redirtied.
- *
- * Most filesystems should call folio_redirty_for_writepage() instead
- * of this fuction. If your filesystem is doing writeback outside the
- * context of a writeback_control(), it can call this when redirtying
- * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
- * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
- * WB_WRITTEN) in long term. The mismatches will lead to systematic errors
- * in balanced_dirty_ratelimit and the dirty pages position control.
- */
-void folio_account_redirty(struct folio *folio)
-{
- struct address_space *mapping = folio->mapping;
-
- if (mapping && mapping_can_writeback(mapping)) {
- struct inode *inode = mapping->host;
- struct bdi_writeback *wb;
- struct wb_lock_cookie cookie = {};
- long nr = folio_nr_pages(folio);
-
- wb = unlocked_inode_to_wb_begin(inode, &cookie);
- current->nr_dirtied -= nr;
- node_stat_mod_folio(folio, NR_DIRTIED, -nr);
- wb_stat_mod(wb, WB_DIRTIED, -nr);
- unlocked_inode_to_wb_end(inode, &cookie);
- }
-}
-EXPORT_SYMBOL(folio_account_redirty);
-
-/**
* folio_redirty_for_writepage - Decline to write a dirty folio.
* @wbc: The writeback control.
* @folio: The folio.
@@ -2757,13 +2726,23 @@ EXPORT_SYMBOL(folio_account_redirty);
bool folio_redirty_for_writepage(struct writeback_control *wbc,
struct folio *folio)
{
- bool ret;
+ struct address_space *mapping = folio->mapping;
long nr = folio_nr_pages(folio);
+ bool ret;
wbc->pages_skipped += nr;
- ret = filemap_dirty_folio(folio->mapping, folio);
- folio_account_redirty(folio);
+ ret = filemap_dirty_folio(mapping, folio);
+ if (mapping && mapping_can_writeback(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ current->nr_dirtied -= nr;
+ node_stat_mod_folio(folio, NR_DIRTIED, -nr);
+ wb_stat_mod(wb, WB_DIRTIED, -nr);
+ unlocked_inode_to_wb_end(inode, &cookie);
+ }
return ret;
}
EXPORT_SYMBOL(folio_redirty_for_writepage);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2022333805d3..9b2d23fbf4d3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -400,6 +400,33 @@ static int __walk_page_range(unsigned long start, unsigned long end,
return err;
}
+static inline void process_mm_walk_lock(struct mm_struct *mm,
+ enum page_walk_lock walk_lock)
+{
+ if (walk_lock == PGWALK_RDLOCK)
+ mmap_assert_locked(mm);
+ else
+ mmap_assert_write_locked(mm);
+}
+
+static inline void process_vma_walk_lock(struct vm_area_struct *vma,
+ enum page_walk_lock walk_lock)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ switch (walk_lock) {
+ case PGWALK_WRLOCK:
+ vma_start_write(vma);
+ break;
+ case PGWALK_WRLOCK_VERIFY:
+ vma_assert_write_locked(vma);
+ break;
+ case PGWALK_RDLOCK:
+ /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
+ break;
+ }
+#endif
+}
+
/**
* walk_page_range - walk page table with caller specific callbacks
* @mm: mm_struct representing the target process of page table walk
@@ -459,7 +486,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (!walk.mm)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
vma = find_vma(walk.mm, start);
do {
@@ -474,6 +501,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (ops->pte_hole)
err = ops->pte_hole(start, next, -1, &walk);
} else { /* inside vma */
+ process_vma_walk_lock(vma, ops->walk_lock);
walk.vma = vma;
next = min(end, vma->vm_end);
vma = find_vma(mm, vma->vm_end);
@@ -549,7 +577,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
+ process_vma_walk_lock(vma, ops->walk_lock);
return __walk_page_range(start, end, &walk);
}
@@ -566,7 +595,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
if (!walk.mm)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ process_mm_walk_lock(walk.mm, ops->walk_lock);
+ process_vma_walk_lock(vma, ops->walk_lock);
return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index a9c999aa19af..e815c114de21 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -461,19 +461,6 @@ static int try_context_readahead(struct address_space *mapping,
return 1;
}
-/*
- * There are some parts of the kernel which assume that PMD entries
- * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
- * limit the maximum allocation order to PMD size. I'm not aware of any
- * assumptions about maximum order if THP are disabled, but 8 seems like
- * a good order (that's 1MB if you're using 4kB pages)
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
-#else
-#define MAX_PAGECACHE_ORDER 8
-#endif
-
static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
pgoff_t mark, unsigned int order, gfp_t gfp)
{
diff --git a/mm/shmem.c b/mm/shmem.c
index f5af4b943e42..479d1ce65868 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,6 +78,7 @@ static struct vfsmount *shm_mnt;
#include <uapi/linux/memfd.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
+#include <linux/quotaops.h>
#include <linux/uaccess.h>
@@ -89,6 +90,9 @@ static struct vfsmount *shm_mnt;
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
+/* Pretend that one inode + its dentry occupy this much memory */
+#define BOGO_INODE_SIZE 1024
+
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128
@@ -116,11 +120,14 @@ struct shmem_options {
int huge;
int seen;
bool noswap;
+ unsigned short quota_types;
+ struct shmem_quota_limits qlimits;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
#define SHMEM_SEEN_NOSWAP 16
+#define SHMEM_SEEN_QUOTA 32
};
#ifdef CONFIG_TMPFS
@@ -133,7 +140,8 @@ static unsigned long shmem_default_max_inodes(void)
{
unsigned long nr_pages = totalram_pages();
- return min(nr_pages - totalhigh_pages(), nr_pages / 2);
+ return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
+ ULONG_MAX / BOGO_INODE_SIZE);
}
#endif
@@ -199,33 +207,47 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
-static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
+static int shmem_inode_acct_block(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ int err = -ENOSPC;
if (shmem_acct_block(info->flags, pages))
- return false;
+ return err;
+ might_sleep(); /* when quotas */
if (sbinfo->max_blocks) {
if (percpu_counter_compare(&sbinfo->used_blocks,
sbinfo->max_blocks - pages) > 0)
goto unacct;
+
+ err = dquot_alloc_block_nodirty(inode, pages);
+ if (err)
+ goto unacct;
+
percpu_counter_add(&sbinfo->used_blocks, pages);
+ } else {
+ err = dquot_alloc_block_nodirty(inode, pages);
+ if (err)
+ goto unacct;
}
- return true;
+ return 0;
unacct:
shmem_unacct_blocks(info->flags, pages);
- return false;
+ return err;
}
-static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
+static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ might_sleep(); /* when quotas */
+ dquot_free_block_nodirty(inode, pages);
+
if (sbinfo->max_blocks)
percpu_counter_sub(&sbinfo->used_blocks, pages);
shmem_unacct_blocks(info->flags, pages);
@@ -254,6 +276,47 @@ bool vma_is_shmem(struct vm_area_struct *vma)
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
+#ifdef CONFIG_TMPFS_QUOTA
+
+static int shmem_enable_quotas(struct super_block *sb,
+ unsigned short quota_types)
+{
+ int type, err = 0;
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
+ for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
+ if (!(quota_types & (1 << type)))
+ continue;
+ err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
+ DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
+ if (err)
+ goto out_err;
+ }
+ return 0;
+
+out_err:
+ pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
+ type, err);
+ for (type--; type >= 0; type--)
+ dquot_quota_off(sb, type);
+ return err;
+}
+
+static void shmem_disable_quotas(struct super_block *sb)
+{
+ int type;
+
+ for (type = 0; type < SHMEM_MAXQUOTAS; type++)
+ dquot_quota_off(sb, type);
+}
+
+static struct dquot **shmem_get_dquots(struct inode *inode)
+{
+ return SHMEM_I(inode)->i_dquot;
+}
+#endif /* CONFIG_TMPFS_QUOTA */
+
/*
* shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
* produces a novel ino for the newly allocated inode.
@@ -272,11 +335,11 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
if (!(sb->s_flags & SB_KERNMOUNT)) {
raw_spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_inodes) {
- if (!sbinfo->free_inodes) {
+ if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
raw_spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
- sbinfo->free_inodes--;
+ sbinfo->free_ispace -= BOGO_INODE_SIZE;
}
if (inop) {
ino = sbinfo->next_ino++;
@@ -330,12 +393,12 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
return 0;
}
-static void shmem_free_inode(struct super_block *sb)
+static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
raw_spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
+ sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -343,62 +406,65 @@ static void shmem_free_inode(struct super_block *sb)
/**
* shmem_recalc_inode - recalculate the block usage of an inode
* @inode: inode to recalc
+ * @alloced: the change in number of pages allocated to inode
+ * @swapped: the change in number of pages swapped from inode
*
* We have to calculate the free blocks since the mm can drop
* undirtied hole pages behind our back.
*
* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
- *
- * It has to be called with the spinlock held.
*/
-static void shmem_recalc_inode(struct inode *inode)
+static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
struct shmem_inode_info *info = SHMEM_I(inode);
long freed;
- freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
- if (freed > 0) {
+ spin_lock(&info->lock);
+ info->alloced += alloced;
+ info->swapped += swapped;
+ freed = info->alloced - info->swapped -
+ READ_ONCE(inode->i_mapping->nrpages);
+ /*
+ * Special case: whereas normally shmem_recalc_inode() is called
+ * after i_mapping->nrpages has already been adjusted (up or down),
+ * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * to stop a racing shmem_recalc_inode() from thinking that a page has
+ * been freed. Compensate here, to avoid the need for a followup call.
+ */
+ if (swapped > 0)
+ freed += swapped;
+ if (freed > 0)
info->alloced -= freed;
- inode->i_blocks -= freed * BLOCKS_PER_PAGE;
+ spin_unlock(&info->lock);
+
+ /* The quota case may block */
+ if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
- }
}
bool shmem_charge(struct inode *inode, long pages)
{
- struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long flags;
+ struct address_space *mapping = inode->i_mapping;
- if (!shmem_inode_acct_block(inode, pages))
+ if (shmem_inode_acct_block(inode, pages))
return false;
/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
- inode->i_mapping->nrpages += pages;
-
- spin_lock_irqsave(&info->lock, flags);
- info->alloced += pages;
- inode->i_blocks += pages * BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irqrestore(&info->lock, flags);
+ xa_lock_irq(&mapping->i_pages);
+ mapping->nrpages += pages;
+ xa_unlock_irq(&mapping->i_pages);
+ shmem_recalc_inode(inode, pages, 0);
return true;
}
void shmem_uncharge(struct inode *inode, long pages)
{
- struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long flags;
-
+ /* pages argument is currently unused: keep it to help debugging */
/* nrpages adjustment done by __filemap_remove_folio() or caller */
- spin_lock_irqsave(&info->lock, flags);
- info->alloced -= pages;
- inode->i_blocks -= pages * BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irqrestore(&info->lock, flags);
-
- shmem_inode_unacct_blocks(inode, pages);
+ shmem_recalc_inode(inode, 0, 0);
}
/*
@@ -806,14 +872,16 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned long swapped = 0;
+ unsigned long max = end - 1;
rcu_read_lock();
- xas_for_each(&xas, page, end - 1) {
+ xas_for_each(&xas, page, max) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page))
swapped++;
-
+ if (xas.xa_index == max)
+ break;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
@@ -1038,16 +1106,13 @@ whole_folios:
folio_batch_release(&fbatch);
}
- spin_lock_irq(&info->lock);
- info->swapped -= nr_swaps_freed;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -1059,11 +1124,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
- if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
- }
+ if (info->alloced - info->swapped != inode->i_mapping->nrpages)
+ shmem_recalc_inode(inode, 0, 0);
+
if (info->fsflags & FS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (info->fsflags & FS_IMMUTABLE_FL)
@@ -1073,7 +1136,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
if (shmem_is_huge(inode, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
@@ -1140,13 +1203,28 @@ static int shmem_setattr(struct mnt_idmap *idmap,
}
}
+ if (is_quota_modification(idmap, inode, attr)) {
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
+ }
+
+ /* Transfer quota accounting */
+ if (i_uid_needs_update(idmap, attr, inode) ||
+ i_gid_needs_update(idmap, attr, inode)) {
+ error = dquot_transfer(idmap, inode, attr);
+
+ if (error)
+ return error;
+ }
+
setattr_copy(idmap, inode, attr);
if (attr->ia_valid & ATTR_MODE)
error = posix_acl_chmod(idmap, dentry, inode->i_mode);
if (!error && update_ctime) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (update_mtime)
- inode->i_mtime = inode->i_ctime;
+ inode->i_mtime = inode_get_ctime(inode);
inode_inc_iversion(inode);
}
return error;
@@ -1156,6 +1234,7 @@ static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ size_t freed = 0;
if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
@@ -1182,10 +1261,14 @@ static void shmem_evict_inode(struct inode *inode)
}
}
- simple_xattrs_free(&info->xattrs);
+ simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
+ shmem_free_inode(inode->i_sb, freed);
WARN_ON(inode->i_blocks);
- shmem_free_inode(inode->i_sb);
clear_inode(inode);
+#ifdef CONFIG_TMPFS_QUOTA
+ dquot_free_inode(inode);
+ dquot_drop(inode);
+#endif
}
static int shmem_find_swap_entries(struct address_space *mapping,
@@ -1429,11 +1512,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (add_to_swap_cache(folio, swap,
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
NULL) == 0) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- info->swapped++;
- spin_unlock_irq(&info->lock);
-
+ shmem_recalc_inode(inode, 0, 1);
swap_shmem_alloc(swap);
shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
@@ -1588,13 +1667,14 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
struct shmem_inode_info *info = SHMEM_I(inode);
struct folio *folio;
int nr;
- int err = -ENOSPC;
+ int err;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
- if (!shmem_inode_acct_block(inode, nr))
+ err = shmem_inode_acct_block(inode, nr);
+ if (err)
goto failed;
if (huge)
@@ -1703,7 +1783,6 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
struct folio *folio, swp_entry_t swap)
{
struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info = SHMEM_I(inode);
swp_entry_t swapin_error;
void *old;
@@ -1716,16 +1795,12 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
folio_wait_writeback(folio);
delete_from_swap_cache(folio);
- spin_lock_irq(&info->lock);
/*
- * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
- * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
- * shmem_evict_inode.
+ * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
+ * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
+ * in shmem_evict_inode().
*/
- info->alloced--;
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, -1, -1);
swap_free(swap);
}
@@ -1812,10 +1887,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (error)
goto failed;
- spin_lock_irq(&info->lock);
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, -1);
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
@@ -1980,13 +2052,9 @@ alloc_nohuge:
charge_mm);
if (error)
goto unacct;
- folio_add_lru(folio);
- spin_lock_irq(&info->lock);
- info->alloced += folio_nr_pages(folio);
- inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ folio_add_lru(folio);
+ shmem_recalc_inode(inode, folio_nr_pages(folio), 0);
alloced = true;
if (folio_test_pmd_mappable(folio) &&
@@ -2035,9 +2103,7 @@ clear:
if (alloced) {
folio_clear_dirty(folio);
filemap_remove_folio(folio);
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, 0);
}
error = -EINVAL;
goto unlock;
@@ -2063,9 +2129,7 @@ unlock:
folio_put(folio);
}
if (error == -ENOSPC && !once++) {
- spin_lock_irq(&info->lock);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ shmem_recalc_inode(inode, 0, 0);
goto repeat;
}
if (error == -EEXIST)
@@ -2326,6 +2390,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int shmem_file_open(struct inode *inode, struct file *file)
+{
+ file->f_mode |= FMODE_CAN_ODIRECT;
+ return generic_file_open(inode, file);
+}
+
#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
@@ -2355,77 +2425,127 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
#define shmem_initxattrs NULL
#endif
-static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
- struct inode *dir, umode_t mode, dev_t dev,
- unsigned long flags)
+static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
+{
+ return &SHMEM_I(inode)->dir_offsets;
+}
+
+static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb,
+ struct inode *dir, umode_t mode,
+ dev_t dev, unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
ino_t ino;
+ int err;
+
+ err = shmem_reserve_inode(sb, &ino);
+ if (err)
+ return ERR_PTR(err);
- if (shmem_reserve_inode(sb, &ino))
- return NULL;
inode = new_inode(sb);
- if (inode) {
- inode->i_ino = ino;
- inode_init_owner(idmap, inode, dir, mode);
- inode->i_blocks = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = get_random_u32();
- info = SHMEM_I(inode);
- memset(info, 0, (char *)inode - (char *)info);
- spin_lock_init(&info->lock);
- atomic_set(&info->stop_eviction, 0);
- info->seals = F_SEAL_SEAL;
- info->flags = flags & VM_NORESERVE;
- info->i_crtime = inode->i_mtime;
- info->fsflags = (dir == NULL) ? 0 :
- SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
- if (info->fsflags)
- shmem_set_inode_flags(inode, info->fsflags);
- INIT_LIST_HEAD(&info->shrinklist);
- INIT_LIST_HEAD(&info->swaplist);
- if (sbinfo->noswap)
- mapping_set_unevictable(inode->i_mapping);
- simple_xattrs_init(&info->xattrs);
- cache_no_acl(inode);
- mapping_set_large_folios(inode->i_mapping);
-
- switch (mode & S_IFMT) {
- default:
- inode->i_op = &shmem_special_inode_operations;
- init_special_inode(inode, mode, dev);
- break;
- case S_IFREG:
- inode->i_mapping->a_ops = &shmem_aops;
- inode->i_op = &shmem_inode_operations;
- inode->i_fop = &shmem_file_operations;
- mpol_shared_policy_init(&info->policy,
- shmem_get_sbmpol(sbinfo));
- break;
- case S_IFDIR:
- inc_nlink(inode);
- /* Some things misbehave if size == 0 on a directory */
- inode->i_size = 2 * BOGO_DIRENT_SIZE;
- inode->i_op = &shmem_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
- break;
- case S_IFLNK:
- /*
- * Must not load anything in the rbtree,
- * mpol_free_shared_policy will not be called.
- */
- mpol_shared_policy_init(&info->policy, NULL);
- break;
- }
+ if (!inode) {
+ shmem_free_inode(sb, 0);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ inode->i_ino = ino;
+ inode_init_owner(idmap, inode, dir, mode);
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ inode->i_generation = get_random_u32();
+ info = SHMEM_I(inode);
+ memset(info, 0, (char *)inode - (char *)info);
+ spin_lock_init(&info->lock);
+ atomic_set(&info->stop_eviction, 0);
+ info->seals = F_SEAL_SEAL;
+ info->flags = flags & VM_NORESERVE;
+ info->i_crtime = inode->i_mtime;
+ info->fsflags = (dir == NULL) ? 0 :
+ SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
+ if (info->fsflags)
+ shmem_set_inode_flags(inode, info->fsflags);
+ INIT_LIST_HEAD(&info->shrinklist);
+ INIT_LIST_HEAD(&info->swaplist);
+ INIT_LIST_HEAD(&info->swaplist);
+ if (sbinfo->noswap)
+ mapping_set_unevictable(inode->i_mapping);
+ simple_xattrs_init(&info->xattrs);
+ cache_no_acl(inode);
+ mapping_set_large_folios(inode->i_mapping);
+
+ switch (mode & S_IFMT) {
+ default:
+ inode->i_op = &shmem_special_inode_operations;
+ init_special_inode(inode, mode, dev);
+ break;
+ case S_IFREG:
+ inode->i_mapping->a_ops = &shmem_aops;
+ inode->i_op = &shmem_inode_operations;
+ inode->i_fop = &shmem_file_operations;
+ mpol_shared_policy_init(&info->policy,
+ shmem_get_sbmpol(sbinfo));
+ break;
+ case S_IFDIR:
+ inc_nlink(inode);
+ /* Some things misbehave if size == 0 on a directory */
+ inode->i_size = 2 * BOGO_DIRENT_SIZE;
+ inode->i_op = &shmem_dir_inode_operations;
+ inode->i_fop = &simple_offset_dir_operations;
+ simple_offset_init(shmem_get_offset_ctx(inode));
+ break;
+ case S_IFLNK:
+ /*
+ * Must not load anything in the rbtree,
+ * mpol_free_shared_policy will not be called.
+ */
+ mpol_shared_policy_init(&info->policy, NULL);
+ break;
+ }
+
+ lockdep_annotate_inode_mutex_key(inode);
+ return inode;
+}
- lockdep_annotate_inode_mutex_key(inode);
- } else
- shmem_free_inode(sb);
+#ifdef CONFIG_TMPFS_QUOTA
+static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ int err;
+ struct inode *inode;
+
+ inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
+ if (IS_ERR(inode))
+ return inode;
+
+ err = dquot_initialize(inode);
+ if (err)
+ goto errout;
+
+ err = dquot_alloc_inode(inode);
+ if (err) {
+ dquot_drop(inode);
+ goto errout;
+ }
return inode;
+
+errout:
+ inode->i_flags |= S_NOQUOTA;
+ iput(inode);
+ return ERR_PTR(err);
+}
+#else
+static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
}
+#endif /* CONFIG_TMPFS_QUOTA */
#ifdef CONFIG_USERFAULTFD
int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
@@ -2445,7 +2565,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
int ret;
pgoff_t max_off;
- if (!shmem_inode_acct_block(inode, 1)) {
+ if (shmem_inode_acct_block(inode, 1)) {
/*
* We may have got a page, returned -ENOENT triggering a retry,
* and now we find ourselves with -ENOMEM. Release the page, to
@@ -2527,12 +2647,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
if (ret)
goto out_delete_from_cache;
- spin_lock_irq(&info->lock);
- info->alloced++;
- inode->i_blocks += BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
-
+ shmem_recalc_inode(inode, 1, 0);
folio_unlock(folio);
return 0;
out_delete_from_cache:
@@ -2731,6 +2846,28 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
+static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto unlock;
+ ret = file_remove_privs(file);
+ if (ret)
+ goto unlock;
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+ ret = generic_perform_write(iocb, from);
+unlock:
+ inode_unlock(inode);
+ return ret;
+}
+
static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -3055,7 +3192,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
- buf->f_ffree = sbinfo->free_inodes;
+ buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
}
/* else leave those fields 0 like simple_statfs */
@@ -3072,27 +3209,32 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
- int error = -ENOSPC;
+ int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
- if (inode) {
- error = simple_acl_create(dir, inode);
- if (error)
- goto out_iput;
- error = security_inode_init_security(inode, dir,
- &dentry->d_name,
- shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP)
- goto out_iput;
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- error = 0;
- dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = current_time(dir);
- inode_inc_iversion(dir);
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
- }
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+
+ error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (error)
+ goto out_iput;
+
+ dir->i_size += BOGO_DIRENT_SIZE;
+ dir->i_mtime = inode_set_ctime_current(dir);
+ inode_inc_iversion(dir);
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
return error;
+
out_iput:
iput(inode);
return error;
@@ -3103,20 +3245,26 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
struct inode *inode;
- int error = -ENOSPC;
+ int error;
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
- if (inode) {
- error = security_inode_init_security(inode, dir,
- NULL,
- shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP)
- goto out_iput;
- error = simple_acl_create(dir, inode);
- if (error)
- goto out_iput;
- d_tmpfile(file, inode);
+
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
+ goto err_out;
}
+
+ error = security_inode_init_security(inode, dir,
+ NULL,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ d_tmpfile(file, inode);
+
+err_out:
return finish_open_simple(file, error);
out_iput:
iput(inode);
@@ -3162,8 +3310,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
goto out;
}
+ ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (ret) {
+ if (inode->i_nlink)
+ shmem_free_inode(inode->i_sb, 0);
+ goto out;
+ }
+
dir->i_size += BOGO_DIRENT_SIZE;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
@@ -3178,10 +3334,13 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
- shmem_free_inode(inode->i_sb);
+ shmem_free_inode(inode->i_sb, 0);
+
+ simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
dir->i_size -= BOGO_DIRENT_SIZE;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
@@ -3238,24 +3397,29 @@ static int shmem_rename2(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
+ int error;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ return simple_offset_rename_exchange(old_dir, old_dentry,
+ new_dir, new_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
- int error;
-
error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error)
return error;
}
+ simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry);
+ error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry);
+ if (error)
+ return error;
+
if (d_really_is_positive(new_dentry)) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs) {
@@ -3269,9 +3433,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
old_dir->i_size -= BOGO_DIRENT_SIZE;
new_dir->i_size += BOGO_DIRENT_SIZE;
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- inode->i_ctime = current_time(old_dir);
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
return 0;
@@ -3291,31 +3453,32 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
VM_NORESERVE);
- if (!inode)
- return -ENOSPC;
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+
+ error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
+ if (error)
+ goto out_iput;
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
inode->i_link = kmemdup(symname, len, GFP_KERNEL);
if (!inode->i_link) {
- iput(inode);
- return -ENOMEM;
+ error = -ENOMEM;
+ goto out_remove_offset;
}
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
- if (error) {
- iput(inode);
- return error;
- }
+ if (error)
+ goto out_remove_offset;
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
memcpy(folio_address(folio), symname, len);
@@ -3325,11 +3488,17 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
return 0;
+
+out_remove_offset:
+ simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
+out_iput:
+ iput(inode);
+ return error;
}
static void shmem_put_link(void *arg)
@@ -3397,7 +3566,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap,
(fa->flags & SHMEM_FL_USER_MODIFIABLE);
shmem_set_inode_flags(inode, info->fsflags);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
return 0;
}
@@ -3417,21 +3586,40 @@ static int shmem_initxattrs(struct inode *inode,
void *fs_info)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
const struct xattr *xattr;
struct simple_xattr *new_xattr;
+ size_t ispace = 0;
size_t len;
+ if (sbinfo->max_inodes) {
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ ispace += simple_xattr_space(xattr->name,
+ xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
+ }
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_ispace < ispace)
+ ispace = 0;
+ else
+ sbinfo->free_ispace -= ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ if (!ispace)
+ return -ENOSPC;
+ }
+ }
+
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
if (!new_xattr)
- return -ENOMEM;
+ break;
len = strlen(xattr->name) + 1;
new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!new_xattr->name) {
kvfree(new_xattr);
- return -ENOMEM;
+ break;
}
memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
@@ -3442,6 +3630,16 @@ static int shmem_initxattrs(struct inode *inode,
simple_xattr_add(&info->xattrs, new_xattr);
}
+ if (xattr->name != NULL) {
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_ispace += ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ }
+ simple_xattrs_free(&info->xattrs, NULL);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -3462,15 +3660,40 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- int err;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct simple_xattr *old_xattr;
+ size_t ispace = 0;
name = xattr_full_name(handler, name);
- err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
- if (!err) {
- inode->i_ctime = current_time(inode);
+ if (value && sbinfo->max_inodes) {
+ ispace = simple_xattr_space(name, size);
+ raw_spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_ispace < ispace)
+ ispace = 0;
+ else
+ sbinfo->free_ispace -= ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ if (!ispace)
+ return -ENOSPC;
+ }
+
+ old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
+ if (!IS_ERR(old_xattr)) {
+ ispace = 0;
+ if (old_xattr && sbinfo->max_inodes)
+ ispace = simple_xattr_space(old_xattr->name,
+ old_xattr->size);
+ simple_xattr_free(old_xattr);
+ old_xattr = NULL;
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
}
- return err;
+ if (ispace) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_ispace += ispace;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ }
+ return PTR_ERR(old_xattr);
}
static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3485,9 +3708,16 @@ static const struct xattr_handler shmem_trusted_xattr_handler = {
.set = shmem_xattr_handler_set,
};
+static const struct xattr_handler shmem_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
+
static const struct xattr_handler *shmem_xattr_handlers[] = {
&shmem_security_xattr_handler,
&shmem_trusted_xattr_handler,
+ &shmem_user_xattr_handler,
NULL
};
@@ -3500,6 +3730,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
static const struct inode_operations shmem_short_symlink_operations = {
.getattr = shmem_getattr,
+ .setattr = shmem_setattr,
.get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3508,6 +3739,7 @@ static const struct inode_operations shmem_short_symlink_operations = {
static const struct inode_operations shmem_symlink_inode_operations = {
.getattr = shmem_getattr,
+ .setattr = shmem_setattr,
.get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3607,6 +3839,13 @@ enum shmem_param {
Opt_inode32,
Opt_inode64,
Opt_noswap,
+ Opt_quota,
+ Opt_usrquota,
+ Opt_grpquota,
+ Opt_usrquota_block_hardlimit,
+ Opt_usrquota_inode_hardlimit,
+ Opt_grpquota_block_hardlimit,
+ Opt_grpquota_inode_hardlimit,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3629,6 +3868,15 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_flag ("inode32", Opt_inode32),
fsparam_flag ("inode64", Opt_inode64),
fsparam_flag ("noswap", Opt_noswap),
+#ifdef CONFIG_TMPFS_QUOTA
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
+ fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
+ fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
+ fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
+#endif
{}
};
@@ -3639,6 +3887,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
unsigned long long size;
char *rest;
int opt;
+ kuid_t kuid;
+ kgid_t kgid;
opt = fs_parse(fc, shmem_fs_parameters, param, &result);
if (opt < 0)
@@ -3660,13 +3910,13 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_nr_blocks:
ctx->blocks = memparse(param->string, &rest);
- if (*rest || ctx->blocks > S64_MAX)
+ if (*rest || ctx->blocks > LONG_MAX)
goto bad_value;
ctx->seen |= SHMEM_SEEN_BLOCKS;
break;
case Opt_nr_inodes:
ctx->inodes = memparse(param->string, &rest);
- if (*rest)
+ if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
goto bad_value;
ctx->seen |= SHMEM_SEEN_INODES;
break;
@@ -3674,14 +3924,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->mode = result.uint_32 & 07777;
break;
case Opt_uid:
- ctx->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(ctx->uid))
+ kuid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(kuid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, kuid))
goto bad_value;
+
+ ctx->uid = kuid;
break;
case Opt_gid:
- ctx->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(ctx->gid))
+ kgid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(kgid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, kgid))
goto bad_value;
+
+ ctx->gid = kgid;
break;
case Opt_huge:
ctx->huge = result.uint_32;
@@ -3720,6 +3988,60 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
ctx->noswap = true;
ctx->seen |= SHMEM_SEEN_NOSWAP;
break;
+ case Opt_quota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
+ break;
+ case Opt_usrquota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= QTYPE_MASK_USR;
+ break;
+ case Opt_grpquota:
+ if (fc->user_ns != &init_user_ns)
+ return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
+ ctx->seen |= SHMEM_SEEN_QUOTA;
+ ctx->quota_types |= QTYPE_MASK_GRP;
+ break;
+ case Opt_usrquota_block_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
+ return invalfc(fc,
+ "User quota block hardlimit too large.");
+ ctx->qlimits.usrquota_bhardlimit = size;
+ break;
+ case Opt_grpquota_block_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
+ return invalfc(fc,
+ "Group quota block hardlimit too large.");
+ ctx->qlimits.grpquota_bhardlimit = size;
+ break;
+ case Opt_usrquota_inode_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
+ return invalfc(fc,
+ "User quota inode hardlimit too large.");
+ ctx->qlimits.usrquota_ihardlimit = size;
+ break;
+ case Opt_grpquota_inode_hardlimit:
+ size = memparse(param->string, &rest);
+ if (*rest || !size)
+ goto bad_value;
+ if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
+ return invalfc(fc,
+ "Group quota inode hardlimit too large.");
+ ctx->qlimits.grpquota_ihardlimit = size;
+ break;
}
return 0;
@@ -3775,21 +4097,17 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
/*
* Reconfigure a shmem filesystem.
- *
- * Note that we disallow change from limited->unlimited blocks/inodes while any
- * are in use; but we must separately disallow unlimited->limited, because in
- * that case we have no record of how much is already in use.
*/
static int shmem_reconfigure(struct fs_context *fc)
{
struct shmem_options *ctx = fc->fs_private;
struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
- unsigned long inodes;
+ unsigned long used_isp;
struct mempolicy *mpol = NULL;
const char *err;
raw_spin_lock(&sbinfo->stat_lock);
- inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
@@ -3807,7 +4125,7 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Cannot retroactively limit inodes";
goto out;
}
- if (ctx->inodes < inodes) {
+ if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
err = "Too few inodes for current use";
goto out;
}
@@ -3827,6 +4145,24 @@ static int shmem_reconfigure(struct fs_context *fc)
goto out;
}
+ if (ctx->seen & SHMEM_SEEN_QUOTA &&
+ !sb_any_quota_loaded(fc->root->d_sb)) {
+ err = "Cannot enable quota on remount";
+ goto out;
+ }
+
+#ifdef CONFIG_TMPFS_QUOTA
+#define CHANGED_LIMIT(name) \
+ (ctx->qlimits.name## hardlimit && \
+ (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
+
+ if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
+ CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
+ err = "Cannot change global quota limit on remount";
+ goto out;
+ }
+#endif /* CONFIG_TMPFS_QUOTA */
+
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
if (ctx->seen & SHMEM_SEEN_INUMS)
@@ -3835,7 +4171,7 @@ static int shmem_reconfigure(struct fs_context *fc)
sbinfo->max_blocks = ctx->blocks;
if (ctx->seen & SHMEM_SEEN_INODES) {
sbinfo->max_inodes = ctx->inodes;
- sbinfo->free_inodes = ctx->inodes - inodes;
+ sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
}
/*
@@ -3918,6 +4254,9 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+#ifdef CONFIG_TMPFS_QUOTA
+ shmem_disable_quotas(sb);
+#endif
free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
mpol_put(sbinfo->mpol);
@@ -3930,12 +4269,13 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
+ int error = -ENOMEM;
/* Round up to L1_CACHE_BYTES to resist false sharing */
sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
L1_CACHE_BYTES), GFP_KERNEL);
if (!sbinfo)
- return -ENOMEM;
+ return error;
sb->s_fs_info = sbinfo;
@@ -3962,7 +4302,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
#endif
sbinfo->max_blocks = ctx->blocks;
- sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ sbinfo->max_inodes = ctx->inodes;
+ sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
if (sb->s_flags & SB_KERNMOUNT) {
sbinfo->ino_batch = alloc_percpu(ino_t);
if (!sbinfo->ino_batch)
@@ -3996,10 +4337,27 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
uuid_gen(&sb->s_uuid);
+#ifdef CONFIG_TMPFS_QUOTA
+ if (ctx->seen & SHMEM_SEEN_QUOTA) {
+ sb->dq_op = &shmem_quota_operations;
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+
+ /* Copy the default limits from ctx into sbinfo */
+ memcpy(&sbinfo->qlimits, &ctx->qlimits,
+ sizeof(struct shmem_quota_limits));
+
+ if (shmem_enable_quotas(sb, ctx->quota_types))
+ goto failed;
+ }
+#endif /* CONFIG_TMPFS_QUOTA */
+
inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
VM_NORESERVE);
- if (!inode)
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
goto failed;
+ }
inode->i_uid = sbinfo->uid;
inode->i_gid = sbinfo->gid;
sb->s_root = d_make_root(inode);
@@ -4009,7 +4367,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
failed:
shmem_put_super(sb);
- return -ENOMEM;
+ return error;
}
static int shmem_get_tree(struct fs_context *fc)
@@ -4059,6 +4417,8 @@ static void shmem_destroy_inode(struct inode *inode)
{
if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+ if (S_ISDIR(inode->i_mode))
+ simple_offset_destroy(shmem_get_offset_ctx(inode));
}
static void shmem_init_inode(void *foo)
@@ -4102,12 +4462,12 @@ EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
- .open = generic_file_open,
+ .open = shmem_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
.read_iter = shmem_file_read_iter,
- .write_iter = generic_file_write_iter,
+ .write_iter = shmem_file_write_iter,
.fsync = noop_fsync,
.splice_read = shmem_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -4139,6 +4499,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
.mknod = shmem_mknod,
.rename = shmem_rename2,
.tmpfile = shmem_tmpfile,
+ .get_offset_ctx = shmem_get_offset_ctx,
#endif
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -4170,6 +4531,9 @@ static const struct super_operations shmem_ops = {
.statfs = shmem_statfs,
.show_options = shmem_show_options,
#endif
+#ifdef CONFIG_TMPFS_QUOTA
+ .get_dquots = shmem_get_dquots,
+#endif
.evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
@@ -4223,7 +4587,7 @@ static struct file_system_type shmem_fs_type = {
#endif
.kill_sb = kill_litter_super,
#ifdef CONFIG_SHMEM
- .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
#else
.fs_flags = FS_USERNS_MOUNT,
#endif
@@ -4235,6 +4599,14 @@ void __init shmem_init(void)
shmem_init_inodecache();
+#ifdef CONFIG_TMPFS_QUOTA
+ error = register_quota_format(&shmem_quota_format);
+ if (error < 0) {
+ pr_err("Could not register quota format\n");
+ goto out3;
+ }
+#endif
+
error = register_filesystem(&shmem_fs_type);
if (error) {
pr_err("Could not register tmpfs\n");
@@ -4259,6 +4631,10 @@ void __init shmem_init(void)
out1:
unregister_filesystem(&shmem_fs_type);
out2:
+#ifdef CONFIG_TMPFS_QUOTA
+ unregister_quota_format(&shmem_quota_format);
+out3:
+#endif
shmem_destroy_inodecache();
shm_mnt = ERR_PTR(error);
}
@@ -4378,10 +4754,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
#define shmem_anon_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
-#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
+static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
+ return inode ? inode : ERR_PTR(-ENOSPC);
+}
+
#endif /* CONFIG_SHMEM */
/* common code */
@@ -4406,9 +4788,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
- if (unlikely(!inode)) {
+
+ if (IS_ERR(inode)) {
shmem_unacct_size(flags, size);
- return ERR_PTR(-ENOSPC);
+ return ERR_CAST(inode);
}
inode->i_flags |= i_flags;
inode->i_size = size;
diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c
new file mode 100644
index 000000000000..062d1c1097ae
--- /dev/null
+++ b/mm/shmem_quota.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * In memory quota format relies on quota infrastructure to store dquot
+ * information for us. While conventional quota formats for file systems
+ * with persistent storage can load quota information into dquot from the
+ * storage on-demand and hence quota dquot shrinker can free any dquot
+ * that is not currently being used, it must be avoided here. Otherwise we
+ * can lose valuable information, user provided limits, because there is
+ * no persistent storage to load the information from afterwards.
+ *
+ * One information that in-memory quota format needs to keep track of is
+ * a sorted list of ids for each quota type. This is done by utilizing
+ * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
+ * type.
+ *
+ * This format can be used to support quota on file system without persistent
+ * storage such as tmpfs.
+ *
+ * Author: Lukas Czerner <lczerner@redhat.com>
+ * Carlos Maiolino <cmaiolino@redhat.com>
+ *
+ * Copyright (C) 2023 Red Hat, Inc.
+ */
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
+
+#include <linux/quotaops.h>
+#include <linux/quota.h>
+
+#ifdef CONFIG_TMPFS_QUOTA
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define SHMEM_MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
+#define SHMEM_MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
+
+struct quota_id {
+ struct rb_node node;
+ qid_t id;
+ qsize_t bhardlimit;
+ qsize_t bsoftlimit;
+ qsize_t ihardlimit;
+ qsize_t isoftlimit;
+};
+
+static int shmem_check_quota_file(struct super_block *sb, int type)
+{
+ /* There is no real quota file, nothing to do */
+ return 1;
+}
+
+/*
+ * There is no real quota file. Just allocate rb_root for quota ids and
+ * set limits
+ */
+static int shmem_read_file_info(struct super_block *sb, int type)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct mem_dqinfo *info = &dqopt->info[type];
+
+ info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
+ if (!info->dqi_priv)
+ return -ENOMEM;
+
+ info->dqi_max_spc_limit = SHMEM_QUOTA_MAX_SPC_LIMIT;
+ info->dqi_max_ino_limit = SHMEM_QUOTA_MAX_INO_LIMIT;
+
+ info->dqi_bgrace = SHMEM_MAX_DQ_TIME;
+ info->dqi_igrace = SHMEM_MAX_IQ_TIME;
+ info->dqi_flags = 0;
+
+ return 0;
+}
+
+static int shmem_write_file_info(struct super_block *sb, int type)
+{
+ /* There is no real quota file, nothing to do */
+ return 0;
+}
+
+/*
+ * Free all the quota_id entries in the rb tree and rb_root.
+ */
+static int shmem_free_file_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
+ struct rb_root *root = info->dqi_priv;
+ struct quota_id *entry;
+ struct rb_node *node;
+
+ info->dqi_priv = NULL;
+ node = rb_first(root);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+ node = rb_next(&entry->node);
+
+ rb_erase(&entry->node, root);
+ kfree(entry);
+ }
+
+ kfree(root);
+ return 0;
+}
+
+static int shmem_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
+ struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
+ qid_t id = from_kqid(&init_user_ns, *qid);
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct quota_id *entry = NULL;
+ int ret = 0;
+
+ if (!sb_has_quota_active(sb, qid->type))
+ return -ESRCH;
+
+ down_read(&dqopt->dqio_sem);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+
+ if (id < entry->id)
+ node = node->rb_left;
+ else if (id > entry->id)
+ node = node->rb_right;
+ else
+ goto got_next_id;
+ }
+
+ if (!entry) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (id > entry->id) {
+ node = rb_next(&entry->node);
+ if (!node) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ entry = rb_entry(node, struct quota_id, node);
+ }
+
+got_next_id:
+ *qid = make_kqid(&init_user_ns, qid->type, entry->id);
+out_unlock:
+ up_read(&dqopt->dqio_sem);
+ return ret;
+}
+
+/*
+ * Load dquot with limits from existing entry, or create the new entry if
+ * it does not exist.
+ */
+static int shmem_acquire_dquot(struct dquot *dquot)
+{
+ struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
+ struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
+ struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info;
+ struct rb_node *parent = NULL, *new_node = NULL;
+ struct quota_id *new_entry, *entry;
+ qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ int ret = 0;
+
+ mutex_lock(&dquot->dq_lock);
+
+ down_write(&dqopt->dqio_sem);
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct quota_id, node);
+
+ if (id < entry->id)
+ n = &(*n)->rb_left;
+ else if (id > entry->id)
+ n = &(*n)->rb_right;
+ else
+ goto found;
+ }
+
+ /* We don't have entry for this id yet, create it */
+ new_entry = kzalloc(sizeof(struct quota_id), GFP_NOFS);
+ if (!new_entry) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ new_entry->id = id;
+ if (dquot->dq_id.type == USRQUOTA) {
+ new_entry->bhardlimit = sbinfo->qlimits.usrquota_bhardlimit;
+ new_entry->ihardlimit = sbinfo->qlimits.usrquota_ihardlimit;
+ } else if (dquot->dq_id.type == GRPQUOTA) {
+ new_entry->bhardlimit = sbinfo->qlimits.grpquota_bhardlimit;
+ new_entry->ihardlimit = sbinfo->qlimits.grpquota_ihardlimit;
+ }
+
+ new_node = &new_entry->node;
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
+ entry = new_entry;
+
+found:
+ /* Load the stored limits from the tree */
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot->dq_dqb.dqb_bhardlimit = entry->bhardlimit;
+ dquot->dq_dqb.dqb_bsoftlimit = entry->bsoftlimit;
+ dquot->dq_dqb.dqb_ihardlimit = entry->ihardlimit;
+ dquot->dq_dqb.dqb_isoftlimit = entry->isoftlimit;
+
+ if (!dquot->dq_dqb.dqb_bhardlimit &&
+ !dquot->dq_dqb.dqb_bsoftlimit &&
+ !dquot->dq_dqb.dqb_ihardlimit &&
+ !dquot->dq_dqb.dqb_isoftlimit)
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ spin_unlock(&dquot->dq_dqb_lock);
+
+ /* Make sure flags update is visible after dquot has been filled */
+ smp_mb__before_atomic();
+ set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_unlock:
+ up_write(&dqopt->dqio_sem);
+ mutex_unlock(&dquot->dq_lock);
+ return ret;
+}
+
+static bool shmem_is_empty_dquot(struct dquot *dquot)
+{
+ struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info;
+ qsize_t bhardlimit;
+ qsize_t ihardlimit;
+
+ if (dquot->dq_id.type == USRQUOTA) {
+ bhardlimit = sbinfo->qlimits.usrquota_bhardlimit;
+ ihardlimit = sbinfo->qlimits.usrquota_ihardlimit;
+ } else if (dquot->dq_id.type == GRPQUOTA) {
+ bhardlimit = sbinfo->qlimits.grpquota_bhardlimit;
+ ihardlimit = sbinfo->qlimits.grpquota_ihardlimit;
+ }
+
+ if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+ (dquot->dq_dqb.dqb_curspace == 0 &&
+ dquot->dq_dqb.dqb_curinodes == 0 &&
+ dquot->dq_dqb.dqb_bhardlimit == bhardlimit &&
+ dquot->dq_dqb.dqb_ihardlimit == ihardlimit))
+ return true;
+
+ return false;
+}
+/*
+ * Store limits from dquot in the tree unless it's fake. If it is fake
+ * remove the id from the tree since there is no useful information in
+ * there.
+ */
+static int shmem_release_dquot(struct dquot *dquot)
+{
+ struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
+ struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
+ qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ struct quota_id *entry = NULL;
+
+ mutex_lock(&dquot->dq_lock);
+ /* Check whether we are not racing with some other dqget() */
+ if (dquot_is_busy(dquot))
+ goto out_dqlock;
+
+ down_write(&dqopt->dqio_sem);
+ while (node) {
+ entry = rb_entry(node, struct quota_id, node);
+
+ if (id < entry->id)
+ node = node->rb_left;
+ else if (id > entry->id)
+ node = node->rb_right;
+ else
+ goto found;
+ }
+
+ /* We should always find the entry in the rb tree */
+ WARN_ONCE(1, "quota id %u from dquot %p, not in rb tree!\n", id, dquot);
+ up_write(&dqopt->dqio_sem);
+ mutex_unlock(&dquot->dq_lock);
+ return -ENOENT;
+
+found:
+ if (shmem_is_empty_dquot(dquot)) {
+ /* Remove entry from the tree */
+ rb_erase(&entry->node, info->dqi_priv);
+ kfree(entry);
+ } else {
+ /* Store the limits in the tree */
+ spin_lock(&dquot->dq_dqb_lock);
+ entry->bhardlimit = dquot->dq_dqb.dqb_bhardlimit;
+ entry->bsoftlimit = dquot->dq_dqb.dqb_bsoftlimit;
+ entry->ihardlimit = dquot->dq_dqb.dqb_ihardlimit;
+ entry->isoftlimit = dquot->dq_dqb.dqb_isoftlimit;
+ spin_unlock(&dquot->dq_dqb_lock);
+ }
+
+ clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+ up_write(&dqopt->dqio_sem);
+
+out_dqlock:
+ mutex_unlock(&dquot->dq_lock);
+ return 0;
+}
+
+static int shmem_mark_dquot_dirty(struct dquot *dquot)
+{
+ return 0;
+}
+
+static int shmem_dquot_write_info(struct super_block *sb, int type)
+{
+ return 0;
+}
+
+static const struct quota_format_ops shmem_format_ops = {
+ .check_quota_file = shmem_check_quota_file,
+ .read_file_info = shmem_read_file_info,
+ .write_file_info = shmem_write_file_info,
+ .free_file_info = shmem_free_file_info,
+};
+
+struct quota_format_type shmem_quota_format = {
+ .qf_fmt_id = QFMT_SHMEM,
+ .qf_ops = &shmem_format_ops,
+ .qf_owner = THIS_MODULE
+};
+
+const struct dquot_operations shmem_quota_operations = {
+ .acquire_dquot = shmem_acquire_dquot,
+ .release_dquot = shmem_release_dquot,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
+ .write_info = shmem_dquot_write_info,
+ .mark_dirty = shmem_mark_dquot_dirty,
+ .get_next_id = shmem_get_next_id,
+};
+#endif /* CONFIG_TMPFS_QUOTA */
diff --git a/mm/truncate.c b/mm/truncate.c
index 95d1291d269b..c3320e66d6ea 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -657,11 +657,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
- if (folio->mapping != mapping) {
+ if (unlikely(folio->mapping != mapping)) {
folio_unlock(folio);
continue;
}
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
if (folio_mapped(folio))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 93cf99aba335..228a4a5312f2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2979,6 +2979,10 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
free_vm_area(area);
return NULL;
}
+
+ flush_cache_vmap((unsigned long)area->addr,
+ (unsigned long)area->addr + count * PAGE_SIZE);
+
return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1080209a568b..2fe4a11d63f4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4284,6 +4284,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
static const struct mm_walk_ops mm_walk_ops = {
.test_walk = should_skip_vma,
.p4d_entry = walk_pud_range,
+ .walk_lock = PGWALK_RDLOCK,
};
int err;
@@ -4853,16 +4854,17 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
spin_lock_irq(&pgdat->memcg_lru.lock);
- VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+ if (hlist_nulls_unhashed(&lruvec->lrugen.list))
+ goto unlock;
gen = lruvec->lrugen.gen;
- hlist_nulls_del_rcu(&lruvec->lrugen.list);
+ hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
pgdat->memcg_lru.nr_memcgs[gen]--;
if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
-
+unlock:
spin_unlock_irq(&pgdat->memcg_lru.lock);
}
}
@@ -5434,8 +5436,10 @@ restart:
rcu_read_lock();
hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
- if (op)
+ if (op) {
lru_gen_rotate_memcg(lruvec, op);
+ op = 0;
+ }
mem_cgroup_put(memcg);
@@ -5443,7 +5447,7 @@ restart:
memcg = lruvec_memcg(lruvec);
if (!mem_cgroup_tryget(memcg)) {
- op = 0;
+ lru_gen_release_memcg(memcg);
memcg = NULL;
continue;
}
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index acff565849ae..1d704574e6bf 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -505,7 +505,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
struct batadv_elp_packet *elp_packet;
struct batadv_hard_iface *primary_if;
- struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ struct ethhdr *ethhdr;
bool res;
int ret = NET_RX_DROP;
@@ -513,6 +513,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
if (!res)
goto free_skb;
+ ethhdr = eth_hdr(skb);
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
goto free_skb;
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index e710e9afe78f..e503ee0d896b 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -123,8 +123,10 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ if (hard_iface->if_status != BATADV_IF_ACTIVE) {
+ kfree_skb(skb);
return;
+ }
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
@@ -985,7 +987,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
{
struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
struct batadv_ogm2_packet *ogm_packet;
- struct ethhdr *ethhdr = eth_hdr(skb);
+ struct ethhdr *ethhdr;
int ogm_offset;
u8 *packet_pos;
int ret = NET_RX_DROP;
@@ -999,6 +1001,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
goto free_skb;
+ ethhdr = eth_hdr(skb);
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
goto free_skb;
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 41c1ad33d009..24c9c0c3f316 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -630,7 +630,19 @@ out:
*/
void batadv_update_min_mtu(struct net_device *soft_iface)
{
- soft_iface->mtu = batadv_hardif_min_mtu(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ int limit_mtu;
+ int mtu;
+
+ mtu = batadv_hardif_min_mtu(soft_iface);
+
+ if (bat_priv->mtu_set_by_user)
+ limit_mtu = bat_priv->mtu_set_by_user;
+ else
+ limit_mtu = ETH_DATA_LEN;
+
+ mtu = min(mtu, limit_mtu);
+ dev_set_mtu(soft_iface, mtu);
/* Check if the local translate table should be cleaned up to match a
* new (and smaller) MTU.
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index ad5714f737be..6efbc9275aec 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -495,7 +495,10 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED];
atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr));
+
+ rtnl_lock();
batadv_update_min_mtu(bat_priv->soft_iface);
+ rtnl_unlock();
}
if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) {
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index d3fdf82282af..85d00dc9ce32 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -153,11 +153,14 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+
/* check ranges */
if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev))
return -EINVAL;
dev->mtu = new_mtu;
+ bat_priv->mtu_set_by_user = new_mtu;
return 0;
}
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 36ca31252a73..b95c36765d04 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -774,7 +774,6 @@ check_roaming:
if (roamed_back) {
batadv_tt_global_free(bat_priv, tt_global,
"Roaming canceled");
- tt_global = NULL;
} else {
/* The global entry has to be marked as ROAMING and
* has to be kept for consistency purpose
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index ca9449ec9836..cf1a0eafe3ab 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1547,6 +1547,12 @@ struct batadv_priv {
struct net_device *soft_iface;
/**
+ * @mtu_set_by_user: MTU was set once by user
+ * protected by rtnl_lock
+ */
+ int mtu_set_by_user;
+
+ /**
* @bat_counters: mesh internal traffic statistic counters (see
* batadv_counters)
*/
diff --git a/net/can/isotp.c b/net/can/isotp.c
index 99770ed28531..f02b5d3e4733 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -188,12 +188,6 @@ static bool isotp_register_rxid(struct isotp_sock *so)
return (isotp_bc_flags(so) == 0);
}
-static bool isotp_register_txecho(struct isotp_sock *so)
-{
- /* all modes but SF_BROADCAST register for tx echo skbs */
- return (isotp_bc_flags(so) != CAN_ISOTP_SF_BROADCAST);
-}
-
static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer)
{
struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
@@ -1209,7 +1203,7 @@ static int isotp_release(struct socket *sock)
lock_sock(sk);
/* remove current filters & unregister */
- if (so->bound && isotp_register_txecho(so)) {
+ if (so->bound) {
if (so->ifindex) {
struct net_device *dev;
@@ -1332,14 +1326,12 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id),
isotp_rcv, sk, "isotp", sk);
- if (isotp_register_txecho(so)) {
- /* no consecutive frame echo skb in flight */
- so->cfecho = 0;
+ /* no consecutive frame echo skb in flight */
+ so->cfecho = 0;
- /* register for echo skb's */
- can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id),
- isotp_rcv_echo, sk, "isotpe", sk);
- }
+ /* register for echo skb's */
+ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id),
+ isotp_rcv_echo, sk, "isotpe", sk);
dev_put(dev);
@@ -1560,7 +1552,7 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg,
case NETDEV_UNREGISTER:
lock_sock(sk);
/* remove current filters & unregister */
- if (so->bound && isotp_register_txecho(so)) {
+ if (so->bound) {
if (isotp_register_rxid(so))
can_rx_unregister(dev_net(dev), dev, so->rxid,
SINGLE_MASK(so->rxid),
diff --git a/net/can/raw.c b/net/can/raw.c
index e10f59375659..d50c3f3d892f 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -85,6 +85,7 @@ struct raw_sock {
int bound;
int ifindex;
struct net_device *dev;
+ netdevice_tracker dev_tracker;
struct list_head notifier;
int loopback;
int recv_own_msgs;
@@ -285,8 +286,10 @@ static void raw_notify(struct raw_sock *ro, unsigned long msg,
case NETDEV_UNREGISTER:
lock_sock(sk);
/* remove current filters & unregister */
- if (ro->bound)
+ if (ro->bound) {
raw_disable_allfilters(dev_net(dev), dev, sk);
+ netdev_put(dev, &ro->dev_tracker);
+ }
if (ro->count > 1)
kfree(ro->filter);
@@ -391,10 +394,12 @@ static int raw_release(struct socket *sock)
/* remove current filters & unregister */
if (ro->bound) {
- if (ro->dev)
+ if (ro->dev) {
raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk);
- else
+ netdev_put(ro->dev, &ro->dev_tracker);
+ } else {
raw_disable_allfilters(sock_net(sk), NULL, sk);
+ }
}
if (ro->count > 1)
@@ -445,10 +450,10 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
goto out;
}
if (dev->type != ARPHRD_CAN) {
- dev_put(dev);
err = -ENODEV;
- goto out;
+ goto out_put_dev;
}
+
if (!(dev->flags & IFF_UP))
notify_enetdown = 1;
@@ -456,7 +461,9 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
/* filters set by default/setsockopt */
err = raw_enable_allfilters(sock_net(sk), dev, sk);
- dev_put(dev);
+ if (err)
+ goto out_put_dev;
+
} else {
ifindex = 0;
@@ -467,18 +474,28 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
if (!err) {
if (ro->bound) {
/* unregister old filters */
- if (ro->dev)
+ if (ro->dev) {
raw_disable_allfilters(dev_net(ro->dev),
ro->dev, sk);
- else
+ /* drop reference to old ro->dev */
+ netdev_put(ro->dev, &ro->dev_tracker);
+ } else {
raw_disable_allfilters(sock_net(sk), NULL, sk);
+ }
}
ro->ifindex = ifindex;
ro->bound = 1;
+ /* bind() ok -> hold a reference for new ro->dev */
ro->dev = dev;
+ if (ro->dev)
+ netdev_hold(ro->dev, &ro->dev_tracker, GFP_KERNEL);
}
- out:
+out_put_dev:
+ /* remove potential reference from dev_get_by_index() */
+ if (dev)
+ dev_put(dev);
+out:
release_sock(sk);
rtnl_unlock();
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index aef25aa5cf1d..00c94d9622b4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2268,13 +2268,27 @@ out_err:
return err;
}
-int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
- struct netlink_ext_ack *exterr)
+int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
+ struct netlink_ext_ack *exterr)
{
- return nla_parse_deprecated(tb, IFLA_MAX, head, len, ifla_policy,
+ const struct ifinfomsg *ifmp;
+ const struct nlattr *attrs;
+ size_t len;
+
+ ifmp = nla_data(nla_peer);
+ attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg);
+ len = nla_len(nla_peer) - sizeof(struct ifinfomsg);
+
+ if (ifmp->ifi_index < 0) {
+ NL_SET_ERR_MSG_ATTR(exterr, nla_peer,
+ "ifindex can't be negative");
+ return -EINVAL;
+ }
+
+ return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy,
exterr);
}
-EXPORT_SYMBOL(rtnl_nla_parse_ifla);
+EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg);
struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
{
@@ -3547,6 +3561,9 @@ replay:
if (ifm->ifi_index > 0) {
link_specified = true;
dev = __dev_get_by_index(net, ifm->ifi_index);
+ } else if (ifm->ifi_index < 0) {
+ NL_SET_ERR_MSG(extack, "ifindex can't be negative");
+ return -EINVAL;
} else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
link_specified = true;
dev = rtnl_dev_get(net, tb);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index fa8079303cb0..a545ad71201c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -130,7 +130,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr,
inet->inet_sport,
inet->inet_dport);
- inet->inet_id = get_random_u16();
+ atomic_set(&inet->inet_id, get_random_u16());
err = dccp_connect(sk);
rt = NULL;
@@ -432,7 +432,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
- newinet->inet_id = get_random_u16();
+ atomic_set(&newinet->inet_id, get_random_u16());
if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
goto put_and_exit;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 4e3266e4d7c3..fcc5c9d64f46 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -315,11 +315,15 @@ EXPORT_SYMBOL_GPL(dccp_disconnect);
__poll_t dccp_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
- __poll_t mask;
struct sock *sk = sock->sk;
+ __poll_t mask;
+ u8 shutdown;
+ int state;
sock_poll_wait(file, sock, wait);
- if (sk->sk_state == DCCP_LISTEN)
+
+ state = inet_sk_state_load(sk);
+ if (state == DCCP_LISTEN)
return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
@@ -328,20 +332,21 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
*/
mask = 0;
- if (sk->sk_err)
+ if (READ_ONCE(sk->sk_err))
mask = EPOLLERR;
+ shutdown = READ_ONCE(sk->sk_shutdown);
- if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
+ if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED)
mask |= EPOLLHUP;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
+ if (shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
/* Connected? */
- if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
+ if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
if (atomic_read(&sk->sk_rmem_alloc) > 0)
mask |= EPOLLIN | EPOLLRDNORM;
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (!(shutdown & SEND_SHUTDOWN)) {
if (sk_stream_is_writeable(sk)) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else { /* send SIGIO later */
@@ -359,7 +364,6 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
}
return mask;
}
-
EXPORT_SYMBOL_GPL(dccp_poll);
int dccp_ioctl(struct sock *sk, int cmd, int *karg)
diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c
index 1f00f874471f..bfed7929a904 100644
--- a/net/devlink/leftover.c
+++ b/net/devlink/leftover.c
@@ -6704,6 +6704,7 @@ void devlink_notify_unregister(struct devlink *devlink)
struct devlink_param_item *param_item;
struct devlink_trap_item *trap_item;
struct devlink_port *devlink_port;
+ struct devlink_linecard *linecard;
struct devlink_rate *rate_node;
struct devlink_region *region;
unsigned long port_index;
@@ -6732,6 +6733,8 @@ void devlink_notify_unregister(struct devlink *devlink)
xa_for_each(&devlink->ports, port_index, devlink_port)
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+ list_for_each_entry_reverse(linecard, &devlink->linecard_list, list)
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
devlink_notify(devlink, DEVLINK_CMD_DEL);
}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9b2ca2fcc5a1..02736b83c303 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -340,7 +340,7 @@ lookup_protocol:
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- inet->inet_id = 0;
+ atomic_set(&inet->inet_id, 0);
sock_init_data(sock, sk);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 4d1af0cd7d99..cb5dbee9e018 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -73,7 +73,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
- inet->inet_id = get_random_u16();
+ atomic_set(&inet->inet_id, get_random_u16());
sk_dst_set(sk, &rt->dst);
err = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a59cc4b83861..2dbdc26da86e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -312,7 +312,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr));
}
- inet->inet_id = get_random_u16();
+ atomic_set(&inet->inet_id, get_random_u16());
if (tcp_fastopen_defer_connect(sk, &err))
return err;
@@ -1596,7 +1596,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
- newinet->inet_id = get_random_u16();
+ atomic_set(&newinet->inet_id, get_random_u16());
/* Set ToS of the new socket based upon the value of incoming SYN.
* ECT bits are set later in tcp_init_transfer().
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 4f707d2a160f..0af2599c17e8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1083,7 +1083,8 @@ static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx,
struct sk_buff *tail = skb_peek_tail(frames);
struct ieee80211_rx_status *status;
- if (tid_agg_rx->reorder_buf_filtered & BIT_ULL(index))
+ if (tid_agg_rx->reorder_buf_filtered &&
+ tid_agg_rx->reorder_buf_filtered & BIT_ULL(index))
return true;
if (!tail)
@@ -1124,7 +1125,8 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
}
no_frame:
- tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
+ if (tid_agg_rx->reorder_buf_filtered)
+ tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num);
}
@@ -4264,6 +4266,7 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
u16 ssn, u64 filtered,
u16 received_mpdus)
{
+ struct ieee80211_local *local;
struct sta_info *sta;
struct tid_ampdu_rx *tid_agg_rx;
struct sk_buff_head frames;
@@ -4281,6 +4284,11 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
sta = container_of(pubsta, struct sta_info, sta);
+ local = sta->sdata->local;
+ WARN_ONCE(local->hw.max_rx_aggregation_subframes > 64,
+ "RX BA marker can't support max_rx_aggregation_subframes %u > 64\n",
+ local->hw.max_rx_aggregation_subframes);
+
if (!ieee80211_rx_data_set_sta(&rx, sta, -1))
return;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3e841e45f2c0..eb8b1167dced 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1373,7 +1373,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
if (table == NULL)
goto err_kzalloc;
- table->validate_state = NFT_VALIDATE_SKIP;
+ table->validate_state = nft_net->validate_state;
table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
if (table->name == NULL)
goto err_strdup;
@@ -9051,9 +9051,8 @@ static int nf_tables_validate(struct net *net)
return -EAGAIN;
nft_validate_state_update(table, NFT_VALIDATE_SKIP);
+ break;
}
-
- break;
}
return 0;
@@ -9457,9 +9456,9 @@ static void nft_trans_gc_work(struct work_struct *work)
struct nft_trans_gc *trans, *next;
LIST_HEAD(trans_gc_list);
- spin_lock(&nf_tables_destroy_list_lock);
+ spin_lock(&nf_tables_gc_list_lock);
list_splice_init(&nf_tables_gc_list, &trans_gc_list);
- spin_unlock(&nf_tables_destroy_list_lock);
+ spin_unlock(&nf_tables_gc_list_lock);
list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
list_del(&trans->list);
@@ -9799,8 +9798,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
}
/* 0. Validate ruleset, otherwise roll back for error reporting. */
- if (nf_tables_validate(net) < 0)
+ if (nf_tables_validate(net) < 0) {
+ nft_net->validate_state = NFT_VALIDATE_DO;
return -EAGAIN;
+ }
err = nft_flow_rule_offload_commit(net);
if (err < 0)
@@ -10059,6 +10060,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nf_tables_commit_audit_log(&adl, nft_net->base_seq);
nft_gc_seq_end(nft_net, gc_seq);
+ nft_net->validate_state = NFT_VALIDATE_SKIP;
nf_tables_commit_release(net);
return 0;
@@ -10335,8 +10337,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb,
enum nfnl_abort_action action)
{
struct nftables_pernet *nft_net = nft_pernet(net);
- int ret = __nf_tables_abort(net, action);
+ unsigned int gc_seq;
+ int ret;
+ gc_seq = nft_gc_seq_begin(nft_net);
+ ret = __nf_tables_abort(net, action);
+ nft_gc_seq_end(nft_net, gc_seq);
mutex_unlock(&nft_net->commit_mutex);
return ret;
@@ -11071,7 +11077,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
gc_seq = nft_gc_seq_begin(nft_net);
if (!list_empty(&nf_tables_destroy_list))
- rcu_barrier();
+ nf_tables_trans_destroy_flush_work();
again:
list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
@@ -11115,6 +11121,7 @@ static int __net_init nf_tables_init_net(struct net *net)
mutex_init(&nft_net->commit_mutex);
nft_net->base_seq = 1;
nft_net->gc_seq = 0;
+ nft_net->validate_state = NFT_VALIDATE_SKIP;
return 0;
}
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index cef5df846000..524763659f25 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -326,6 +326,9 @@ static void nft_rhash_gc(struct work_struct *work)
nft_net = nft_pernet(net);
gc_seq = READ_ONCE(nft_net->gc_seq);
+ if (nft_set_gc_is_pending(set))
+ goto done;
+
gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
if (!gc)
goto done;
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 3757fcc55723..6af9c9ed4b5c 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -902,12 +902,14 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int mask_bits)
{
- int rule = f->rules++, group, ret, bit_offset = 0;
+ int rule = f->rules, group, ret, bit_offset = 0;
- ret = pipapo_resize(f, f->rules - 1, f->rules);
+ ret = pipapo_resize(f, f->rules, f->rules + 1);
if (ret)
return ret;
+ f->rules++;
+
for (group = 0; group < f->groups; group++) {
int i, v;
u8 mask;
@@ -1052,7 +1054,9 @@ static int pipapo_expand(struct nft_pipapo_field *f,
step++;
if (step >= len) {
if (!masks) {
- pipapo_insert(f, base, 0);
+ err = pipapo_insert(f, base, 0);
+ if (err < 0)
+ return err;
masks = 1;
}
goto out;
@@ -1235,6 +1239,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
else
ret = pipapo_expand(f, start, end, f->groups * f->bb);
+ if (ret < 0)
+ return ret;
+
if (f->bsize > bsize_max)
bsize_max = f->bsize;
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index f9d4c8fcbbf8..c6435e709231 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -611,6 +611,9 @@ static void nft_rbtree_gc(struct work_struct *work)
nft_net = nft_pernet(net);
gc_seq = READ_ONCE(nft_net->gc_seq);
+ if (nft_set_gc_is_pending(set))
+ goto done;
+
gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
if (!gc)
goto done;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index aa6b1fe65151..e9eaf637220e 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1547,10 +1547,28 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
return 0;
}
+static bool req_create_or_replace(struct nlmsghdr *n)
+{
+ return (n->nlmsg_flags & NLM_F_CREATE &&
+ n->nlmsg_flags & NLM_F_REPLACE);
+}
+
+static bool req_create_exclusive(struct nlmsghdr *n)
+{
+ return (n->nlmsg_flags & NLM_F_CREATE &&
+ n->nlmsg_flags & NLM_F_EXCL);
+}
+
+static bool req_change(struct nlmsghdr *n)
+{
+ return (!(n->nlmsg_flags & NLM_F_CREATE) &&
+ !(n->nlmsg_flags & NLM_F_REPLACE) &&
+ !(n->nlmsg_flags & NLM_F_EXCL));
+}
+
/*
* Create/change qdisc.
*/
-
static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
struct netlink_ext_ack *extack)
{
@@ -1644,27 +1662,35 @@ replay:
*
* We know, that some child q is already
* attached to this parent and have choice:
- * either to change it or to create/graft new one.
+ * 1) change it or 2) create/graft new one.
+ * If the requested qdisc kind is different
+ * than the existing one, then we choose graft.
+ * If they are the same then this is "change"
+ * operation - just let it fallthrough..
*
* 1. We are allowed to create/graft only
- * if CREATE and REPLACE flags are set.
+ * if the request is explicitly stating
+ * "please create if it doesn't exist".
*
- * 2. If EXCL is set, requestor wanted to say,
- * that qdisc tcm_handle is not expected
+ * 2. If the request is to exclusive create
+ * then the qdisc tcm_handle is not expected
* to exist, so that we choose create/graft too.
*
* 3. The last case is when no flags are set.
+ * This will happen when for example tc
+ * utility issues a "change" command.
* Alas, it is sort of hole in API, we
* cannot decide what to do unambiguously.
- * For now we select create/graft, if
- * user gave KIND, which does not match existing.
+ * For now we select create/graft.
*/
- if ((n->nlmsg_flags & NLM_F_CREATE) &&
- (n->nlmsg_flags & NLM_F_REPLACE) &&
- ((n->nlmsg_flags & NLM_F_EXCL) ||
- (tca[TCA_KIND] &&
- nla_strcmp(tca[TCA_KIND], q->ops->id))))
- goto create_n_graft;
+ if (tca[TCA_KIND] &&
+ nla_strcmp(tca[TCA_KIND], q->ops->id)) {
+ if (req_create_or_replace(n) ||
+ req_create_exclusive(n))
+ goto create_n_graft;
+ else if (req_change(n))
+ goto create_n_graft2;
+ }
}
}
} else {
@@ -1698,6 +1724,7 @@ create_n_graft:
NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
return -ENOENT;
}
+create_n_graft2:
if (clid == TC_H_INGRESS) {
if (dev_ingress_queue(dev)) {
q = qdisc_create(dev, dev_ingress_queue(dev),
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9388d98aebc0..76f1bce49a8e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -99,7 +99,7 @@ struct percpu_counter sctp_sockets_allocated;
static void sctp_enter_memory_pressure(struct sock *sk)
{
- sctp_memory_pressure = 1;
+ WRITE_ONCE(sctp_memory_pressure, 1);
}
@@ -9479,7 +9479,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
newinet->inet_dport = htons(asoc->peer.port);
newinet->pmtudisc = inet->pmtudisc;
- newinet->inet_id = get_random_u16();
+ atomic_set(&newinet->inet_id, get_random_u16());
newinet->uc_ttl = inet->uc_ttl;
newinet->mc_loop = 1;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 0b6034fab9ab..f420d8457345 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -472,7 +472,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
return NULL;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_fop = &simple_dir_operations;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b098fde373ab..28c0771c4e8c 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -935,9 +935,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
if (!rep->rr_rdmabuf)
goto out_free;
- if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
- goto out_free_regbuf;
-
rep->rr_cid.ci_completion_id =
atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
@@ -956,8 +953,6 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
spin_unlock(&buf->rb_lock);
return rep;
-out_free_regbuf:
- rpcrdma_regbuf_free(rep->rr_rdmabuf);
out_free:
kfree(rep);
out:
@@ -1363,6 +1358,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
rep = rpcrdma_rep_create(r_xprt, temp);
if (!rep)
break;
+ if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
+ rpcrdma_rep_put(buf, rep);
+ break;
+ }
rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
trace_xprtrdma_post_recv(rep);
diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c
index e5ed08098ff3..e2a6a69352df 100644
--- a/samples/ftrace/ftrace-direct-modify.c
+++ b/samples/ftrace/ftrace-direct-modify.c
@@ -105,7 +105,7 @@ asm (
" .type my_tramp1, @function\n"
" .globl my_tramp1\n"
" my_tramp1:"
-" bti c\n"
+" hint 34\n" // bti c
" sub sp, sp, #16\n"
" stp x9, x30, [sp]\n"
" bl my_direct_func1\n"
@@ -117,7 +117,7 @@ asm (
" .type my_tramp2, @function\n"
" .globl my_tramp2\n"
" my_tramp2:"
-" bti c\n"
+" hint 34\n" // bti c
" sub sp, sp, #16\n"
" stp x9, x30, [sp]\n"
" bl my_direct_func2\n"
diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c
index 292cff2b3f5d..2e349834d63c 100644
--- a/samples/ftrace/ftrace-direct-multi-modify.c
+++ b/samples/ftrace/ftrace-direct-multi-modify.c
@@ -112,7 +112,7 @@ asm (
" .type my_tramp1, @function\n"
" .globl my_tramp1\n"
" my_tramp1:"
-" bti c\n"
+" hint 34\n" // bti c
" sub sp, sp, #32\n"
" stp x9, x30, [sp]\n"
" str x0, [sp, #16]\n"
@@ -127,7 +127,7 @@ asm (
" .type my_tramp2, @function\n"
" .globl my_tramp2\n"
" my_tramp2:"
-" bti c\n"
+" hint 34\n" // bti c
" sub sp, sp, #32\n"
" stp x9, x30, [sp]\n"
" str x0, [sp, #16]\n"
diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c
index b4391e08c913..9243dbfe4d0c 100644
--- a/samples/ftrace/ftrace-direct-multi.c
+++ b/samples/ftrace/ftrace-direct-multi.c
@@ -75,7 +75,7 @@ asm (
" .type my_tramp, @function\n"
" .globl my_tramp\n"
" my_tramp:"
-" bti c\n"
+" hint 34\n" // bti c
" sub sp, sp, #32\n"
" stp x9, x30, [sp]\n"
" str x0, [sp, #16]\n"
diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c
index e9804c5307c0..e39c3563ae4e 100644
--- a/samples/ftrace/ftrace-direct-too.c
+++ b/samples/ftrace/ftrace-direct-too.c
@@ -81,7 +81,7 @@ asm (
" .type my_tramp, @function\n"
" .globl my_tramp\n"
" my_tramp:"
-" bti c\n"
+" hint 34\n" // bti c
" sub sp, sp, #48\n"
" stp x9, x30, [sp]\n"
" stp x0, x1, [sp, #16]\n"
diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c
index 20f4a7caa810..32c477da1e9a 100644
--- a/samples/ftrace/ftrace-direct.c
+++ b/samples/ftrace/ftrace-direct.c
@@ -72,7 +72,7 @@ asm (
" .type my_tramp, @function\n"
" .globl my_tramp\n"
" my_tramp:"
-" bti c\n"
+" hint 34\n" // bti c
" sub sp, sp, #32\n"
" stp x9, x30, [sp]\n"
" str x0, [sp, #16]\n"
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 880fde13d9b8..a9841148cde2 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -7457,6 +7457,30 @@ sub process {
}
}
+# Complain about RCU Tasks Trace used outside of BPF (and of course, RCU).
+ our $rcu_trace_funcs = qr{(?x:
+ rcu_read_lock_trace |
+ rcu_read_lock_trace_held |
+ rcu_read_unlock_trace |
+ call_rcu_tasks_trace |
+ synchronize_rcu_tasks_trace |
+ rcu_barrier_tasks_trace |
+ rcu_request_urgent_qs_task
+ )};
+ our $rcu_trace_paths = qr{(?x:
+ kernel/bpf/ |
+ include/linux/bpf |
+ net/bpf/ |
+ kernel/rcu/ |
+ include/linux/rcu
+ )};
+ if ($line =~ /\b($rcu_trace_funcs)\s*\(/) {
+ if ($realfile !~ m{^$rcu_trace_paths}) {
+ WARN("RCU_TASKS_TRACE",
+ "use of RCU tasks trace is incorrect outside BPF or core RCU code\n" . $herecurr);
+ }
+ }
+
# check for lockdep_set_novalidate_class
if ($line =~ /^.\s*lockdep_set_novalidate_class\s*\(/ ||
$line =~ /__lockdep_no_validate__\s*\)/ ) {
diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h
index 84c730da36dd..1ae39b9f4a95 100644
--- a/scripts/gcc-plugins/gcc-common.h
+++ b/scripts/gcc-plugins/gcc-common.h
@@ -440,4 +440,8 @@ static inline void debug_gimple_stmt(const_gimple s)
#define SET_DECL_MODE(decl, mode) DECL_MODE(decl) = (mode)
#endif
+#if BUILDING_GCC_VERSION >= 14000
+#define last_stmt(x) last_nondebug_stmt(x)
+#endif
+
#endif
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index 0f295961e773..2cff851ebfd7 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -279,6 +279,29 @@ config ZERO_CALL_USED_REGS
endmenu
+menu "Hardening of kernel data structures"
+
+config LIST_HARDENED
+ bool "Check integrity of linked list manipulation"
+ help
+ Minimal integrity checking in the linked-list manipulation routines
+ to catch memory corruptions that are not guaranteed to result in an
+ immediate access fault.
+
+ If unsure, say N.
+
+config BUG_ON_DATA_CORRUPTION
+ bool "Trigger a BUG when data corruption is detected"
+ select LIST_HARDENED
+ help
+ Select this option if the kernel should BUG when it encounters
+ data corruption in kernel memory structures when they get checked
+ for validity.
+
+ If unsure, say N.
+
+endmenu
+
config CC_HAS_RANDSTRUCT
def_bool $(cc-option,-frandomize-layout-seed-file=/dev/null)
# Randstruct was first added in Clang 15, but it isn't safe to use until
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index db7a51acf9db..bd6a910f6528 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -226,7 +226,7 @@ static int __aafs_setup_d_inode(struct inode *dir, struct dentry *dentry,
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_private = data;
if (S_ISDIR(mode)) {
inode->i_op = iops ? iops : &simple_dir_inode_operations;
@@ -1554,8 +1554,11 @@ void __aafs_profile_migrate_dents(struct aa_profile *old,
for (i = 0; i < AAFS_PROF_SIZEOF; i++) {
new->dents[i] = old->dents[i];
- if (new->dents[i])
- new->dents[i]->d_inode->i_mtime = current_time(new->dents[i]->d_inode);
+ if (new->dents[i]) {
+ struct inode *inode = d_inode(new->dents[i]);
+
+ inode->i_mtime = inode_set_ctime_current(inode);
+ }
old->dents[i] = NULL;
}
}
@@ -2540,7 +2543,7 @@ static int aa_mk_null_file(struct dentry *parent)
inode->i_ino = get_next_ino();
inode->i_mode = S_IFCHR | S_IRUGO | S_IWUGO;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
init_special_inode(inode, S_IFCHR | S_IRUGO | S_IWUGO,
MKDEV(MEM_MAJOR, 3));
d_instantiate(dentry, inode);
diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c
index 694fb7a09962..8b8846073e14 100644
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -86,10 +86,13 @@ void __aa_loaddata_update(struct aa_loaddata *data, long revision)
data->revision = revision;
if ((data->dents[AAFS_LOADDATA_REVISION])) {
- d_inode(data->dents[AAFS_LOADDATA_DIR])->i_mtime =
- current_time(d_inode(data->dents[AAFS_LOADDATA_DIR]));
- d_inode(data->dents[AAFS_LOADDATA_REVISION])->i_mtime =
- current_time(d_inode(data->dents[AAFS_LOADDATA_REVISION]));
+ struct inode *inode;
+
+ inode = d_inode(data->dents[AAFS_LOADDATA_DIR]);
+ inode->i_mtime = inode_set_ctime_current(inode);
+
+ inode = d_inode(data->dents[AAFS_LOADDATA_REVISION]);
+ inode->i_mtime = inode_set_ctime_current(inode);
}
}
diff --git a/security/inode.c b/security/inode.c
index 6c326939750d..3aa75fffa8c9 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -145,7 +145,7 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_private = data;
if (S_ISDIR(mode)) {
inode->i_op = &simple_dir_inode_operations;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index c9b3bd8f1bb9..7a0420cf1a6a 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -68,7 +68,7 @@ enum policy_rule_list { IMA_DEFAULT_POLICY = 1, IMA_CUSTOM_POLICY };
struct ima_rule_opt_list {
size_t count;
- char *items[];
+ char *items[] __counted_by(count);
};
/*
@@ -342,6 +342,7 @@ static struct ima_rule_opt_list *ima_alloc_rule_opt_list(const substring_t *src)
kfree(src_copy);
return ERR_PTR(-ENOMEM);
}
+ opt_list->count = count;
/*
* strsep() has already replaced all instances of '|' with '\0',
@@ -357,7 +358,6 @@ static struct ima_rule_opt_list *ima_alloc_rule_opt_list(const substring_t *src)
opt_list->items[i] = cur;
cur = strchr(cur, '\0') + 1;
}
- opt_list->count = count;
return opt_list;
}
diff --git a/security/integrity/platform_certs/load_ipl_s390.c b/security/integrity/platform_certs/load_ipl_s390.c
index e769dcb7ea94..c7c381a9ddaa 100644
--- a/security/integrity/platform_certs/load_ipl_s390.c
+++ b/security/integrity/platform_certs/load_ipl_s390.c
@@ -22,8 +22,8 @@ static int __init load_ipl_certs(void)
if (!ipl_cert_list_addr)
return 0;
- /* Copy the certificates to the system keyring */
- ptr = (void *) ipl_cert_list_addr;
+ /* Copy the certificates to the platform keyring */
+ ptr = __va(ipl_cert_list_addr);
end = ptr + ipl_cert_list_size;
while ((void *) ptr < end) {
len = *(unsigned int *) ptr;
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index ebae964f7cc9..a9d40456a064 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -336,6 +336,7 @@ static int read_trusted_verity_root_digests(unsigned int fd)
rc = -ENOMEM;
goto err;
}
+ trd->len = len;
if (hex2bin(trd->data, d, len)) {
kfree(trd);
@@ -343,8 +344,6 @@ static int read_trusted_verity_root_digests(unsigned int fd)
goto err;
}
- trd->len = len;
-
list_add_tail(&trd->node, &dm_verity_loadpin_trusted_root_digests);
}
diff --git a/security/security.c b/security/security.c
index b720424ca37d..549104a447e3 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1139,6 +1139,20 @@ void security_bprm_committed_creds(struct linux_binprm *bprm)
}
/**
+ * security_fs_context_submount() - Initialise fc->security
+ * @fc: new filesystem context
+ * @reference: dentry reference for submount/remount
+ *
+ * Fill out the ->security field for a new fs_context.
+ *
+ * Return: Returns 0 on success or negative error code on failure.
+ */
+int security_fs_context_submount(struct fs_context *fc, struct super_block *reference)
+{
+ return call_int_hook(fs_context_submount, 0, fc, reference);
+}
+
+/**
* security_fs_context_dup() - Duplicate a fs_context LSM blob
* @fc: destination filesystem context
* @src_fc: source filesystem context
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d06e350fedee..afd663744041 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2745,6 +2745,27 @@ static int selinux_umount(struct vfsmount *mnt, int flags)
FILESYSTEM__UNMOUNT, NULL);
}
+static int selinux_fs_context_submount(struct fs_context *fc,
+ struct super_block *reference)
+{
+ const struct superblock_security_struct *sbsec;
+ struct selinux_mnt_opts *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ sbsec = selinux_superblock(reference);
+ if (sbsec->flags & FSCONTEXT_MNT)
+ opts->fscontext_sid = sbsec->sid;
+ if (sbsec->flags & CONTEXT_MNT)
+ opts->context_sid = sbsec->mntpoint_sid;
+ if (sbsec->flags & DEFCONTEXT_MNT)
+ opts->defcontext_sid = sbsec->def_sid;
+ fc->security = opts;
+ return 0;
+}
+
static int selinux_fs_context_dup(struct fs_context *fc,
struct fs_context *src_fc)
{
@@ -7182,6 +7203,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
/*
* PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE
*/
+ LSM_HOOK_INIT(fs_context_submount, selinux_fs_context_submount),
LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup),
LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param),
LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts),
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index bad1f6b685fd..9dafb6ff110d 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1197,7 +1197,7 @@ static struct inode *sel_make_inode(struct super_block *sb, int mode)
if (ret) {
ret->i_mode = mode;
- ret->i_atime = ret->i_mtime = ret->i_ctime = current_time(ret);
+ ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret);
}
return ret;
}
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 31b08b34c722..dc904865af58 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -2005,6 +2005,7 @@ static int filename_trans_read_helper(struct policydb *p, void *fp)
if (!datum)
goto out;
+ datum->next = NULL;
*dst = datum;
/* ebitmap_read() will at least init the bitmap */
@@ -2017,7 +2018,6 @@ static int filename_trans_read_helper(struct policydb *p, void *fp)
goto out;
datum->otype = le32_to_cpu(buf[0]);
- datum->next = NULL;
dst = &datum->next;
}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 6e270cf3fd30..a8201cf22f20 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -615,6 +615,56 @@ out_opt_err:
}
/**
+ * smack_fs_context_submount - Initialise security data for a filesystem context
+ * @fc: The filesystem context.
+ * @reference: reference superblock
+ *
+ * Returns 0 on success or -ENOMEM on error.
+ */
+static int smack_fs_context_submount(struct fs_context *fc,
+ struct super_block *reference)
+{
+ struct superblock_smack *sbsp;
+ struct smack_mnt_opts *ctx;
+ struct inode_smack *isp;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fc->security = ctx;
+
+ sbsp = smack_superblock(reference);
+ isp = smack_inode(reference->s_root->d_inode);
+
+ if (sbsp->smk_default) {
+ ctx->fsdefault = kstrdup(sbsp->smk_default->smk_known, GFP_KERNEL);
+ if (!ctx->fsdefault)
+ return -ENOMEM;
+ }
+
+ if (sbsp->smk_floor) {
+ ctx->fsfloor = kstrdup(sbsp->smk_floor->smk_known, GFP_KERNEL);
+ if (!ctx->fsfloor)
+ return -ENOMEM;
+ }
+
+ if (sbsp->smk_hat) {
+ ctx->fshat = kstrdup(sbsp->smk_hat->smk_known, GFP_KERNEL);
+ if (!ctx->fshat)
+ return -ENOMEM;
+ }
+
+ if (isp->smk_flags & SMK_INODE_TRANSMUTE) {
+ if (sbsp->smk_root) {
+ ctx->fstransmute = kstrdup(sbsp->smk_root->smk_known, GFP_KERNEL);
+ if (!ctx->fstransmute)
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/**
* smack_fs_context_dup - Duplicate the security data on fs_context duplication
* @fc: The new filesystem context.
* @src_fc: The source filesystem context being duplicated.
@@ -4876,6 +4926,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
LSM_HOOK_INIT(syslog, smack_syslog),
+ LSM_HOOK_INIT(fs_context_submount, smack_fs_context_submount),
LSM_HOOK_INIT(fs_context_dup, smack_fs_context_dup),
LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param),
diff --git a/sound/pci/ymfpci/ymfpci.c b/sound/pci/ymfpci/ymfpci.c
index b033bd290940..48444dda44de 100644
--- a/sound/pci/ymfpci/ymfpci.c
+++ b/sound/pci/ymfpci/ymfpci.c
@@ -152,8 +152,8 @@ static inline int snd_ymfpci_create_gameport(struct snd_ymfpci *chip, int dev, i
void snd_ymfpci_free_gameport(struct snd_ymfpci *chip) { }
#endif /* SUPPORT_JOYSTICK */
-static int snd_card_ymfpci_probe(struct pci_dev *pci,
- const struct pci_device_id *pci_id)
+static int __snd_card_ymfpci_probe(struct pci_dev *pci,
+ const struct pci_device_id *pci_id)
{
static int dev;
struct snd_card *card;
@@ -348,6 +348,12 @@ static int snd_card_ymfpci_probe(struct pci_dev *pci,
return 0;
}
+static int snd_card_ymfpci_probe(struct pci_dev *pci,
+ const struct pci_device_id *pci_id)
+{
+ return snd_card_free_on_error(&pci->dev, __snd_card_ymfpci_probe(pci, pci_id));
+}
+
static struct pci_driver ymfpci_driver = {
.name = KBUILD_MODNAME,
.id_table = snd_ymfpci_ids,
diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c
index a2fe3bd4f9a1..b304b3562c82 100644
--- a/sound/soc/amd/yc/acp6x-mach.c
+++ b/sound/soc/amd/yc/acp6x-mach.c
@@ -217,7 +217,7 @@ static const struct dmi_system_id yc_acp_quirk_table[] = {
.driver_data = &acp6x_card,
.matches = {
DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
- DMI_MATCH(DMI_PRODUCT_NAME, "82"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "82V2"),
}
},
{
@@ -251,6 +251,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = {
{
.driver_data = &acp6x_card,
.matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "M6500RC"),
+ }
+ },
+ {
+ .driver_data = &acp6x_card,
+ .matches = {
DMI_MATCH(DMI_BOARD_VENDOR, "Alienware"),
DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m17 R5 AMD"),
}
diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c
index 6ac501f008ec..8a879b6f4829 100644
--- a/sound/soc/codecs/cs35l41.c
+++ b/sound/soc/codecs/cs35l41.c
@@ -168,7 +168,7 @@ static int cs35l41_get_fs_mon_config_index(int freq)
static const DECLARE_TLV_DB_RANGE(dig_vol_tlv,
0, 0, TLV_DB_SCALE_ITEM(TLV_DB_GAIN_MUTE, 0, 1),
1, 913, TLV_DB_MINMAX_ITEM(-10200, 1200));
-static DECLARE_TLV_DB_SCALE(amp_gain_tlv, 0, 1, 1);
+static DECLARE_TLV_DB_SCALE(amp_gain_tlv, 50, 100, 0);
static const struct snd_kcontrol_new dre_ctrl =
SOC_DAPM_SINGLE("Switch", CS35L41_PWR_CTRL3, 20, 1, 0);
diff --git a/sound/soc/codecs/cs35l56-i2c.c b/sound/soc/codecs/cs35l56-i2c.c
index ed2a41943d97..40666e6698ba 100644
--- a/sound/soc/codecs/cs35l56-i2c.c
+++ b/sound/soc/codecs/cs35l56-i2c.c
@@ -62,10 +62,19 @@ static const struct i2c_device_id cs35l56_id_i2c[] = {
};
MODULE_DEVICE_TABLE(i2c, cs35l56_id_i2c);
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id cs35l56_asoc_acpi_match[] = {
+ { "CSC355C", 0 },
+ {},
+};
+MODULE_DEVICE_TABLE(acpi, cs35l56_asoc_acpi_match);
+#endif
+
static struct i2c_driver cs35l56_i2c_driver = {
.driver = {
.name = "cs35l56",
.pm = &cs35l56_pm_ops_i2c_spi,
+ .acpi_match_table = ACPI_PTR(cs35l56_asoc_acpi_match),
},
.id_table = cs35l56_id_i2c,
.probe = cs35l56_i2c_probe,
diff --git a/sound/soc/codecs/cs35l56-spi.c b/sound/soc/codecs/cs35l56-spi.c
index 996aab10500e..302f9c47407a 100644
--- a/sound/soc/codecs/cs35l56-spi.c
+++ b/sound/soc/codecs/cs35l56-spi.c
@@ -59,10 +59,19 @@ static const struct spi_device_id cs35l56_id_spi[] = {
};
MODULE_DEVICE_TABLE(spi, cs35l56_id_spi);
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id cs35l56_asoc_acpi_match[] = {
+ { "CSC355C", 0 },
+ {},
+};
+MODULE_DEVICE_TABLE(acpi, cs35l56_asoc_acpi_match);
+#endif
+
static struct spi_driver cs35l56_spi_driver = {
.driver = {
.name = "cs35l56",
.pm = &cs35l56_pm_ops_i2c_spi,
+ .acpi_match_table = ACPI_PTR(cs35l56_asoc_acpi_match),
},
.id_table = cs35l56_id_spi,
.probe = cs35l56_spi_probe,
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index c03f9d3c9a13..fd06b9f9d496 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -5,7 +5,6 @@
// Copyright (C) 2023 Cirrus Logic, Inc. and
// Cirrus Logic International Semiconductor Ltd.
-#include <linux/acpi.h>
#include <linux/completion.h>
#include <linux/debugfs.h>
#include <linux/delay.h>
@@ -1354,26 +1353,22 @@ static int cs35l56_dsp_init(struct cs35l56_private *cs35l56)
return 0;
}
-static int cs35l56_acpi_get_name(struct cs35l56_private *cs35l56)
+static int cs35l56_get_firmware_uid(struct cs35l56_private *cs35l56)
{
- acpi_handle handle = ACPI_HANDLE(cs35l56->dev);
- const char *sub;
+ struct device *dev = cs35l56->dev;
+ const char *prop;
+ int ret;
- /* If there is no ACPI_HANDLE, there is no ACPI for this system, return 0 */
- if (!handle)
+ ret = device_property_read_string(dev, "cirrus,firmware-uid", &prop);
+ /* If bad sw node property, return 0 and fallback to legacy firmware path */
+ if (ret < 0)
return 0;
- sub = acpi_get_subsystem_id(handle);
- if (IS_ERR(sub)) {
- /* If bad ACPI, return 0 and fallback to legacy firmware path, otherwise fail */
- if (PTR_ERR(sub) == -ENODATA)
- return 0;
- else
- return PTR_ERR(sub);
- }
+ cs35l56->dsp.system_name = devm_kstrdup(dev, prop, GFP_KERNEL);
+ if (cs35l56->dsp.system_name == NULL)
+ return -ENOMEM;
- cs35l56->dsp.system_name = sub;
- dev_dbg(cs35l56->dev, "Subsystem ID: %s\n", cs35l56->dsp.system_name);
+ dev_dbg(dev, "Firmware UID: %s\n", cs35l56->dsp.system_name);
return 0;
}
@@ -1417,7 +1412,7 @@ int cs35l56_common_probe(struct cs35l56_private *cs35l56)
gpiod_set_value_cansleep(cs35l56->reset_gpio, 1);
}
- ret = cs35l56_acpi_get_name(cs35l56);
+ ret = cs35l56_get_firmware_uid(cs35l56);
if (ret != 0)
goto err;
@@ -1604,8 +1599,6 @@ void cs35l56_remove(struct cs35l56_private *cs35l56)
regcache_cache_only(cs35l56->regmap, true);
- kfree(cs35l56->dsp.system_name);
-
gpiod_set_value_cansleep(cs35l56->reset_gpio, 0);
regulator_bulk_disable(ARRAY_SIZE(cs35l56->supplies), cs35l56->supplies);
}
diff --git a/sound/soc/codecs/tas2781-comlib.c b/sound/soc/codecs/tas2781-comlib.c
index a88c6c28a394..ffb26e4a7e2f 100644
--- a/sound/soc/codecs/tas2781-comlib.c
+++ b/sound/soc/codecs/tas2781-comlib.c
@@ -57,16 +57,17 @@ static int tasdevice_change_chn_book(struct tasdevice_priv *tas_priv,
if (client->addr != tasdev->dev_addr) {
client->addr = tasdev->dev_addr;
- if (tasdev->cur_book == book) {
- ret = regmap_write(map,
- TASDEVICE_PAGE_SELECT, 0);
- if (ret < 0) {
- dev_err(tas_priv->dev, "%s, E=%d\n",
- __func__, ret);
- goto out;
- }
+ /* All tas2781s share the same regmap, clear the page
+ * inside regmap once switching to another tas2781.
+ * Register 0 at any pages and any books inside tas2781
+ * is the same one for page-switching.
+ */
+ ret = regmap_write(map, TASDEVICE_PAGE_SELECT, 0);
+ if (ret < 0) {
+ dev_err(tas_priv->dev, "%s, E=%d\n",
+ __func__, ret);
+ goto out;
}
- goto out;
}
if (tasdev->cur_book != book) {
diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c
index 0c905bd0fab4..027416eb2f50 100644
--- a/sound/soc/sof/ipc4-pcm.c
+++ b/sound/soc/sof/ipc4-pcm.c
@@ -708,6 +708,9 @@ static int sof_ipc4_pcm_hw_params(struct snd_soc_component *component,
struct snd_sof_pcm *spcm;
spcm = snd_sof_find_spcm_dai(component, rtd);
+ if (!spcm)
+ return -EINVAL;
+
time_info = spcm->stream[substream->stream].private;
/* delay calculation is not supported by current fw_reg ABI */
if (!time_info)
diff --git a/tools/arch/x86/include/uapi/asm/unistd_32.h b/tools/arch/x86/include/uapi/asm/unistd_32.h
index bc48a4dabe5d..4798f9d18fe8 100644
--- a/tools/arch/x86/include/uapi/asm/unistd_32.h
+++ b/tools/arch/x86/include/uapi/asm/unistd_32.h
@@ -26,3 +26,6 @@
#ifndef __NR_setns
#define __NR_setns 346
#endif
+#ifdef __NR_seccomp
+#define __NR_seccomp 354
+#endif
diff --git a/tools/arch/x86/include/uapi/asm/unistd_64.h b/tools/arch/x86/include/uapi/asm/unistd_64.h
index f70d2cada256..d0f2043d7132 100644
--- a/tools/arch/x86/include/uapi/asm/unistd_64.h
+++ b/tools/arch/x86/include/uapi/asm/unistd_64.h
@@ -26,3 +26,6 @@
#ifndef __NR_getcpu
#define __NR_getcpu 309
#endif
+#ifndef __NR_seccomp
+#define __NR_seccomp 317
+#endif
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 9d36c8ce1fe7..1684216e826a 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -42,6 +42,18 @@
# define __always_inline inline __attribute__((always_inline))
#endif
+#ifndef __always_unused
+#define __always_unused __attribute__((__unused__))
+#endif
+
+#ifndef __noreturn
+#define __noreturn __attribute__((__noreturn__))
+#endif
+
+#ifndef unreachable
+#define unreachable() __builtin_unreachable()
+#endif
+
#ifndef noinline
#define noinline
#endif
@@ -190,4 +202,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
#define ___PASTE(a, b) a##b
#define __PASTE(a, b) ___PASTE(a, b)
+#ifndef OPTIMIZER_HIDE_VAR
+/* Make the optimizer believe the variable can be manipulated arbitrarily. */
+#define OPTIMIZER_HIDE_VAR(var) \
+ __asm__ ("" : "=r" (var) : "0" (var))
+#endif
+
#endif /* _TOOLS_LINUX_COMPILER_H */
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 0f158dc8139b..07bbc449329e 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -1,5 +1,6 @@
perf-y += sched-messaging.o
perf-y += sched-pipe.o
+perf-y += sched-seccomp-notify.o
perf-y += syscall.o
perf-y += mem-functions.o
perf-y += futex-hash.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 0d2b65976212..a0625c77bea3 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -21,6 +21,7 @@ extern struct timeval bench__start, bench__end, bench__runtime;
int bench_numa(int argc, const char **argv);
int bench_sched_messaging(int argc, const char **argv);
int bench_sched_pipe(int argc, const char **argv);
+int bench_sched_seccomp_notify(int argc, const char **argv);
int bench_syscall_basic(int argc, const char **argv);
int bench_syscall_getpgid(int argc, const char **argv);
int bench_syscall_fork(int argc, const char **argv);
diff --git a/tools/perf/bench/sched-seccomp-notify.c b/tools/perf/bench/sched-seccomp-notify.c
new file mode 100644
index 000000000000..b04ebcde4036
--- /dev/null
+++ b/tools/perf/bench/sched-seccomp-notify.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <subcmd/parse-options.h>
+#include "bench.h"
+
+#include <uapi/linux/filter.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <linux/unistd.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <linux/time64.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+
+#include <unistd.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <string.h>
+#include <errno.h>
+#include <err.h>
+#include <inttypes.h>
+
+#define LOOPS_DEFAULT 1000000UL
+static uint64_t loops = LOOPS_DEFAULT;
+static bool sync_mode;
+
+static const struct option options[] = {
+ OPT_U64('l', "loop", &loops, "Specify number of loops"),
+ OPT_BOOLEAN('s', "sync-mode", &sync_mode,
+ "Enable the synchronious mode for seccomp notifications"),
+ OPT_END()
+};
+
+static const char * const bench_seccomp_usage[] = {
+ "perf bench sched secccomp-notify <options>",
+ NULL
+};
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+ return syscall(__NR_seccomp, op, flags, args);
+}
+
+static int user_notif_syscall(int nr, unsigned int flags)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
+}
+
+#define USER_NOTIF_MAGIC INT_MAX
+static void user_notification_sync_loop(int listener)
+{
+ struct seccomp_notif_resp resp;
+ struct seccomp_notif req;
+ uint64_t nr;
+
+ for (nr = 0; nr < loops; nr++) {
+ memset(&req, 0, sizeof(req));
+ if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req))
+ err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_RECV failed");
+
+ if (req.data.nr != __NR_gettid)
+ errx(EXIT_FAILURE, "unexpected syscall: %d", req.data.nr);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+ resp.flags = 0;
+ if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp))
+ err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_SEND failed");
+ }
+}
+
+#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
+#endif
+int bench_sched_seccomp_notify(int argc, const char **argv)
+{
+ struct timeval start, stop, diff;
+ unsigned long long result_usec = 0;
+ int status, listener;
+ pid_t pid;
+ long ret;
+
+ argc = parse_options(argc, argv, options, bench_seccomp_usage, 0);
+
+ gettimeofday(&start, NULL);
+
+ prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ listener = user_notif_syscall(__NR_gettid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ if (listener < 0)
+ err(EXIT_FAILURE, "can't create a notification descriptor");
+
+ pid = fork();
+ if (pid < 0)
+ err(EXIT_FAILURE, "fork");
+ if (pid == 0) {
+ if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0))
+ err(EXIT_FAILURE, "can't set the parent death signal");
+ while (1) {
+ ret = syscall(__NR_gettid);
+ if (ret == USER_NOTIF_MAGIC)
+ continue;
+ break;
+ }
+ _exit(1);
+ }
+
+ if (sync_mode) {
+ if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
+ SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0))
+ err(EXIT_FAILURE,
+ "can't set SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP");
+ }
+ user_notification_sync_loop(listener);
+
+ kill(pid, SIGKILL);
+ if (waitpid(pid, &status, 0) != pid)
+ err(EXIT_FAILURE, "waitpid(%d) failed", pid);
+ if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL)
+ errx(EXIT_FAILURE, "unexpected exit code: %d", status);
+
+ gettimeofday(&stop, NULL);
+ timersub(&stop, &start, &diff);
+
+ switch (bench_format) {
+ case BENCH_FORMAT_DEFAULT:
+ printf("# Executed %" PRIu64 " system calls\n\n",
+ loops);
+
+ result_usec = diff.tv_sec * USEC_PER_SEC;
+ result_usec += diff.tv_usec;
+
+ printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
+ (unsigned long) diff.tv_sec,
+ (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
+
+ printf(" %14lf usecs/op\n",
+ (double)result_usec / (double)loops);
+ printf(" %14d ops/sec\n",
+ (int)((double)loops /
+ ((double)result_usec / (double)USEC_PER_SEC)));
+ break;
+
+ case BENCH_FORMAT_SIMPLE:
+ printf("%lu.%03lu\n",
+ (unsigned long) diff.tv_sec,
+ (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
+ break;
+
+ default:
+ /* reaching here is something disaster */
+ fprintf(stderr, "Unknown format:%d\n", bench_format);
+ exit(1);
+ break;
+ }
+
+ return 0;
+}
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index db435b791a09..5033e8bab276 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -47,6 +47,7 @@ static struct bench numa_benchmarks[] = {
static struct bench sched_benchmarks[] = {
{ "messaging", "Benchmark for scheduling and IPC", bench_sched_messaging },
{ "pipe", "Benchmark for pipe() between two processes", bench_sched_pipe },
+ { "seccomp-notify", "Benchmark for seccomp user notify", bench_sched_seccomp_notify},
{ "all", "Run all scheduler benchmarks", NULL },
{ NULL, NULL, NULL }
};
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 8a36ba5df9f9..9a10512e3407 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5447,7 +5447,7 @@ unsigned int intel_model_duplicates(unsigned int model)
case INTEL_FAM6_LAKEFIELD:
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
- case INTEL_FAM6_ALDERLAKE_N:
+ case INTEL_FAM6_ATOM_GRACEMONT:
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
case INTEL_FAM6_RAPTORLAKE_S:
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 666b56f22a41..8dca8acdb671 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -18,6 +18,7 @@ TARGETS += drivers/net/bonding
TARGETS += drivers/net/team
TARGETS += efivarfs
TARGETS += exec
+TARGETS += fchmodat2
TARGETS += filesystems
TARGETS += filesystems/binderfs
TARGETS += filesystems/epoll
diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index ace8b67fb22d..28b93cab8c0d 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -19,6 +19,8 @@ CFLAGS += -I$(top_srcdir)/tools/testing/selftests/
CFLAGS += $(KHDR_INCLUDES)
+CFLAGS += -I$(top_srcdir)/tools/include
+
export CFLAGS
export top_srcdir
diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index d4ad813fed10..e3d262831d91 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -19,19 +19,38 @@
#include "../../kselftest.h"
-#define TESTS_PER_HWCAP 2
+#define TESTS_PER_HWCAP 3
/*
- * Function expected to generate SIGILL when the feature is not
- * supported and return when it is supported. If SIGILL is generated
- * then the handler must be able to skip over the instruction safely.
+ * Function expected to generate exception when the feature is not
+ * supported and return when it is supported. If the specific exception
+ * is generated then the handler must be able to skip over the
+ * instruction safely.
*
* Note that it is expected that for many architecture extensions
* there are no specific traps due to no architecture state being
* added so we may not fault if running on a kernel which doesn't know
* to add the hwcap.
*/
-typedef void (*sigill_fn)(void);
+typedef void (*sig_fn)(void);
+
+static void aes_sigill(void)
+{
+ /* AESE V0.16B, V0.16B */
+ asm volatile(".inst 0x4e284800" : : : );
+}
+
+static void atomics_sigill(void)
+{
+ /* STADD W0, [SP] */
+ asm volatile(".inst 0xb82003ff" : : : );
+}
+
+static void crc32_sigill(void)
+{
+ /* CRC32W W0, W0, W1 */
+ asm volatile(".inst 0x1ac14800" : : : );
+}
static void cssc_sigill(void)
{
@@ -39,6 +58,29 @@ static void cssc_sigill(void)
asm volatile(".inst 0xdac01c00" : : : "x0");
}
+static void fp_sigill(void)
+{
+ asm volatile("fmov s0, #1");
+}
+
+static void ilrcpc_sigill(void)
+{
+ /* LDAPUR W0, [SP, #8] */
+ asm volatile(".inst 0x994083e0" : : : );
+}
+
+static void jscvt_sigill(void)
+{
+ /* FJCVTZS W0, D0 */
+ asm volatile(".inst 0x1e7e0000" : : : );
+}
+
+static void lrcpc_sigill(void)
+{
+ /* LDAPR W0, [SP, #0] */
+ asm volatile(".inst 0xb8bfc3e0" : : : );
+}
+
static void mops_sigill(void)
{
char dst[1], src[1];
@@ -53,11 +95,35 @@ static void mops_sigill(void)
: "cc", "memory");
}
+static void pmull_sigill(void)
+{
+ /* PMULL V0.1Q, V0.1D, V0.1D */
+ asm volatile(".inst 0x0ee0e000" : : : );
+}
+
static void rng_sigill(void)
{
asm volatile("mrs x0, S3_3_C2_C4_0" : : : "x0");
}
+static void sha1_sigill(void)
+{
+ /* SHA1H S0, S0 */
+ asm volatile(".inst 0x5e280800" : : : );
+}
+
+static void sha2_sigill(void)
+{
+ /* SHA256H Q0, Q0, V0.4S */
+ asm volatile(".inst 0x5e004000" : : : );
+}
+
+static void sha512_sigill(void)
+{
+ /* SHA512H Q0, Q0, V0.2D */
+ asm volatile(".inst 0xce608000" : : : );
+}
+
static void sme_sigill(void)
{
/* RDSVL x0, #0 */
@@ -208,15 +274,46 @@ static void svebf16_sigill(void)
asm volatile(".inst 0x658aa000" : : : "z0");
}
+static void hbc_sigill(void)
+{
+ /* BC.EQ +4 */
+ asm volatile("cmp xzr, xzr\n"
+ ".inst 0x54000030" : : : "cc");
+}
+
+static void uscat_sigbus(void)
+{
+ /* unaligned atomic access */
+ asm volatile("ADD x1, sp, #2" : : : );
+ /* STADD W0, [X1] */
+ asm volatile(".inst 0xb820003f" : : : );
+}
+
static const struct hwcap_data {
const char *name;
unsigned long at_hwcap;
unsigned long hwcap_bit;
const char *cpuinfo;
- sigill_fn sigill_fn;
+ sig_fn sigill_fn;
bool sigill_reliable;
+ sig_fn sigbus_fn;
+ bool sigbus_reliable;
} hwcaps[] = {
{
+ .name = "AES",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_AES,
+ .cpuinfo = "aes",
+ .sigill_fn = aes_sigill,
+ },
+ {
+ .name = "CRC32",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_CRC32,
+ .cpuinfo = "crc32",
+ .sigill_fn = crc32_sigill,
+ },
+ {
.name = "CSSC",
.at_hwcap = AT_HWCAP2,
.hwcap_bit = HWCAP2_CSSC,
@@ -224,6 +321,50 @@ static const struct hwcap_data {
.sigill_fn = cssc_sigill,
},
{
+ .name = "FP",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_FP,
+ .cpuinfo = "fp",
+ .sigill_fn = fp_sigill,
+ },
+ {
+ .name = "JSCVT",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_JSCVT,
+ .cpuinfo = "jscvt",
+ .sigill_fn = jscvt_sigill,
+ },
+ {
+ .name = "LRCPC",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_LRCPC,
+ .cpuinfo = "lrcpc",
+ .sigill_fn = lrcpc_sigill,
+ },
+ {
+ .name = "LRCPC2",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_ILRCPC,
+ .cpuinfo = "ilrcpc",
+ .sigill_fn = ilrcpc_sigill,
+ },
+ {
+ .name = "LSE",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_ATOMICS,
+ .cpuinfo = "atomics",
+ .sigill_fn = atomics_sigill,
+ },
+ {
+ .name = "LSE2",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_USCAT,
+ .cpuinfo = "uscat",
+ .sigill_fn = atomics_sigill,
+ .sigbus_fn = uscat_sigbus,
+ .sigbus_reliable = true,
+ },
+ {
.name = "MOPS",
.at_hwcap = AT_HWCAP2,
.hwcap_bit = HWCAP2_MOPS,
@@ -232,6 +373,13 @@ static const struct hwcap_data {
.sigill_reliable = true,
},
{
+ .name = "PMULL",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_PMULL,
+ .cpuinfo = "pmull",
+ .sigill_fn = pmull_sigill,
+ },
+ {
.name = "RNG",
.at_hwcap = AT_HWCAP2,
.hwcap_bit = HWCAP2_RNG,
@@ -245,6 +393,27 @@ static const struct hwcap_data {
.cpuinfo = "rprfm",
},
{
+ .name = "SHA1",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_SHA1,
+ .cpuinfo = "sha1",
+ .sigill_fn = sha1_sigill,
+ },
+ {
+ .name = "SHA2",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_SHA2,
+ .cpuinfo = "sha2",
+ .sigill_fn = sha2_sigill,
+ },
+ {
+ .name = "SHA512",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_SHA512,
+ .cpuinfo = "sha512",
+ .sigill_fn = sha512_sigill,
+ },
+ {
.name = "SME",
.at_hwcap = AT_HWCAP2,
.hwcap_bit = HWCAP2_SME,
@@ -386,20 +555,32 @@ static const struct hwcap_data {
.hwcap_bit = HWCAP2_SVE_EBF16,
.cpuinfo = "sveebf16",
},
+ {
+ .name = "HBC",
+ .at_hwcap = AT_HWCAP2,
+ .hwcap_bit = HWCAP2_HBC,
+ .cpuinfo = "hbc",
+ .sigill_fn = hbc_sigill,
+ .sigill_reliable = true,
+ },
};
-static bool seen_sigill;
-
-static void handle_sigill(int sig, siginfo_t *info, void *context)
-{
- ucontext_t *uc = context;
-
- seen_sigill = true;
-
- /* Skip over the offending instruction */
- uc->uc_mcontext.pc += 4;
+typedef void (*sighandler_fn)(int, siginfo_t *, void *);
+
+#define DEF_SIGHANDLER_FUNC(SIG, NUM) \
+static bool seen_##SIG; \
+static void handle_##SIG(int sig, siginfo_t *info, void *context) \
+{ \
+ ucontext_t *uc = context; \
+ \
+ seen_##SIG = true; \
+ /* Skip over the offending instruction */ \
+ uc->uc_mcontext.pc += 4; \
}
+DEF_SIGHANDLER_FUNC(sigill, SIGILL);
+DEF_SIGHANDLER_FUNC(sigbus, SIGBUS);
+
bool cpuinfo_present(const char *name)
{
FILE *f;
@@ -442,24 +623,77 @@ bool cpuinfo_present(const char *name)
return false;
}
-int main(void)
+static int install_sigaction(int signum, sighandler_fn handler)
{
- const struct hwcap_data *hwcap;
- int i, ret;
- bool have_cpuinfo, have_hwcap;
+ int ret;
struct sigaction sa;
- ksft_print_header();
- ksft_set_plan(ARRAY_SIZE(hwcaps) * TESTS_PER_HWCAP);
-
memset(&sa, 0, sizeof(sa));
- sa.sa_sigaction = handle_sigill;
+ sa.sa_sigaction = handler;
sa.sa_flags = SA_RESTART | SA_SIGINFO;
sigemptyset(&sa.sa_mask);
- ret = sigaction(SIGILL, &sa, NULL);
+ ret = sigaction(signum, &sa, NULL);
if (ret < 0)
- ksft_exit_fail_msg("Failed to install SIGILL handler: %s (%d)\n",
+ ksft_exit_fail_msg("Failed to install SIGNAL handler: %s (%d)\n",
+ strerror(errno), errno);
+
+ return ret;
+}
+
+static void uninstall_sigaction(int signum)
+{
+ if (sigaction(signum, NULL, NULL) < 0)
+ ksft_exit_fail_msg("Failed to uninstall SIGNAL handler: %s (%d)\n",
strerror(errno), errno);
+}
+
+#define DEF_INST_RAISE_SIG(SIG, NUM) \
+static bool inst_raise_##SIG(const struct hwcap_data *hwcap, \
+ bool have_hwcap) \
+{ \
+ if (!hwcap->SIG##_fn) { \
+ ksft_test_result_skip(#SIG"_%s\n", hwcap->name); \
+ /* assume that it would raise exception in default */ \
+ return true; \
+ } \
+ \
+ install_sigaction(NUM, handle_##SIG); \
+ \
+ seen_##SIG = false; \
+ hwcap->SIG##_fn(); \
+ \
+ if (have_hwcap) { \
+ /* Should be able to use the extension */ \
+ ksft_test_result(!seen_##SIG, \
+ #SIG"_%s\n", hwcap->name); \
+ } else if (hwcap->SIG##_reliable) { \
+ /* Guaranteed a SIGNAL */ \
+ ksft_test_result(seen_##SIG, \
+ #SIG"_%s\n", hwcap->name); \
+ } else { \
+ /* Missing SIGNAL might be fine */ \
+ ksft_print_msg(#SIG"_%sreported for %s\n", \
+ seen_##SIG ? "" : "not ", \
+ hwcap->name); \
+ ksft_test_result_skip(#SIG"_%s\n", \
+ hwcap->name); \
+ } \
+ \
+ uninstall_sigaction(NUM); \
+ return seen_##SIG; \
+}
+
+DEF_INST_RAISE_SIG(sigill, SIGILL);
+DEF_INST_RAISE_SIG(sigbus, SIGBUS);
+
+int main(void)
+{
+ int i;
+ const struct hwcap_data *hwcap;
+ bool have_cpuinfo, have_hwcap, raise_sigill;
+
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(hwcaps) * TESTS_PER_HWCAP);
for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
hwcap = &hwcaps[i];
@@ -473,30 +707,15 @@ int main(void)
ksft_test_result(have_hwcap == have_cpuinfo,
"cpuinfo_match_%s\n", hwcap->name);
- if (hwcap->sigill_fn) {
- seen_sigill = false;
- hwcap->sigill_fn();
-
- if (have_hwcap) {
- /* Should be able to use the extension */
- ksft_test_result(!seen_sigill, "sigill_%s\n",
- hwcap->name);
- } else if (hwcap->sigill_reliable) {
- /* Guaranteed a SIGILL */
- ksft_test_result(seen_sigill, "sigill_%s\n",
- hwcap->name);
- } else {
- /* Missing SIGILL might be fine */
- ksft_print_msg("SIGILL %sreported for %s\n",
- seen_sigill ? "" : "not ",
- hwcap->name);
- ksft_test_result_skip("sigill_%s\n",
- hwcap->name);
- }
- } else {
- ksft_test_result_skip("sigill_%s\n",
- hwcap->name);
- }
+ /*
+ * Testing for SIGBUS only makes sense after make sure
+ * that the instruction does not cause a SIGILL signal.
+ */
+ raise_sigill = inst_raise_sigill(hwcap, have_hwcap);
+ if (!raise_sigill)
+ inst_raise_sigbus(hwcap, have_hwcap);
+ else
+ ksft_test_result_skip("sigbus_%s\n", hwcap->name);
}
ksft_print_cnts();
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index 18cc123e2347..d704511a0955 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -20,12 +20,20 @@
#include "syscall-abi.h"
+/*
+ * The kernel defines a much larger SVE_VQ_MAX than is expressable in
+ * the architecture, this creates a *lot* of overhead filling the
+ * buffers (especially ZA) on emulated platforms so use the actual
+ * architectural maximum instead.
+ */
+#define ARCH_SVE_VQ_MAX 16
+
static int default_sme_vl;
static int sve_vl_count;
-static unsigned int sve_vls[SVE_VQ_MAX];
+static unsigned int sve_vls[ARCH_SVE_VQ_MAX];
static int sme_vl_count;
-static unsigned int sme_vls[SVE_VQ_MAX];
+static unsigned int sme_vls[ARCH_SVE_VQ_MAX];
extern void do_syscall(int sve_vl, int sme_vl);
@@ -130,9 +138,9 @@ static int check_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
#define SVE_Z_SHARED_BYTES (128 / 8)
-static uint8_t z_zero[__SVE_ZREG_SIZE(SVE_VQ_MAX)];
-uint8_t z_in[SVE_NUM_ZREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
-uint8_t z_out[SVE_NUM_ZREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+static uint8_t z_zero[__SVE_ZREG_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t z_in[SVE_NUM_ZREGS * __SVE_ZREG_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t z_out[SVE_NUM_ZREGS * __SVE_ZREG_SIZE(ARCH_SVE_VQ_MAX)];
static void setup_z(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
uint64_t svcr)
@@ -190,8 +198,8 @@ static int check_z(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
return errors;
}
-uint8_t p_in[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
-uint8_t p_out[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t p_in[SVE_NUM_PREGS * __SVE_PREG_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t p_out[SVE_NUM_PREGS * __SVE_PREG_SIZE(ARCH_SVE_VQ_MAX)];
static void setup_p(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
uint64_t svcr)
@@ -222,8 +230,8 @@ static int check_p(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
return errors;
}
-uint8_t ffr_in[__SVE_PREG_SIZE(SVE_VQ_MAX)];
-uint8_t ffr_out[__SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t ffr_in[__SVE_PREG_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t ffr_out[__SVE_PREG_SIZE(ARCH_SVE_VQ_MAX)];
static void setup_ffr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
uint64_t svcr)
@@ -300,8 +308,8 @@ static int check_svcr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
return errors;
}
-uint8_t za_in[ZA_SIG_REGS_SIZE(SVE_VQ_MAX)];
-uint8_t za_out[ZA_SIG_REGS_SIZE(SVE_VQ_MAX)];
+uint8_t za_in[ZA_SIG_REGS_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t za_out[ZA_SIG_REGS_SIZE(ARCH_SVE_VQ_MAX)];
static void setup_za(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
uint64_t svcr)
@@ -470,9 +478,9 @@ void sve_count_vls(void)
return;
/*
- * Enumerate up to SVE_VQ_MAX vector lengths
+ * Enumerate up to ARCH_SVE_VQ_MAX vector lengths
*/
- for (vq = SVE_VQ_MAX; vq > 0; vq /= 2) {
+ for (vq = ARCH_SVE_VQ_MAX; vq > 0; vq /= 2) {
vl = prctl(PR_SVE_SET_VL, vq * 16);
if (vl == -1)
ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
@@ -496,9 +504,9 @@ void sme_count_vls(void)
return;
/*
- * Enumerate up to SVE_VQ_MAX vector lengths
+ * Enumerate up to ARCH_SVE_VQ_MAX vector lengths
*/
- for (vq = SVE_VQ_MAX; vq > 0; vq /= 2) {
+ for (vq = ARCH_SVE_VQ_MAX; vq > 0; vq /= 2) {
vl = prctl(PR_SME_SET_VL, vq * 16);
if (vl == -1)
ksft_exit_fail_msg("PR_SME_SET_VL failed: %s (%d)\n",
diff --git a/tools/testing/selftests/arm64/bti/Makefile b/tools/testing/selftests/arm64/bti/Makefile
index ccdac414ad94..05e4ee523a53 100644
--- a/tools/testing/selftests/arm64/bti/Makefile
+++ b/tools/testing/selftests/arm64/bti/Makefile
@@ -2,8 +2,6 @@
TEST_GEN_PROGS := btitest nobtitest
-PROGS := $(patsubst %,gen/%,$(TEST_GEN_PROGS))
-
# These tests are built as freestanding binaries since otherwise BTI
# support in ld.so is required which is not currently widespread; when
# it is available it will still be useful to test this separately as the
@@ -18,44 +16,41 @@ CFLAGS_COMMON = -ffreestanding -Wall -Wextra $(CFLAGS)
BTI_CC_COMMAND = $(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -c -o $@ $<
NOBTI_CC_COMMAND = $(CC) $(CFLAGS_NOBTI) $(CFLAGS_COMMON) -c -o $@ $<
-%-bti.o: %.c
+$(OUTPUT)/%-bti.o: %.c
$(BTI_CC_COMMAND)
-%-bti.o: %.S
+$(OUTPUT)/%-bti.o: %.S
$(BTI_CC_COMMAND)
-%-nobti.o: %.c
+$(OUTPUT)/%-nobti.o: %.c
$(NOBTI_CC_COMMAND)
-%-nobti.o: %.S
+$(OUTPUT)/%-nobti.o: %.S
$(NOBTI_CC_COMMAND)
BTI_OBJS = \
- test-bti.o \
- signal-bti.o \
- start-bti.o \
- syscall-bti.o \
- system-bti.o \
- teststubs-bti.o \
- trampoline-bti.o
-gen/btitest: $(BTI_OBJS)
+ $(OUTPUT)/test-bti.o \
+ $(OUTPUT)/signal-bti.o \
+ $(OUTPUT)/start-bti.o \
+ $(OUTPUT)/syscall-bti.o \
+ $(OUTPUT)/system-bti.o \
+ $(OUTPUT)/teststubs-bti.o \
+ $(OUTPUT)/trampoline-bti.o
+$(OUTPUT)/btitest: $(BTI_OBJS)
$(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -nostdlib -static -o $@ $^
NOBTI_OBJS = \
- test-nobti.o \
- signal-nobti.o \
- start-nobti.o \
- syscall-nobti.o \
- system-nobti.o \
- teststubs-nobti.o \
- trampoline-nobti.o
-gen/nobtitest: $(NOBTI_OBJS)
+ $(OUTPUT)/test-nobti.o \
+ $(OUTPUT)/signal-nobti.o \
+ $(OUTPUT)/start-nobti.o \
+ $(OUTPUT)/syscall-nobti.o \
+ $(OUTPUT)/system-nobti.o \
+ $(OUTPUT)/teststubs-nobti.o \
+ $(OUTPUT)/trampoline-nobti.o
+$(OUTPUT)/nobtitest: $(NOBTI_OBJS)
$(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -nostdlib -static -o $@ $^
# Including KSFT lib.mk here will also mangle the TEST_GEN_PROGS list
# to account for any OUTPUT target-dirs optionally provided by
# the toplevel makefile
include ../../lib.mk
-
-$(TEST_GEN_PROGS): $(PROGS)
- cp $(PROGS) $(OUTPUT)/
diff --git a/tools/testing/selftests/arm64/bti/compiler.h b/tools/testing/selftests/arm64/bti/compiler.h
deleted file mode 100644
index ebb6204f447a..000000000000
--- a/tools/testing/selftests/arm64/bti/compiler.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2019 Arm Limited
- * Original author: Dave Martin <Dave.Martin@arm.com>
- */
-
-#ifndef COMPILER_H
-#define COMPILER_H
-
-#define __always_unused __attribute__((__unused__))
-#define __noreturn __attribute__((__noreturn__))
-#define __unreachable() __builtin_unreachable()
-
-/* curse(e) has value e, but the compiler cannot assume so */
-#define curse(e) ({ \
- __typeof__(e) __curse_e = (e); \
- asm ("" : "+r" (__curse_e)); \
- __curse_e; \
-})
-
-#endif /* ! COMPILER_H */
diff --git a/tools/testing/selftests/arm64/bti/gen/.gitignore b/tools/testing/selftests/arm64/bti/gen/.gitignore
deleted file mode 100644
index 73869fabada4..000000000000
--- a/tools/testing/selftests/arm64/bti/gen/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-btitest
-nobtitest
diff --git a/tools/testing/selftests/arm64/bti/system.c b/tools/testing/selftests/arm64/bti/system.c
index 6385d8d4973b..93d772b00bfe 100644
--- a/tools/testing/selftests/arm64/bti/system.c
+++ b/tools/testing/selftests/arm64/bti/system.c
@@ -8,12 +8,10 @@
#include <asm/unistd.h>
-#include "compiler.h"
-
void __noreturn exit(int n)
{
syscall(__NR_exit, n);
- __unreachable();
+ unreachable();
}
ssize_t write(int fd, const void *buf, size_t size)
diff --git a/tools/testing/selftests/arm64/bti/system.h b/tools/testing/selftests/arm64/bti/system.h
index aca118589705..2e9ee1284a0c 100644
--- a/tools/testing/selftests/arm64/bti/system.h
+++ b/tools/testing/selftests/arm64/bti/system.h
@@ -14,12 +14,12 @@ typedef __kernel_size_t size_t;
typedef __kernel_ssize_t ssize_t;
#include <linux/errno.h>
+#include <linux/compiler.h>
+
#include <asm/hwcap.h>
#include <asm/ptrace.h>
#include <asm/unistd.h>
-#include "compiler.h"
-
long syscall(int nr, ...);
void __noreturn exit(int n);
diff --git a/tools/testing/selftests/arm64/bti/test.c b/tools/testing/selftests/arm64/bti/test.c
index 2cd8dcee5aec..28a8e8a28a84 100644
--- a/tools/testing/selftests/arm64/bti/test.c
+++ b/tools/testing/selftests/arm64/bti/test.c
@@ -17,7 +17,6 @@
typedef struct ucontext ucontext_t;
#include "btitest.h"
-#include "compiler.h"
#include "signal.h"
#define EXPECTED_TESTS 18
diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c
index 9bcfcdc34ee9..5f648b97a06f 100644
--- a/tools/testing/selftests/arm64/fp/vec-syscfg.c
+++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c
@@ -6,6 +6,7 @@
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
+#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
@@ -39,9 +40,11 @@ struct vec_data {
int max_vl;
};
+#define VEC_SVE 0
+#define VEC_SME 1
static struct vec_data vec_data[] = {
- {
+ [VEC_SVE] = {
.name = "SVE",
.hwcap_type = AT_HWCAP,
.hwcap = HWCAP_SVE,
@@ -51,7 +54,7 @@ static struct vec_data vec_data[] = {
.prctl_set = PR_SVE_SET_VL,
.default_vl_file = "/proc/sys/abi/sve_default_vector_length",
},
- {
+ [VEC_SME] = {
.name = "SME",
.hwcap_type = AT_HWCAP2,
.hwcap = HWCAP2_SME,
@@ -551,7 +554,8 @@ static void prctl_set_onexec(struct vec_data *data)
/* For each VQ verify that setting via prctl() does the right thing */
static void prctl_set_all_vqs(struct vec_data *data)
{
- int ret, vq, vl, new_vl;
+ int ret, vq, vl, new_vl, i;
+ int orig_vls[ARRAY_SIZE(vec_data)];
int errors = 0;
if (!data->min_vl || !data->max_vl) {
@@ -560,6 +564,9 @@ static void prctl_set_all_vqs(struct vec_data *data)
return;
}
+ for (i = 0; i < ARRAY_SIZE(vec_data); i++)
+ orig_vls[i] = vec_data[i].rdvl();
+
for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
vl = sve_vl_from_vq(vq);
@@ -582,6 +589,22 @@ static void prctl_set_all_vqs(struct vec_data *data)
errors++;
}
+ /* Did any other VLs change? */
+ for (i = 0; i < ARRAY_SIZE(vec_data); i++) {
+ if (&vec_data[i] == data)
+ continue;
+
+ if (!(getauxval(vec_data[i].hwcap_type) & vec_data[i].hwcap))
+ continue;
+
+ if (vec_data[i].rdvl() != orig_vls[i]) {
+ ksft_print_msg("%s VL changed from %d to %d\n",
+ vec_data[i].name, orig_vls[i],
+ vec_data[i].rdvl());
+ errors++;
+ }
+ }
+
/* Was that the VL we asked for? */
if (new_vl == vl)
continue;
@@ -644,18 +667,107 @@ static const test_type tests[] = {
prctl_set_all_vqs,
};
+static inline void smstart(void)
+{
+ asm volatile("msr S0_3_C4_C7_3, xzr");
+}
+
+static inline void smstart_sm(void)
+{
+ asm volatile("msr S0_3_C4_C3_3, xzr");
+}
+
+static inline void smstop(void)
+{
+ asm volatile("msr S0_3_C4_C6_3, xzr");
+}
+
+
+/*
+ * Verify we can change the SVE vector length while SME is active and
+ * continue to use SME afterwards.
+ */
+static void change_sve_with_za(void)
+{
+ struct vec_data *sve_data = &vec_data[VEC_SVE];
+ bool pass = true;
+ int ret, i;
+
+ if (sve_data->min_vl == sve_data->max_vl) {
+ ksft_print_msg("Only one SVE VL supported, can't change\n");
+ ksft_test_result_skip("change_sve_while_sme\n");
+ return;
+ }
+
+ /* Ensure we will trigger a change when we set the maximum */
+ ret = prctl(sve_data->prctl_set, sve_data->min_vl);
+ if (ret != sve_data->min_vl) {
+ ksft_print_msg("Failed to set SVE VL %d: %d\n",
+ sve_data->min_vl, ret);
+ pass = false;
+ }
+
+ /* Enable SM and ZA */
+ smstart();
+
+ /* Trigger another VL change */
+ ret = prctl(sve_data->prctl_set, sve_data->max_vl);
+ if (ret != sve_data->max_vl) {
+ ksft_print_msg("Failed to set SVE VL %d: %d\n",
+ sve_data->max_vl, ret);
+ pass = false;
+ }
+
+ /*
+ * Spin for a bit with SM enabled to try to trigger another
+ * save/restore. We can't use syscalls without exiting
+ * streaming mode.
+ */
+ for (i = 0; i < 100000000; i++)
+ smstart_sm();
+
+ /*
+ * TODO: Verify that ZA was preserved over the VL change and
+ * spin.
+ */
+
+ /* Clean up after ourselves */
+ smstop();
+ ret = prctl(sve_data->prctl_set, sve_data->default_vl);
+ if (ret != sve_data->default_vl) {
+ ksft_print_msg("Failed to restore SVE VL %d: %d\n",
+ sve_data->default_vl, ret);
+ pass = false;
+ }
+
+ ksft_test_result(pass, "change_sve_with_za\n");
+}
+
+typedef void (*test_all_type)(void);
+
+static const struct {
+ const char *name;
+ test_all_type test;
+} all_types_tests[] = {
+ { "change_sve_with_za", change_sve_with_za },
+};
+
int main(void)
{
+ bool all_supported = true;
int i, j;
ksft_print_header();
- ksft_set_plan(ARRAY_SIZE(tests) * ARRAY_SIZE(vec_data));
+ ksft_set_plan(ARRAY_SIZE(tests) * ARRAY_SIZE(vec_data) +
+ ARRAY_SIZE(all_types_tests));
for (i = 0; i < ARRAY_SIZE(vec_data); i++) {
struct vec_data *data = &vec_data[i];
unsigned long supported;
supported = getauxval(data->hwcap_type) & data->hwcap;
+ if (!supported)
+ all_supported = false;
for (j = 0; j < ARRAY_SIZE(tests); j++) {
if (supported)
@@ -666,5 +778,12 @@ int main(void)
}
}
+ for (i = 0; i < ARRAY_SIZE(all_types_tests); i++) {
+ if (all_supported)
+ all_types_tests[i].test();
+ else
+ ksft_test_result_skip("%s\n", all_types_tests[i].name);
+ }
+
ksft_exit_pass();
}
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h
index 222093f51b67..762c8fe9c54a 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.h
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -8,6 +8,8 @@
#include <stdio.h>
#include <string.h>
+#include <linux/compiler.h>
+
#include "test_signals.h"
int test_init(struct tdescr *td);
@@ -60,13 +62,25 @@ static __always_inline bool get_current_context(struct tdescr *td,
size_t dest_sz)
{
static volatile bool seen_already;
+ int i;
+ char *uc = (char *)dest_uc;
assert(td && dest_uc);
/* it's a genuine invocation..reinit */
seen_already = 0;
td->live_uc_valid = 0;
td->live_sz = dest_sz;
- memset(dest_uc, 0x00, td->live_sz);
+
+ /*
+ * This is a memset() but we don't want the compiler to
+ * optimise it into either instructions or a library call
+ * which might be incompatible with streaming mode.
+ */
+ for (i = 0; i < td->live_sz; i++) {
+ uc[i] = 0;
+ OPTIMIZER_HIDE_VAR(uc[0]);
+ }
+
td->live_uc = dest_uc;
/*
* Grab ucontext_t triggering a SIGTRAP.
@@ -104,6 +118,17 @@ static __always_inline bool get_current_context(struct tdescr *td,
: "memory");
/*
+ * If we were grabbing a streaming mode context then we may
+ * have entered streaming mode behind the system's back and
+ * libc or compiler generated code might decide to do
+ * something invalid in streaming mode, or potentially even
+ * the state of ZA. Issue a SMSTOP to exit both now we have
+ * grabbed the state.
+ */
+ if (td->feats_supported & FEAT_SME)
+ asm volatile("msr S0_3_C4_C6_3, xzr");
+
+ /*
* If we get here with seen_already==1 it implies the td->live_uc
* context has been used to get back here....this probably means
* a test has failed to cause a SEGV...anyway live_uc does not
diff --git a/tools/testing/selftests/arm64/signal/testcases/zt_regs.c b/tools/testing/selftests/arm64/signal/testcases/zt_regs.c
index e1eb4d5c027a..2e384d731618 100644
--- a/tools/testing/selftests/arm64/signal/testcases/zt_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/zt_regs.c
@@ -65,6 +65,7 @@ int zt_regs_run(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
if (memcmp(zeros, (char *)zt + ZT_SIG_REGS_OFFSET,
ZT_SIG_REGS_SIZE(zt->nregs)) != 0) {
fprintf(stderr, "ZT data invalid\n");
+ free(zeros);
return 1;
}
diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c
index 54d09b820ed4..c037553cfd92 100644
--- a/tools/testing/selftests/cachestat/test_cachestat.c
+++ b/tools/testing/selftests/cachestat/test_cachestat.c
@@ -4,10 +4,12 @@
#include <stdio.h>
#include <stdbool.h>
#include <linux/kernel.h>
+#include <linux/magic.h>
#include <linux/mman.h>
#include <sys/mman.h>
#include <sys/shm.h>
#include <sys/syscall.h>
+#include <sys/vfs.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
@@ -15,6 +17,8 @@
#include "../kselftest.h"
+#define NR_TESTS 9
+
static const char * const dev_files[] = {
"/dev/zero", "/dev/null", "/dev/urandom",
"/proc/version", "/proc"
@@ -91,19 +95,33 @@ out:
}
/*
+ * fsync() is implemented via noop_fsync() on tmpfs. This makes the fsync()
+ * test fail below, so we need to check for test file living on a tmpfs.
+ */
+static bool is_on_tmpfs(int fd)
+{
+ struct statfs statfs_buf;
+
+ if (fstatfs(fd, &statfs_buf))
+ return false;
+
+ return statfs_buf.f_type == TMPFS_MAGIC;
+}
+
+/*
* Open/create the file at filename, (optionally) write random data to it
* (exactly num_pages), then test the cachestat syscall on this file.
*
* If test_fsync == true, fsync the file, then check the number of dirty
* pages.
*/
-bool test_cachestat(const char *filename, bool write_random, bool create,
- bool test_fsync, unsigned long num_pages, int open_flags,
- mode_t open_mode)
+static int test_cachestat(const char *filename, bool write_random, bool create,
+ bool test_fsync, unsigned long num_pages,
+ int open_flags, mode_t open_mode)
{
size_t PS = sysconf(_SC_PAGESIZE);
int filesize = num_pages * PS;
- bool ret = true;
+ int ret = KSFT_PASS;
long syscall_ret;
struct cachestat cs;
struct cachestat_range cs_range = { 0, filesize };
@@ -112,7 +130,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
if (fd == -1) {
ksft_print_msg("Unable to create/open file.\n");
- ret = false;
+ ret = KSFT_FAIL;
goto out;
} else {
ksft_print_msg("Create/open %s\n", filename);
@@ -121,7 +139,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
if (write_random) {
if (!write_exactly(fd, filesize)) {
ksft_print_msg("Unable to access urandom.\n");
- ret = false;
+ ret = KSFT_FAIL;
goto out1;
}
}
@@ -132,7 +150,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
if (syscall_ret) {
ksft_print_msg("Cachestat returned non-zero.\n");
- ret = false;
+ ret = KSFT_FAIL;
goto out1;
} else {
@@ -142,15 +160,17 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
if (cs.nr_cache + cs.nr_evicted != num_pages) {
ksft_print_msg(
"Total number of cached and evicted pages is off.\n");
- ret = false;
+ ret = KSFT_FAIL;
}
}
}
if (test_fsync) {
- if (fsync(fd)) {
+ if (is_on_tmpfs(fd)) {
+ ret = KSFT_SKIP;
+ } else if (fsync(fd)) {
ksft_print_msg("fsync fails.\n");
- ret = false;
+ ret = KSFT_FAIL;
} else {
syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0);
@@ -161,13 +181,13 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
print_cachestat(&cs);
if (cs.nr_dirty) {
- ret = false;
+ ret = KSFT_FAIL;
ksft_print_msg(
"Number of dirty should be zero after fsync.\n");
}
} else {
ksft_print_msg("Cachestat (after fsync) returned non-zero.\n");
- ret = false;
+ ret = KSFT_FAIL;
goto out1;
}
}
@@ -236,13 +256,29 @@ out:
int main(void)
{
- int ret = 0;
+ int ret;
+
+ ksft_print_header();
+
+ ret = syscall(__NR_cachestat, -1, NULL, NULL, 0);
+ if (ret == -1 && errno == ENOSYS)
+ ksft_exit_skip("cachestat syscall not available\n");
+
+ ksft_set_plan(NR_TESTS);
+
+ if (ret == -1 && errno == EBADF) {
+ ksft_test_result_pass("bad file descriptor recognized\n");
+ ret = 0;
+ } else {
+ ksft_test_result_fail("bad file descriptor ignored\n");
+ ret = 1;
+ }
for (int i = 0; i < 5; i++) {
const char *dev_filename = dev_files[i];
if (test_cachestat(dev_filename, false, false, false,
- 4, O_RDONLY, 0400))
+ 4, O_RDONLY, 0400) == KSFT_PASS)
ksft_test_result_pass("cachestat works with %s\n", dev_filename);
else {
ksft_test_result_fail("cachestat fails with %s\n", dev_filename);
@@ -251,13 +287,27 @@ int main(void)
}
if (test_cachestat("tmpfilecachestat", true, true,
- true, 4, O_CREAT | O_RDWR, 0400 | 0600))
+ false, 4, O_CREAT | O_RDWR, 0600) == KSFT_PASS)
ksft_test_result_pass("cachestat works with a normal file\n");
else {
ksft_test_result_fail("cachestat fails with normal file\n");
ret = 1;
}
+ switch (test_cachestat("tmpfilecachestat", true, true,
+ true, 4, O_CREAT | O_RDWR, 0600)) {
+ case KSFT_FAIL:
+ ksft_test_result_fail("cachestat fsync fails with normal file\n");
+ ret = KSFT_FAIL;
+ break;
+ case KSFT_PASS:
+ ksft_test_result_pass("cachestat fsync works with a normal file\n");
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("tmpfilecachestat is on tmpfs\n");
+ break;
+ }
+
if (test_cachestat_shmem())
ksft_test_result_pass("cachestat works with a shmem file\n");
else {
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
index 1b2cec9d18a4..ed2e50bb1e76 100644
--- a/tools/testing/selftests/cgroup/test_kmem.c
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -75,11 +75,11 @@ static int test_kmem_basic(const char *root)
sleep(1);
slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
- if (slab1 <= 0)
+ if (slab1 < 0)
goto cleanup;
current = cg_read_long(cg, "memory.current");
- if (current <= 0)
+ if (current < 0)
goto cleanup;
if (slab1 < slab0 / 2 && current < slab0 / 2)
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 03f92d7aeb19..8a72bb7de70f 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -9,10 +9,12 @@ TEST_PROGS := \
mode-1-recovery-updelay.sh \
mode-2-recovery-updelay.sh \
bond_options.sh \
- bond-eth-type-change.sh
+ bond-eth-type-change.sh \
+ bond_macvlan.sh
TEST_FILES := \
lag_lib.sh \
+ bond_topo_2d1c.sh \
bond_topo_3d1c.sh \
net_forwarding_lib.sh
diff --git a/tools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh b/tools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh
index 47ab90596acb..6358df5752f9 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond-break-lacpdu-tx.sh
@@ -57,8 +57,8 @@ ip link add name veth2-bond type veth peer name veth2-end
# add ports
ip link set fbond master fab-br0
-ip link set veth1-bond down master fbond
-ip link set veth2-bond down master fbond
+ip link set veth1-bond master fbond
+ip link set veth2-bond master fbond
# bring up
ip link set veth1-end up
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
new file mode 100755
index 000000000000..b609fb6231f4
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test macvlan over balance-alb
+
+lib_dir=$(dirname "$0")
+source ${lib_dir}/bond_topo_2d1c.sh
+
+m1_ns="m1-$(mktemp -u XXXXXX)"
+m2_ns="m1-$(mktemp -u XXXXXX)"
+m1_ip4="192.0.2.11"
+m1_ip6="2001:db8::11"
+m2_ip4="192.0.2.12"
+m2_ip6="2001:db8::12"
+
+cleanup()
+{
+ ip -n ${m1_ns} link del macv0
+ ip netns del ${m1_ns}
+ ip -n ${m2_ns} link del macv0
+ ip netns del ${m2_ns}
+
+ client_destroy
+ server_destroy
+ gateway_destroy
+}
+
+check_connection()
+{
+ local ns=${1}
+ local target=${2}
+ local message=${3:-"macvlan_over_bond"}
+ RET=0
+
+
+ ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null
+ check_err $? "ping failed"
+ log_test "$mode: $message"
+}
+
+macvlan_over_bond()
+{
+ local param="$1"
+ RET=0
+
+ # setup new bond mode
+ bond_reset "${param}"
+
+ ip -n ${s_ns} link add link bond0 name macv0 type macvlan mode bridge
+ ip -n ${s_ns} link set macv0 netns ${m1_ns}
+ ip -n ${m1_ns} link set dev macv0 up
+ ip -n ${m1_ns} addr add ${m1_ip4}/24 dev macv0
+ ip -n ${m1_ns} addr add ${m1_ip6}/24 dev macv0
+
+ ip -n ${s_ns} link add link bond0 name macv0 type macvlan mode bridge
+ ip -n ${s_ns} link set macv0 netns ${m2_ns}
+ ip -n ${m2_ns} link set dev macv0 up
+ ip -n ${m2_ns} addr add ${m2_ip4}/24 dev macv0
+ ip -n ${m2_ns} addr add ${m2_ip6}/24 dev macv0
+
+ sleep 2
+
+ check_connection "${c_ns}" "${s_ip4}" "IPv4: client->server"
+ check_connection "${c_ns}" "${s_ip6}" "IPv6: client->server"
+ check_connection "${c_ns}" "${m1_ip4}" "IPv4: client->macvlan_1"
+ check_connection "${c_ns}" "${m1_ip6}" "IPv6: client->macvlan_1"
+ check_connection "${c_ns}" "${m2_ip4}" "IPv4: client->macvlan_2"
+ check_connection "${c_ns}" "${m2_ip6}" "IPv6: client->macvlan_2"
+ check_connection "${m1_ns}" "${m2_ip4}" "IPv4: macvlan_1->macvlan_2"
+ check_connection "${m1_ns}" "${m2_ip6}" "IPv6: macvlan_1->macvlan_2"
+
+
+ sleep 5
+
+ check_connection "${s_ns}" "${c_ip4}" "IPv4: server->client"
+ check_connection "${s_ns}" "${c_ip6}" "IPv6: server->client"
+ check_connection "${m1_ns}" "${c_ip4}" "IPv4: macvlan_1->client"
+ check_connection "${m1_ns}" "${c_ip6}" "IPv6: macvlan_1->client"
+ check_connection "${m2_ns}" "${c_ip4}" "IPv4: macvlan_2->client"
+ check_connection "${m2_ns}" "${c_ip6}" "IPv6: macvlan_2->client"
+ check_connection "${m2_ns}" "${m1_ip4}" "IPv4: macvlan_2->macvlan_2"
+ check_connection "${m2_ns}" "${m1_ip6}" "IPv6: macvlan_2->macvlan_2"
+
+ ip -n ${c_ns} neigh flush dev eth0
+}
+
+trap cleanup EXIT
+
+setup_prepare
+ip netns add ${m1_ns}
+ip netns add ${m2_ns}
+
+modes="active-backup balance-tlb balance-alb"
+
+for mode in $modes; do
+ macvlan_over_bond "mode $mode"
+done
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
index 607ba5c38977..c54d1697f439 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
@@ -9,10 +9,7 @@ ALL_TESTS="
num_grat_arp
"
-REQUIRE_MZ=no
-NUM_NETIFS=0
lib_dir=$(dirname "$0")
-source ${lib_dir}/net_forwarding_lib.sh
source ${lib_dir}/bond_topo_3d1c.sh
skip_prio()
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh
new file mode 100644
index 000000000000..a509ef949dcf
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Topology for Bond mode 1,5,6 testing
+#
+# +-------------------------+
+# | bond0 | Server
+# | + | 192.0.2.1/24
+# | eth0 | eth1 | 2001:db8::1/24
+# | +---+---+ |
+# | | | |
+# +-------------------------+
+# | |
+# +-------------------------+
+# | | | |
+# | +---+-------+---+ | Gateway
+# | | br0 | | 192.0.2.254/24
+# | +-------+-------+ | 2001:db8::254/24
+# | | |
+# +-------------------------+
+# |
+# +-------------------------+
+# | | | Client
+# | + | 192.0.2.10/24
+# | eth0 | 2001:db8::10/24
+# +-------------------------+
+
+REQUIRE_MZ=no
+NUM_NETIFS=0
+lib_dir=$(dirname "$0")
+source ${lib_dir}/net_forwarding_lib.sh
+
+s_ns="s-$(mktemp -u XXXXXX)"
+c_ns="c-$(mktemp -u XXXXXX)"
+g_ns="g-$(mktemp -u XXXXXX)"
+s_ip4="192.0.2.1"
+c_ip4="192.0.2.10"
+g_ip4="192.0.2.254"
+s_ip6="2001:db8::1"
+c_ip6="2001:db8::10"
+g_ip6="2001:db8::254"
+
+gateway_create()
+{
+ ip netns add ${g_ns}
+ ip -n ${g_ns} link add br0 type bridge
+ ip -n ${g_ns} link set br0 up
+ ip -n ${g_ns} addr add ${g_ip4}/24 dev br0
+ ip -n ${g_ns} addr add ${g_ip6}/24 dev br0
+}
+
+gateway_destroy()
+{
+ ip -n ${g_ns} link del br0
+ ip netns del ${g_ns}
+}
+
+server_create()
+{
+ ip netns add ${s_ns}
+ ip -n ${s_ns} link add bond0 type bond mode active-backup miimon 100
+
+ for i in $(seq 0 1); do
+ ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns}
+
+ ip -n ${g_ns} link set s${i} up
+ ip -n ${g_ns} link set s${i} master br0
+ ip -n ${s_ns} link set eth${i} master bond0
+
+ tc -n ${g_ns} qdisc add dev s${i} clsact
+ done
+
+ ip -n ${s_ns} link set bond0 up
+ ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0
+ ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0
+ sleep 2
+}
+
+# Reset bond with new mode and options
+bond_reset()
+{
+ # Count the eth link number in real-time as this function
+ # maybe called from other topologies.
+ local link_num=$(ip -n ${s_ns} -br link show | grep -c "^eth")
+ local param="$1"
+ link_num=$((link_num -1))
+
+ ip -n ${s_ns} link set bond0 down
+ ip -n ${s_ns} link del bond0
+
+ ip -n ${s_ns} link add bond0 type bond $param
+ for i in $(seq 0 ${link_num}); do
+ ip -n ${s_ns} link set eth$i master bond0
+ done
+
+ ip -n ${s_ns} link set bond0 up
+ ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0
+ ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0
+ sleep 2
+}
+
+server_destroy()
+{
+ # Count the eth link number in real-time as this function
+ # maybe called from other topologies.
+ local link_num=$(ip -n ${s_ns} -br link show | grep -c "^eth")
+ link_num=$((link_num -1))
+ for i in $(seq 0 ${link_num}); do
+ ip -n ${s_ns} link del eth${i}
+ done
+ ip netns del ${s_ns}
+}
+
+client_create()
+{
+ ip netns add ${c_ns}
+ ip -n ${c_ns} link add eth0 type veth peer name c0 netns ${g_ns}
+
+ ip -n ${g_ns} link set c0 up
+ ip -n ${g_ns} link set c0 master br0
+
+ ip -n ${c_ns} link set eth0 up
+ ip -n ${c_ns} addr add ${c_ip4}/24 dev eth0
+ ip -n ${c_ns} addr add ${c_ip6}/24 dev eth0
+}
+
+client_destroy()
+{
+ ip -n ${c_ns} link del eth0
+ ip netns del ${c_ns}
+}
+
+setup_prepare()
+{
+ gateway_create
+ server_create
+ client_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ client_destroy
+ server_destroy
+ gateway_destroy
+}
+
+bond_check_connection()
+{
+ local msg=${1:-"check connection"}
+
+ sleep 2
+ ip netns exec ${s_ns} ping ${c_ip4} -c5 -i 0.1 &>/dev/null
+ check_err $? "${msg}: ping failed"
+ ip netns exec ${s_ns} ping6 ${c_ip6} -c5 -i 0.1 &>/dev/null
+ check_err $? "${msg}: ping6 failed"
+}
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh
index 69ab99a56043..3a1333d9a85b 100644
--- a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh
@@ -25,121 +25,19 @@
# | eth0 | 2001:db8::10/24
# +-------------------------------------+
-s_ns="s-$(mktemp -u XXXXXX)"
-c_ns="c-$(mktemp -u XXXXXX)"
-g_ns="g-$(mktemp -u XXXXXX)"
-s_ip4="192.0.2.1"
-c_ip4="192.0.2.10"
-g_ip4="192.0.2.254"
-s_ip6="2001:db8::1"
-c_ip6="2001:db8::10"
-g_ip6="2001:db8::254"
-
-gateway_create()
-{
- ip netns add ${g_ns}
- ip -n ${g_ns} link add br0 type bridge
- ip -n ${g_ns} link set br0 up
- ip -n ${g_ns} addr add ${g_ip4}/24 dev br0
- ip -n ${g_ns} addr add ${g_ip6}/24 dev br0
-}
-
-gateway_destroy()
-{
- ip -n ${g_ns} link del br0
- ip netns del ${g_ns}
-}
-
-server_create()
-{
- ip netns add ${s_ns}
- ip -n ${s_ns} link add bond0 type bond mode active-backup miimon 100
-
- for i in $(seq 0 2); do
- ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns}
-
- ip -n ${g_ns} link set s${i} up
- ip -n ${g_ns} link set s${i} master br0
- ip -n ${s_ns} link set eth${i} master bond0
-
- tc -n ${g_ns} qdisc add dev s${i} clsact
- done
-
- ip -n ${s_ns} link set bond0 up
- ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0
- ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0
- sleep 2
-}
-
-# Reset bond with new mode and options
-bond_reset()
-{
- local param="$1"
-
- ip -n ${s_ns} link set bond0 down
- ip -n ${s_ns} link del bond0
-
- ip -n ${s_ns} link add bond0 type bond $param
- for i in $(seq 0 2); do
- ip -n ${s_ns} link set eth$i master bond0
- done
-
- ip -n ${s_ns} link set bond0 up
- ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0
- ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0
- sleep 2
-}
-
-server_destroy()
-{
- for i in $(seq 0 2); do
- ip -n ${s_ns} link del eth${i}
- done
- ip netns del ${s_ns}
-}
-
-client_create()
-{
- ip netns add ${c_ns}
- ip -n ${c_ns} link add eth0 type veth peer name c0 netns ${g_ns}
-
- ip -n ${g_ns} link set c0 up
- ip -n ${g_ns} link set c0 master br0
-
- ip -n ${c_ns} link set eth0 up
- ip -n ${c_ns} addr add ${c_ip4}/24 dev eth0
- ip -n ${c_ns} addr add ${c_ip6}/24 dev eth0
-}
-
-client_destroy()
-{
- ip -n ${c_ns} link del eth0
- ip netns del ${c_ns}
-}
+source bond_topo_2d1c.sh
setup_prepare()
{
gateway_create
server_create
client_create
-}
-
-cleanup()
-{
- pre_cleanup
-
- client_destroy
- server_destroy
- gateway_destroy
-}
-
-bond_check_connection()
-{
- local msg=${1:-"check connection"}
- sleep 2
- ip netns exec ${s_ns} ping ${c_ip4} -c5 -i 0.1 &>/dev/null
- check_err $? "${msg}: ping failed"
- ip netns exec ${s_ns} ping6 ${c_ip6} -c5 -i 0.1 &>/dev/null
- check_err $? "${msg}: ping6 failed"
+ # Add the extra device as we use 3 down links for bond0
+ local i=2
+ ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns}
+ ip -n ${g_ns} link set s${i} up
+ ip -n ${g_ns} link set s${i} master br0
+ ip -n ${s_ns} link set eth${i} master bond0
+ tc -n ${g_ns} qdisc add dev s${i} clsact
}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh
index 7d9e73a43a49..0c47faff9274 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh
@@ -98,12 +98,12 @@ sb_occ_etc_check()
port_pool_test()
{
- local exp_max_occ=288
+ local exp_max_occ=$(devlink_cell_size_get)
local max_occ
devlink sb occupancy clearmax $DEVLINK_DEV
- $MZ $h1 -c 1 -p 160 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \
+ $MZ $h1 -c 1 -p 10 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \
-t ip -q
devlink sb occupancy snapshot $DEVLINK_DEV
@@ -126,12 +126,12 @@ port_pool_test()
port_tc_ip_test()
{
- local exp_max_occ=288
+ local exp_max_occ=$(devlink_cell_size_get)
local max_occ
devlink sb occupancy clearmax $DEVLINK_DEV
- $MZ $h1 -c 1 -p 160 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \
+ $MZ $h1 -c 1 -p 10 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \
-t ip -q
devlink sb occupancy snapshot $DEVLINK_DEV
@@ -154,16 +154,12 @@ port_tc_ip_test()
port_tc_arp_test()
{
- local exp_max_occ=96
+ local exp_max_occ=$(devlink_cell_size_get)
local max_occ
- if [[ $MLXSW_CHIP != "mlxsw_spectrum" ]]; then
- exp_max_occ=144
- fi
-
devlink sb occupancy clearmax $DEVLINK_DEV
- $MZ $h1 -c 1 -p 160 -a $h1mac -A 192.0.1.1 -t arp -q
+ $MZ $h1 -c 1 -p 10 -a $h1mac -A 192.0.1.1 -t arp -q
devlink sb occupancy snapshot $DEVLINK_DEV
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore b/tools/testing/selftests/fchmodat2/.gitignore
index 24e27957efcc..82a4846cbc4b 100644
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore
+++ b/tools/testing/selftests/fchmodat2/.gitignore
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-srcu.c
+/*_test
diff --git a/tools/testing/selftests/fchmodat2/Makefile b/tools/testing/selftests/fchmodat2/Makefile
new file mode 100644
index 000000000000..20839f8e43f2
--- /dev/null
+++ b/tools/testing/selftests/fchmodat2/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined $(KHDR_INCLUDES)
+TEST_GEN_PROGS := fchmodat2_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/fchmodat2/fchmodat2_test.c b/tools/testing/selftests/fchmodat2/fchmodat2_test.c
new file mode 100644
index 000000000000..e0319417124d
--- /dev/null
+++ b/tools/testing/selftests/fchmodat2/fchmodat2_test.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+int sys_fchmodat2(int dfd, const char *filename, mode_t mode, int flags)
+{
+ int ret = syscall(__NR_fchmodat2, dfd, filename, mode, flags);
+
+ return ret >= 0 ? ret : -errno;
+}
+
+int setup_testdir(void)
+{
+ int dfd, ret;
+ char dirname[] = "/tmp/ksft-fchmodat2.XXXXXX";
+
+ /* Make the top-level directory. */
+ if (!mkdtemp(dirname))
+ ksft_exit_fail_msg("%s: failed to create tmpdir\n", __func__);
+
+ dfd = open(dirname, O_PATH | O_DIRECTORY);
+ if (dfd < 0)
+ ksft_exit_fail_msg("%s: failed to open tmpdir\n", __func__);
+
+ ret = openat(dfd, "regfile", O_CREAT | O_WRONLY | O_TRUNC, 0644);
+ if (ret < 0)
+ ksft_exit_fail_msg("%s: failed to create file in tmpdir\n",
+ __func__);
+ close(ret);
+
+ ret = symlinkat("regfile", dfd, "symlink");
+ if (ret < 0)
+ ksft_exit_fail_msg("%s: failed to create symlink in tmpdir\n",
+ __func__);
+
+ return dfd;
+}
+
+int expect_mode(int dfd, const char *filename, mode_t expect_mode)
+{
+ struct stat st;
+ int ret = fstatat(dfd, filename, &st, AT_SYMLINK_NOFOLLOW);
+
+ if (ret)
+ ksft_exit_fail_msg("%s: %s: fstatat failed\n",
+ __func__, filename);
+
+ return (st.st_mode == expect_mode);
+}
+
+void test_regfile(void)
+{
+ int dfd, ret;
+
+ dfd = setup_testdir();
+
+ ret = sys_fchmodat2(dfd, "regfile", 0640, 0);
+
+ if (ret < 0)
+ ksft_exit_fail_msg("%s: fchmodat2(noflag) failed\n", __func__);
+
+ if (!expect_mode(dfd, "regfile", 0100640))
+ ksft_exit_fail_msg("%s: wrong file mode bits after fchmodat2\n",
+ __func__);
+
+ ret = sys_fchmodat2(dfd, "regfile", 0600, AT_SYMLINK_NOFOLLOW);
+
+ if (ret < 0)
+ ksft_exit_fail_msg("%s: fchmodat2(AT_SYMLINK_NOFOLLOW) failed\n",
+ __func__);
+
+ if (!expect_mode(dfd, "regfile", 0100600))
+ ksft_exit_fail_msg("%s: wrong file mode bits after fchmodat2 with nofollow\n",
+ __func__);
+
+ ksft_test_result_pass("fchmodat2(regfile)\n");
+}
+
+void test_symlink(void)
+{
+ int dfd, ret;
+
+ dfd = setup_testdir();
+
+ ret = sys_fchmodat2(dfd, "symlink", 0640, 0);
+
+ if (ret < 0)
+ ksft_exit_fail_msg("%s: fchmodat2(noflag) failed\n", __func__);
+
+ if (!expect_mode(dfd, "regfile", 0100640))
+ ksft_exit_fail_msg("%s: wrong file mode bits after fchmodat2\n",
+ __func__);
+
+ if (!expect_mode(dfd, "symlink", 0120777))
+ ksft_exit_fail_msg("%s: wrong symlink mode bits after fchmodat2\n",
+ __func__);
+
+ ret = sys_fchmodat2(dfd, "symlink", 0600, AT_SYMLINK_NOFOLLOW);
+
+ /*
+ * On certain filesystems (xfs or btrfs), chmod operation fails. So we
+ * first check the symlink target but if the operation fails we mark the
+ * test as skipped.
+ *
+ * https://sourceware.org/legacy-ml/libc-alpha/2020-02/msg00467.html
+ */
+ if (ret == 0 && !expect_mode(dfd, "symlink", 0120600))
+ ksft_exit_fail_msg("%s: wrong symlink mode bits after fchmodat2 with nofollow\n",
+ __func__);
+
+ if (!expect_mode(dfd, "regfile", 0100640))
+ ksft_exit_fail_msg("%s: wrong file mode bits after fchmodat2 with nofollow\n",
+ __func__);
+
+ if (ret != 0)
+ ksft_test_result_skip("fchmodat2(symlink)\n");
+ else
+ ksft_test_result_pass("fchmodat2(symlink)\n");
+}
+
+#define NUM_TESTS 2
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(NUM_TESTS);
+
+ test_regfile();
+ test_symlink();
+
+ if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/filelock/Makefile b/tools/testing/selftests/filelock/Makefile
new file mode 100644
index 000000000000..478e82f8b464
--- /dev/null
+++ b/tools/testing/selftests/filelock/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_GEN_PROGS := ofdlocks
+
+include ../lib.mk
diff --git a/tools/testing/selftests/filelock/ofdlocks.c b/tools/testing/selftests/filelock/ofdlocks.c
new file mode 100644
index 000000000000..a55b79810ab2
--- /dev/null
+++ b/tools/testing/selftests/filelock/ofdlocks.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include "../kselftest.h"
+
+static int lock_set(int fd, struct flock *fl)
+{
+ int ret;
+
+ fl->l_pid = 0; // needed for OFD locks
+ fl->l_whence = SEEK_SET;
+ ret = fcntl(fd, F_OFD_SETLK, fl);
+ if (ret)
+ perror("fcntl()");
+ return ret;
+}
+
+static int lock_get(int fd, struct flock *fl)
+{
+ int ret;
+
+ fl->l_pid = 0; // needed for OFD locks
+ fl->l_whence = SEEK_SET;
+ ret = fcntl(fd, F_OFD_GETLK, fl);
+ if (ret)
+ perror("fcntl()");
+ return ret;
+}
+
+int main(void)
+{
+ int rc;
+ struct flock fl, fl2;
+ int fd = open("/tmp/aa", O_RDWR | O_CREAT | O_EXCL, 0600);
+ int fd2 = open("/tmp/aa", O_RDONLY);
+
+ unlink("/tmp/aa");
+ assert(fd != -1);
+ assert(fd2 != -1);
+ ksft_print_msg("[INFO] opened fds %i %i\n", fd, fd2);
+
+ /* Set some read lock */
+ fl.l_type = F_RDLCK;
+ fl.l_start = 5;
+ fl.l_len = 3;
+ rc = lock_set(fd, &fl);
+ if (rc == 0) {
+ ksft_print_msg
+ ("[SUCCESS] set OFD read lock on first fd\n");
+ } else {
+ ksft_print_msg("[FAIL] to set OFD read lock on first fd\n");
+ return -1;
+ }
+ /* Make sure read locks do not conflict on different fds. */
+ fl.l_type = F_RDLCK;
+ fl.l_start = 5;
+ fl.l_len = 1;
+ rc = lock_get(fd2, &fl);
+ if (rc != 0)
+ return -1;
+ if (fl.l_type != F_UNLCK) {
+ ksft_print_msg("[FAIL] read locks conflicted\n");
+ return -1;
+ }
+ /* Make sure read/write locks do conflict on different fds. */
+ fl.l_type = F_WRLCK;
+ fl.l_start = 5;
+ fl.l_len = 1;
+ rc = lock_get(fd2, &fl);
+ if (rc != 0)
+ return -1;
+ if (fl.l_type != F_UNLCK) {
+ ksft_print_msg
+ ("[SUCCESS] read and write locks conflicted\n");
+ } else {
+ ksft_print_msg
+ ("[SUCCESS] read and write locks not conflicted\n");
+ return -1;
+ }
+ /* Get info about the lock on first fd. */
+ fl.l_type = F_UNLCK;
+ fl.l_start = 5;
+ fl.l_len = 1;
+ rc = lock_get(fd, &fl);
+ if (rc != 0) {
+ ksft_print_msg
+ ("[FAIL] F_OFD_GETLK with F_UNLCK not supported\n");
+ return -1;
+ }
+ if (fl.l_type != F_UNLCK) {
+ ksft_print_msg
+ ("[SUCCESS] F_UNLCK test returns: locked, type %i pid %i len %zi\n",
+ fl.l_type, fl.l_pid, fl.l_len);
+ } else {
+ ksft_print_msg
+ ("[FAIL] F_OFD_GETLK with F_UNLCK did not return lock info\n");
+ return -1;
+ }
+ /* Try the same but by locking everything by len==0. */
+ fl2.l_type = F_UNLCK;
+ fl2.l_start = 0;
+ fl2.l_len = 0;
+ rc = lock_get(fd, &fl2);
+ if (rc != 0) {
+ ksft_print_msg
+ ("[FAIL] F_OFD_GETLK with F_UNLCK not supported\n");
+ return -1;
+ }
+ if (memcmp(&fl, &fl2, sizeof(fl))) {
+ ksft_print_msg
+ ("[FAIL] F_UNLCK test returns: locked, type %i pid %i len %zi\n",
+ fl.l_type, fl.l_pid, fl.l_len);
+ return -1;
+ }
+ ksft_print_msg("[SUCCESS] F_UNLCK with len==0 returned the same\n");
+ /* Get info about the lock on second fd - no locks on it. */
+ fl.l_type = F_UNLCK;
+ fl.l_start = 0;
+ fl.l_len = 0;
+ lock_get(fd2, &fl);
+ if (fl.l_type != F_UNLCK) {
+ ksft_print_msg
+ ("[FAIL] F_OFD_GETLK with F_UNLCK return lock info from another fd\n");
+ return -1;
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/snapshot1.tc b/tools/testing/selftests/ftrace/test.d/00basic/snapshot1.tc
new file mode 100644
index 000000000000..63b76cf2a360
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/snapshot1.tc
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Snapshot and tracing_cpumask
+# requires: trace_marker tracing_cpumask snapshot
+# flags: instance
+
+# This testcase is constrived to reproduce a problem that the cpu buffers
+# become unavailable which is due to 'record_disabled' of array_buffer and
+# max_buffer being messed up.
+
+# Store origin cpumask
+ORIG_CPUMASK=`cat tracing_cpumask`
+
+# Stop tracing all cpu
+echo 0 > tracing_cpumask
+
+# Take a snapshot of the main buffer
+echo 1 > snapshot
+
+# Restore origin cpumask, note that there should be some cpus being traced
+echo ${ORIG_CPUMASK} > tracing_cpumask
+
+# Set tracing on
+echo 1 > tracing_on
+
+# Write a log into buffer
+echo "test input 1" > trace_marker
+
+# Ensure the log writed so that cpu buffers are still available
+grep -q "test input 1" trace
+exit 0
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 5fd49ad0c696..e05ac8261046 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -938,7 +938,11 @@ void __wait_for_test(struct __test_metadata *t)
fprintf(TH_LOG_STREAM,
"# %s: Test terminated by timeout\n", t->name);
} else if (WIFEXITED(status)) {
- if (t->termsig != -1) {
+ if (WEXITSTATUS(status) == 255) {
+ /* SKIP */
+ t->passed = 1;
+ t->skip = 1;
+ } else if (t->termsig != -1) {
t->passed = 0;
fprintf(TH_LOG_STREAM,
"# %s: Test exited normally instead of by signal (code: %d)\n",
@@ -950,11 +954,6 @@ void __wait_for_test(struct __test_metadata *t)
case 0:
t->passed = 1;
break;
- /* SKIP */
- case 255:
- t->passed = 1;
- t->skip = 1;
- break;
/* Other failure, assume step report. */
default:
t->passed = 0;
diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
index 4adaad1b822f..20294553a5dd 100644
--- a/tools/testing/selftests/mm/hmm-tests.c
+++ b/tools/testing/selftests/mm/hmm-tests.c
@@ -57,9 +57,14 @@ enum {
#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
/* Just the flags we need, copied from mm.h: */
+
+#ifndef FOLL_WRITE
#define FOLL_WRITE 0x01 /* check pte is writable */
-#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */
+#endif
+#ifndef FOLL_LONGTERM
+#define FOLL_LONGTERM 0x100 /* mapping lifetime is indefinite */
+#endif
FIXTURE(hmm)
{
int fd;
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 501854a89cc0..2f9d378edec3 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -15,6 +15,7 @@ ip_local_port_range
ipsec
ipv6_flowlabel
ipv6_flowlabel_mgr
+log.txt
msg_zerocopy
nettest
psock_fanout
@@ -45,6 +46,7 @@ test_unix_oob
timestamping
tls
toeplitz
+tools
tun
txring_overwrite
txtimestamp
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 486334981e60..970df9e55131 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -577,7 +577,6 @@ int run_syscall(int min, int max)
CASE_TEST(chdir_root); EXPECT_SYSZR(1, chdir("/")); break;
CASE_TEST(chdir_dot); EXPECT_SYSZR(1, chdir(".")); break;
CASE_TEST(chdir_blah); EXPECT_SYSER(1, chdir("/blah"), -1, ENOENT); break;
- CASE_TEST(chmod_net); EXPECT_SYSZR(proc, chmod("/proc/self/net", 0555)); break;
CASE_TEST(chmod_self); EXPECT_SYSER(proc, chmod("/proc/self", 0555), -1, EPERM); break;
CASE_TEST(chown_self); EXPECT_SYSER(proc, chown("/proc/self", 0, 0), -1, EPERM); break;
CASE_TEST(chroot_root); EXPECT_SYSZR(euid0, chroot("/")); break;
diff --git a/tools/testing/selftests/rcutorture/bin/configcheck.sh b/tools/testing/selftests/rcutorture/bin/configcheck.sh
index b92dfeb7fbbf..99162d18bad3 100755
--- a/tools/testing/selftests/rcutorture/bin/configcheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/configcheck.sh
@@ -3,6 +3,8 @@
#
# Usage: configcheck.sh .config .config-template
#
+# Non-empty output if errors detected.
+#
# Copyright (C) IBM Corporation, 2011
#
# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
@@ -10,32 +12,35 @@
T="`mktemp -d ${TMPDIR-/tmp}/configcheck.sh.XXXXXX`"
trap 'rm -rf $T' 0
-sed -e 's/"//g' < $1 > $T/.config
+# function test_kconfig_enabled ( Kconfig-var=val )
+function test_kconfig_enabled () {
+ if ! grep -q "^$1$" $T/.config
+ then
+ echo :$1: improperly set
+ return 1
+ fi
+ return 0
+}
-sed -e 's/"//g' -e 's/\(.*\)=n/# \1 is not set/' -e 's/^#CHECK#//' < $2 |
-awk '
-{
- print "if grep -q \"" $0 "\" < '"$T/.config"'";
- print "then";
- print "\t:";
- print "else";
- if ($1 == "#") {
- print "\tif grep -q \"" $2 "\" < '"$T/.config"'";
- print "\tthen";
- print "\t\tif test \"$firsttime\" = \"\""
- print "\t\tthen"
- print "\t\t\tfirsttime=1"
- print "\t\tfi"
- print "\t\techo \":" $2 ": improperly set\"";
- print "\telse";
- print "\t\t:";
- print "\tfi";
- } else {
- print "\tif test \"$firsttime\" = \"\""
- print "\tthen"
- print "\t\tfirsttime=1"
- print "\tfi"
- print "\techo \":" $0 ": improperly set\"";
- }
- print "fi";
- }' | sh
+# function test_kconfig_disabled ( Kconfig-var )
+function test_kconfig_disabled () {
+ if grep -q "^$1=n$" $T/.config
+ then
+ return 0
+ fi
+ if grep -q "^$1=" $T/.config
+ then
+ echo :$1=n: improperly set
+ return 1
+ fi
+ return 0
+}
+
+sed -e 's/"//g' < $1 > $T/.config
+sed -e 's/^#CHECK#//' < $2 > $T/ConfigFragment
+grep '^CONFIG_.*=n$' $T/ConfigFragment |
+ sed -e 's/^/test_kconfig_disabled /' -e 's/=n$//' > $T/kconfig-n.sh
+. $T/kconfig-n.sh
+grep -v '^CONFIG_.*=n$' $T/ConfigFragment | grep '^CONFIG_' |
+ sed -e 's/^/test_kconfig_enabled /' > $T/kconfig-not-n.sh
+. $T/kconfig-not-n.sh
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 48b9147e8c91..b8e2ea23cb3f 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -45,7 +45,7 @@ checkarg () {
configfrag_boot_params () {
if test -r "$2.boot"
then
- echo $1 `grep -v '^#' "$2.boot" | tr '\012' ' '`
+ echo `grep -v '^#' "$2.boot" | tr '\012' ' '` $1
else
echo $1
fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh
index b582113178ac..f683e424ddd5 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh
@@ -40,6 +40,10 @@ awk '
sum += $5 / 1000.;
}
+/rcu_scale: Grace-period kthread CPU time/ {
+ cputime = $6;
+}
+
END {
newNR = asort(gptimes);
if (newNR <= 0) {
@@ -78,6 +82,8 @@ END {
print "90th percentile grace-period duration: " gptimes[pct90];
print "99th percentile grace-period duration: " gptimes[pct99];
print "Maximum grace-period duration: " gptimes[newNR];
- print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches;
+ if (cputime != "")
+ cpustr = " CPU: " cputime;
+ print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches cpustr;
print "Computed from rcuscale printk output.";
}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index 1df7e695edf7..5be670dd4009 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -16,6 +16,8 @@
T=/tmp/kvm-recheck.sh.$$
trap 'rm -f $T' 0 2
+configerrors=0
+
PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
. functions.sh
for rd in "$@"
@@ -32,7 +34,7 @@ do
fi
TORTURE_SUITE="`cat $i/../torture_suite`" ; export TORTURE_SUITE
configfile=`echo $i | sed -e 's,^.*/,,'`
- rm -f $i/console.log.*.diags
+ rm -f $i/console.log.*.diags $i/ConfigFragment.diags
case "${TORTURE_SUITE}" in
X*)
;;
@@ -49,8 +51,21 @@ do
then
echo QEMU killed
fi
- configcheck.sh $i/.config $i/ConfigFragment > $T 2>&1
- cat $T
+ configcheck.sh $i/.config $i/ConfigFragment > $i/ConfigFragment.diags 2>&1
+ if grep -q '^CONFIG_KCSAN=y$' $i/ConfigFragment.input
+ then
+ # KCSAN forces a number of Kconfig options, so remove
+ # complaints about those Kconfig options in KCSAN runs.
+ mv $i/ConfigFragment.diags $i/ConfigFragment.diags.kcsan
+ grep -v -E 'CONFIG_PROVE_RCU|CONFIG_PREEMPT_COUNT' $i/ConfigFragment.diags.kcsan > $i/ConfigFragment.diags
+ fi
+ if test -s $i/ConfigFragment.diags
+ then
+ cat $i/ConfigFragment.diags
+ configerrors=$((configerrors+1))
+ else
+ rm $i/ConfigFragment.diags
+ fi
if test -r $i/Make.oldconfig.err
then
cat $i/Make.oldconfig.err
@@ -65,7 +80,14 @@ do
if test -f "$i/buildonly"
then
echo Build-only run, no boot/test
- configcheck.sh $i/.config $i/ConfigFragment
+ configcheck.sh $i/.config $i/ConfigFragment > $i/ConfigFragment.diags 2>&1
+ if test -s $i/ConfigFragment.diags
+ then
+ cat $i/ConfigFragment.diags
+ configerrors=$((configerrors+1))
+ else
+ rm $i/ConfigFragment.diags
+ fi
parse-build.sh $i/Make.out $configfile
elif test -f "$i/qemu-cmd"
then
@@ -79,10 +101,10 @@ do
done
if test -f "$rd/kcsan.sum"
then
- if ! test -f $T
+ if ! test -f $i/ConfigFragment.diags
then
:
- elif grep -q CONFIG_KCSAN=y $T
+ elif grep -q CONFIG_KCSAN=y $i/ConfigFragment.diags
then
echo "Compiler or architecture does not support KCSAN!"
echo Did you forget to switch your compiler with '--kmake-arg CC=<cc-that-supports-kcsan>'?
@@ -94,17 +116,23 @@ do
fi
fi
done
+
+if test "$configerrors" -gt 0
+then
+ echo $configerrors runs with .config errors.
+ ret=1
+fi
EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1
builderrors="`tr ' ' '\012' < $T | grep -c '/Make.out.diags'`"
if test "$builderrors" -gt 0
then
echo $builderrors runs with build errors.
- ret=1
+ ret=2
fi
runerrors="`tr ' ' '\012' < $T | grep -c '/console.log.diags'`"
if test "$runerrors" -gt 0
then
echo $runerrors runs with runtime errors.
- ret=2
+ ret=3
fi
exit $ret
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
index a2328163eba1..134cdef5a6e0 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -137,14 +137,20 @@ chmod +x $T/bin/kvm-remote-*.sh
# Check first to avoid the need for cleanup for system-name typos
for i in $systems
do
- ncpus="`ssh -o BatchMode=yes $i getconf _NPROCESSORS_ONLN 2> /dev/null`"
+ ssh -o BatchMode=yes $i getconf _NPROCESSORS_ONLN > $T/ssh.stdout 2> $T/ssh.stderr
ret=$?
if test "$ret" -ne 0
then
- echo System $i unreachable, giving up. | tee -a "$oldrun/remote-log"
+ echo "System $i unreachable ($ret), giving up." | tee -a "$oldrun/remote-log"
+ echo ' --- ssh stdout: vvv' | tee -a "$oldrun/remote-log"
+ cat $T/ssh.stdout | tee -a "$oldrun/remote-log"
+ echo ' --- ssh stdout: ^^^' | tee -a "$oldrun/remote-log"
+ echo ' --- ssh stderr: vvv' | tee -a "$oldrun/remote-log"
+ cat $T/ssh.stderr | tee -a "$oldrun/remote-log"
+ echo ' --- ssh stderr: ^^^' | tee -a "$oldrun/remote-log"
exit 4
fi
- echo $i: $ncpus CPUs " " `date` | tee -a "$oldrun/remote-log"
+ echo $i: `cat $T/ssh.stdout` CPUs " " `date` | tee -a "$oldrun/remote-log"
done
# Download and expand the tarball on all systems.
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index d2a3710a5f2a..b33cd8753689 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -9,9 +9,10 @@
#
# Usage: kvm-test-1-run.sh config resdir seconds qemu-args boot_args_in
#
-# qemu-args defaults to "-enable-kvm -nographic", along with arguments
-# specifying the number of CPUs and other options
-# generated from the underlying CPU architecture.
+# qemu-args defaults to "-enable-kvm -display none -no-reboot", along
+# with arguments specifying the number of CPUs
+# and other options generated from the underlying
+# CPU architecture.
# boot_args_in defaults to value returned by the per_version_boot_params
# shell function.
#
@@ -57,7 +58,6 @@ config_override_param () {
cat $T/Kconfig_args >> $resdir/ConfigFragment.input
config_override.sh $T/$2 $T/Kconfig_args > $T/$2.tmp
mv $T/$2.tmp $T/$2
- # Note that "#CHECK#" is not permitted on commandline.
fi
}
@@ -140,7 +140,7 @@ then
fi
# Generate -smp qemu argument.
-qemu_args="-enable-kvm -nographic $qemu_args"
+qemu_args="-enable-kvm -display none -no-reboot $qemu_args"
cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment`
cpu_count=`configfrag_boot_cpus "$boot_args_in" "$config_template" "$cpu_count"`
if test "$cpu_count" -gt "$TORTURE_ALLOTED_CPUS"
@@ -163,7 +163,7 @@ boot_args="`configfrag_boot_params "$boot_args_in" "$config_template"`"
boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`"
if test -n "$TORTURE_BOOT_GDB_ARG"
then
- boot_args="$boot_args $TORTURE_BOOT_GDB_ARG"
+ boot_args="$TORTURE_BOOT_GDB_ARG $boot_args"
fi
# Give bare-metal advice
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index d3cdc2d33d4b..b0f36a638a69 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -186,7 +186,7 @@ do
fi
;;
--kconfig|--kconfigs)
- checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)*$' '^error$'
+ checkarg --kconfig "(Kconfig options)" $# "$2" '^\(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\( \(#CHECK#\)\?CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\|"[^"]*"\)\)*$' '^error$'
TORTURE_KCONFIG_ARG="`echo "$TORTURE_KCONFIG_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
shift
;;
diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
index 71f0dfbb2a6d..212c52ca90b5 100755
--- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
+++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
@@ -10,7 +10,6 @@
D=tools/testing/selftests/rcutorture
# Prerequisite checks
-[ -z "$D" ] && echo >&2 "No argument supplied" && exit 1
if [ ! -d "$D" ]; then
echo >&2 "$D does not exist: Malformed kernel source tree?"
exit 1
@@ -34,12 +33,16 @@ cat > init.c << '___EOF___'
volatile unsigned long delaycount;
-int main(int argc, int argv[])
+int main(int argc, char *argv[])
{
int i;
struct timeval tv;
struct timeval tvb;
+ printf("Torture-test rudimentary init program started, command line:\n");
+ for (i = 0; i < argc; i++)
+ printf(" %s", argv[i]);
+ printf("\n");
for (;;) {
sleep(1);
/* Need some userspace time. */
@@ -64,15 +67,23 @@ ___EOF___
# build using nolibc on supported archs (smaller executable) and fall
# back to regular glibc on other ones.
if echo -e "#if __x86_64__||__i386__||__i486__||__i586__||__i686__" \
- "||__ARM_EABI__||__aarch64__||__s390x__\nyes\n#endif" \
+ "||__ARM_EABI__||__aarch64__||__s390x__||__loongarch__\nyes\n#endif" \
| ${CROSS_COMPILE}gcc -E -nostdlib -xc - \
| grep -q '^yes'; then
# architecture supported by nolibc
${CROSS_COMPILE}gcc -fno-asynchronous-unwind-tables -fno-ident \
-nostdlib -include ../../../../include/nolibc/nolibc.h \
-s -static -Os -o init init.c -lgcc
+ ret=$?
else
${CROSS_COMPILE}gcc -s -static -Os -o init init.c
+ ret=$?
+fi
+
+if [ "$ret" -ne 0 ]
+then
+ echo "Failed to create a statically linked C-language initrd"
+ exit "$ret"
fi
rm init.c
diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
index 5a2ae2264403..12b50a4a881a 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -55,6 +55,8 @@ do_kasan=yes
do_kcsan=no
do_clocksourcewd=yes
do_rt=yes
+do_rcutasksflavors=yes
+do_srcu_lockdep=yes
# doyesno - Helper function for yes/no arguments
function doyesno () {
@@ -73,18 +75,20 @@ usage () {
echo " --configs-locktorture \"config-file list w/ repeat factor (10*LOCK01)\""
echo " --configs-scftorture \"config-file list w/ repeat factor (2*CFLIST)\""
echo " --do-all"
- echo " --do-allmodconfig / --do-no-allmodconfig"
- echo " --do-clocksourcewd / --do-no-clocksourcewd"
- echo " --do-kasan / --do-no-kasan"
- echo " --do-kcsan / --do-no-kcsan"
- echo " --do-kvfree / --do-no-kvfree"
- echo " --do-locktorture / --do-no-locktorture"
+ echo " --do-allmodconfig / --do-no-allmodconfig / --no-allmodconfig"
+ echo " --do-clocksourcewd / --do-no-clocksourcewd / --no-clocksourcewd"
+ echo " --do-kasan / --do-no-kasan / --no-kasan"
+ echo " --do-kcsan / --do-no-kcsan / --no-kcsan"
+ echo " --do-kvfree / --do-no-kvfree / --no-kvfree"
+ echo " --do-locktorture / --do-no-locktorture / --no-locktorture"
echo " --do-none"
- echo " --do-rcuscale / --do-no-rcuscale"
- echo " --do-rcutorture / --do-no-rcutorture"
- echo " --do-refscale / --do-no-refscale"
- echo " --do-rt / --do-no-rt"
- echo " --do-scftorture / --do-no-scftorture"
+ echo " --do-rcuscale / --do-no-rcuscale / --no-rcuscale"
+ echo " --do-rcutasksflavors / --do-no-rcutasksflavors / --no-rcutasksflavors"
+ echo " --do-rcutorture / --do-no-rcutorture / --no-rcutorture"
+ echo " --do-refscale / --do-no-refscale / --no-refscale"
+ echo " --do-rt / --do-no-rt / --no-rt"
+ echo " --do-scftorture / --do-no-scftorture / --no-scftorture"
+ echo " --do-srcu-lockdep / --do-no-srcu-lockdep / --no-srcu-lockdep"
echo " --duration [ <minutes> | <hours>h | <days>d ]"
echo " --kcsan-kmake-arg kernel-make-arguments"
exit 1
@@ -115,6 +119,7 @@ do
;;
--do-all|--doall)
do_allmodconfig=yes
+ do_rcutasksflavor=yes
do_rcutorture=yes
do_locktorture=yes
do_scftorture=yes
@@ -125,27 +130,29 @@ do
do_kasan=yes
do_kcsan=yes
do_clocksourcewd=yes
+ do_srcu_lockdep=yes
;;
- --do-allmodconfig|--do-no-allmodconfig)
+ --do-allmodconfig|--do-no-allmodconfig|--no-allmodconfig)
do_allmodconfig=`doyesno "$1" --do-allmodconfig`
;;
- --do-clocksourcewd|--do-no-clocksourcewd)
+ --do-clocksourcewd|--do-no-clocksourcewd|--no-clocksourcewd)
do_clocksourcewd=`doyesno "$1" --do-clocksourcewd`
;;
- --do-kasan|--do-no-kasan)
+ --do-kasan|--do-no-kasan|--no-kasan)
do_kasan=`doyesno "$1" --do-kasan`
;;
- --do-kcsan|--do-no-kcsan)
+ --do-kcsan|--do-no-kcsan|--no-kcsan)
do_kcsan=`doyesno "$1" --do-kcsan`
;;
- --do-kvfree|--do-no-kvfree)
+ --do-kvfree|--do-no-kvfree|--no-kvfree)
do_kvfree=`doyesno "$1" --do-kvfree`
;;
- --do-locktorture|--do-no-locktorture)
+ --do-locktorture|--do-no-locktorture|--no-locktorture)
do_locktorture=`doyesno "$1" --do-locktorture`
;;
--do-none|--donone)
do_allmodconfig=no
+ do_rcutasksflavors=no
do_rcutorture=no
do_locktorture=no
do_scftorture=no
@@ -156,22 +163,29 @@ do
do_kasan=no
do_kcsan=no
do_clocksourcewd=no
+ do_srcu_lockdep=no
;;
- --do-rcuscale|--do-no-rcuscale)
+ --do-rcuscale|--do-no-rcuscale|--no-rcuscale)
do_rcuscale=`doyesno "$1" --do-rcuscale`
;;
- --do-rcutorture|--do-no-rcutorture)
+ --do-rcutasksflavors|--do-no-rcutasksflavors|--no-rcutasksflavors)
+ do_rcutasksflavors=`doyesno "$1" --do-rcutasksflavors`
+ ;;
+ --do-rcutorture|--do-no-rcutorture|--no-rcutorture)
do_rcutorture=`doyesno "$1" --do-rcutorture`
;;
- --do-refscale|--do-no-refscale)
+ --do-refscale|--do-no-refscale|--no-refscale)
do_refscale=`doyesno "$1" --do-refscale`
;;
- --do-rt|--do-no-rt)
+ --do-rt|--do-no-rt|--no-rt)
do_rt=`doyesno "$1" --do-rt`
;;
- --do-scftorture|--do-no-scftorture)
+ --do-scftorture|--do-no-scftorture|--no-scftorture)
do_scftorture=`doyesno "$1" --do-scftorture`
;;
+ --do-srcu-lockdep|--do-no-srcu-lockdep|--no-srcu-lockdep)
+ do_srcu_lockdep=`doyesno "$1" --do-srcu-lockdep`
+ ;;
--duration)
checkarg --duration "(minutes)" $# "$2" '^[0-9][0-9]*\(m\|h\|d\|\)$' '^error'
mult=1
@@ -361,6 +375,40 @@ then
fi
fi
+# Test building RCU Tasks flavors in isolation, both SMP and !SMP
+if test "$do_rcutasksflavors" = "yes"
+then
+ echo " --- rcutasksflavors:" Start `date` | tee -a $T/log
+ rtfdir="tools/testing/selftests/rcutorture/res/$ds/results-rcutasksflavors"
+ mkdir -p "$rtfdir"
+ cat > $T/rcutasksflavors << __EOF__
+#CHECK#CONFIG_TASKS_RCU=n
+#CHECK#CONFIG_TASKS_RUDE_RCU=n
+#CHECK#CONFIG_TASKS_TRACE_RCU=n
+__EOF__
+ for flavor in CONFIG_TASKS_RCU CONFIG_TASKS_RUDE_RCU CONFIG_TASKS_TRACE_RCU
+ do
+ forceflavor="`echo $flavor | sed -e 's/^CONFIG/CONFIG_FORCE/'`"
+ deselectedflavors="`grep -v $flavor $T/rcutasksflavors | tr '\012' ' ' | tr -s ' ' | sed -e 's/ *$//'`"
+ echo " --- Running RCU Tasks Trace flavor $flavor `date`" >> $rtfdir/log
+ tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1
+ retcode=$?
+ if test "$retcode" -ne 0
+ then
+ break
+ fi
+ done
+ if test "$retcode" -eq 0
+ then
+ echo "rcutasksflavors($retcode)" $rtfdir >> $T/successes
+ echo Success >> $rtfdir/log
+ else
+ echo "rcutasksflavors($retcode)" $rtfdir >> $T/failures
+ echo " --- rcutasksflavors Test summary:" >> $rtfdir/log
+ echo " --- Summary: Exit code $retcode from $flavor, see Make.out" >> $rtfdir/log
+ fi
+fi
+
# --torture rcu
if test "$do_rcutorture" = "yes"
then
@@ -376,8 +424,10 @@ fi
if test "$do_scftorture" = "yes"
then
+ # Scale memory based on the number of CPUs.
+ scfmem=$((2+HALF_ALLOTED_CPUS/16))
torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1"
- torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make
+ torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make
fi
if test "$do_rt" = "yes"
@@ -391,6 +441,23 @@ then
torture_set "rcurttorture-exp" tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration "$duration_rcutorture" --configs "TREE03" --trust-make
fi
+if test "$do_srcu_lockdep" = "yes"
+then
+ echo " --- do-srcu-lockdep:" Start `date` | tee -a $T/log
+ tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh --datestamp "$ds/results-srcu-lockdep" > $T/srcu_lockdep.sh.out 2>&1
+ retcode=$?
+ cp $T/srcu_lockdep.sh.out "tools/testing/selftests/rcutorture/res/$ds/results-srcu-lockdep/log"
+ if test "$retcode" -eq 0
+ then
+ echo "srcu_lockdep($retcode)" "tools/testing/selftests/rcutorture/res/$ds/results-srcu-lockdep" >> $T/successes
+ echo Success >> "tools/testing/selftests/rcutorture/res/$ds/results-srcu-lockdep/log"
+ else
+ echo "srcu_lockdep($retcode)" "tools/testing/selftests/rcutorture/res/$ds/results-srcu-lockdep" >> $T/failures
+ echo " --- srcu_lockdep Test Summary:" >> "tools/testing/selftests/rcutorture/res/$ds/results-srcu-lockdep/log"
+ echo " --- Summary: Exit code $retcode from srcu_lockdep.sh, see ds/results-srcu-lockdep" >> "tools/testing/selftests/rcutorture/res/$ds/results-srcu-lockdep/log"
+ fi
+fi
+
if test "$do_refscale" = yes
then
primlist="`grep '\.name[ ]*=' kernel/rcu/refscale.c | sed -e 's/^[^"]*"//' -e 's/".*$//'`"
@@ -541,11 +608,23 @@ then
fi
echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log
echo Summary: Successes: $nsuccesses Failures: $nfailures. | tee -a $T/log
+tdir="`cat $T/successes $T/failures | head -1 | awk '{ print $NF }' | sed -e 's,/[^/]\+/*$,,'`"
+find "$tdir" -name 'ConfigFragment.diags' -print > $T/configerrors
+find "$tdir" -name 'Make.out.diags' -print > $T/builderrors
+if test -s "$T/configerrors"
+then
+ echo " Scenarios with .config errors: `wc -l "$T/configerrors" | awk '{ print $1 }'`"
+ nonkcsanbug="yes"
+fi
+if test -s "$T/builderrors"
+then
+ echo " Scenarios with build errors: `wc -l "$T/builderrors" | awk '{ print $1 }'`"
+ nonkcsanbug="yes"
+fi
if test -z "$nonkcsanbug" && test -s "$T/failuresum"
then
echo " All bugs were KCSAN failures."
fi
-tdir="`cat $T/successes $T/failures | head -1 | awk '{ print $NF }' | sed -e 's,/[^/]\+/*$,,'`"
if test -n "$tdir" && test $compress_concurrency -gt 0
then
# KASAN vmlinux files can approach 1GB in size, so compress them.
diff --git a/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh
index d3e4b2971f92..e7bb32709d78 100644
--- a/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh
+++ b/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh
@@ -22,8 +22,9 @@ locktorture_param_onoff () {
#
# Adds per-version torture-module parameters to kernels supporting them.
per_version_boot_params () {
- echo $1 `locktorture_param_onoff "$1" "$2"` \
+ echo `locktorture_param_onoff "$1" "$2"` \
locktorture.stat_interval=15 \
locktorture.shutdown_secs=$3 \
- locktorture.verbose=1
+ locktorture.verbose=1 \
+ $1
}
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
index dea26c568678..2ef2fb69c360 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
@@ -6,6 +6,5 @@ CONFIG_PREEMPT=y
CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=n
CONFIG_NO_HZ_FULL=y
-#CHECK#CONFIG_RCU_EXPERT=n
CONFIG_TASKS_RCU=y
CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
index 04831ef1f9b5..8ae41d5f81a3 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -15,4 +15,3 @@ CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_RCU_BOOST=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_RCU_EXPERT=y
-CONFIG_BOOTPARAM_HOTPLUG_CPU0=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh
index e2bc99c785e7..c044df386876 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh
+++ b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh
@@ -46,10 +46,11 @@ rcutorture_param_stat_interval () {
#
# Adds per-version torture-module parameters to kernels supporting them.
per_version_boot_params () {
- echo $1 `rcutorture_param_onoff "$1" "$2"` \
+ echo `rcutorture_param_onoff "$1" "$2"` \
`rcutorture_param_n_barrier_cbs "$1"` \
`rcutorture_param_stat_interval "$1"` \
rcutorture.shutdown_secs=$3 \
rcutorture.test_no_idle_hz=1 \
- rcutorture.verbose=1
+ rcutorture.verbose=1 \
+ $1
}
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon
index 6a00157bee5b..b1ffd7c67604 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon
@@ -2,5 +2,7 @@ CONFIG_RCU_SCALE_TEST=y
CONFIG_PRINTK_TIME=y
CONFIG_FORCE_TASKS_RCU=y
#CHECK#CONFIG_TASKS_RCU=y
+CONFIG_FORCE_TASKS_RUDE_RCU=y
+#CHECK#CONFIG_TASKS_RUDE_RCU=y
CONFIG_FORCE_TASKS_TRACE_RCU=y
#CHECK#CONFIG_TASKS_TRACE_RCU=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01
index 227aba7783af..0059592c7408 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01
@@ -2,6 +2,8 @@ CONFIG_SMP=y
CONFIG_PREEMPT_NONE=y
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
+#CHECK#CONFIG_TREE_RCU=y
CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
CONFIG_NO_HZ_FULL=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh
index ffbe15109f0d..28070b43f017 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh
@@ -11,6 +11,7 @@
#
# Adds per-version torture-module parameters to kernels supporting them.
per_version_boot_params () {
- echo $1 rcuscale.shutdown=1 \
- rcuscale.verbose=0
+ echo rcuscale.shutdown=1 \
+ rcuscale.verbose=0 \
+ $1
}
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
index ef2b501a6971..67f9d2998afd 100644
--- a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
+++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
@@ -2,6 +2,7 @@ CONFIG_SMP=y
CONFIG_PREEMPT_NONE=y
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
#CHECK#CONFIG_PREEMPT_RCU=n
CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=y
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh
index f81fa2c541a6..748465627601 100644
--- a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh
+++ b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh
@@ -11,6 +11,7 @@
#
# Adds per-version torture-module parameters to kernels supporting them.
per_version_boot_params () {
- echo $1 refscale.shutdown=1 \
- refscale.verbose=0
+ echo refscale.shutdown=1 \
+ refscale.verbose=0 \
+ $1
}
diff --git a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT
index 3a59346b3de7..6133f54ce2a7 100644
--- a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT
+++ b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT
@@ -2,6 +2,8 @@ CONFIG_SMP=y
CONFIG_PREEMPT_NONE=y
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=n
+CONFIG_PREEMPT_DYNAMIC=n
+#CHECK#CONFIG_PREEMPT_RCU=n
CONFIG_HZ_PERIODIC=n
CONFIG_NO_HZ_IDLE=n
CONFIG_NO_HZ_FULL=y
diff --git a/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh
index 2d949e58f5a5..7637f68ef0ce 100644
--- a/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh
+++ b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh
@@ -22,8 +22,9 @@ scftorture_param_onoff () {
#
# Adds per-version torture-module parameters to kernels supporting them.
per_version_boot_params () {
- echo $1 `scftorture_param_onoff "$1" "$2"` \
+ echo `scftorture_param_onoff "$1" "$2"` \
scftorture.stat_interval=15 \
scftorture.shutdown_secs=$3 \
- scftorture.verbose=1
+ scftorture.verbose=1 \
+ $1
}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/Makefile b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/Makefile
deleted file mode 100644
index 4bed0b678f8b..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-all: srcu.c store_buffering
-
-LINUX_SOURCE = ../../../../../..
-
-modified_srcu_input = $(LINUX_SOURCE)/include/linux/srcu.h \
- $(LINUX_SOURCE)/kernel/rcu/srcu.c
-
-modified_srcu_output = include/linux/srcu.h srcu.c
-
-include/linux/srcu.h: srcu.c
-
-srcu.c: modify_srcu.awk Makefile $(modified_srcu_input)
- awk -f modify_srcu.awk $(modified_srcu_input) $(modified_srcu_output)
-
-store_buffering:
- @cd tests/store_buffering; make
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/delay.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/delay.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/delay.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/export.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/export.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/export.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/mutex.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/mutex.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/mutex.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/percpu.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/percpu.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/percpu.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/preempt.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/preempt.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/preempt.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/rcupdate.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/rcupdate.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/rcupdate.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/sched.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/sched.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/sched.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/smp.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/smp.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/smp.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/workqueue.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/workqueue.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/workqueue.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/uapi/linux/types.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/uapi/linux/types.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/uapi/linux/types.h
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore
deleted file mode 100644
index 57d296341304..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-srcu.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/kconfig.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/kconfig.h
deleted file mode 100644
index f2860dd1b407..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/kconfig.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <LINUX_SOURCE/linux/kconfig.h>
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
deleted file mode 100644
index 8bc960e5e713..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This header has been modifies to remove definitions of types that
- * are defined in standard userspace headers or are problematic for some
- * other reason.
- */
-
-#ifndef _LINUX_TYPES_H
-#define _LINUX_TYPES_H
-
-#define __EXPORTED_HEADERS__
-#include <uapi/linux/types.h>
-
-#ifndef __ASSEMBLY__
-
-#define DECLARE_BITMAP(name, bits) \
- unsigned long name[BITS_TO_LONGS(bits)]
-
-typedef __u32 __kernel_dev_t;
-
-/* bsd */
-typedef unsigned char u_char;
-typedef unsigned short u_short;
-typedef unsigned int u_int;
-typedef unsigned long u_long;
-
-/* sysv */
-typedef unsigned char unchar;
-typedef unsigned short ushort;
-typedef unsigned int uint;
-typedef unsigned long ulong;
-
-#ifndef __BIT_TYPES_DEFINED__
-#define __BIT_TYPES_DEFINED__
-
-typedef __u8 u_int8_t;
-typedef __s8 int8_t;
-typedef __u16 u_int16_t;
-typedef __s16 int16_t;
-typedef __u32 u_int32_t;
-typedef __s32 int32_t;
-
-#endif /* !(__BIT_TYPES_DEFINED__) */
-
-typedef __u8 uint8_t;
-typedef __u16 uint16_t;
-typedef __u32 uint32_t;
-
-/* this is a special 64bit data type that is 8-byte aligned */
-#define aligned_u64 __u64 __attribute__((aligned(8)))
-#define aligned_be64 __be64 __attribute__((aligned(8)))
-#define aligned_le64 __le64 __attribute__((aligned(8)))
-
-/**
- * The type used for indexing onto a disc or disc partition.
- *
- * Linux always considers sectors to be 512 bytes long independently
- * of the devices real block size.
- *
- * blkcnt_t is the type of the inode's block count.
- */
-typedef u64 sector_t;
-
-/*
- * The type of an index into the pagecache.
- */
-#define pgoff_t unsigned long
-
-/*
- * A dma_addr_t can hold any valid DMA address, i.e., any address returned
- * by the DMA API.
- *
- * If the DMA API only uses 32-bit addresses, dma_addr_t need only be 32
- * bits wide. Bus addresses, e.g., PCI BARs, may be wider than 32 bits,
- * but drivers do memory-mapped I/O to ioremapped kernel virtual addresses,
- * so they don't care about the size of the actual bus addresses.
- */
-#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
-typedef u64 dma_addr_t;
-#else
-typedef u32 dma_addr_t;
-#endif
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-typedef u64 phys_addr_t;
-#else
-typedef u32 phys_addr_t;
-#endif
-
-typedef phys_addr_t resource_size_t;
-
-/*
- * This type is the placeholder for a hardware interrupt number. It has to be
- * big enough to enclose whatever representation is used by a given platform.
- */
-typedef unsigned long irq_hw_number_t;
-
-typedef struct {
- int counter;
-} atomic_t;
-
-#ifdef CONFIG_64BIT
-typedef struct {
- long counter;
-} atomic64_t;
-#endif
-
-struct list_head {
- struct list_head *next, *prev;
-};
-
-struct hlist_head {
- struct hlist_node *first;
-};
-
-struct hlist_node {
- struct hlist_node *next, **pprev;
-};
-
-/**
- * struct callback_head - callback structure for use with RCU and task_work
- * @next: next update requests in a list
- * @func: actual update function to call after the grace period.
- *
- * The struct is aligned to size of pointer. On most architectures it happens
- * naturally due ABI requirements, but some architectures (like CRIS) have
- * weird ABI and we need to ask it explicitly.
- *
- * The alignment is required to guarantee that bits 0 and 1 of @next will be
- * clear under normal conditions -- as long as we use call_rcu() or
- * call_srcu() to queue callback.
- *
- * This guarantee is important for few reasons:
- * - future call_rcu_lazy() will make use of lower bits in the pointer;
- * - the structure shares storage spacer in struct page with @compound_head,
- * which encode PageTail() in bit 0. The guarantee is needed to avoid
- * false-positive PageTail().
- */
-struct callback_head {
- struct callback_head *next;
- void (*func)(struct callback_head *head);
-} __attribute__((aligned(sizeof(void *))));
-#define rcu_head callback_head
-
-typedef void (*rcu_callback_t)(struct rcu_head *head);
-typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
-
-/* clocksource cycle base type */
-typedef u64 cycle_t;
-
-#endif /* __ASSEMBLY__ */
-#endif /* _LINUX_TYPES_H */
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk
deleted file mode 100755
index e05182d3e47d..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk
+++ /dev/null
@@ -1,376 +0,0 @@
-#!/usr/bin/awk -f
-# SPDX-License-Identifier: GPL-2.0
-
-# Modify SRCU for formal verification. The first argument should be srcu.h and
-# the second should be srcu.c. Outputs modified srcu.h and srcu.c into the
-# current directory.
-
-BEGIN {
- if (ARGC != 5) {
- print "Usange: input.h input.c output.h output.c" > "/dev/stderr";
- exit 1;
- }
- h_output = ARGV[3];
- c_output = ARGV[4];
- ARGC = 3;
-
- # Tokenize using FS and not RS as FS supports regular expressions. Each
- # record is one line of source, except that backslashed lines are
- # combined. Comments are treated as field separators, as are quotes.
- quote_regexp="\"([^\\\\\"]|\\\\.)*\"";
- comment_regexp="\\/\\*([^*]|\\*+[^*/])*\\*\\/|\\/\\/.*(\n|$)";
- FS="([ \\\\\t\n\v\f;,.=(){}+*/<>&|^-]|\\[|\\]|" comment_regexp "|" quote_regexp ")+";
-
- inside_srcu_struct = 0;
- inside_srcu_init_def = 0;
- srcu_init_param_name = "";
- in_macro = 0;
- brace_nesting = 0;
- paren_nesting = 0;
-
- # Allow the manipulation of the last field separator after has been
- # seen.
- last_fs = "";
- # Whether the last field separator was intended to be output.
- last_fs_print = 0;
-
- # rcu_batches stores the initialization for each instance of struct
- # rcu_batch
-
- in_comment = 0;
-
- outputfile = "";
-}
-
-{
- prev_outputfile = outputfile;
- if (FILENAME ~ /\.h$/) {
- outputfile = h_output;
- if (FNR != NR) {
- print "Incorrect file order" > "/dev/stderr";
- exit 1;
- }
- }
- else
- outputfile = c_output;
-
- if (prev_outputfile && outputfile != prev_outputfile) {
- new_outputfile = outputfile;
- outputfile = prev_outputfile;
- update_fieldsep("", 0);
- outputfile = new_outputfile;
- }
-}
-
-# Combine the next line into $0.
-function combine_line() {
- ret = getline next_line;
- if (ret == 0) {
- # Don't allow two consecutive getlines at the end of the file
- if (eof_found) {
- print "Error: expected more input." > "/dev/stderr";
- exit 1;
- } else {
- eof_found = 1;
- }
- } else if (ret == -1) {
- print "Error reading next line of file" FILENAME > "/dev/stderr";
- exit 1;
- }
- $0 = $0 "\n" next_line;
-}
-
-# Combine backslashed lines and multiline comments.
-function combine_backslashes() {
- while (/\\$|\/\*([^*]|\*+[^*\/])*\**$/) {
- combine_line();
- }
-}
-
-function read_line() {
- combine_line();
- combine_backslashes();
-}
-
-# Print out field separators and update variables that depend on them. Only
-# print if p is true. Call with sep="" and p=0 to print out the last field
-# separator.
-function update_fieldsep(sep, p) {
- # Count braces
- sep_tmp = sep;
- gsub(quote_regexp "|" comment_regexp, "", sep_tmp);
- while (1)
- {
- if (sub("[^{}()]*\\{", "", sep_tmp)) {
- brace_nesting++;
- continue;
- }
- if (sub("[^{}()]*\\}", "", sep_tmp)) {
- brace_nesting--;
- if (brace_nesting < 0) {
- print "Unbalanced braces!" > "/dev/stderr";
- exit 1;
- }
- continue;
- }
- if (sub("[^{}()]*\\(", "", sep_tmp)) {
- paren_nesting++;
- continue;
- }
- if (sub("[^{}()]*\\)", "", sep_tmp)) {
- paren_nesting--;
- if (paren_nesting < 0) {
- print "Unbalanced parenthesis!" > "/dev/stderr";
- exit 1;
- }
- continue;
- }
-
- break;
- }
-
- if (last_fs_print)
- printf("%s", last_fs) > outputfile;
- last_fs = sep;
- last_fs_print = p;
-}
-
-# Shifts the fields down by n positions. Calls next if there are no more. If p
-# is true then print out field separators.
-function shift_fields(n, p) {
- do {
- if (match($0, FS) > 0) {
- update_fieldsep(substr($0, RSTART, RLENGTH), p);
- if (RSTART + RLENGTH <= length())
- $0 = substr($0, RSTART + RLENGTH);
- else
- $0 = "";
- } else {
- update_fieldsep("", 0);
- print "" > outputfile;
- next;
- }
- } while (--n > 0);
-}
-
-# Shifts and prints the first n fields.
-function print_fields(n) {
- do {
- update_fieldsep("", 0);
- printf("%s", $1) > outputfile;
- shift_fields(1, 1);
- } while (--n > 0);
-}
-
-{
- combine_backslashes();
-}
-
-# Print leading FS
-{
- if (match($0, "^(" FS ")+") > 0) {
- update_fieldsep(substr($0, RSTART, RLENGTH), 1);
- if (RSTART + RLENGTH <= length())
- $0 = substr($0, RSTART + RLENGTH);
- else
- $0 = "";
- }
-}
-
-# Parse the line.
-{
- while (NF > 0) {
- if ($1 == "struct" && NF < 3) {
- read_line();
- continue;
- }
-
- if (FILENAME ~ /\.h$/ && !inside_srcu_struct &&
- brace_nesting == 0 && paren_nesting == 0 &&
- $1 == "struct" && $2 == "srcu_struct" &&
- $0 ~ "^struct(" FS ")+srcu_struct(" FS ")+\\{") {
- inside_srcu_struct = 1;
- print_fields(2);
- continue;
- }
- if (inside_srcu_struct && brace_nesting == 0 &&
- paren_nesting == 0) {
- inside_srcu_struct = 0;
- update_fieldsep("", 0);
- for (name in rcu_batches)
- print "extern struct rcu_batch " name ";" > outputfile;
- }
-
- if (inside_srcu_struct && $1 == "struct" && $2 == "rcu_batch") {
- # Move rcu_batches outside of the struct.
- rcu_batches[$3] = "";
- shift_fields(3, 1);
- sub(/;[[:space:]]*$/, "", last_fs);
- continue;
- }
-
- if (FILENAME ~ /\.h$/ && !inside_srcu_init_def &&
- $1 == "#define" && $2 == "__SRCU_STRUCT_INIT") {
- inside_srcu_init_def = 1;
- srcu_init_param_name = $3;
- in_macro = 1;
- print_fields(3);
- continue;
- }
- if (inside_srcu_init_def && brace_nesting == 0 &&
- paren_nesting == 0) {
- inside_srcu_init_def = 0;
- in_macro = 0;
- continue;
- }
-
- if (inside_srcu_init_def && brace_nesting == 1 &&
- paren_nesting == 0 && last_fs ~ /\.[[:space:]]*$/ &&
- $1 ~ /^[[:alnum:]_]+$/) {
- name = $1;
- if (name in rcu_batches) {
- # Remove the dot.
- sub(/\.[[:space:]]*$/, "", last_fs);
-
- old_record = $0;
- do
- shift_fields(1, 0);
- while (last_fs !~ /,/ || paren_nesting > 0);
- end_loc = length(old_record) - length($0);
- end_loc += index(last_fs, ",") - length(last_fs);
-
- last_fs = substr(last_fs, index(last_fs, ",") + 1);
- last_fs_print = 1;
-
- match(old_record, "^"name"("FS")+=");
- start_loc = RSTART + RLENGTH;
-
- len = end_loc - start_loc;
- initializer = substr(old_record, start_loc, len);
- gsub(srcu_init_param_name "\\.", "", initializer);
- rcu_batches[name] = initializer;
- continue;
- }
- }
-
- # Don't include a nonexistent file
- if (!in_macro && $1 == "#include" && /^#include[[:space:]]+"rcu\.h"/) {
- update_fieldsep("", 0);
- next;
- }
-
- # Ignore most preprocessor stuff.
- if (!in_macro && $1 ~ /#/) {
- break;
- }
-
- if (brace_nesting > 0 && $1 ~ "^[[:alnum:]_]+$" && NF < 2) {
- read_line();
- continue;
- }
- if (brace_nesting > 0 &&
- $0 ~ "^[[:alnum:]_]+[[:space:]]*(\\.|->)[[:space:]]*[[:alnum:]_]+" &&
- $2 in rcu_batches) {
- # Make uses of rcu_batches global. Somewhat unreliable.
- shift_fields(1, 0);
- print_fields(1);
- continue;
- }
-
- if ($1 == "static" && NF < 3) {
- read_line();
- continue;
- }
- if ($1 == "static" && ($2 == "bool" && $3 == "try_check_zero" ||
- $2 == "void" && $3 == "srcu_flip")) {
- shift_fields(1, 1);
- print_fields(2);
- continue;
- }
-
- # Distinguish between read-side and write-side memory barriers.
- if ($1 == "smp_mb" && NF < 2) {
- read_line();
- continue;
- }
- if (match($0, /^smp_mb[[:space:]();\/*]*[[:alnum:]]/)) {
- barrier_letter = substr($0, RLENGTH, 1);
- if (barrier_letter ~ /A|D/)
- new_barrier_name = "sync_smp_mb";
- else if (barrier_letter ~ /B|C/)
- new_barrier_name = "rs_smp_mb";
- else {
- print "Unrecognized memory barrier." > "/dev/null";
- exit 1;
- }
-
- shift_fields(1, 1);
- printf("%s", new_barrier_name) > outputfile;
- continue;
- }
-
- # Skip definition of rcu_synchronize, since it is already
- # defined in misc.h. Only present in old versions of srcu.
- if (brace_nesting == 0 && paren_nesting == 0 &&
- $1 == "struct" && $2 == "rcu_synchronize" &&
- $0 ~ "^struct(" FS ")+rcu_synchronize(" FS ")+\\{") {
- shift_fields(2, 0);
- while (brace_nesting) {
- if (NF < 2)
- read_line();
- shift_fields(1, 0);
- }
- }
-
- # Skip definition of wakeme_after_rcu for the same reason
- if (brace_nesting == 0 && $1 == "static" && $2 == "void" &&
- $3 == "wakeme_after_rcu") {
- while (NF < 5)
- read_line();
- shift_fields(3, 0);
- do {
- while (NF < 3)
- read_line();
- shift_fields(1, 0);
- } while (paren_nesting || brace_nesting);
- }
-
- if ($1 ~ /^(unsigned|long)$/ && NF < 3) {
- read_line();
- continue;
- }
-
- # Give srcu_batches_completed the correct type for old SRCU.
- if (brace_nesting == 0 && $1 == "long" &&
- $2 == "srcu_batches_completed") {
- update_fieldsep("", 0);
- printf("unsigned ") > outputfile;
- print_fields(2);
- continue;
- }
- if (brace_nesting == 0 && $1 == "unsigned" && $2 == "long" &&
- $3 == "srcu_batches_completed") {
- print_fields(3);
- continue;
- }
-
- # Just print out the input code by default.
- print_fields(1);
- }
- update_fieldsep("", 0);
- print > outputfile;
- next;
-}
-
-END {
- update_fieldsep("", 0);
-
- if (brace_nesting != 0) {
- print "Unbalanced braces!" > "/dev/stderr";
- exit 1;
- }
-
- # Define the rcu_batches
- for (name in rcu_batches)
- print "struct rcu_batch " name " = " rcu_batches[name] ";" > c_output;
-}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/assume.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/assume.h
deleted file mode 100644
index 570a49d9da7e..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/assume.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASSUME_H
-#define ASSUME_H
-
-/* Provide an assumption macro that can be disabled for gcc. */
-#ifdef RUN
-#define assume(x) \
- do { \
- /* Evaluate x to suppress warnings. */ \
- (void) (x); \
- } while (0)
-
-#else
-#define assume(x) __CPROVER_assume(x)
-#endif
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h
deleted file mode 100644
index 3f95a768a03b..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef BARRIERS_H
-#define BARRIERS_H
-
-#define barrier() __asm__ __volatile__("" : : : "memory")
-
-#ifdef RUN
-#define smp_mb() __sync_synchronize()
-#define smp_mb__after_unlock_lock() __sync_synchronize()
-#else
-/*
- * Copied from CBMC's implementation of __sync_synchronize(), which
- * seems to be disabled by default.
- */
-#define smp_mb() __CPROVER_fence("WWfence", "RRfence", "RWfence", "WRfence", \
- "WWcumul", "RRcumul", "RWcumul", "WRcumul")
-#define smp_mb__after_unlock_lock() __CPROVER_fence("WWfence", "RRfence", "RWfence", "WRfence", \
- "WWcumul", "RRcumul", "RWcumul", "WRcumul")
-#endif
-
-/*
- * Allow memory barriers to be disabled in either the read or write side
- * of SRCU individually.
- */
-
-#ifndef NO_SYNC_SMP_MB
-#define sync_smp_mb() smp_mb()
-#else
-#define sync_smp_mb() do {} while (0)
-#endif
-
-#ifndef NO_READ_SIDE_SMP_MB
-#define rs_smp_mb() smp_mb()
-#else
-#define rs_smp_mb() do {} while (0)
-#endif
-
-#define READ_ONCE(x) (*(volatile typeof(x) *) &(x))
-#define WRITE_ONCE(x) ((*(volatile typeof(x) *) &(x)) = (val))
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/bug_on.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/bug_on.h
deleted file mode 100644
index 5e7912c6a521..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/bug_on.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef BUG_ON_H
-#define BUG_ON_H
-
-#include <assert.h>
-
-#define BUG() assert(0)
-#define BUG_ON(x) assert(!(x))
-
-/* Does it make sense to treat warnings as errors? */
-#define WARN() BUG()
-#define WARN_ON(x) (BUG_ON(x), false)
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/combined_source.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/combined_source.c
deleted file mode 100644
index e67ee5b3dd7c..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/combined_source.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <config.h>
-
-/* Include all source files. */
-
-#include "include_srcu.c"
-
-#include "preempt.c"
-#include "misc.c"
-
-/* Used by test.c files */
-#include <pthread.h>
-#include <stdlib.h>
-#include <linux/srcu.h>
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
deleted file mode 100644
index 283d7103334f..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* "Cheater" definitions based on restricted Kconfig choices. */
-
-#undef CONFIG_TINY_RCU
-#undef __CHECKER__
-#undef CONFIG_DEBUG_LOCK_ALLOC
-#undef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-#undef CONFIG_HOTPLUG_CPU
-#undef CONFIG_MODULES
-#undef CONFIG_NO_HZ_FULL_SYSIDLE
-#undef CONFIG_PREEMPT_COUNT
-#undef CONFIG_PREEMPT_RCU
-#undef CONFIG_PROVE_RCU
-#undef CONFIG_RCU_NOCB_CPU
-#undef CONFIG_RCU_NOCB_CPU_ALL
-#undef CONFIG_RCU_STALL_COMMON
-#undef CONFIG_RCU_TRACE
-#undef CONFIG_RCU_USER_QS
-#undef CONFIG_TASKS_RCU
-#define CONFIG_TREE_RCU
-
-#define CONFIG_GENERIC_ATOMIC64
-
-#if NR_CPUS > 1
-#define CONFIG_SMP
-#else
-#undef CONFIG_SMP
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/include_srcu.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/include_srcu.c
deleted file mode 100644
index e5202d4cff30..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/include_srcu.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <config.h>
-
-#include <assert.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <pthread.h>
-#include <stddef.h>
-#include <string.h>
-#include <sys/types.h>
-
-#include "int_typedefs.h"
-
-#include "barriers.h"
-#include "bug_on.h"
-#include "locks.h"
-#include "misc.h"
-#include "preempt.h"
-#include "percpu.h"
-#include "workqueues.h"
-
-#ifdef USE_SIMPLE_SYNC_SRCU
-#define synchronize_srcu(sp) synchronize_srcu_original(sp)
-#endif
-
-#include <srcu.c>
-
-#ifdef USE_SIMPLE_SYNC_SRCU
-#undef synchronize_srcu
-
-#include "simple_sync_srcu.c"
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/int_typedefs.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/int_typedefs.h
deleted file mode 100644
index 0dd27aa517a7..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/int_typedefs.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef INT_TYPEDEFS_H
-#define INT_TYPEDEFS_H
-
-#include <inttypes.h>
-
-typedef int8_t s8;
-typedef uint8_t u8;
-typedef int16_t s16;
-typedef uint16_t u16;
-typedef int32_t s32;
-typedef uint32_t u32;
-typedef int64_t s64;
-typedef uint64_t u64;
-
-typedef int8_t __s8;
-typedef uint8_t __u8;
-typedef int16_t __s16;
-typedef uint16_t __u16;
-typedef int32_t __s32;
-typedef uint32_t __u32;
-typedef int64_t __s64;
-typedef uint64_t __u64;
-
-#define S8_C(x) INT8_C(x)
-#define U8_C(x) UINT8_C(x)
-#define S16_C(x) INT16_C(x)
-#define U16_C(x) UINT16_C(x)
-#define S32_C(x) INT32_C(x)
-#define U32_C(x) UINT32_C(x)
-#define S64_C(x) INT64_C(x)
-#define U64_C(x) UINT64_C(x)
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h
deleted file mode 100644
index 1e24827f96f1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LOCKS_H
-#define LOCKS_H
-
-#include <limits.h>
-#include <pthread.h>
-#include <stdbool.h>
-
-#include "assume.h"
-#include "bug_on.h"
-#include "preempt.h"
-
-int nondet_int(void);
-
-#define __acquire(x)
-#define __acquires(x)
-#define __release(x)
-#define __releases(x)
-
-/* Only use one lock mechanism. Select which one. */
-#ifdef PTHREAD_LOCK
-struct lock_impl {
- pthread_mutex_t mutex;
-};
-
-static inline void lock_impl_lock(struct lock_impl *lock)
-{
- BUG_ON(pthread_mutex_lock(&lock->mutex));
-}
-
-static inline void lock_impl_unlock(struct lock_impl *lock)
-{
- BUG_ON(pthread_mutex_unlock(&lock->mutex));
-}
-
-static inline bool lock_impl_trylock(struct lock_impl *lock)
-{
- int err = pthread_mutex_trylock(&lock->mutex);
-
- if (!err)
- return true;
- else if (err == EBUSY)
- return false;
- BUG();
-}
-
-static inline void lock_impl_init(struct lock_impl *lock)
-{
- pthread_mutex_init(&lock->mutex, NULL);
-}
-
-#define LOCK_IMPL_INITIALIZER {.mutex = PTHREAD_MUTEX_INITIALIZER}
-
-#else /* !defined(PTHREAD_LOCK) */
-/* Spinlock that assumes that it always gets the lock immediately. */
-
-struct lock_impl {
- bool locked;
-};
-
-static inline bool lock_impl_trylock(struct lock_impl *lock)
-{
-#ifdef RUN
- /* TODO: Should this be a test and set? */
- return __sync_bool_compare_and_swap(&lock->locked, false, true);
-#else
- __CPROVER_atomic_begin();
- bool old_locked = lock->locked;
- lock->locked = true;
- __CPROVER_atomic_end();
-
- /* Minimal barrier to prevent accesses leaking out of lock. */
- __CPROVER_fence("RRfence", "RWfence");
-
- return !old_locked;
-#endif
-}
-
-static inline void lock_impl_lock(struct lock_impl *lock)
-{
- /*
- * CBMC doesn't support busy waiting, so just assume that the
- * lock is available.
- */
- assume(lock_impl_trylock(lock));
-
- /*
- * If the lock was already held by this thread then the assumption
- * is unsatisfiable (deadlock).
- */
-}
-
-static inline void lock_impl_unlock(struct lock_impl *lock)
-{
-#ifdef RUN
- BUG_ON(!__sync_bool_compare_and_swap(&lock->locked, true, false));
-#else
- /* Minimal barrier to prevent accesses leaking out of lock. */
- __CPROVER_fence("RWfence", "WWfence");
-
- __CPROVER_atomic_begin();
- bool old_locked = lock->locked;
- lock->locked = false;
- __CPROVER_atomic_end();
-
- BUG_ON(!old_locked);
-#endif
-}
-
-static inline void lock_impl_init(struct lock_impl *lock)
-{
- lock->locked = false;
-}
-
-#define LOCK_IMPL_INITIALIZER {.locked = false}
-
-#endif /* !defined(PTHREAD_LOCK) */
-
-/*
- * Implement spinlocks using the lock mechanism. Wrap the lock to prevent mixing
- * locks of different types.
- */
-typedef struct {
- struct lock_impl internal_lock;
-} spinlock_t;
-
-#define SPIN_LOCK_UNLOCKED {.internal_lock = LOCK_IMPL_INITIALIZER}
-#define __SPIN_LOCK_UNLOCKED(x) SPIN_LOCK_UNLOCKED
-#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED
-
-static inline void spin_lock_init(spinlock_t *lock)
-{
- lock_impl_init(&lock->internal_lock);
-}
-
-static inline void spin_lock(spinlock_t *lock)
-{
- /*
- * Spin locks also need to be removed in order to eliminate all
- * memory barriers. They are only used by the write side anyway.
- */
-#ifndef NO_SYNC_SMP_MB
- preempt_disable();
- lock_impl_lock(&lock->internal_lock);
-#endif
-}
-
-static inline void spin_unlock(spinlock_t *lock)
-{
-#ifndef NO_SYNC_SMP_MB
- lock_impl_unlock(&lock->internal_lock);
- preempt_enable();
-#endif
-}
-
-/* Don't bother with interrupts */
-#define spin_lock_irq(lock) spin_lock(lock)
-#define spin_unlock_irq(lock) spin_unlock(lock)
-#define spin_lock_irqsave(lock, flags) spin_lock(lock)
-#define spin_unlock_irqrestore(lock, flags) spin_unlock(lock)
-
-/*
- * This is supposed to return an int, but I think that a bool should work as
- * well.
- */
-static inline bool spin_trylock(spinlock_t *lock)
-{
-#ifndef NO_SYNC_SMP_MB
- preempt_disable();
- return lock_impl_trylock(&lock->internal_lock);
-#else
- return true;
-#endif
-}
-
-struct completion {
- /* Hopefully this won't overflow. */
- unsigned int count;
-};
-
-#define COMPLETION_INITIALIZER(x) {.count = 0}
-#define DECLARE_COMPLETION(x) struct completion x = COMPLETION_INITIALIZER(x)
-#define DECLARE_COMPLETION_ONSTACK(x) DECLARE_COMPLETION(x)
-
-static inline void init_completion(struct completion *c)
-{
- c->count = 0;
-}
-
-static inline void wait_for_completion(struct completion *c)
-{
- unsigned int prev_count = __sync_fetch_and_sub(&c->count, 1);
-
- assume(prev_count);
-}
-
-static inline void complete(struct completion *c)
-{
- unsigned int prev_count = __sync_fetch_and_add(&c->count, 1);
-
- BUG_ON(prev_count == UINT_MAX);
-}
-
-/* This function probably isn't very useful for CBMC. */
-static inline bool try_wait_for_completion(struct completion *c)
-{
- BUG();
-}
-
-static inline bool completion_done(struct completion *c)
-{
- return c->count;
-}
-
-/* TODO: Implement complete_all */
-static inline void complete_all(struct completion *c)
-{
- BUG();
-}
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.c
deleted file mode 100644
index 9440cc39e3c6..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.c
+++ /dev/null
@@ -1,12 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <config.h>
-
-#include "misc.h"
-#include "bug_on.h"
-
-struct rcu_head;
-
-void wakeme_after_rcu(struct rcu_head *head)
-{
- BUG();
-}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.h
deleted file mode 100644
index aca50030f954..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef MISC_H
-#define MISC_H
-
-#include "assume.h"
-#include "int_typedefs.h"
-#include "locks.h"
-
-#include <linux/types.h>
-
-/* Probably won't need to deal with bottom halves. */
-static inline void local_bh_disable(void) {}
-static inline void local_bh_enable(void) {}
-
-#define MODULE_ALIAS(X)
-#define module_param(...)
-#define EXPORT_SYMBOL_GPL(x)
-
-#define container_of(ptr, type, member) ({ \
- const typeof(((type *)0)->member) *__mptr = (ptr); \
- (type *)((char *)__mptr - offsetof(type, member)); \
-})
-
-#ifndef USE_SIMPLE_SYNC_SRCU
-/* Abuse udelay to make sure that busy loops terminate. */
-#define udelay(x) assume(0)
-
-#else
-
-/* The simple custom synchronize_srcu is ok with try_check_zero failing. */
-#define udelay(x) do { } while (0)
-#endif
-
-#define trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
- do { } while (0)
-
-#define notrace
-
-/* Avoid including rcupdate.h */
-struct rcu_synchronize {
- struct rcu_head head;
- struct completion completion;
-};
-
-void wakeme_after_rcu(struct rcu_head *head);
-
-#define rcu_lock_acquire(a) do { } while (0)
-#define rcu_lock_release(a) do { } while (0)
-#define rcu_lockdep_assert(c, s) do { } while (0)
-#define RCU_LOCKDEP_WARN(c, s) do { } while (0)
-
-/* Let CBMC non-deterministically choose switch between normal and expedited. */
-bool rcu_gp_is_normal(void);
-bool rcu_gp_is_expedited(void);
-
-/* Do the same for old versions of rcu. */
-#define rcu_expedited (rcu_gp_is_expedited())
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/percpu.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/percpu.h
deleted file mode 100644
index 27e67a3f291f..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/percpu.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef PERCPU_H
-#define PERCPU_H
-
-#include <stddef.h>
-#include "bug_on.h"
-#include "preempt.h"
-
-#define __percpu
-
-/* Maximum size of any percpu data. */
-#define PERCPU_OFFSET (4 * sizeof(long))
-
-/* Ignore alignment, as CBMC doesn't care about false sharing. */
-#define alloc_percpu(type) __alloc_percpu(sizeof(type), 1)
-
-static inline void *__alloc_percpu(size_t size, size_t align)
-{
- BUG();
- return NULL;
-}
-
-static inline void free_percpu(void *ptr)
-{
- BUG();
-}
-
-#define per_cpu_ptr(ptr, cpu) \
- ((typeof(ptr)) ((char *) (ptr) + PERCPU_OFFSET * cpu))
-
-#define __this_cpu_inc(pcp) __this_cpu_add(pcp, 1)
-#define __this_cpu_dec(pcp) __this_cpu_sub(pcp, 1)
-#define __this_cpu_sub(pcp, n) __this_cpu_add(pcp, -(typeof(pcp)) (n))
-
-#define this_cpu_inc(pcp) this_cpu_add(pcp, 1)
-#define this_cpu_dec(pcp) this_cpu_sub(pcp, 1)
-#define this_cpu_sub(pcp, n) this_cpu_add(pcp, -(typeof(pcp)) (n))
-
-/* Make CBMC use atomics to work around bug. */
-#ifdef RUN
-#define THIS_CPU_ADD_HELPER(ptr, x) (*(ptr) += (x))
-#else
-/*
- * Split the atomic into a read and a write so that it has the least
- * possible ordering.
- */
-#define THIS_CPU_ADD_HELPER(ptr, x) \
- do { \
- typeof(ptr) this_cpu_add_helper_ptr = (ptr); \
- typeof(ptr) this_cpu_add_helper_x = (x); \
- typeof(*ptr) this_cpu_add_helper_temp; \
- __CPROVER_atomic_begin(); \
- this_cpu_add_helper_temp = *(this_cpu_add_helper_ptr); \
- __CPROVER_atomic_end(); \
- this_cpu_add_helper_temp += this_cpu_add_helper_x; \
- __CPROVER_atomic_begin(); \
- *(this_cpu_add_helper_ptr) = this_cpu_add_helper_temp; \
- __CPROVER_atomic_end(); \
- } while (0)
-#endif
-
-/*
- * For some reason CBMC needs an atomic operation even though this is percpu
- * data.
- */
-#define __this_cpu_add(pcp, n) \
- do { \
- BUG_ON(preemptible()); \
- THIS_CPU_ADD_HELPER(per_cpu_ptr(&(pcp), thread_cpu_id), \
- (typeof(pcp)) (n)); \
- } while (0)
-
-#define this_cpu_add(pcp, n) \
- do { \
- int this_cpu_add_impl_cpu = get_cpu(); \
- THIS_CPU_ADD_HELPER(per_cpu_ptr(&(pcp), this_cpu_add_impl_cpu), \
- (typeof(pcp)) (n)); \
- put_cpu(); \
- } while (0)
-
-/*
- * This will cause a compiler warning because of the cast from char[][] to
- * type*. This will cause a compile time error if type is too big.
- */
-#define DEFINE_PER_CPU(type, name) \
- char name[NR_CPUS][PERCPU_OFFSET]; \
- typedef char percpu_too_big_##name \
- [sizeof(type) > PERCPU_OFFSET ? -1 : 1]
-
-#define for_each_possible_cpu(cpu) \
- for ((cpu) = 0; (cpu) < NR_CPUS; ++(cpu))
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.c
deleted file mode 100644
index b4083ae348fb..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.c
+++ /dev/null
@@ -1,79 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <config.h>
-
-#include "preempt.h"
-
-#include "assume.h"
-#include "locks.h"
-
-/* Support NR_CPUS of at most 64 */
-#define CPU_PREEMPTION_LOCKS_INIT0 LOCK_IMPL_INITIALIZER
-#define CPU_PREEMPTION_LOCKS_INIT1 \
- CPU_PREEMPTION_LOCKS_INIT0, CPU_PREEMPTION_LOCKS_INIT0
-#define CPU_PREEMPTION_LOCKS_INIT2 \
- CPU_PREEMPTION_LOCKS_INIT1, CPU_PREEMPTION_LOCKS_INIT1
-#define CPU_PREEMPTION_LOCKS_INIT3 \
- CPU_PREEMPTION_LOCKS_INIT2, CPU_PREEMPTION_LOCKS_INIT2
-#define CPU_PREEMPTION_LOCKS_INIT4 \
- CPU_PREEMPTION_LOCKS_INIT3, CPU_PREEMPTION_LOCKS_INIT3
-#define CPU_PREEMPTION_LOCKS_INIT5 \
- CPU_PREEMPTION_LOCKS_INIT4, CPU_PREEMPTION_LOCKS_INIT4
-
-/*
- * Simulate disabling preemption by locking a particular cpu. NR_CPUS
- * should be the actual number of cpus, not just the maximum.
- */
-struct lock_impl cpu_preemption_locks[NR_CPUS] = {
- CPU_PREEMPTION_LOCKS_INIT0
-#if (NR_CPUS - 1) & 1
- , CPU_PREEMPTION_LOCKS_INIT0
-#endif
-#if (NR_CPUS - 1) & 2
- , CPU_PREEMPTION_LOCKS_INIT1
-#endif
-#if (NR_CPUS - 1) & 4
- , CPU_PREEMPTION_LOCKS_INIT2
-#endif
-#if (NR_CPUS - 1) & 8
- , CPU_PREEMPTION_LOCKS_INIT3
-#endif
-#if (NR_CPUS - 1) & 16
- , CPU_PREEMPTION_LOCKS_INIT4
-#endif
-#if (NR_CPUS - 1) & 32
- , CPU_PREEMPTION_LOCKS_INIT5
-#endif
-};
-
-#undef CPU_PREEMPTION_LOCKS_INIT0
-#undef CPU_PREEMPTION_LOCKS_INIT1
-#undef CPU_PREEMPTION_LOCKS_INIT2
-#undef CPU_PREEMPTION_LOCKS_INIT3
-#undef CPU_PREEMPTION_LOCKS_INIT4
-#undef CPU_PREEMPTION_LOCKS_INIT5
-
-__thread int thread_cpu_id;
-__thread int preempt_disable_count;
-
-void preempt_disable(void)
-{
- BUG_ON(preempt_disable_count < 0 || preempt_disable_count == INT_MAX);
-
- if (preempt_disable_count++)
- return;
-
- thread_cpu_id = nondet_int();
- assume(thread_cpu_id >= 0);
- assume(thread_cpu_id < NR_CPUS);
- lock_impl_lock(&cpu_preemption_locks[thread_cpu_id]);
-}
-
-void preempt_enable(void)
-{
- BUG_ON(preempt_disable_count < 1);
-
- if (--preempt_disable_count)
- return;
-
- lock_impl_unlock(&cpu_preemption_locks[thread_cpu_id]);
-}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.h
deleted file mode 100644
index f8b762cd214c..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef PREEMPT_H
-#define PREEMPT_H
-
-#include <stdbool.h>
-
-#include "bug_on.h"
-
-/* This flag contains garbage if preempt_disable_count is 0. */
-extern __thread int thread_cpu_id;
-
-/* Support recursive preemption disabling. */
-extern __thread int preempt_disable_count;
-
-void preempt_disable(void);
-void preempt_enable(void);
-
-static inline void preempt_disable_notrace(void)
-{
- preempt_disable();
-}
-
-static inline void preempt_enable_no_resched(void)
-{
- preempt_enable();
-}
-
-static inline void preempt_enable_notrace(void)
-{
- preempt_enable();
-}
-
-static inline int preempt_count(void)
-{
- return preempt_disable_count;
-}
-
-static inline bool preemptible(void)
-{
- return !preempt_count();
-}
-
-static inline int get_cpu(void)
-{
- preempt_disable();
- return thread_cpu_id;
-}
-
-static inline void put_cpu(void)
-{
- preempt_enable();
-}
-
-static inline void might_sleep(void)
-{
- BUG_ON(preempt_disable_count);
-}
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/simple_sync_srcu.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/simple_sync_srcu.c
deleted file mode 100644
index 97f592048e0b..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/simple_sync_srcu.c
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <config.h>
-
-#include <assert.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <pthread.h>
-#include <stddef.h>
-#include <string.h>
-#include <sys/types.h>
-
-#include "int_typedefs.h"
-
-#include "barriers.h"
-#include "bug_on.h"
-#include "locks.h"
-#include "misc.h"
-#include "preempt.h"
-#include "percpu.h"
-#include "workqueues.h"
-
-#include <linux/srcu.h>
-
-/* Functions needed from modify_srcu.c */
-bool try_check_zero(struct srcu_struct *sp, int idx, int trycount);
-void srcu_flip(struct srcu_struct *sp);
-
-/* Simpler implementation of synchronize_srcu that ignores batching. */
-void synchronize_srcu(struct srcu_struct *sp)
-{
- int idx;
- /*
- * This code assumes that try_check_zero will succeed anyway,
- * so there is no point in multiple tries.
- */
- const int trycount = 1;
-
- might_sleep();
-
- /* Ignore the lock, as multiple writers aren't working yet anyway. */
-
- idx = 1 ^ (sp->completed & 1);
-
- /* For comments see srcu_advance_batches. */
-
- assume(try_check_zero(sp, idx, trycount));
-
- srcu_flip(sp);
-
- assume(try_check_zero(sp, idx^1, trycount));
-}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/workqueues.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/workqueues.h
deleted file mode 100644
index 28b960300971..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/workqueues.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef WORKQUEUES_H
-#define WORKQUEUES_H
-
-#include <stdbool.h>
-
-#include "barriers.h"
-#include "bug_on.h"
-#include "int_typedefs.h"
-
-#include <linux/types.h>
-
-/* Stub workqueue implementation. */
-
-struct work_struct;
-typedef void (*work_func_t)(struct work_struct *work);
-void delayed_work_timer_fn(unsigned long __data);
-
-struct work_struct {
-/* atomic_long_t data; */
- unsigned long data;
-
- struct list_head entry;
- work_func_t func;
-#ifdef CONFIG_LOCKDEP
- struct lockdep_map lockdep_map;
-#endif
-};
-
-struct timer_list {
- struct hlist_node entry;
- unsigned long expires;
- void (*function)(unsigned long);
- unsigned long data;
- u32 flags;
- int slack;
-};
-
-struct delayed_work {
- struct work_struct work;
- struct timer_list timer;
-
- /* target workqueue and CPU ->timer uses to queue ->work */
- struct workqueue_struct *wq;
- int cpu;
-};
-
-
-static inline bool schedule_work(struct work_struct *work)
-{
- BUG();
- return true;
-}
-
-static inline bool schedule_work_on(int cpu, struct work_struct *work)
-{
- BUG();
- return true;
-}
-
-static inline bool queue_work(struct workqueue_struct *wq,
- struct work_struct *work)
-{
- BUG();
- return true;
-}
-
-static inline bool queue_delayed_work(struct workqueue_struct *wq,
- struct delayed_work *dwork,
- unsigned long delay)
-{
- BUG();
- return true;
-}
-
-#define INIT_WORK(w, f) \
- do { \
- (w)->data = 0; \
- (w)->func = (f); \
- } while (0)
-
-#define INIT_DELAYED_WORK(w, f) INIT_WORK(&(w)->work, (f))
-
-#define __WORK_INITIALIZER(n, f) { \
- .data = 0, \
- .entry = { &(n).entry, &(n).entry }, \
- .func = f \
- }
-
-/* Don't bother initializing timer. */
-#define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \
- .work = __WORK_INITIALIZER((n).work, (f)), \
- }
-
-#define DECLARE_WORK(n, f) \
- struct workqueue_struct n = __WORK_INITIALIZER
-
-#define DECLARE_DELAYED_WORK(n, f) \
- struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)
-
-#define system_power_efficient_wq ((struct workqueue_struct *) NULL)
-
-#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore
deleted file mode 100644
index d65462d64816..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-*.out
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/Makefile b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/Makefile
deleted file mode 100644
index ad21b925fbb4..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-CBMC_FLAGS = -I../.. -I../../src -I../../include -I../../empty_includes -32 -pointer-check -mm pso
-
-all:
- for i in ./*.pass; do \
- echo $$i ; \
- CBMC_FLAGS="$(CBMC_FLAGS)" sh ../test_script.sh --should-pass $$i > $$i.out 2>&1 ; \
- done
- for i in ./*.fail; do \
- echo $$i ; \
- CBMC_FLAGS="$(CBMC_FLAGS)" sh ../test_script.sh --should-fail $$i > $$i.out 2>&1 ; \
- done
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/assert_end.fail b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/assert_end.fail
deleted file mode 100644
index 40c8075919d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/assert_end.fail
+++ /dev/null
@@ -1 +0,0 @@
-test_cbmc_options="-DASSERT_END"
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force.fail b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force.fail
deleted file mode 100644
index ada5baf0b60d..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force.fail
+++ /dev/null
@@ -1 +0,0 @@
-test_cbmc_options="-DFORCE_FAILURE"
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force2.fail b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force2.fail
deleted file mode 100644
index 8fe00c8db466..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force2.fail
+++ /dev/null
@@ -1 +0,0 @@
-test_cbmc_options="-DFORCE_FAILURE_2"
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force3.fail b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force3.fail
deleted file mode 100644
index 612ed6772844..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force3.fail
+++ /dev/null
@@ -1 +0,0 @@
-test_cbmc_options="-DFORCE_FAILURE_3"
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/main.pass b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/main.pass
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/main.pass
+++ /dev/null
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/test.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/test.c
deleted file mode 100644
index 2ce2016f7871..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/test.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <src/combined_source.c>
-
-int x;
-int y;
-
-int __unbuffered_tpr_x;
-int __unbuffered_tpr_y;
-
-DEFINE_SRCU(ss);
-
-void rcu_reader(void)
-{
- int idx;
-
-#ifndef FORCE_FAILURE_3
- idx = srcu_read_lock(&ss);
-#endif
- might_sleep();
-
- __unbuffered_tpr_y = READ_ONCE(y);
-#ifdef FORCE_FAILURE
- srcu_read_unlock(&ss, idx);
- idx = srcu_read_lock(&ss);
-#endif
- WRITE_ONCE(x, 1);
-
-#ifndef FORCE_FAILURE_3
- srcu_read_unlock(&ss, idx);
-#endif
- might_sleep();
-}
-
-void *thread_update(void *arg)
-{
- WRITE_ONCE(y, 1);
-#ifndef FORCE_FAILURE_2
- synchronize_srcu(&ss);
-#endif
- might_sleep();
- __unbuffered_tpr_x = READ_ONCE(x);
-
- return NULL;
-}
-
-void *thread_process_reader(void *arg)
-{
- rcu_reader();
-
- return NULL;
-}
-
-int main(int argc, char *argv[])
-{
- pthread_t tu;
- pthread_t tpr;
-
- if (pthread_create(&tu, NULL, thread_update, NULL))
- abort();
- if (pthread_create(&tpr, NULL, thread_process_reader, NULL))
- abort();
- if (pthread_join(tu, NULL))
- abort();
- if (pthread_join(tpr, NULL))
- abort();
- assert(__unbuffered_tpr_y != 0 || __unbuffered_tpr_x != 0);
-
-#ifdef ASSERT_END
- assert(0);
-#endif
-
- return 0;
-}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/test_script.sh b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/test_script.sh
deleted file mode 100755
index 2fe1f0339b4f..000000000000
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/test_script.sh
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-# This script expects a mode (either --should-pass or --should-fail) followed by
-# an input file. The script uses the following environment variables. The test C
-# source file is expected to be named test.c in the directory containing the
-# input file.
-#
-# CBMC: The command to run CBMC. Default: cbmc
-# CBMC_FLAGS: Additional flags to pass to CBMC
-# NR_CPUS: Number of cpus to run tests with. Default specified by the test
-# SYNC_SRCU_MODE: Choose implementation of synchronize_srcu. Defaults to simple.
-# kernel: Version included in the linux kernel source.
-# simple: Use try_check_zero directly.
-#
-# The input file is a script that is sourced by this file. It can define any of
-# the following variables to configure the test.
-#
-# test_cbmc_options: Extra options to pass to CBMC.
-# min_cpus_fail: Minimum number of CPUs (NR_CPUS) for verification to fail.
-# The test is expected to pass if it is run with fewer. (Only
-# useful for .fail files)
-# default_cpus: Quantity of CPUs to use for the test, if not specified on the
-# command line. Default: Larger of 2 and MIN_CPUS_FAIL.
-
-set -e
-
-if test "$#" -ne 2; then
- echo "Expected one option followed by an input file" 1>&2
- exit 99
-fi
-
-if test "x$1" = "x--should-pass"; then
- should_pass="yes"
-elif test "x$1" = "x--should-fail"; then
- should_pass="no"
-else
- echo "Unrecognized argument '$1'" 1>&2
-
- # Exit code 99 indicates a hard error.
- exit 99
-fi
-
-CBMC=${CBMC:-cbmc}
-
-SYNC_SRCU_MODE=${SYNC_SRCU_MODE:-simple}
-
-case ${SYNC_SRCU_MODE} in
-kernel) sync_srcu_mode_flags="" ;;
-simple) sync_srcu_mode_flags="-DUSE_SIMPLE_SYNC_SRCU" ;;
-
-*)
- echo "Unrecognized argument '${SYNC_SRCU_MODE}'" 1>&2
- exit 99
- ;;
-esac
-
-min_cpus_fail=1
-
-c_file=`dirname "$2"`/test.c
-
-# Source the input file.
-. $2
-
-if test ${min_cpus_fail} -gt 2; then
- default_default_cpus=${min_cpus_fail}
-else
- default_default_cpus=2
-fi
-default_cpus=${default_cpus:-${default_default_cpus}}
-cpus=${NR_CPUS:-${default_cpus}}
-
-# Check if there are two few cpus to make the test fail.
-if test $cpus -lt ${min_cpus_fail:-0}; then
- should_pass="yes"
-fi
-
-cbmc_opts="-DNR_CPUS=${cpus} ${sync_srcu_mode_flags} ${test_cbmc_options} ${CBMC_FLAGS}"
-
-echo "Running CBMC: ${CBMC} ${cbmc_opts} ${c_file}"
-if ${CBMC} ${cbmc_opts} "${c_file}"; then
- # Verification successful. Make sure that it was supposed to verify.
- test "x${should_pass}" = xyes
-else
- cbmc_exit_status=$?
-
- # An exit status of 10 indicates a failed verification.
- # (see cbmc_parse_optionst::do_bmc in the CBMC source code)
- if test ${cbmc_exit_status} -eq 10 && test "x${should_pass}" = xno; then
- :
- else
- echo "CBMC returned ${cbmc_exit_status} exit status" 1>&2
-
- # Parse errors have exit status 6. Any other type of error
- # should be considered a hard error.
- if test ${cbmc_exit_status} -ne 6 && \
- test ${cbmc_exit_status} -ne 10; then
- exit 99
- else
- exit 1
- fi
- fi
-fi
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 43ec36b179dc..38f651469968 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -2184,6 +2184,9 @@ FIXTURE_TEARDOWN(TRACE_syscall)
TEST(negative_ENOSYS)
{
+#if defined(__arm__)
+ SKIP(return, "arm32 does not support calling syscall -1");
+#endif
/*
* There should be no difference between an "internal" skip
* and userspace asking for syscall "-1".
@@ -3072,7 +3075,8 @@ TEST(syscall_restart)
timeout.tv_sec = 1;
errno = 0;
EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
- TH_LOG("Call to nanosleep() failed (errno %d)", errno);
+ TH_LOG("Call to nanosleep() failed (errno %d: %s)",
+ errno, strerror(errno));
}
/* Read final sync from parent. */
@@ -3908,6 +3912,9 @@ TEST(user_notification_filter_empty)
TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
}
+ if (__NR_clone3 < 0)
+ SKIP(return, "Test not built with clone3 support");
+
pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
@@ -3962,6 +3969,9 @@ TEST(user_notification_filter_empty_threaded)
TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
}
+ if (__NR_clone3 < 0)
+ SKIP(return, "Test not built with clone3 support");
+
pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
@@ -4255,6 +4265,61 @@ TEST(user_notification_addfd_rlimit)
close(memfd);
}
+#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
+#endif
+
+TEST(user_notification_sync)
+{
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+ int status, listener;
+ pid_t pid;
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ /* Try to set invalid flags. */
+ EXPECT_SYSCALL_RETURN(-EINVAL,
+ ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, 0xffffffff, 0));
+
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
+ SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ ret = syscall(__NR_getppid);
+ ASSERT_EQ(ret, USER_NOTIF_MAGIC) {
+ _exit(1);
+ }
+ _exit(0);
+ }
+
+ req.pid = 0;
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+ ASSERT_EQ(req.data.nr, __NR_getppid);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+ resp.flags = 0;
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ ASSERT_EQ(waitpid(pid, &status, 0), pid);
+ ASSERT_EQ(status, 0);
+}
+
+
/* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
FIXTURE(O_SUSPEND_SECCOMP) {
pid_t pid;